Hadoop-解压缩的zip文件
我有很多zip格式的压缩文件(GBs),我想写一个仅映射的作业来解压缩它们。我的mapper类看起来像Hadoop-解压缩的zip文件,hadoop,mapreduce,compression,Hadoop,Mapreduce,Compression,我有很多zip格式的压缩文件(GBs),我想写一个仅映射的作业来解压缩它们。我的mapper类看起来像 import java.util.zip.*; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.OutputCollector; import j
import java.util.zip.*;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.OutputCollector;
import java.io.*;
public class DecompressMapper extends Mapper <LongWritable, Text, LongWritable, Text>
{
private static final int BUFFER_SIZE = 4096;
public void map(LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Context context) throws IOException
{
FileSplit fileSplit = (FileSplit)context.getInputSplit();
String fileName = fileSplit.getPath().getName();
this.unzip(fileName, new File(fileName).getParent() + File.separator + "/test_poc");
}
public void unzip(String zipFilePath, String destDirectory) throws IOException {
File destDir = new File(destDirectory);
if (!destDir.exists()) {
destDir.mkdir();
}
ZipInputStream zipIn = new ZipInputStream(new FileInputStream(zipFilePath));
ZipEntry entry = zipIn.getNextEntry();
// iterates over entries in the zip file
while (entry != null) {
String filePath = destDirectory + File.separator + entry.getName();
if (!entry.isDirectory()) {
// if the entry is a file, extracts it
extractFile(zipIn, filePath);
} else {
// if the entry is a directory, make the directory
File dir = new File(filePath);
dir.mkdir();
}
zipIn.closeEntry();
entry = zipIn.getNextEntry();
}
zipIn.close();
}
private void extractFile(ZipInputStream zipIn, String filePath) throws IOException {
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(filePath));
byte[] bytesIn = new byte[BUFFER_SIZE];
int read = 0;
while ((read = zipIn.read(bytesIn)) != -1) {
bos.write(bytesIn, 0, read);
}
bos.close();
}
}
我的mapper类似乎工作不好。我没有得到所需目录中的解压缩文件。感谢您的帮助。谢谢…以上代码没有什么问题
<>我们在编写MapReduce程序时需要小心,因为Hadoop使用完全不同的文件系统,我们需要在编写代码时考虑这一点,并且永远不要混合MR1和MR2 API。 < P> >在Hadoop文件系统中没有解压缩文件的具体方法,但是经过长时间的研究,我已经找到了如何在hadoop文件系统中直接解压缩它的方法。前提是你需要在某个位置复制zip文件,然后运行mapreduce作业。很明显,hadoop不理解zipfile输入格式,因此我们需要定制映射器和reducer,以便控制映射器的发射和消耗。请注意,此Mapreduce将在单个映射器上运行,因为在自定义hadoop提供的记录读取器类时,我们禁用了split方法,即将其设为false。因此,Mapreduce将文件名作为键,未压缩文件的内容作为值。当reducer使用它时,我将output outputkey设置为null,这样reducer只保留未压缩的内容,并且reducer的数量设置为1,这样所有转储都在单个部件文件中 我们都知道hadoop不能单独处理zip文件,但java可以借助其ZipFile类来处理,ZipFile类可以通过zipinputstrem读取zip文件内容,通过zipentry读取zip条目,因此我们编写了一个定制的ZipInputFormat类,它扩展了FileInputFormat。
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class ZipFileInputFormat extends FileInputFormat<Text, BytesWritable> {
/** See the comments on the setLenient() method */
private static boolean isLenient = false;
/**
* ZIP files are not splitable so they cannot be overrided so function
* return false
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
/**
* Create the ZipFileRecordReader to parse the file
*/
@Override
public RecordReader<Text, BytesWritable> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
return new ZipFileRecordReader();
}
/**
*
* @param lenient
*/
public static void setLenient(boolean lenient) {
isLenient = lenient;
}
public static boolean getLenient() {
return isLenient;
}
}
请注意,在作业类中,我们已将InputFormatClass配置为ZipFileInputFormat类,OutputFormatClass配置为TextOutPutFormat类。
Mavenize项目,让依赖项保持原样来运行代码。导出Jar文件并将其部署到hadoop集群上。在CDH5.5纱线上测试和部署。POM文件的内容如下:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mithun</groupId>
<artifactId>CustomisedMapperReducer</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>CustomisedMapperReducer</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.9.13</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
4.0.0
com.mithun
自定义显示减速机
0.0.1-快照
罐子
自定义显示减速机
http://maven.apache.org
UTF-8
org.apache.hadoop
hadoop客户端
2.6.0
org.codehaus.jackson
杰克逊地图绘制者
1.9.13
朱尼特
朱尼特
3.8.1
测验
您在哪个目录中获得输出?我没有在所需目录中获得输出。上面的代码不解压缩压缩文件。正在运行map reduce作业时给定的输出文件夹中获取不需要的输出。有关其他解决方案,请参阅
import java.io.IOException;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class ZipFileRecordReader extends RecordReader<Text, BytesWritable> {
/** InputStream used to read the ZIP file from the FileSystem */
private FSDataInputStream fsin;
/** ZIP file parser/decompresser */
private ZipInputStream zip;
/** Uncompressed file name */
private Text currentKey;
/** Uncompressed file contents */
private BytesWritable currentValue;
/** Used to indicate progress */
private boolean isFinished = false;
/**
* Initialise and open the ZIP file from the FileSystem
*/
@Override
public void initialize(InputSplit inputSplit,
TaskAttemptContext taskAttemptContext) throws IOException,
InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = taskAttemptContext.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
// Open the stream
fsin = fs.open(path);
zip = new ZipInputStream(fsin);
}
/**
* Each ZipEntry is decompressed and readied for the Mapper. The contents of
* each file is held *in memory* in a BytesWritable object.
*
* If the ZipFileInputFormat has been set to Lenient (not the default),
* certain exceptions will be gracefully ignored to prevent a larger job
* from failing.
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
ZipEntry entry = null;
try {
entry = zip.getNextEntry();
} catch (ZipException e) {
if (ZipFileInputFormat.getLenient() == false)
throw e;
}
// Sanity check
if (entry == null) {
isFinished = true;
return false;
}
// Filename
currentKey = new Text(entry.getName());
if (currentKey.toString().endsWith(".zip")) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] temp1 = new byte[8192];
while (true) {
int bytesread1 = 0;
try {
bytesread1 = zip.read(temp1, 0, 8192);
} catch (EOFException e) {
if (ZipFileInputFormat.getLenient() == false)
throw e;
return false;
}
if (bytesread1 > 0)
bos.write(temp1, 0, bytesread1);
else
break;
}
zip.closeEntry();
currentValue = new BytesWritable(bos.toByteArray());
return true;
}
// Read the file contents
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] temp = new byte[8192];
while (true) {
int bytesRead = 0;
try {
bytesRead = zip.read(temp, 0, 8192);
} catch (EOFException e) {
if (ZipFileInputFormat.getLenient() == false)
throw e;
return false;
}
if (bytesRead > 0)
bos.write(temp, 0, bytesRead);
else
break;
}
zip.closeEntry();
// Uncompressed contents
currentValue = new BytesWritable(bos.toByteArray());
return true;
}
/**
* Rather than calculating progress, we just keep it simple
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return isFinished ? 1 : 0;
}
/**
* Returns the current key (name of the zipped file)
*/
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return currentKey;
}
/**
* Returns the current value (contents of the zipped file)
*/
@Override
public BytesWritable getCurrentValue() throws IOException,
InterruptedException {
return currentValue;
}
/**
* Close quietly, ignoring any exceptions
*/
@Override
public void close() throws IOException {
try {
zip.close();
} catch (Exception ignore) {
}
try {
fsin.close();
} catch (Exception ignore) {
}
}
}
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<Text, BytesWritable, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Text key, BytesWritable value, Context context)
throws IOException, InterruptedException {
String filename = key.toString();
// We only want to process .txt files
if (filename.endsWith(".txt") == false)
return;
// Prepare the content
String content = new String(value.getBytes(), "UTF-8");
context.write(new Text(content), one);
}
}
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
// context.write(key, new IntWritable(sum));
context.write(new Text(key), null);
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.saama.CustomisedMapperReducer.MyMapper;
import com.saama.CustomisedMapperReducer.MyReducer;
import com.saama.CustomisedMapperReducer.ZipFileInputFormat;
import com.saama.CustomisedMapperReducer.ZipFileRecordReader;
public class MyJob {
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(MyJob.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setInputFormatClass(ZipFileInputFormat.class);
job.setOutputKeyClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
ZipFileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setNumReduceTasks(1);
job.waitForCompletion(true);
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mithun</groupId>
<artifactId>CustomisedMapperReducer</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>CustomisedMapperReducer</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.9.13</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>