Java 如何格式化Hadoop中Mapreduce编写的输出
我正试图按每个单词反转文件的内容。我的程序运行得很好,但是我得到的输出是这样的Java 如何格式化Hadoop中Mapreduce编写的输出,java,hadoop,mapreduce,Java,Hadoop,Mapreduce,我正试图按每个单词反转文件的内容。我的程序运行得很好,但是我得到的输出是这样的 1 dwp 2 seviG 3 eht 4 tnerruc 5 gnikdrow 6 yrotcerid 7 ridkm 8 desU 9 ot 10 etaerc dwp seviG eht tnerruc gnikdrow yrotcerid ridkm desU ot etaerc 我希望输出像这样 1 dwp 2 seviG 3 eht 4 tnerr
1 dwp
2 seviG
3 eht
4 tnerruc
5 gnikdrow
6 yrotcerid
7 ridkm
8 desU
9 ot
10 etaerc
dwp seviG eht tnerruc gnikdrow yrotcerid ridkm desU
ot etaerc
我希望输出像这样
1 dwp
2 seviG
3 eht
4 tnerruc
5 gnikdrow
6 yrotcerid
7 ridkm
8 desU
9 ot
10 etaerc
dwp seviG eht tnerruc gnikdrow yrotcerid ridkm desU
ot etaerc
我正在使用的代码
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class Reproduce {
public static int temp =0;
public static class ReproduceMap extends MapReduceBase implements Mapper<LongWritable, Text, IntWritable, Text>{
private Text word = new Text();
@Override
public void map(LongWritable arg0, Text value,
OutputCollector<IntWritable, Text> output, Reporter reporter)
throws IOException {
String line = value.toString().concat("\n");
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(new StringBuffer(tokenizer.nextToken()).reverse().toString());
temp++;
output.collect(new IntWritable(temp),word);
}
}
}
public static class ReproduceReduce extends MapReduceBase implements Reducer<IntWritable, Text, IntWritable, Text>{
@Override
public void reduce(IntWritable arg0, Iterator<Text> arg1,
OutputCollector<IntWritable, Text> arg2, Reporter arg3)
throws IOException {
String word = arg1.next().toString();
Text word1 = new Text();
word1.set(word);
arg2.collect(arg0, word1);
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(ReproduceMap.class);
conf.setReducerClass(ReproduceReduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
import java.io.IOException;
导入java.util.*;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.conf.*;
导入org.apache.hadoop.io.*;
导入org.apache.hadoop.mapred.*;
导入org.apache.hadoop.util.*;
公共类复制{
公共静态int-temp=0;
公共静态类ReproducteMap扩展MapReduceBase实现映射器{
私有文本字=新文本();
@凌驾
公共void映射(可长写arg0,文本值,
OutputCollector输出,报告器(报告器)
抛出IOException{
字符串行=value.toString().concat(“\n”);
StringTokenizer标记器=新的StringTokenizer(行);
while(tokenizer.hasMoreTokens()){
set(新的StringBuffer(tokenizer.nextToken()).reverse().toString());
temp++;
collect(新的IntWritable(temp)、word);
}
}
}
公共静态类reproducte扩展MapReduceBase实现Reducer{
@凌驾
public void reduce(可写arg0、迭代器arg1、,
输出收集器arg2、报告器arg3)
抛出IOException{
字符串字=arg1.next().toString();
Text word1=新文本();
单词1.集合(单词);
arg2.collect(arg0,word1);
}
}
公共静态void main(字符串[]args)引发异常{
JobConf conf=newjobconf(WordCount.class);
conf.setJobName(“字数”);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(reproductemap.class);
conf.setReduceClass(复制还原类);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
setInputPath(conf,新路径(args[0]);
setOutputPath(conf,新路径(args[1]);
runJob(conf);
}
}
如何修改输出,而不是编写另一个java程序来实现这一点
提前感谢您可以使用
NullWritable
作为输出值。NullWritable只是一个占位符,因为您不希望数字显示为输出的一部分。我已经给你上课了。注意:-需要为NullWritable添加导入语句
public static class ReproduceReduce extends MapReduceBase implements Reducer<IntWritable, Text, Text, NullWritable>{
@Override
public void reduce(IntWritable arg0, Iterator<Text> arg1,
OutputCollector<Text, NullWritable> arg2, Reporter arg3)
throws IOException {
String word = arg1.next().toString();
Text word1 = new Text();
word1.set(word);
arg2.collect(word1, new NullWritable());
}
}
在Mapper中,每个字的键值都会递增,因此每个字都作为单独的键值对进行处理 下面的步骤应该可以解决这个问题 1) 在Mapper中,只需删除temp++,这样所有反转的单词的键都将为0(temp=0) 2) Reducer接收键0和反向字符串列表。
在reducer中,将键设置为NullWritable并写入输出。我们可以通过编写自定义fileoutputformat类来自定义输出。您可以尝试的是获取一个常量键(或简单地说是NullWritable),并将其作为键传递,将整行作为值传递(您可以在mapper类中反转它,也可以在reducer类中反转它)。因此,您的reducer将收到一个常量键(如果您使用nullwritable作为键,则为placeholder)现在,您可以简单地反转该行并将其写入输出文件。通过不使用tmp键,您可以避免在输出文件中写入不需要的数字。下面是一个简单的代码,演示如何使用自定义FileoutputFormat
public class MyTextOutputFormat extends FileOutputFormat<Text, List<IntWritable>> {
@Override
public org.apache.hadoop.mapreduce.RecordWriter<Text, List<Intwritable>> getRecordWriter(TaskAttemptContext arg0) throws IOException, InterruptedException {
//get the current path
Path path = FileOutputFormat.getOutputPath(arg0);
//create the full path with the output directory plus our filename
Path fullPath = new Path(path, "result.txt");
//create the file in the file system
FileSystem fs = path.getFileSystem(arg0.getConfiguration());
FSDataOutputStream fileOut = fs.create(fullPath, arg0);
//create our record writer with the new file
return new MyCustomRecordWriter(fileOut);
}
}
public class MyCustomRecordWriter extends RecordWriter<Text, List<IntWritable>> {
private DataOutputStream out;
public MyCustomRecordWriter(DataOutputStream stream) {
out = stream;
try {
out.writeBytes("results:\r\n");
}
catch (Exception ex) {
}
}
@Override
public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
//close our file
out.close();
}
@Override
public void write(Text arg0, List arg1) throws IOException, InterruptedException {
//write out our key
out.writeBytes(arg0.toString() + ": ");
//loop through all values associated with our key and write them with commas between
for (int i=0; i<arg1.size(); i++) {
if (i>0)
out.writeBytes(",");
out.writeBytes(String.valueOf(arg1.get(i)));
}
out.writeBytes("\r\n");
}
}