Hadoop 在wordcount程序中不跳过两个单词的代码

Hadoop 在wordcount程序中不跳过两个单词的代码,hadoop,mapreduce,Hadoop,Mapreduce,此代码计算字数并跳过文件中的两个给定字词(in&of):- 请解释为什么不跳过这些单词 import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop

此代码计算字数并跳过文件中的两个给定字词(in&of):- 请解释为什么不跳过这些单词

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

class skipwc_mapper extends
            Mapper<LongWritable, Text, Text, IntWritable> {

        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            String line = value.toString();
            StringTokenizer t = new StringTokenizer(line);
            Text word = null;
            while (t.hasMoreTokens()) {
                word = new Text(t.nextToken());
                context.write(word, new IntWritable(1));
            }
        }
    }

    class skipwc_reducer extends
            Reducer<Text, IntWritable, Text, IntWritable> {
        protected void reduce(Text key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {
            int tot = 0;
            if (key.toString() != "in" && key.toString() != "of") {
                while (values.iterator().hasNext()) {
                    tot += values.iterator().next().get();
                }
                context.write(key, new IntWritable(tot));
            }
        }
    }

    public static class skipwc_runner {
        public static void main(String[] args) throws IOException,
                InterruptedException, ClassNotFoundException {
            Configuration conf = new Configuration();
            Job job = new Job(conf);
            job.setJarByClass(skipwc_runner.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            job.setMapperClass(skipwc_mapper.class);
            job.setReducerClass(skipwc_reducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));

            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }
}
import java.io.IOException;
导入java.util.StringTokenizer;
导入org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.IntWritable;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.Reducer;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
类skipwc_映射器扩展
制图员{
受保护的void映射(可长写键、文本值、上下文)
抛出IOException、InterruptedException{
字符串行=value.toString();
StringTokenizer t=新的StringTokenizer(行);
文本字=空;
而(t.hasMoreTokens()){
word=新文本(t.nextToken());
context.write(word,新的intwriteable(1));
}
}
}
skipwc_类减速器扩展
减速器{
受保护的void reduce(文本键、Iterable值、,
上下文)抛出IOException、InterruptedException{
int-tot=0;
if(key.toString()!=”in“&&key.toString()!=”of”){
while(values.iterator().hasNext()){
tot+=values.iterator().next().get();
}
write(key,newintwriteable(tot));
}
}
}
公共静态类skipwc_runner{
公共静态void main(字符串[]args)引发IOException,
InterruptedException,ClassNotFoundException{
Configuration conf=新配置();
作业=新作业(配置);
job.setJarByClass(skipwc_runner.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
setMapperClass(skipwc_mapper.class);
job.setReducerClass(skipwc_reducer.class);
setInputFormatClass(TextInputFormat.class);
setOutputFormatClass(TextOutputFormat.class);
addInputPath(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
系统退出(作业等待完成(真)?0:1;
}
}
}

使用
equals
方法比较字符串,如:

if (!"in".equals(key.toString()) && !"of".equals(key.toString())) 

此外,如果您在映射器中跳过/in而不是reducer,这将是有益的,因为在排序和洗牌阶段之前删除数据将非常有效,因此您可以避免额外的IO。

使用
equals
方法比较字符串,如:

if (!"in".equals(key.toString()) && !"of".equals(key.toString())) 

此外,如果您在映射器中跳过/in而不是reducer,这将是有益的,因为在排序和洗牌阶段之前删除数据会非常有效,因此,您可以避免额外的IO。

如果skipwc是package&所有3个类都在一个类skipwc2中,那么命令将是-hadoop jar skipwc.jar skipwc.skipwc2.skipwc_runner/wordcount/UN.txt/wordcount/outdir-如果skipwc是package&所有3个类都在一个类skipwc2中,那么命令将是-hadoopjar skipwc.jar skipwc.skipwc2.skipwc_runner/wordcount/UN.txt/wordcount/outdir-未找到类的抛出错误