带有JSON文件和JSONParser的MapReduce函数

带有JSON文件和JSONParser的MapReduce函数,json,hadoop,mapreduce,bigdata,Json,Hadoop,Mapreduce,Bigdata,我在编写mapreduce函数时遇到了一些问题。 我想解决以下问题: 我有一个包含1mio JSONObject的JSON文件,如下所示: {"_id":3951,"title":"Two Family House (2000)","genres":["Drama"],"ratings":[{"userId":173,"rating":5},{"userId":195,"rating":5},{"userId":411,"rating":4},{"userId":593,"rating":2}

我在编写mapreduce函数时遇到了一些问题。 我想解决以下问题:

我有一个包含1mio JSONObject的JSON文件,如下所示:

 {"_id":3951,"title":"Two Family House (2000)","genres":["Drama"],"ratings":[{"userId":173,"rating":5},{"userId":195,"rating":5},{"userId":411,"rating":4},{"userId":593,"rating":2},{"userId":629,"rating":3},{"userId":830,"rating":3},{"userId":838,"rating":5},{"userId":850,"rating":4},{"userId":856,"rating":4},{"userId":862,"rating":5},{"userId":889,"rating":1},{"userId":928,"rating":5},{"userId":986,"rating":4},{"userId":1001,"rating":5},{"userId":1069,"rating":3},{"userId":1168,"rating":3},{"userId":1173,"rating":2},{"userId":1242,"rating":3},{"userId":1266,"rating":5},{"userId":1331,"rating":5},{"userId":1417,"rating":5},{"userId":1470,"rating":4},{"userId":1474,"rating":5},{"userId":1615,"rating":3},{"userId":1625,"rating":4},{"userId":1733,"rating":4},{"userId":1799,"rating":4},{"userId":1865,"rating":5},{"userId":1877,"rating":5},{"userId":1897,"rating":5},{"userId":1946,"rating":4},{"userId":2031,"rating":4},{"userId":2129,"rating":2},{"userId":2353,"rating":4},{"userId":2986,"rating":4},{"userId":3940,"rating":4},{"userId":3985,"rating":3},{"userId":4025,"rating":5},{"userId":4727,"rating":3},{"userId":5333,"rating":3}]}
还有更多

一个JSON对象是电影,它包含一个数组。我想计算JSON文件中的所有评级

我在IntelliJ中创建了一个Maven Proct,其中包含Hadoop和JSON解析器的依赖项。我的MapReduce类是:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import java.io.IOException;
import java.util.Iterator;

public class RatingCounter {

public static class RatingMapper extends Mapper<JSONObject, Text, Text, Text>{

    private Text id = new Text();
    private Text ratingAnzahl = new Text();

    public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{
        JSONParser parser = new JSONParser();

        try {
            Object obj = parser.parse(value.toString());
            JSONObject jsonObject = (JSONObject) obj;

            String movieId = (String) jsonObject.get("_id");

            int count = 0;
            // loop array
            JSONArray ratings = (JSONArray) jsonObject.get("ratings");
            Iterator<String> iterator = ratings.iterator();
            while (iterator.hasNext()) {
                count++;
            }

        } catch (ParseException e) {
            e.printStackTrace();
        }
    }
}


public static class RatingReducer extends Reducer<Text, Text, Text, Text> {

    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

        Text resultValue = new Text();

        int allRatings = 0;

        while (values.hasNext()){
            allRatings += Integer.parseInt(values.toString());

        }
        resultValue.set(""+allRatings);
        context.write(key, resultValue);
    }
}

public static void main (String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "ratings count");
    job.setJarByClass(RatingCounter.class);
    job.setMapperClass(RatingMapper.class);
    job.setReducerClass(RatingReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
import org.apache.hadoop.conf.Configuration;
导入org.apache.hadoop.fs.Path;
导入org.apache.hadoop.io.LongWritable;
导入org.apache.hadoop.io.Text;
导入org.apache.hadoop.mapred.OutputCollector;
导入org.apache.hadoop.mapred.Reporter;
导入org.apache.hadoop.mapreduce.Job;
导入org.apache.hadoop.mapreduce.Mapper;
导入org.apache.hadoop.mapreduce.Reducer;
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
导入org.json.simple.JSONArray;
导入org.json.simple.JSONObject;
导入org.json.simple.parser.JSONParser;
导入org.json.simple.parser.ParseException;
导入java.io.IOException;
导入java.util.Iterator;
公共类分级计数器{
公共静态类分级映射器扩展映射器{
私有文本id=新文本();
私有文本比率Anzahl=新文本();
公共void映射(LongWritable键、文本值、OutputCollector输出、Reporter报告器)引发IOException{
JSONParser=新的JSONParser();
试一试{
Object obj=parser.parse(value.toString());
JSONObject JSONObject=(JSONObject)对象;
String movieId=(String)jsonObject.get(“\u id”);
整数计数=0;
//循环阵列
JSONArray评级=(JSONArray)jsonObject.get(“评级”);
迭代器迭代器=ratings.Iterator();
while(iterator.hasNext()){
计数++;
}
}捕获(解析异常){
e、 printStackTrace();
}
}
}
公共静态类额定减速器扩展减速器{
公共void reduce(文本键、Iterable值、上下文上下文)引发IOException、InterruptedException{
文本结果值=新文本();
综合评分=0;
while(values.hasNext()){
allRatings+=Integer.parseInt(values.toString());
}
结果值集(“+allRatings);
write(key,resultValue);
}
}
公共静态void main(字符串[]args)引发异常{
Configuration conf=新配置();
作业=新作业(配置,“评分计数”);
job.setJarByClass(RatingCounter.class);
setMapperClass(RatingMapper.class);
job.setReducerClass(RatingReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
addInputPath(作业,新路径(args[0]);
setOutputPath(作业,新路径(args[1]);
系统退出(作业等待完成(真)?0:1;
}
}

我不知道如何在Mapper和Reducer中编写函数。有人能帮我吗?

我对你的地图绘制器和缩小器做了一些修改

首先,对于您的映射器,您没有在任何地方编写输出,而扩展
mapper
类时的语法也是错误的(可以说)。任何映射器的第一个输入是线的
长可写
(或
对象
类型)偏移量。您可以注意到下面的更改

public static class RatingMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

    public void map(LongWritable key, Text value, Context context) throws IOException, ParseException{
        JSONParser parser = new JSONParser();

        Object obj = parser.parse(value.toString());
        JSONObject jsonObject = (JSONObject) obj;

        String movieId = (String) jsonObject.get("_id");

        JSONArray ratings = (JSONArray) jsonObject.get("ratings");

        context.write(new Text(movieId), new IntWritable(ratings.size()) );
    }
}

我看到您还没有将键值对从mapper类传递到reducer。我想这是你的问题有没有不使用Spark或Hive的理由?
public static class RatingReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int allRatings = 0;

        while (values.hasNext()){
            allRatings += value.get();
        }
        context.write(key, new IntWritable(resultValue));
    }
}