带有JSON文件和JSONParser的MapReduce函数_Json_Hadoop_Mapreduce_Bigdata

带有JSON文件和JSONParser的MapReduce函数

json hadoop mapreduce

带有JSON文件和JSONParser的MapReduce函数,json,hadoop,mapreduce,bigdata,Json,Hadoop,Mapreduce,Bigdata,我在编写mapreduce函数时遇到了一些问题。我想解决以下问题：我有一个包含1mio JSONObject的JSON文件，如下所示： {"_id":3951,"title":"Two Family House (2000)","genres":["Drama"],"ratings":[{"userId":173,"rating":5},{"userId":195,"rating":5},{"userId":411,"rating":4},{"userId":593,"rating":2}

我在编写mapreduce函数时遇到了一些问题。我想解决以下问题：

我有一个包含1mio JSONObject的JSON文件，如下所示：

 {"_id":3951,"title":"Two Family House (2000)","genres":["Drama"],"ratings":[{"userId":173,"rating":5},{"userId":195,"rating":5},{"userId":411,"rating":4},{"userId":593,"rating":2},{"userId":629,"rating":3},{"userId":830,"rating":3},{"userId":838,"rating":5},{"userId":850,"rating":4},{"userId":856,"rating":4},{"userId":862,"rating":5},{"userId":889,"rating":1},{"userId":928,"rating":5},{"userId":986,"rating":4},{"userId":1001,"rating":5},{"userId":1069,"rating":3},{"userId":1168,"rating":3},{"userId":1173,"rating":2},{"userId":1242,"rating":3},{"userId":1266,"rating":5},{"userId":1331,"rating":5},{"userId":1417,"rating":5},{"userId":1470,"rating":4},{"userId":1474,"rating":5},{"userId":1615,"rating":3},{"userId":1625,"rating":4},{"userId":1733,"rating":4},{"userId":1799,"rating":4},{"userId":1865,"rating":5},{"userId":1877,"rating":5},{"userId":1897,"rating":5},{"userId":1946,"rating":4},{"userId":2031,"rating":4},{"userId":2129,"rating":2},{"userId":2353,"rating":4},{"userId":2986,"rating":4},{"userId":3940,"rating":4},{"userId":3985,"rating":3},{"userId":4025,"rating":5},{"userId":4727,"rating":3},{"userId":5333,"rating":3}]}

还有更多

一个JSON对象是电影，它包含一个数组。我想计算JSON文件中的所有评级

我在IntelliJ中创建了一个Maven Proct，其中包含Hadoop和JSON解析器的依赖项。我的MapReduce类是：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import java.io.IOException;
import java.util.Iterator;

public class RatingCounter {

public static class RatingMapper extends Mapper<JSONObject, Text, Text, Text>{

    private Text id = new Text();
    private Text ratingAnzahl = new Text();

    public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{
        JSONParser parser = new JSONParser();

        try {
            Object obj = parser.parse(value.toString());
            JSONObject jsonObject = (JSONObject) obj;

            String movieId = (String) jsonObject.get("_id");

            int count = 0;
            // loop array
            JSONArray ratings = (JSONArray) jsonObject.get("ratings");
            Iterator<String> iterator = ratings.iterator();
            while (iterator.hasNext()) {
                count++;
            }

        } catch (ParseException e) {
            e.printStackTrace();
        }
    }
}


public static class RatingReducer extends Reducer<Text, Text, Text, Text> {

    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

        Text resultValue = new Text();

        int allRatings = 0;

        while (values.hasNext()){
            allRatings += Integer.parseInt(values.toString());

        }
        resultValue.set(""+allRatings);
        context.write(key, resultValue);
    }
}

public static void main (String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "ratings count");
    job.setJarByClass(RatingCounter.class);
    job.setMapperClass(RatingMapper.class);
    job.setReducerClass(RatingReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

import org.apache.hadoop.conf.Configuration；
导入org.apache.hadoop.fs.Path；
导入org.apache.hadoop.io.LongWritable；
导入org.apache.hadoop.io.Text；
导入org.apache.hadoop.mapred.OutputCollector；
导入org.apache.hadoop.mapred.Reporter；
导入org.apache.hadoop.mapreduce.Job；
导入org.apache.hadoop.mapreduce.Mapper；
导入org.apache.hadoop.mapreduce.Reducer；
导入org.apache.hadoop.mapreduce.lib.input.FileInputFormat；
导入org.apache.hadoop.mapreduce.lib.output.FileOutputFormat；
导入org.json.simple.JSONArray；
导入org.json.simple.JSONObject；
导入org.json.simple.parser.JSONParser；
导入org.json.simple.parser.ParseException；
导入java.io.IOException；
导入java.util.Iterator；
公共类分级计数器{
公共静态类分级映射器扩展映射器{
私有文本id=新文本（）；
私有文本比率Anzahl=新文本（）；
公共void映射（LongWritable键、文本值、OutputCollector输出、Reporter报告器）引发IOException{
JSONParser=新的JSONParser（）；
试一试{
Object obj=parser.parse（value.toString（））；
JSONObject JSONObject=（JSONObject）对象；
String movieId=（String）jsonObject.get（“\u id”）；
整数计数=0；
//循环阵列
JSONArray评级=（JSONArray）jsonObject.get（“评级”）；
迭代器迭代器=ratings.Iterator（）；
while（iterator.hasNext（））{
计数++；
}
}捕获（解析异常）{
e、 printStackTrace（）；
}
}
}
公共静态类额定减速器扩展减速器{
公共void reduce（文本键、Iterable值、上下文上下文）引发IOException、InterruptedException{
文本结果值=新文本（）；
综合评分=0；
while（values.hasNext（））{
allRatings+=Integer.parseInt（values.toString（））；
}
结果值集（“+allRatings）；
write（key，resultValue）；
}
}
公共静态void main（字符串[]args）引发异常{
Configuration conf=新配置（）；
作业=新作业（配置，“评分计数”）；
job.setJarByClass（RatingCounter.class）；
setMapperClass（RatingMapper.class）；
job.setReducerClass（RatingReducer.class）；
job.setOutputKeyClass（Text.class）；
job.setOutputValueClass（Text.class）；
addInputPath（作业，新路径（args[0]）；
setOutputPath（作业，新路径（args[1]）；
系统退出（作业等待完成（真）？0:1；
}
}

我不知道如何在Mapper和Reducer中编写函数。有人能帮我吗？

我对你的地图绘制器和缩小器做了一些修改

首先，对于您的映射器，您没有在任何地方编写输出，而扩展

mapper

类时的语法也是错误的（可以说）。任何映射器的第一个输入是线的

长可写

（或

对象

类型）偏移量。您可以注意到下面的更改

public static class RatingMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

    public void map(LongWritable key, Text value, Context context) throws IOException, ParseException{
        JSONParser parser = new JSONParser();

        Object obj = parser.parse(value.toString());
        JSONObject jsonObject = (JSONObject) obj;

        String movieId = (String) jsonObject.get("_id");

        JSONArray ratings = (JSONArray) jsonObject.get("ratings");

        context.write(new Text(movieId), new IntWritable(ratings.size()) );
    }
}

我看到您还没有将键值对从mapper类传递到reducer。我想这是你的问题有没有不使用Spark或Hive的理由？

public static class RatingReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int allRatings = 0;

        while (values.hasNext()){
            allRatings += value.get();
        }
        context.write(key, new IntWritable(resultValue));
    }
}