Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/java/345.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在Java 1.8中的spark groupBy中按部门查找平均值_Java_Apache Spark_Rdd - Fatal编程技术网

在Java 1.8中的spark groupBy中按部门查找平均值

在Java 1.8中的spark groupBy中按部门查找平均值,java,apache-spark,rdd,Java,Apache Spark,Rdd,我有一个下面的数据集,第一列是部门,第二列是工资。我想按部门计算平均工资 IT 2000000 HR 2000000 IT 1950000 HR 2200000 Admin 1900000 IT 1900000 IT 2200000 我做了下面的手术 JavaPairRDD<String, Iterable<Long>> rddY = employees.groupByKey(); System.out.println("<============

我有一个下面的数据集,第一列是部门,第二列是工资。我想按部门计算平均工资

IT  2000000
HR  2000000
IT  1950000
HR  2200000
Admin   1900000
IT  1900000
IT  2200000
我做了下面的手术

JavaPairRDD<String, Iterable<Long>> rddY = employees.groupByKey();
System.out.println("<=========================RDDY collect==================>" + rddY.collect());
javapairdd rddY=employees.groupByKey();
System.out.println(“+rddY.collect());
并得到以下输出:

<=========================RDDY
collect==================>[(IT,[2000000, 1950000, 1900000, 2200000]),
(HR,[2000000, 2200000]), (Admin,[1900000])]
[(IT[2000000,1950000,1900000,2200000]),
(人力资源,[2000000,2200000]),(管理,[1900000])]
我需要的是

  • 我想用spark RDD计算总平均值和部门平均值

  • 如何使用spark中的groupBy函数计算平均值


  • 下面是使用Spark JavaPairdd按键计算平均值的代码。希望这有帮助

    import java.util.ArrayList;
    import java.util.List;
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.apache.spark.api.java.function.PairFunction;
    import scala.Tuple2;
    
    public class SparkAverageCalculation {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("Average Calculation").setMaster("local[2]");
        JavaSparkContext sc = new JavaSparkContext(conf);
        //inputList
        List<Tuple2<String,Integer>> inputList = new ArrayList<Tuple2<String,Integer>>();
        inputList.add(new Tuple2<String,Integer>("a1", 30));
        inputList.add(new Tuple2<String,Integer>("b1", 30));
        inputList.add(new Tuple2<String,Integer>("a1", 40));
        inputList.add(new Tuple2<String,Integer>("a1", 20));
        inputList.add(new Tuple2<String,Integer>("b1", 50));            
        //parallelizePairs    
        JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(inputList);
        //count each values per key
        JavaPairRDD<String, Tuple2<Integer, Integer>> valueCount = pairRDD.mapValues(value -> new Tuple2<Integer, Integer>(value,1));
        //add values by reduceByKey
        JavaPairRDD<String, Tuple2<Integer, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) ->  new Tuple2<Integer, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
        //calculate average
        JavaPairRDD<String, Integer> averagePair = reducedCount.mapToPair(getAverageByKey);
        //print averageByKey
        averagePair.foreach(data -> {
            System.out.println("Key="+data._1() + " Average=" + data._2());
        }); 
        //stop sc
        sc.stop();
        sc.close();
    }
    
    private static PairFunction<Tuple2<String, Tuple2<Integer, Integer>>,String,Integer> getAverageByKey = (tuple) -> {
         Tuple2<Integer, Integer> val = tuple._2;
         int total = val._1;
         int count = val._2;
         Tuple2<String, Integer> averagePair = new Tuple2<String, Integer>(tuple._1, total / count);
         return averagePair;
      };
    }
    
    import java.util.ArrayList;
    导入java.util.List;
    导入org.apache.spark.SparkConf;
    导入org.apache.spark.api.java.javapairdd;
    导入org.apache.spark.api.java.JavaSparkContext;
    导入org.apache.spark.api.java.function.PairFunction;
    导入scala.Tuple2;
    公共类SparkAverageCalculation{
    公共静态void main(字符串[]args){
    SparkConf conf=new SparkConf().setAppName(“平均计算”).setMaster(“本地[2]”);
    JavaSparkContext sc=新的JavaSparkContext(conf);
    //输入列表
    List inputList=新建ArrayList();
    添加(新的元组2(“a1”,30));
    添加(新的元组2(“b1”,30));
    添加(新的元组2(“a1”,40));
    添加(新的元组2(“a1”,20));
    添加(新的元组2(“b1”,50));
    //平行翼
    javapairrdpairrdd=sc.parallelizePairs(输入列表);
    //计算每个键的每个值
    javapairdd valueCount=pairdd.mapValues(值->新元组2(值,1));
    //通过reduceByKey添加值
    javapairdd reducedCount=valueCount.reduceByKey((tuple1,tuple2)->新tuple2(tuple1.\u 1+tuple2.\u 1,tuple1.\u 2+tuple2.\u 2));
    //计算平均数
    javapairdd averagePair=reducedCount.mapToPair(getAverageByKey);
    //按关键字打印平均值
    averagePair.foreach(数据->{
    System.out.println(“Key=“+data.\u 1()+”Average=“+data.\u 2());
    }); 
    //停止sc
    sc.停止();
    sc.close();
    }
    私有静态对函数getAverageByKey=(元组)->{
    Tuple2 val=tuple.\u 2;
    int total=val._1;
    整数计数=数值2;
    Tuple2 averagePair=新Tuple2(tuple.\u 1,总计/计数);
    返回平均对;
    };
    }
    
    导入org.apache.htrace.fasterxml.jackson.databind.ObjectMapper;
    导入org.apache.spark.SparkConf;
    导入org.apache.spark.api.java.javapairdd;
    导入org.apache.spark.api.java.JavaRDD;
    导入org.apache.spark.api.java.function.PairFunction;
    导入org.apache.spark.streaming.Durations;
    导入org.apache.spark.streaming.api.java.JavaStreamingContext;
    导入org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
    导入scala.Tuple2;
    导入java.util.Map;
    公共类ElasticsearchMetricProcessor{
    私有静态最终字符串ES_HOST_PORT=“localhost:9200”;
    私有静态对函数getAverageByKey=(元组)->{
    Tuple2 val=tuple.\u 2;
    长期总计=价值1;
    整数计数=数值2;
    Tuple2 averagePair=新Tuple2(tuple.\u 1,总计/计数);
    返回平均对;
    };
    公共静态void main(字符串args[])引发InterruptedException{
    System.setProperty(“hadoop.home.dir”,“C:\\Users\\anki\\metering\\winutils”);
    SparkConf SparkConf=new SparkConf().setAppName(“StreamingApp”).setMaster(“本地[2]”);
    sparkConf.set(“es.nodes.wan.only”、“false”);
    sparkConf.set(“es.nodes”,es_主机_端口);
    JavaStreamingContext jsc=新的JavaStreamingContext(sparkConf,Durations.seconds(10));
    JavaRDD esRDD=JavaEsSpark.esRDD(jsc.sparkContext(),“门户分析/报告执行”).values();
    javapairdd valueCount=esRDD.mapToPair(x->new Tuple2(x.get(“id”).toString(),Long.valueOf(x.get(“duration”).toString())).mapValues(value->new Tuple2(value,1));
    javapairdd reducedCount=valueCount.reduceByKey((tuple1,tuple2)->新tuple2(tuple1.\u 1+tuple2.\u 1,tuple1.\u 2+tuple2.\u 2));
    //计算平均数
    javapairdd averagePair=reducedCount.mapToPair(getAverageByKey);
    //按关键字打印平均值
    averagePair.foreach(数据->{
    System.out.println(“Key=“+data.\u 1()+”Average=“+data.\u 2());
    });
    //停止sc
    jsc.stop();
    jsc.close();
    }
    }
    --------------------------------------------------------
    弹性搜索测试数据
    {
    "take":3,,
    “超时”:false,
    “_碎片”:{
    “总数”:3,
    "成功":三,,
    “失败”:0
    },
    “点击次数”:{
    “总数”:16,
    “最高分数”:1,
    “点击次数”:[
    {
    “_索引”:“门户网站_分析”,
    “_类型”:“报告执行”,
    “_id”:“AVvS8aPGm2uMcgoWFwdx”,
    “_分数”:1,
    “_来源”:{
    “类型”:“报告执行”,
    “id”:“a37cacc3-71d5-40f0-a329-a051a3949ced”,
    “日期时间”:1475733719123,
    “租户”:“默认值”,
    “用户”:“317f1e761f2faa8da781a4762b9dcc2c5cad209a”,
    “报告”:“72efd670-bb95-11e5-632f-54ee7539b24c”,
    “持续时间”:30
    }
    },
    {
    “_索引”:“门户网站_分析”,
    “_类型”:“报告执行”,
    “_id”:“AVvS8eOcm2uMcgoWFwd3”,
    “_分数”:1,
    “_来源”:{
    “类型”:“报告执行”,
    “id”:“a37cacc3-71d5-40f0-a329-a051a3949ced”,
    “日期时间”:1475733719123,
    “租户”:“默认值”,
    “用户”:“317f1e761f2faa8da781a4762b9dcc2c5cad209a”,
    “报告”:“72efd670-bb95-11e5-632f-54ee7539b24c”,
    “持续时间”:30
    }
    },
    {
    “_索引”:“门户网站_分析”,
    “_类型”:“报告执行”,
    “_id”:“AVvTL5ACm2uMcgoWFweC”,
    “_分数”:1,
    “_来源”:{
    “类型”:“报告执行”,
    “id”:“b37cacc3-71d5-40f0-a329-a051a3949ced”,
    “日期时间”:1475733719123,
    “租户”:“默认值”,
    “用户”:“317f1e761f2faa8da781a4762b9dcc2c5cad209a”,
    “报告”:“72efd670-bb95-11e5-632f-54ee7539b24c”,
    “持续时间”:70
    }
    },
    {
    “_索引”:“门户网站_分析”,
    “_类型”:“报告执行”,
    “_id”:“AVvTL96Xm2uMcgoWFweD”,
    “_分数”:1,
    “_来源”:{
    “类型”:“报告执行”,
    “id”:“b37cacc3-71d5-40f0-a329-a051a3949ced”,
    “日期时间”:1475733719123,
    “租户”:“默认值”,
    “用户”:“3
    
    import org.apache.htrace.fasterxml.jackson.databind.ObjectMapper;
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.streaming.Durations;
    import org.apache.spark.streaming.api.java.JavaStreamingContext;
    import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
    import scala.Tuple2;
    
    import java.util.Map;
    
    public class ElasticsearchMetricProcessor {
    
        private static final String ES_HOST_PORT = "localhost:9200";
    
        private static PairFunction<Tuple2<String, Tuple2<Long, Integer>>,String,Long> getAverageByKey = (tuple) -> {
            Tuple2<Long, Integer> val = tuple._2;
            long total = val._1;
            int count = val._2;
            Tuple2<String, Long> averagePair = new Tuple2<String, Long>(tuple._1, total / count);
            return averagePair;
        };
    
        public static void main(String args[]) throws InterruptedException {
            System.setProperty("hadoop.home.dir","C:\\Users\\anki\\metering\\winutils");
            SparkConf sparkConf = new SparkConf().setAppName("StreamingApp").setMaster("local[2]");
            sparkConf.set("es.nodes.wan.only","false");
            sparkConf.set("es.nodes",ES_HOST_PORT);
            JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    
            JavaRDD<Map<String, Object>> esRDD =  JavaEsSpark.esRDD(jsc.sparkContext(), "portal_analytics/report-execution").values();
    
            JavaPairRDD<String, Tuple2<Long, Integer>> valueCount = esRDD.mapToPair( x -> new Tuple2<String, Long>(x.get("id").toString(),Long.valueOf(x.get("duration").toString()))).mapValues(value -> new Tuple2<Long, Integer>(value,1));
    
            JavaPairRDD<String, Tuple2<Long, Integer>> reducedCount = valueCount.reduceByKey((tuple1,tuple2) ->  new Tuple2<Long, Integer>(tuple1._1 + tuple2._1, tuple1._2 + tuple2._2));
            //calculate average
            JavaPairRDD<String, Long> averagePair = reducedCount.mapToPair(getAverageByKey);
            //print averageByKey
            averagePair.foreach(data -> {
                System.out.println("Key="+data._1() + " Average=" + data._2());
            });
            //stop sc
            jsc.stop();
            jsc.close();
        }
    }
    --------------------------------------------------------
    Elasticsearch Test Data
    
    {
    "took": 3,
    "timed_out": false,
    "_shards": {
    "total": 3,
    "successful": 3,
    "failed": 0
    },
    "hits": {
    "total": 16,
    "max_score": 1,
    "hits": [
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvS8aPGm2uMcgoWFwdx",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvS8eOcm2uMcgoWFwd3",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvTL5ACm2uMcgoWFweC",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 70
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvTL96Xm2uMcgoWFweD",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvTNrKPm2uMcgoWFweF",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "b37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvS8dWFm2uMcgoWFwdy",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvS8dlim2uMcgoWFwdz",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvS8d7am2uMcgoWFwd1",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvS8eX0m2uMcgoWFwd4",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 30
    }
    },
      {
    "_index": "portal_analytics",
    "_type": "report-execution",
    "_id": "AVvS8nplm2uMcgoWFwd7",
    "_score": 1,
    "_source": {
    "type": "report-execution",
    "id": "a37cacc3-71d5-40f0-a329-a051a3949ced",
    "date-time": 1475733719123,
    "tenant": "default",
    "user": "317f1e761f2faa8da781a4762b9dcc2c5cad209a",
    "report": "72efd670-bb95-11e5-632f-54ee7539b24c",
    "duration": 50
    }
    }
    ],
    }
    }
    
    val mapp = data.map(x => x.split(" "))
    val dept = mapp.map( x => (x(0),(x(1).toInt,1)))
    val avg = dept.reduceByKey((x,y) => ((x._1+y._1),(x._2+y._2)))
    val count = avg.mapValues{case (x,y) => x/y}
    count.foreach(println)
    
    (Admin,1900000)
    (HR,4200000)
    (IT,8050000)