Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在ApacheSpark-Java中对多个列进行分组并从数据帧进行计数_Java_Apache Spark - Fatal编程技术网

在ApacheSpark-Java中对多个列进行分组并从数据帧进行计数

在ApacheSpark-Java中对多个列进行分组并从数据帧进行计数,java,apache-spark,Java,Apache Spark,我有一个csv文件(在hdfs hadoop中导入),显示出租车路线,其结构如下: ●taxi_id​ (eg, id109243) ●pickup_date (eg, 10/10/2019) ●passengers​ (eg, 3) ●pick_lon​ (eg, -70.0235) ●pick_lat​ (eg, 40.19091) 现在,我有很多行,差不多一百万行。而且,我想计算每个区域和每天的出租车路线。通过平均面积,我只吐出了4个区域(基于纬度和经度),我检查了每条出租车路线的纬度和

我有一个csv文件(在hdfs hadoop中导入),显示出租车路线,其结构如下:

●taxi_id​ (eg, id109243)
●pickup_date (eg, 10/10/2019)
●passengers​ (eg, 3)
●pick_lon​ (eg, -70.0235)
●pick_lat​ (eg, 40.19091)
现在,我有很多行,差不多一百万行。而且,我想计算每个区域和每天的出租车路线。通过平均面积,我只吐出了4个区域(基于纬度和经度),我检查了每条出租车路线的纬度和经度,看看是否包含在这个区域内。所以我的结果应该是这样的:

 Day1: 10/10/2019 -> There are 145 taxi routes from Quarter Area 1
 Day1: 10/10/2019 -> There are 156 taxi routes from Quarter Area 2
 Day1: 10/10/2019 -> There are 214 taxi routes from Quarter Area 3
 Day1: 10/10/2019 -> There are 200 taxi routes from Quarter Area 4
.....
 Day2: 11/10/2019 -> There are 145 taxi routes from Quarter Area 1
 Day2: 11/10/2019 -> There are 156 taxi routes from Quarter Area 2
 Day2: 11/10/2019 -> There are 214 taxi routes from Quarter Area 3
 Day2: 11/10/2019 -> There are 200 taxi routes from Quarter Area 4
.........
我的Java代码[Java8,Spark 2.4.4]

package com.bigdata.taxi;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import static org.apache.spark.sql.functions.*;


public class Main {

    public static void main(String[] args) {
        // TODO Auto-generated method stub

        SparkConf conf = new SparkConf();
        conf.setAppName("My 1st Spark app");
        conf.setMaster("local[*]");
        JavaSparkContext sc = new JavaSparkContext(conf);

        SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();

        //Now read csv , from hdfs source
        //[cloudera@quickstart ~]$ hdfs dfs -put /home/cloudera/Desktop/fares.csv hdfs://quickstart.cloudera:8020//user//cloudera//fares.csv
        Dataset<Row> df = sparkSession.read().option("header", true).option("inferSchema", "true").
                option("timestampFormat", "yyyy-MM-dd hh:mm:ss").csv("hdfs://quickstart.cloudera:8020//user//cloudera//fares.csv");
        //df.show(); //only showing top 20 rows
        //System.out.println(df.schema());

        //Add new column from the first dataframe, with only date.
        Dataset<Row> df_date_column = df.withColumn("pickup_date", date_format(col("pickup_datetime"), "yyyy-MM-dd"));
        //df_date_column.show();

        //Dataset<Row> df2 = df_date_column.groupBy("pickup_date").count();
        //df2.orderBy("pickup_date").show(182, false);

        //??
        df_date_column.foreach(row -> { 
            getCoordinates(row);
        });

        //System.out.println(df.schema());
    }

    /*private static void getCoordinates(Row r){
        //extract lon and lat from r
        //then see which quarter field belong
        double x = r.getDouble(5);
        double y = r.getDouble(6);
        System.out.println("To x einai : " + x);
        System.out.println("To y einai : " + y);

        if((40<=x && x<=56) && (-89<=y && y<=-56)){

        }else if((40<=x && x<=56) && (-89<=y && y<=-56)){

        }else if((40<=x && x<=56) && (-89<=y && y<=-56)){

        }else if((40<=x && x<=56) && (-89<=y && y<=-56)){

        }
    }*/
}
package com.bigdata.taxi;
导入org.apache.spark.SparkConf;
导入org.apache.spark.api.java.JavaSparkContext;
导入org.apache.spark.sql.Dataset;
导入org.apache.spark.sql.Row;
导入org.apache.spark.sql.SparkSession;
导入静态org.apache.spark.sql.functions.*;
公共班机{
公共静态void main(字符串[]args){
//TODO自动生成的方法存根
SparkConf conf=新的SparkConf();
conf.setAppName(“我的第一个Spark应用”);
conf.setMaster(“本地[*]”);
JavaSparkContext sc=新的JavaSparkContext(conf);
SparkSession SparkSession=SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
//现在从hdfs源读取csv
//[cloudera@quickstart~]$hdfs dfs-put/home/cloudera/Desktop/fares.csvhdfs://quickstart.cloudera:8020//user//cloudera//fares.csv
Dataset df=sparkSession.read().option(“header”,true)。option(“推断模式”,“true”)。
选项(“时间戳格式”,“yyyy-MM-dd hh:MM:ss”).csv(“hdfs://quickstart.cloudera:8020//user//cloudera//fares.csv");
//df.show();//仅显示前20行
//System.out.println(df.schema());
//从第一个数据帧添加新列,仅包含日期。
数据集df_date_column=df.withColumn(“拾取日期”,日期格式(col(“拾取日期时间”),“yyyy-MM-dd”);
//df_date_column.show();
//数据集df2=df_date_column.groupBy(“拾取日期”).count();
//df2.订购人(“收货日期”)显示(182,假);
//??
df_date_column.foreach(行->{
获取坐标(行);
});
//System.out.println(df.schema());
}
/*专用静态坐标(r行){
//从r中提取lon和lat
//然后查看属于哪个四分之一字段
双x=r.getDouble(5);
双y=r.getDouble(6);
System.out.println(“To x einai:+x”);
System.out.println(“To y einai:+y”);

如果((40您可以使用
map
功能

Encoder enc=Encoders.bean(DateAndQuarter.class);
数据集daq=df\u date\u column.map(行->{
字符串日期=row.getAs(“提取日期”);
双x=r.getDouble(5);
双y=r.getDouble(6);
四分之一整数=0;
如果((40