Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/java/317.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 如何按天累计?_Java_Apache Spark_Apache Spark Sql - Fatal编程技术网

Java 如何按天累计?

Java 如何按天累计?,java,apache-spark,apache-spark-sql,Java,Apache Spark,Apache Spark Sql,我有以下几点建议: public class MyPojo { Date startDate; Double usageAMount; // ... bla bla bla } 因此,我有一个MyPojo对象列表,作为参数传递给函数: public Map<Date, Double> getWeeklyCost(@NotNull List<MyPojo> reports) { JavaRDD<MyPojo> rdd = co

我有以下几点建议:

public class MyPojo {
   Date startDate;
   Double usageAMount;
   // ... bla bla bla
}
因此,我有一个
MyPojo
对象列表,作为参数传递给函数:

public Map<Date, Double> getWeeklyCost(@NotNull List<MyPojo> reports) {
        JavaRDD<MyPojo> rdd = context.parallelize(reports);
        JavaPairRDD<Date, Double> result = rdd.mapToPair(
                (PairFunction<MyPojo, Date, Double>) x ->
                        new Tuple2<>(x.getStartDate(), x.getUsageAmount()))
                .reduceByKey((Function2<Double, Double, Double>) (x, y) -> x + y);

        return result.collectAsMap();
}
我想返回按天聚合的数据,按日期降序排序。 例如:

"2017-06-28T03:00:00.000+0000": 0.07777776000000002,
"2017-06-28T20:00:00.000+0000": 0.01944444,
在同一天,因此应添加它们的值(usageAmount)。我只在乎一天,不在乎一小时。如何减少或汇总RDD以获得所需的结果

**更新**答案必须是Spark RDD解决方案

相对容易(尽管需要大量代码)

让我们从Pojo的一个实现开始:

static class Record
{
    private Date date;
    private double amount;
    public Record(Date d, double a)
    {
        this.date = d;
        this.amount = a;
    }
    @Override
    public String toString() {
        return date.toString() + "\t" + amount;
    }
}
现在有一种实用方法可以检查两条记录是否在同一天:

private static boolean sameDay(Record r0, Record r1)
{
    Date d0 = r0.date;
    Date d1 = r1.date;

    Calendar cal = new GregorianCalendar();
    cal.setTime(d0);

    int[] dateParts0 = {cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.MONTH), cal.get(Calendar.YEAR)};

    cal.setTime(d1);

    return cal.get(Calendar.DAY_OF_MONTH) == dateParts0[0] &&
            cal.get(Calendar.MONTH) == dateParts0[1] &&
            cal.get(Calendar.YEAR) == dateParts0[2];
}
现在我们有了它,我们可以开始学习算法的主要部分了。 这里的想法是按天对输入列表进行排序。然后在列表上循环。 对于我们正在处理的每个条目,我们检查它是否与聚合数据集的最后一个已知日期相同。如果是,我们添加记录的数量,如果不是,我们添加一个新条目

public static List<Record> aggregate(Collection<Record> rs)
{
    List<Record> tmp = new ArrayList<>(rs);
    java.util.Collections.sort(tmp, new Comparator<Record>() {
        @Override
        public int compare(Record o1, Record o2) {
            return o1.date.compareTo(o2.date);
        }
    });

    List<Record> out = new ArrayList<>();
    out.add(new Record(tmp.get(0).date, 0));
    for(int i=0;i<tmp.size();i++)
    {
        Record last = out.get(out.size() - 1);
        Record recordBeingProcessed = tmp.get(i);
        if(sameDay(last, recordBeingProcessed))
        {
            last.amount += recordBeingProcessed.amount;
        }
        else
        {
            out.add(recordBeingProcessed);
        }
    }

    return out;
}
相对容易(尽管需要很多代码)

让我们从Pojo的一个实现开始:

static class Record
{
    private Date date;
    private double amount;
    public Record(Date d, double a)
    {
        this.date = d;
        this.amount = a;
    }
    @Override
    public String toString() {
        return date.toString() + "\t" + amount;
    }
}
现在有一种实用方法可以检查两条记录是否在同一天:

private static boolean sameDay(Record r0, Record r1)
{
    Date d0 = r0.date;
    Date d1 = r1.date;

    Calendar cal = new GregorianCalendar();
    cal.setTime(d0);

    int[] dateParts0 = {cal.get(Calendar.DAY_OF_MONTH), cal.get(Calendar.MONTH), cal.get(Calendar.YEAR)};

    cal.setTime(d1);

    return cal.get(Calendar.DAY_OF_MONTH) == dateParts0[0] &&
            cal.get(Calendar.MONTH) == dateParts0[1] &&
            cal.get(Calendar.YEAR) == dateParts0[2];
}
现在我们有了它,我们可以开始学习算法的主要部分了。 这里的想法是按天对输入列表进行排序。然后在列表上循环。 对于我们正在处理的每个条目,我们检查它是否与聚合数据集的最后一个已知日期相同。如果是,我们添加记录的数量,如果不是,我们添加一个新条目

public static List<Record> aggregate(Collection<Record> rs)
{
    List<Record> tmp = new ArrayList<>(rs);
    java.util.Collections.sort(tmp, new Comparator<Record>() {
        @Override
        public int compare(Record o1, Record o2) {
            return o1.date.compareTo(o2.date);
        }
    });

    List<Record> out = new ArrayList<>();
    out.add(new Record(tmp.get(0).date, 0));
    for(int i=0;i<tmp.size();i++)
    {
        Record last = out.get(out.size() - 1);
        Record recordBeingProcessed = tmp.get(i);
        if(sameDay(last, recordBeingProcessed))
        {
            last.amount += recordBeingProcessed.amount;
        }
        else
        {
            out.add(recordBeingProcessed);
        }
    }

    return out;
}
公共类MyPojo{
起始日期;
双重用途;
静态日期格式=新的简化格式(“yyyy-mm-dd:hh”);
MyPojo(日期开始日期,双用途装载){
this.startDate=startDate;
this.usageAMount=usageAMount;
}
日期getStrartDate(){return startDate;}
Double getUsage(){return usageAMount;}
公共静态void main(字符串[]args)引发异常{
List reports=getReports();
//按日期排序
reports=reports.stream().sorted(getComperator()).collect(Collectors.toList());
产出(报告);
//您可以收集到地图,但地图键未排序
//和键(日期)必须是唯一的
映射结果=reports.stream().sorted(getComperator()).collect(收集器
.toMap(e->e.startDate,e->e.usageAMount));
}
私有静态列表getReports()引发ParseException{
列表报告=新建ArrayList();
报告。添加(新的MyPojo(format.parse(“2017-06-28:01”),0.02916666);
报告。添加(新的MyPojo(format.parse(“2017-06-29:01”),0.02916666);
添加报告(新的MyPojo(格式解析(“2017-06-27:01”),0.0388888);
添加报告(新的MyPojo(format.parse(“2017-06-26:01”),0.058333320000000001);
添加报告(新的MyPojo(格式解析(“2017-06-28:02”),0.0388888);
添加报告(新的MyPojo(格式解析(“2017-06-27:02”),0.0388888);
添加(新的MyPojo(format.parse(“2017-06-28:03”),0.077776000000002);
报告。添加(新的MyPojo(format.parse(“2017-06-28:04”),0.01944444);
添加(新的MyPojo(format.parse(“2017-06-30:01”),0.00972222));
返回报告;
}
私有静态比较器
公共类MyPojo{
起始日期;
双重用途;
静态日期格式=新的简化格式(“yyyy-mm-dd:hh”);
MyPojo(日期开始日期,双用途装载){
this.startDate=startDate;
this.usageAMount=usageAMount;
}
日期getStrartDate(){return startDate;}
Double getUsage(){return usageAMount;}
公共静态void main(字符串[]args)引发异常{
List reports=getReports();
//按日期排序
reports=reports.stream().sorted(getComperator()).collect(Collectors.toList());
产出(报告);
//您可以收集到地图,但地图键未排序
//和键(日期)必须是唯一的
映射结果=reports.stream().sorted(getComperator()).collect(收集器
.toMap(e->e.startDate,e->e.usageAMount));
}
私有静态列表getReports()引发ParseException{
列表报告=新建ArrayList();
报告。添加(新的MyPojo(format.parse(“2017-06-28:01”),0.02916666);
报告。添加(新的MyPojo(format.parse(“2017-06-29:01”),0.02916666);
添加报告(新的MyPojo(格式解析(“2017-06-27:01”),0.0388888);
添加报告(新的MyPojo(format.parse(“2017-06-26:01”),0.058333320000000001);
添加报告(新的MyPojo(格式解析(“2017-06-28:02”),0.0388888);
添加报告(新的MyPojo(格式解析(“2017-06-27:02”),0.0388888);
添加(新的MyPojo(format.parse(“2017-06-28:03”),0.077776000000002);
报告。添加(新的MyPojo(format.parse(“2017-06-28:04”),0.01944444);
添加(新的MyPojo(format.parse(“2017-06-30:01”),0.00972222));
返回报告;
}

私有静态比较程序可以使用Spark SQL的数据框架吗?这样以后编写和理解就容易多了。@JacekLaskowski数据来自MongoDB…没有公认的答案?可以使用Spark SQL的数据框架吗?这样以后编写和理解就容易多了。@JacekLaskowski数据来自MongoDB…没有公认的答案?
public class MyPojo {

       Date startDate;
       Double usageAMount;
       static DateFormat format = new SimpleDateFormat("yyyy-mm-dd:hh");

    MyPojo(Date startDate, Double usageAMount) {

        this.startDate = startDate;
        this.usageAMount = usageAMount;
    }

    Date getStrartDate() { return startDate;}
    Double getUsage() { return usageAMount;}

    public static void main(String[] args) throws ParseException {

        List<MyPojo> reports  = getReports();

        //sort by date
        reports = reports.stream().sorted(getComperator()).collect(Collectors.toList());
        output(reports);

        //you can collect to map but map keys are not sorted
        //and keys (dates) must be unique
        Map<Date, Double> result = reports.stream().sorted(getComperator()).collect(Collectors
                .toMap( e-> e.startDate , e-> e.usageAMount));
    }

    private static List<MyPojo> getReports() throws ParseException {

        List<MyPojo> reports = new ArrayList<>();

        reports.add(new MyPojo(format.parse("2017-06-28:01"), 0.02916666));
        reports.add(new MyPojo(format.parse("2017-06-29:01"), 0.02916666));
        reports.add(new MyPojo(format.parse("2017-06-27:01"), 0.03888888));
        reports.add(new MyPojo(format.parse("2017-06-26:01"), 0.05833332000000001));
        reports.add(new MyPojo(format.parse("2017-06-28:02"), 0.03888888));
        reports.add(new MyPojo(format.parse("2017-06-27:02"), 0.03888888));
        reports.add(new MyPojo(format.parse("2017-06-28:03"), 0.07777776000000002));
        reports.add(new MyPojo(format.parse("2017-06-28:04"), 0.01944444));
        reports.add(new MyPojo(format.parse("2017-06-30:01"), 0.00972222));

        return reports;
    }

    private static Comparator<? super MyPojo> getComperator() {

        Comparator<? super MyPojo> comperator = new Comparator<MyPojo>() {

            @Override
            public int compare(MyPojo o1, MyPojo o2) {

                if((o1 == o2) || ((o1 == null) && (o2 == null))) {
                    return 0;
                }
                if( o1 == null) {
                    return -1;
                }
                if( o2 == null) {
                    return  1;
                }

                return (o1).startDate.compareTo((o2).startDate);
            }

        };
        return comperator;
    }

    static void output(List<MyPojo> reports) {

        for(MyPojo p : reports) {
            System.out.println(format.format(p.startDate) +" - "+ p.usageAMount);
        }
    }
}