Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
使用java连接spark数据集_Java_Apache Spark_Dataset - Fatal编程技术网

使用java连接spark数据集

使用java连接spark数据集,java,apache-spark,dataset,Java,Apache Spark,Dataset,我有两个数据集,我正在尝试合并: 数据集1(机器): 关于如何在java中使用dataset join执行此操作的任何帮助 查看包含数组的选项(如下所示) machine.foreachPartition((ForeachPartitionFunction)迭代器->{ while(iterator.hasNext()){ Machine=iterator.next(); machine.getmachineCat().stream().filter(cat->{ 日志信息(“匹配”); spa

我有两个数据集,我正在尝试合并:

数据集1(机器):

关于如何在java中使用dataset join执行此操作的任何帮助

查看包含数组的选项(如下所示)

machine.foreachPartition((ForeachPartitionFunction)迭代器->{
while(iterator.hasNext()){
Machine=iterator.next();
machine.getmachineCat().stream().filter(cat->{
日志信息(“匹配”);
spark.sql(
“从machineDataset m中选择*”
+“加入”
+“卡片集c”
+“其中数组_包含(m.machineCat,cat)”;
返回true;
});
}
});
然后


这是否太复杂了???您可以
分解
列表值,它将生成machineId和machineCat(字符串,整数)列,对dataset2执行相同的操作,然后通过macineCat=carCat进行连接,仅此而已,最后如果您想要单元素数组,请使用
数组(“machineCat”)
function@chlebek数据集1(机器)和数据集2(汽车)包含的元素比列表和字符串多得多(超过30个)。你还认为爆炸清单会很容易吗。你能详细说明一下吗?在你分解多个列之前,这应该不会麻烦。我的结果显示:[1172573,WrappedArray(141),549,WrappedArray(141)][1172573,WrappedArray(1653),3155,WrappedArray(1653)][1172573,WrappedArray(191),1412,WrappedArray(191)]我如何扩展这个WrappedArray?
 String machineID:
 List<Integer> machineCat;(100,200,300)
 String carID:
 List<Integer> carCat;(30,200,100,300)
machineID,machineCat(100),carID,carCat(100)
machineID,machineCat(200),carID,carCat(200)
machineID,machineCat(300),carID,carCat(400)
  machine.foreachPartition((ForeachPartitionFunction<Machine>) iterator -> {

    while (iterator.hasNext()) {

        Machine machine = iterator.next();
        machine.getmachineCat().stream().filter(cat -> {

            LOG.info("matched");
            spark.sql(
                    "select * from machineDataset m"
                            + " join"
                            + " carDataset c "
                            + "where array_contains(m.machineCat,cat)");
            return true;
        });

    }
});
import static org.apache.spark.sql.functions.*; // before main class

Machine machine = new Machine("m1",Arrays.asList(100,200,300));
Car car = new Car("c1", Arrays.asList(30,200,100,300));

Dataset<Row> mDF= spark.createDataFrame(Arrays.asList(machine), Machine.class);
mDF.show();
Dataset<Row> cDF= spark.createDataFrame(Arrays.asList(car), Car.class);
cDF.show();
+---------------+---------+
|     machineCat|machineId|
+---------------+---------+
|[100, 200, 300]|       m1|
+---------------+---------+

+-------------------+-----+
|             carCat|catId|
+-------------------+-----+
|[30, 200, 100, 300]|   c1|
+-------------------+-----+
Dataset<Row> mDF2 = mDF.select(col("machineId"),explode(col("machineCat")).as("machineCat"));
Dataset<Row> cDF2 = cDF.select(col("catId"),explode(col("carCat")).as("carCat"));
Dataset<Row> joinedDF = mDF2.join(cDF2).where(mDF2.col("machineCat").equalTo(cDF2.col("carCat")));
Dataset<Row> finalDF = joinedDF.select(col("machineId"),array(col("machineCat")), col("catId"),array(col("carCat")) );
finalDF.show();
+---------+-----------------+-----+-------------+
|machineId|array(machineCat)|catId|array(carCat)|
+---------+-----------------+-----+-------------+
|       m1|            [100]|   c1|        [100]|
|       m1|            [200]|   c1|        [200]|
|       m1|            [300]|   c1|        [300]|
+---------+-----------------+-----+-------------+

root
 |-- machineId: string (nullable = true)
 |-- array(machineCat): array (nullable = false)
 |    |-- element: integer (containsNull = true)
 |-- catId: string (nullable = true)
 |-- array(carCat): array (nullable = false)
 |    |-- element: integer (containsNull = true)