Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/java/321.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
为什么联接后选择在java spark数据帧中引发异常?_Java_Apache Spark_Apache Spark Sql - Fatal编程技术网

为什么联接后选择在java spark数据帧中引发异常?

为什么联接后选择在java spark数据帧中引发异常?,java,apache-spark,apache-spark-sql,Java,Apache Spark,Apache Spark Sql,我有两个数据帧:左数据帧和右数据帧。它们由三列组成:src relation、dest和具有相同的值 1-我尝试连接这些数据帧,条件是左侧的dst=右侧的src。但它不起作用。错误在哪里 Dataset<Row> r = left .join(right, left.col("dst").equalTo(right.col("src"))); 2-如果我将左侧的dst重命名为dst,将右侧的src列重命名为dst2,那么我应用了联接,它就可以工作了。但是如果我尝试从optain

我有两个数据帧:左数据帧和右数据帧。它们由三列组成:src relation、dest和具有相同的值

1-我尝试连接这些数据帧,条件是左侧的dst=右侧的src。但它不起作用。错误在哪里

Dataset<Row> r = left
  .join(right, left.col("dst").equalTo(right.col("src")));
2-如果我将左侧的dst重命名为dst,将右侧的src列重命名为dst2,那么我应用了联接,它就可以工作了。但是如果我尝试从optained数据帧中选择某个列。这引发了一个例外。我的错误在哪里

 Dataset<Row> left = input_df.withColumnRenamed("dst", "dst2");
 Dataset<Row> right = input_df.withColumnRenamed("src", "dst2");  
 Dataset<Row> r = left.join(right, left.col("dst2").equalTo(right.col("dst2")));
给出:

+---+---------+----+
|src|predicate|dst2|
+---+---------+----+
|  a|       r1| :b1|
|  a|       r2|   k|
|:b1|       r3| :b4|
|:b1|      r10|   d|
|:b4|       r4|   f|
|:b4|       r5| :b5|
|:b5|       r9|   t|
|:b5|      r10|   e|
+---+---------+----+
+----+---------+---+
|dst2|predicate|dst|
+----+---------+---+
|   a|       r1|:b1|
|   a|       r2|  k|
| :b1|       r3|:b4|
| :b1|      r10|  d|
| :b4|       r4|  f|
| :b4|       r5|:b5|
| :b5|       r9|  t|
| :b5|      r10|  e|
+----+---------+---+

给出:

+---+---------+----+
|src|predicate|dst2|
+---+---------+----+
|  a|       r1| :b1|
|  a|       r2|   k|
|:b1|       r3| :b4|
|:b1|      r10|   d|
|:b4|       r4|   f|
|:b4|       r5| :b5|
|:b5|       r9|   t|
|:b5|      r10|   e|
+---+---------+----+
+----+---------+---+
|dst2|predicate|dst|
+----+---------+---+
|   a|       r1|:b1|
|   a|       r2|  k|
| :b1|       r3|:b4|
| :b1|      r10|  d|
| :b4|       r4|  f|
| :b4|       r5|:b5|
| :b5|       r9|  t|
| :b5|      r10|  e|
+----+---------+---+
结果:

+---+---------+----+----+---------+---+
|src|predicate|dst2|dst2|predicate|dst|
+---+---------+----+----+---------+---+
|  a|       r1| b1| b1  |      r10|  d|
|  a|       r1| b1| b1  |       r3| b4|
|b1 |       r3| b4| b4  |       r5| b5|
|b1 |       r3| b4| b4  |       r4|  f|
+---+---------+----+----+---------+---+


Dataset<Row> r = left
  .join(right, left.col("dst2").equalTo(right.col("dst2")))
  .select(left.col("src"),right.col("dst"));
Exception in thread "main" org.apache.spark.sql.AnalysisException: resolved attribute(s) dst#45 missing from dst2#177,src#43,predicate#197,predicate#44,dst2#182,dst#198 in operator !Project [src#43, dst#45];
3-假设所选数据帧工作,如何将获得的数据帧添加到左侧数据帧

我用Java工作。

您使用的是:

r = r.select(left.col("src"), right.col("dst"));
Spark似乎没有找到正确数据帧的血统。这并不令人震惊,因为它经过了大量优化

假设您期望的输出是:

+---+---+
|src|dst|
+---+---+
| b1|:b5|
| b1|  f|
|:b4|  e|
|:b4|  t|
+---+---+
您可以使用以下3个选项之一:

使用col方法 使用列名 以下是完整的源代码:

package net.jgp.books.spark.ch12.lab990_others;

import static org.apache.spark.sql.functions.col;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/**
 * Self join.
 * 
 * @author jgp
 */
public class SelfJoinAndSelectApp {

  /**
   * main() is your entry point to the application.
   * 
   * @param args
   */
  public static void main(String[] args) {
    SelfJoinAndSelectApp app = new SelfJoinAndSelectApp();
    app.start();
  }

  /**
   * The processing code.
   */
  private void start() {
    // Creates a session on a local master
    SparkSession spark = SparkSession.builder()
        .appName("Self join")
        .master("local[*]")
        .getOrCreate();

    Dataset<Row> inputDf = createDataframe(spark);
    inputDf.show(false);

    Dataset<Row> left = inputDf.withColumnRenamed("dst", "dst2");
    left.show();

    Dataset<Row> right = inputDf.withColumnRenamed("src", "dst2");
    right.show();

    Dataset<Row> r = left.join(
        right,
        left.col("dst2").equalTo(right.col("dst2")));
    r.show();

    Dataset<Row> resultOption1Df = r.select(left.col("src"), r.col("dst"));
    resultOption1Df.show();

    Dataset<Row> resultOption2Df = r.select(col("src"), col("dst"));
    resultOption2Df.show();

    Dataset<Row> resultOption3Df = r.select("src", "dst");
    resultOption3Df.show();
  }

  private static Dataset<Row> createDataframe(SparkSession spark) {
    StructType schema = DataTypes.createStructType(new StructField[] {
        DataTypes.createStructField(
            "src",
            DataTypes.StringType,
            false),
        DataTypes.createStructField(
            "predicate",
            DataTypes.StringType,
            false),
        DataTypes.createStructField(
            "dst",
            DataTypes.StringType,
            false) });

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create("a", "r1", ":b1"));
    rows.add(RowFactory.create("a", "r2", "k"));
    rows.add(RowFactory.create("b1", "r3", ":b4"));
    rows.add(RowFactory.create("b1", "r10", "d"));
    rows.add(RowFactory.create(":b4", "r4", "f"));
    rows.add(RowFactory.create(":b4", "r5", ":b5"));
    rows.add(RowFactory.create(":b5", "r9", "t"));
    rows.add(RowFactory.create(":b5", "r10", "e"));

    return spark.createDataFrame(rows, schema);
  }
}

您可以添加您得到的错误和代码片段吗。@Vijay_Shinde请检查更新后的邮件,然后尝试选择Colsrc,coldst或只选择Src,dst;加入后,它应该创建一个不同的数据框,并且可能无法识别左侧和右侧。在转换前后添加show and printSchema或数据框确实有助于理解您的用例。@ApurbaPandey在java中不能只使用colColName。加入左侧和右侧时,我有以下专栏:src | predicate | dst2 | dst2 | predicate | dst,右数据帧的printSchema是root |-dst2:string nullable=true |-predicate:string nullable=true |-dst:string nullable=true那么为什么不识别dst呢?我认为你应该在你的问题中添加一个完整的show和printSchema。。。这里不太可读。可以编辑它。您是否可以在加入之前添加用于左右构建的代码?看看是否有帮助。如果需要的话,很乐意提供更多帮助。谢谢你,太棒了。我的最后一个问题是:如果对数据帧执行sql查询,如:input.RegisterPentableR7;数据集r7=spark.sqlSELECT a.src、a.predicate、b.dst+,其中a.dst=b.src;在这种情况下,作为专家,您更喜欢哪种解决方案?为什么?@Moudi,再问一个问题,在这里标记我,我将在这么小的空间内回答难以回答的问题:。很高兴我能帮助您投票表决我的解决方案:。
Dataset<Row> resultOption1Df = r.select(left.col("src"), r.col("dst"));
resultOption1Df.show();
Dataset<Row> resultOption2Df = r.select(col("src"), col("dst"));
resultOption2Df.show();
Dataset<Row> resultOption3Df = r.select("src", "dst");
resultOption3Df.show();
package net.jgp.books.spark.ch12.lab990_others;

import static org.apache.spark.sql.functions.col;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/**
 * Self join.
 * 
 * @author jgp
 */
public class SelfJoinAndSelectApp {

  /**
   * main() is your entry point to the application.
   * 
   * @param args
   */
  public static void main(String[] args) {
    SelfJoinAndSelectApp app = new SelfJoinAndSelectApp();
    app.start();
  }

  /**
   * The processing code.
   */
  private void start() {
    // Creates a session on a local master
    SparkSession spark = SparkSession.builder()
        .appName("Self join")
        .master("local[*]")
        .getOrCreate();

    Dataset<Row> inputDf = createDataframe(spark);
    inputDf.show(false);

    Dataset<Row> left = inputDf.withColumnRenamed("dst", "dst2");
    left.show();

    Dataset<Row> right = inputDf.withColumnRenamed("src", "dst2");
    right.show();

    Dataset<Row> r = left.join(
        right,
        left.col("dst2").equalTo(right.col("dst2")));
    r.show();

    Dataset<Row> resultOption1Df = r.select(left.col("src"), r.col("dst"));
    resultOption1Df.show();

    Dataset<Row> resultOption2Df = r.select(col("src"), col("dst"));
    resultOption2Df.show();

    Dataset<Row> resultOption3Df = r.select("src", "dst");
    resultOption3Df.show();
  }

  private static Dataset<Row> createDataframe(SparkSession spark) {
    StructType schema = DataTypes.createStructType(new StructField[] {
        DataTypes.createStructField(
            "src",
            DataTypes.StringType,
            false),
        DataTypes.createStructField(
            "predicate",
            DataTypes.StringType,
            false),
        DataTypes.createStructField(
            "dst",
            DataTypes.StringType,
            false) });

    List<Row> rows = new ArrayList<>();
    rows.add(RowFactory.create("a", "r1", ":b1"));
    rows.add(RowFactory.create("a", "r2", "k"));
    rows.add(RowFactory.create("b1", "r3", ":b4"));
    rows.add(RowFactory.create("b1", "r10", "d"));
    rows.add(RowFactory.create(":b4", "r4", "f"));
    rows.add(RowFactory.create(":b4", "r5", ":b5"));
    rows.add(RowFactory.create(":b5", "r9", "t"));
    rows.add(RowFactory.create(":b5", "r10", "e"));

    return spark.createDataFrame(rows, schema);
  }
}