Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/158.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Apache spark 修改Spark数据帧结构_Apache Spark_Dataframe_Data Structures - Fatal编程技术网

Apache spark 修改Spark数据帧结构

Apache spark 修改Spark数据帧结构,apache-spark,dataframe,data-structures,Apache Spark,Dataframe,Data Structures,假设我有一个包含以下列的Spark数据框: | header1 | location | precision | header2 | velocity | data | (此df还包含一些数据) 现在,我想将df转换成一个新的结构,它有两列,每个列都有复杂的字段,类似这样: | gps | velocity | | header1 | location | precision | header2 | vel

假设我有一个包含以下列的Spark数据框:

| header1 | location | precision | header2 | velocity | data |
(此df还包含一些数据)

现在,我想将df转换成一个新的结构,它有两列,每个列都有复杂的字段,类似这样:

|          gps                   |         velocity          |
| header1 | location | precision | header2 | velocity | data |
如果我可以调用一个方法,那么最好:

df1 = createStructure(df, "gps", ["header1", "gps", "precision"])
df2 = createStructure(df1, "velocity", ["header2", "velocity", "data"])
我试着用“withColumn”,但运气不好

试试这个

scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._

scala> val df1 = Seq(("h1-4", "loc4", "prec4", "h2-4", "vel4", "d4"), ("h1-5", "loc5", "prec5", "h2-5", "vel5", "d5")).toDF("header1", "location", "precision", "header2", "velocity", "data")
df1: org.apache.spark.sql.DataFrame = [header1: string, location: string ... 4 more fields]

scala> df1.show(false)
+-------+--------+---------+-------+--------+----+
|header1|location|precision|header2|velocity|data|
+-------+--------+---------+-------+--------+----+
|h1-4   |loc4    |prec4    |h2-4   |vel4    |d4  |
|h1-5   |loc5    |prec5    |h2-5   |vel5    |d5  |
+-------+--------+---------+-------+--------+----+


scala> val outputDF = df1.withColumn("gps", struct($"header1", $"location", $"precision")).withColumn("velocity", struct($"header2", $"velocity", $"data")).select("gps", "velocity")
outputDF: org.apache.spark.sql.DataFrame = [gps: struct<header1: string, location: string ... 1 more field>, velocity: struct<header2: string, velocity: string ... 1 more field>]

scala> outputDF.printSchema
root
|-- gps: struct (nullable = false)
|    |-- header1: string (nullable = true)
|    |-- location: string (nullable = true)
|    |-- precision: string (nullable = true)
|-- velocity: struct (nullable = false)
|    |-- header2: string (nullable = true)
|    |-- velocity: string (nullable = true)
|    |-- data: string (nullable = true)


scala> outputDF.show(false)
+-------------------+----------------+
|gps                |velocity        |
+-------------------+----------------+
|[h1-4, loc4, prec4]|[h2-4, vel4, d4]|
|[h1-5, loc5, prec5]|[h2-5, vel5, d5]|
+-------------------+----------------+
scala>import org.apache.spark.sql.functions_
导入org.apache.spark.sql.functions_
scala>val df1=序列((“h1-4”、“loc4”、“prec4”、“h2-4”、“vel4”、“d4”)、(“h1-5”、“loc5”、“prec5”、“h2-5”、“vel5”、“d5”))。toDF(“表头1”、“位置”、“精度”、“表头2”、“速度”、“数据”)
df1:org.apache.spark.sql.DataFrame=[header1:string,location:string…还有4个字段]
scala>df1.show(false)
+-------+--------+---------+-------+--------+----+
|header1 |位置|精度| header2 |速度|数据|
+-------+--------+---------+-------+--------+----+
|h1-4 | loc4 | prec4 | h2-4 | vel4 | d4|
|h1-5 | loc5 | prec5 | h2-5 | vel5 | d5|
+-------+--------+---------+-------+--------+----+
scala>val outputDF=df1。带列(“gps”,结构($“header1”,“$”位置“,$”精度”))。带列(“速度”,结构($“header2”,“$”速度“,$”数据”)。选择(“gps”,“速度”)
outputDF:org.apache.spark.sql.DataFrame=[gps:struct,velocity:struct]
scala>outputDF.printSchema
根
|--gps:struct(nullable=false)
||--header1:string(nullable=true)
||--位置:字符串(nullable=true)
||--精度:字符串(nullable=true)
|--velocity:struct(nullable=false)
||--header2:string(nullable=true)
||--速度:字符串(可空=真)
||--数据:字符串(nullable=true)
scala>outputDF.show(false)
+-------------------+----------------+
|gps |速度|
+-------------------+----------------+
|[h1-4,loc4,prec4]|[h2-4,vel4,d4]|
|[h1-5,loc5,prec5]|[h2-5,vel5,d5]|
+-------------------+----------------+

谢谢!成功了