Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/19.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala “处理”;相关的;火花线_Scala_Apache Spark - Fatal编程技术网

Scala “处理”;相关的;火花线

Scala “处理”;相关的;火花线,scala,apache-spark,Scala,Apache Spark,假设一个目录中有多个文件,每个文件 文件1 文件2 与同一事件对应的具有相同日期和值的相邻行。 两个单独文件中的两行不能相邻 预期结果: e1|20100101|12.34|... e1|20100101|12.34|... e2|20100101|36.00|... e3|20100102|36.00|... e4|20100101|14.00|... e4|20100101|14.00|... e5|20100101|12.34|... e5|20100101|12.34|... e6|20

假设一个目录中有多个文件,每个文件

文件1

文件2

与同一事件对应的具有相同日期和值的相邻行。 两个单独文件中的两行不能相邻

预期结果:

e1|20100101|12.34|...
e1|20100101|12.34|...
e2|20100101|36.00|...
e3|20100102|36.00|...
e4|20100101|14.00|...
e4|20100101|14.00|...
e5|20100101|12.34|...
e5|20100101|12.34|...
e6|20100101|36.00|...
e7|20100102|36.00|...
e8|20100101|14.00|...
e8|20100101|14.00|...
式中,eN为澄清样本的任意值(e1 e2 e3…)

以下代码是否为所有文件的所有行提供唯一的事件id:

案例类事件(
行号:Long,变量EventId:Long,
日期:字符串,值:字符串//,。。
)
val lines=sc.textFile(“目录”)
val rows=lines.filter(l=>!l.startsWith(“someString”)).zipWithUniqueId
.map(l=>l.u 2.toString+:l.u 1.split(“\\\\\”,-1));
var lastValue:Float=0;
var lastDate:String=“0000101”;
var-eventId:Long=0;
var rowDF=行
.map(c=>{
var e=事件(
托龙,0,c(1),c(2)/,。。。
);
如果(e.Date!=lastDate | e.Value!=lastValue){
lastDate=e.日期
lastValue=e.值
eventId=e.LineNumber
}
e、 EventId=EventId
E
}).toDF();
基本上,我使用
zipWithUniqueId
给出的唯一行号作为相邻行序列的键


我认为我的潜在问题是:第二次映射操作是否有可能跨多个进程分割文件内容?

这里是一个惯用的解决方案。希望这有帮助。我使用文件名来区分文件。groupBy涉及文件名、zipindex,然后连接回原始输入数据帧,从而生成所需的输出

import org.apache.spark.sql.functions._
import org.apache.spark.sql._
import org.apache.spark.sql.types._


scala> val lines = spark.read.textFile("file:///home/fsdjob/theDir").withColumn("filename", input_file_name())

scala> lines.show(false)
+--------------+------------------------------------+
|value         |filename                            |
+--------------+------------------------------------+
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|
+--------------+------------------------------------+

scala> val linesGrpWithUid = lines.groupBy("value", "filename").count.drop("count").rdd.zipWithUniqueId
linesGrpWithUid: org.apache.spark.rdd.RDD[(org.apache.spark.sql.Row, Long)] = MapPartitionsRDD[135] at zipWithUniqueId at <console>:31

scala> val linesGrpWithIdRdd = linesGrpWithUid.map( x => { org.apache.spark.sql.Row(x._1.get(0),x._1.get(1), x._2) })
linesGrpWithIdRdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[136] at map at <console>:31


scala> val schema =
    |   StructType(
    |     StructField("value", StringType, false) ::
    |     StructField("filename", StringType, false) ::
    |     StructField("id", LongType, false) ::
    |     Nil)
schema: org.apache.spark.sql.types.StructType = StructType(StructField(value,StringType,false), StructField(filename,StringType,false), StructField(id,LongType,false))

scala> val linesGrpWithIdDF = spark.createDataFrame(linesGrpWithIdRdd, schema)
linesGrpWithIdDF: org.apache.spark.sql.DataFrame = [value: string, filename: string ... 1 more field]

scala> linesGrpWithIdDF.show(false)
+--------------+------------------------------------+---+
|value         |filename                            |id |
+--------------+------------------------------------+---+
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|6  |
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|20 |
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|30 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|146|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
+--------------+------------------------------------+---+


scala> val output = lines.join(linesGrpWithIdDF, Seq("value", "filename"))
output: org.apache.spark.sql.DataFrame = [value: string, filename: string ... 1 more field]

scala> output.show(false)
+--------------+------------------------------------+---+
|value         |filename                            |id |
+--------------+------------------------------------+---+
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|6  |
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|20 |
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|30 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|146|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
+--------------+------------------------------------+---+
import org.apache.spark.sql.functions_
导入org.apache.spark.sql_
导入org.apache.spark.sql.types_
scala>val lines=spark.read.textFile(“file:///home/fsdjob/theDir“”)。withColumn(“文件名”,输入文件名()
scala>lines.show(假)
+--------------+------------------------------------+
|值|文件名|
+--------------+------------------------------------+
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|
+--------------+------------------------------------+
scala>val linesGrpWithUid=lines.groupBy(“值”、“文件名”).count.drop(“计数”).rdd.zipWithUniqueId
linesGrpWithUid:org.apache.spark.rdd.rdd[(org.apache.spark.sql.Row,Long)]=MapPartitionsRDD[135]位于zipWithUniqueId:31
scala>val linesGrpWithIdRdd=linesGrpWithUid.map(x=>{org.apache.spark.sql.Row(x._1.get(0),x._1.get(1),x._2)})
linesGrpWithIdRdd:org.apache.spark.rdd.rdd[org.apache.spark.sql.Row]=MapPartitionsRDD[136]位于map at:31
scala>val模式=
|结构类型(
|StructField(“值”,StringType,false)::
|StructField(“文件名”,StringType,false)::
|StructField(“id”,长型,false)::
|零)
架构:org.apache.spark.sql.types.StructType=StructType(StructField(value,StringType,false),StructField(filename,StringType,false),StructField(id,LongType,false))
scala>val linesGrpWithIdDF=spark.createDataFrame(linesGrpWithIdRdd,schema)
linesGrpWithIdDF:org.apache.spark.sql.DataFrame=[value:string,filename:string…还有一个字段]
scala>linesGrpWithIdDF.show(false)
+--------------+------------------------------------+---+
|值|文件名| id|
+--------------+------------------------------------+---+
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|6  |
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|20 |
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|30 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|146|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
+--------------+------------------------------------+---+
scala>val output=lines.join(linesGrpWithIdDF,Seq(“value”,“filename”))
输出:org.apache.spark.sql.DataFrame=[值:字符串,文件名:字符串…1个其他字段]
scala>output.show(false)
+--------------+------------------------------------+---+
|值|文件名| id|
+--------------+------------------------------------+---+
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|6  |
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|20 |
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|30 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|146|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
+--------------+------------------------------------+---+

文件选项卡是否有分隔符?@C.S.ReddyGadipally不,它们是|分隔的,我的意思是问它是否有|分隔符。如果我的回答对你有用,请告诉我。
e1|20100101|12.34|...
e1|20100101|12.34|...
e2|20100101|36.00|...
e3|20100102|36.00|...
e4|20100101|14.00|...
e4|20100101|14.00|...
e5|20100101|12.34|...
e5|20100101|12.34|...
e6|20100101|36.00|...
e7|20100102|36.00|...
e8|20100101|14.00|...
e8|20100101|14.00|...
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
import org.apache.spark.sql.types._


scala> val lines = spark.read.textFile("file:///home/fsdjob/theDir").withColumn("filename", input_file_name())

scala> lines.show(false)
+--------------+------------------------------------+
|value         |filename                            |
+--------------+------------------------------------+
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|
+--------------+------------------------------------+

scala> val linesGrpWithUid = lines.groupBy("value", "filename").count.drop("count").rdd.zipWithUniqueId
linesGrpWithUid: org.apache.spark.rdd.RDD[(org.apache.spark.sql.Row, Long)] = MapPartitionsRDD[135] at zipWithUniqueId at <console>:31

scala> val linesGrpWithIdRdd = linesGrpWithUid.map( x => { org.apache.spark.sql.Row(x._1.get(0),x._1.get(1), x._2) })
linesGrpWithIdRdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[136] at map at <console>:31


scala> val schema =
    |   StructType(
    |     StructField("value", StringType, false) ::
    |     StructField("filename", StringType, false) ::
    |     StructField("id", LongType, false) ::
    |     Nil)
schema: org.apache.spark.sql.types.StructType = StructType(StructField(value,StringType,false), StructField(filename,StringType,false), StructField(id,LongType,false))

scala> val linesGrpWithIdDF = spark.createDataFrame(linesGrpWithIdRdd, schema)
linesGrpWithIdDF: org.apache.spark.sql.DataFrame = [value: string, filename: string ... 1 more field]

scala> linesGrpWithIdDF.show(false)
+--------------+------------------------------------+---+
|value         |filename                            |id |
+--------------+------------------------------------+---+
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|6  |
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|20 |
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|30 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|146|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
+--------------+------------------------------------+---+


scala> val output = lines.join(linesGrpWithIdDF, Seq("value", "filename"))
output: org.apache.spark.sql.DataFrame = [value: string, filename: string ... 1 more field]

scala> output.show(false)
+--------------+------------------------------------+---+
|value         |filename                            |id |
+--------------+------------------------------------+---+
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|12.34|file:///home/fsdjob/theDir/file2.txt|3  |
|20100101|36.00|file:///home/fsdjob/theDir/file2.txt|6  |
|20100102|36.00|file:///home/fsdjob/theDir/file2.txt|20 |
|20100102|36.00|file:///home/fsdjob/theDir/file1.txt|30 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file1.txt|36 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|14.00|file:///home/fsdjob/theDir/file2.txt|56 |
|20100101|36.00|file:///home/fsdjob/theDir/file1.txt|146|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
|20100101|12.34|file:///home/fsdjob/theDir/file1.txt|165|
+--------------+------------------------------------+---+