Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala Spark UDF将列值拆分为多列_Scala_Apache Spark_Apache Spark Sql_Apache Spark 2.0 - Fatal编程技术网

Scala Spark UDF将列值拆分为多列

Scala Spark UDF将列值拆分为多列,scala,apache-spark,apache-spark-sql,apache-spark-2.0,Scala,Apache Spark,Apache Spark Sql,Apache Spark 2.0,我有一个名为“description”值的数据框列,格式如下 ABC XXXXXXXXXXXX STORE NAME ABC TYPE1 我想将其解析为不同的3列,如下所示 | mode | type | store | description | |------------------------------------------------------------------------| | ABC | T

我有一个名为“description”值的数据框列,格式如下

ABC XXXXXXXXXXXX STORE NAME ABC TYPE1
我想将其解析为不同的3列,如下所示

|  mode |  type  |  store       |  description                           |
|------------------------------------------------------------------------|
|  ABC  |  TYPE1 |  STORE NAME  | ABC XXXXXXXXXXXX STORE NAME ABC TYPE1  |
我尝试了类似的方法。它适用于简单的UDF函数,但不适用于我编写的函数。挑战在于存储的价值可能超过2个字,或者没有固定的字数

def myFunc1: (String => (String, String, String)) = { description =>
      var descripe = description.split(" ")
      val type = descripe(descripe.size - 1)
      descripe = description.substring(description.indexOf("ABC") + 4, description.lastIndexOf("ABC")).split(" ")
      val mode = descripe(0)
      descripe(0) = ""
      val store = descripe.mkString(" ").trim
      (mode, store, type)
    }

val schema = StructType(Array(
  StructField("mode", StringType, true),
  StructField("store", StringType, true),
  StructField("type", StringType, true)
))

val myUDF = udf(myFunc1, schema)

val test = pos.withColumn("test", myUDF(col("description")))
    test.printSchema()
val a =test.withColumn("mode", col("test").getItem("_1"))
    .withColumn("store", col("test").getItem("_2"))
    .withColumn("type", col("test").getItem("_3"))
    .drop(col("test"))

a.printSchema()
a.show(5, false)
当我执行时,我得到以下错误

18/10/06 21:38:02错误执行者:第5.0阶段任务0.0中出现异常 (TID 5)org.apache.spark.SparkException:无法执行用户 已定义函数($anonfun$myFunc1$1$1:(字符串)=> 结构(模式:string,存储:string,类型:string))位于 org.apache.spark.sql.catalyst.expressions.GeneratedClass$GenerateEditor.processNext(未知 来源)在 org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) 在 org.apache.spark.sql.execution.whisttagecodegenexec$$anonfun$8$$anon$1.hasNext(whisttagecodegenexec.scala:395) 在 org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:234) 在 org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:228) 在 org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:827) 在 org.apache.spark.rdd.rdd$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(rdd.scala:827) 在 org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) 在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:323)上 位于org.apache.spark.rdd.rdd.iterator(rdd.scala:287) org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)位于 org.apache.spark.scheduler.Task.run(Task.scala:108)位于 org.apache.spark.executor.executor$TaskRunner.run(executor.scala:338) 在 java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 在 java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 在java.lang.Thread.run(Thread.java:748)处,由以下原因引起: java.lang.StringIndexOutOfBoundsException:字符串索引超出范围: -4位于com.hasif.bank.track.trasaction.TransactionParser$$anonfun$myFunc1$1$1.apply(TransactionParser.scala:26)的java.lang.String.substring(String.java:1967)中 在 com.hasif.bank.track.trasaction.TransactionParser$$anonfun$myFunc1$1$1.apply(TransactionParser.scala:22) ... 还有16个

任何关于这方面的建议都将不胜感激

看看这个

scala> val df = Seq("ABC XXXXXXXXXXXX STORE NAME ABC TYPE1").toDF("desc")
df: org.apache.spark.sql.DataFrame = [desc: string]

scala> df.withColumn("mode",split('desc," ")(0)).withColumn("type",split('desc," ")(5)).withColumn("store",concat(split('desc," ")(2), lit(" "), split('desc," ")(3))).show(false)
+-------------------------------------+----+-----+----------+
|desc                                 |mode|type |store     |
+-------------------------------------+----+-----+----------+
|ABC XXXXXXXXXXXX STORE NAME ABC TYPE1|ABC |TYPE1|STORE NAME|
+-------------------------------------+----+-----+----------+


scala>
更新1:

scala> def splitStore(x:String):String=
     | return x.split(" ").drop(2).init.init.mkString(" ")
splitStore: (x: String)String

scala> val mysplitstore = udf(splitStore(_:String):String)
mysplitstore: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))

scala> val df2 = Seq("ABC XXXXXXXXXXXX STORE NAME XYZ ABC TYPE1").toDF("desc")
df2: org.apache.spark.sql.DataFrame = [desc: string]

scala> val df3 = df2.withColumn("length",split('desc," "))
df3: org.apache.spark.sql.DataFrame = [desc: string, length: array<string>]

scala> val df4 = df3.withColumn("mode",split('desc," ")(size('length)-2)).withColumn("type",split('desc," ")(size('length)-1)).withColumn("store",mysplitstore('desc))
df4: org.apache.spark.sql.DataFrame = [desc: string, length: array<string> ... 3 more fields]

scala> df4.drop('length).show(false)
+-----------------------------------------+----+-----+--------------+
|desc                                     |mode|type |store         |
+-----------------------------------------+----+-----+--------------+
|ABC XXXXXXXXXXXX STORE NAME XYZ ABC TYPE1|ABC |TYPE1|STORE NAME XYZ|
+-----------------------------------------+----+-----+--------------+


scala>
scala>def splitStore(x:String):String=
|返回x.split(“”).drop(2.init.init.mkString(“”)
splitStore:(x:String)String
scala>val-mysplitstore=udf(拆分存储(93;:字符串):字符串)
mysplitstore:org.apache.spark.sql.expressions.UserDefinedFunction=UserDefinedFunction(,StringType,Some(List(StringType)))
scala>val df2=Seq(“ABC XXXXXXXXXXXXX门店名称XYZ ABC类型1”).toDF(“描述”)
df2:org.apache.spark.sql.DataFrame=[desc:string]
scala>val df3=df2.withColumn(“长度”,split('desc,”))
df3:org.apache.spark.sql.DataFrame=[desc:string,length:array]
scala>val df4=df3.withColumn(“mode”,split('desc,”)(size('length)-2)).withColumn(“type”,split('desc,”)(size('length)-1)).withColumn(“store”,mysplitstore('desc))
df4:org.apache.spark.sql.DataFrame=[desc:string,length:array…还有3个字段]
scala>df4.drop('length).show(false)
+-----------------------------------------+----+-----+--------------+
|描述|模式|类型|存储|
+-----------------------------------------+----+-----+--------------+
|ABC XXXXXXXXXX门店名称XYZ ABC类型1 | ABC |类型1 |门店名称XYZ|
+-----------------------------------------+----+-----+--------------+
斯卡拉>

感谢您的回复。这里的挑战是,商店的价值可以超过2字,我假设模式和类型将是最后2列,前2个词将是描述,中间的任何东西都是商店名称?我错过了边界案例。再次更新。请核对一下。