Spark scala将rdd sql行转换为向量

Spark scala将rdd sql行转换为向量,scala,apache-spark,Scala,Apache Spark,我需要将名为rows的var值中填充的SQL行转换为vector。我使用下面的步骤 val df = sqlContext.sql("SELECT age,gender FROM test.test2") val rows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = df.rdd val doubVals = rows.map{ row => row.getDouble(0) } val vector = Vectors.

我需要将名为rows的var值中填充的SQL行转换为vector。我使用下面的步骤

val df = sqlContext.sql("SELECT age,gender FROM test.test2")
val rows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = df.rdd
val doubVals = rows.map{ row =>   row.getDouble(0) }
val vector = Vectors.dense{ doubVals.collect}
但是它给出了很多异常,比如ClassNotFoundException

scala> val vector = Vectors.dense{ doubVals.collect}
 WARN  2017-07-14 02:12:09,477 org.apache.spark.scheduler.TaskSetManager: 
 Lost task 0.0 in stage 2.0 (TID 7, 192.168.110.200): 
 java.lang.ClassNotFoundException: 



   $line31.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw
   $$iw$$iw$$iw$$iw$$anonfun$1
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:348)
    at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
    at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1826)
    at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)
    at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
    at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:748)

    [Stage 2:>                                                          (0 + 
 3) / 7]ERROR 2017-07-14 02:12:09,787 
  org.apache.spark.scheduler.TaskSetManager: Task 2 in stage 2.0 failed 4 
  times; aborting job
 org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 
 in stage 2.0 failed 4 times, most recent failure: Lost task 2.3 in stage 
  2.0 (TID 21, 192.168.110.200): java.lang.ClassNotFoundException: $anonfun$1
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:348)
    at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
    at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1826)
    at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)
    at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
    at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:748)
但它给了我一个例外:ClassNotFoundException

scala> val vector = Vectors.dense{ doubVals.collect}
 WARN  2017-07-14 02:12:09,477 org.apache.spark.scheduler.TaskSetManager: 
 Lost task 0.0 in stage 2.0 (TID 7, 192.168.110.200): 
 java.lang.ClassNotFoundException: 



   $line31.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw
   $$iw$$iw$$iw$$iw$$anonfun$1
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:348)
    at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
    at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1826)
    at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)
    at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
    at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:748)

    [Stage 2:>                                                          (0 + 
 3) / 7]ERROR 2017-07-14 02:12:09,787 
  org.apache.spark.scheduler.TaskSetManager: Task 2 in stage 2.0 failed 4 
  times; aborting job
 org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 
 in stage 2.0 failed 4 times, most recent failure: Lost task 2.3 in stage 
  2.0 (TID 21, 192.168.110.200): java.lang.ClassNotFoundException: $anonfun$1
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:348)
    at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
    at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1826)
    at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:422)
    at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
    at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
    at org.apache.spark.scheduler.Task.run(Task.scala:86)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:748)

你能帮我解决这个错误吗?

看下面的步骤(它确实允许我)

scala>val df=Seq(2.0,3.0,3.2,2.3,1.2).toDF(“col”)
df:org.apache.spark.sql.DataFrame=[col:double]
scala>import org.apache.spark.mllib.linalg.Vectors
导入org.apache.spark.mllib.linalg.Vectors
scala>val行=df.rdd
rows:org.apache.spark.rdd.rdd[org.apache.spark.sql.Row]=MapPartitionsRDD[3]位于rdd at:31
scala>val doubVals=rows.map{row=>row.getDouble(0)}
doubVals:org.apache.spark.rdd.rdd[Double]=MapPartitionsRDD[4]位于map at:33
scala>val vector=Vectors.dense{doubVals.collect}
vector:org.apache.spark.mllib.linalg.vector=[2.0,3.0,3.2,2.3,1.2]

这应该会提示您调试您的

请看以下步骤(它确实允许我)

scala>val df=Seq(2.0,3.0,3.2,2.3,1.2).toDF(“col”)
df:org.apache.spark.sql.DataFrame=[col:double]
scala>import org.apache.spark.mllib.linalg.Vectors
导入org.apache.spark.mllib.linalg.Vectors
scala>val行=df.rdd
rows:org.apache.spark.rdd.rdd[org.apache.spark.sql.Row]=MapPartitionsRDD[3]位于rdd at:31
scala>val doubVals=rows.map{row=>row.getDouble(0)}
doubVals:org.apache.spark.rdd.rdd[Double]=MapPartitionsRDD[4]位于map at:33
scala>val vector=Vectors.dense{doubVals.collect}
vector:org.apache.spark.mllib.linalg.vector=[2.0,3.0,3.2,2.3,1.2]

这将为调试您的答案提供提示

我的答案告诉您,如果您这样做,就不会出现错误。您是否遵循了相同的操作?它在
行中崩溃。map{row=>row.getDouble(0)}
行(0)是一个年龄,因此它必须是一个整数。我能看看你们的数据帧df的样本吗?我的回答告诉你们,若你们这样做,你们就不会有错误。您是否遵循了相同的操作?它在
行中崩溃。map{row=>row.getDouble(0)}
行(0)是一个年龄,因此它必须是一个整数。我能看看你们的数据帧df的样本吗?