Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/scala/18.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Scala 将数据帧保存到Spark中的TFRecords时出错_Scala_Apache Spark_Apache Spark Sql_Tfrecord - Fatal编程技术网

Scala 将数据帧保存到Spark中的TFRecords时出错

Scala 将数据帧保存到Spark中的TFRecords时出错,scala,apache-spark,apache-spark-sql,tfrecord,Scala,Apache Spark,Apache Spark Sql,Tfrecord,我试图将dataframe保存到spark shell中的TFrecord文件中,该文件需要spark tensorflow连接器jar的依赖项,因此我运行 spark-shell --jars xxx/xxx/spark-tensorflow-connector_2.11-1.11.0.jar 然后在spark shell中运行以下代码: scala> import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} imp

我试图将dataframe保存到spark shell中的TFrecord文件中,该文件需要spark tensorflow连接器jar的依赖项,因此我运行

spark-shell --jars xxx/xxx/spark-tensorflow-connector_2.11-1.11.0.jar
然后在spark shell中运行以下代码:

scala> import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

scala> val df = Seq((8, "bat"),(8, "abc"), (1, "xyz"), (2, "aaa")).toDF("number", "word")
df: org.apache.spark.sql.DataFrame = [number: int, word: string]

scala> df.show
+------+----+
|number|word|
+------+----+
|     8| bat|
|     8| abc|
|     1| xyz|
|     2| aaa|
+------+----+

scala> var s = df.write.mode(SaveMode.Overwrite).format("tfrecords").option("recordType", "Example")
s: org.apache.spark.sql.DataFrameWriter[org.apache.spark.sql.Row] = org.apache.spark.sql.DataFrameWriter@da1382f

scala> s.save("tmp/tfrecords")
java.lang.NoClassDefFoundError: scala/Product$class                             
  at org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.<init>(TensorflowRelation.scala:29)
  at org.tensorflow.spark.datasources.tfrecords.DefaultSource.createRelation(DefaultSource.scala:78)
  at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
  at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
  at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
  at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
  at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122)
  at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121)
  at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:944)
  at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
  at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
  at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
  at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
  at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
  at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:944)
  at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:396)
  at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:380)
  at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:269)
  ... 47 elided
Caused by: java.lang.ClassNotFoundException: scala.Product$class
  at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
  at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:355)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
  ... 70 more

scala>import org.apache.spark.sql.{DataFrame,SaveMode,SparkSession}
导入org.apache.spark.sql.{DataFrame,SaveMode,SparkSession}
scala>val df=Seq((8,“bat”),(8,“abc”),(1,“xyz”),(2,“aaa”)。toDF(“数字”,“单词”)
df:org.apache.spark.sql.DataFrame=[number:int,word:string]
scala>df.show
+------+----+
|数字|字|
+------+----+
|8 |蝙蝠|
|8 | abc|
|1 | xyz|
|2 | aaa|
+------+----+
scala>var s=df.write.mode(SaveMode.Overwrite).format(“tfrecords”).option(“recordType”,“Example”)
s:org.apache.spark.sql.DataFrameWriter[org.apache.spark.sql.Row]=org.apache.spark.sql。DataFrameWriter@da1382f
scala>s.save(“tmp/tfrecords”)
java.lang.NoClassDefFoundError:scala/Product$class
位于org.tensorflow.spark.datasources.tfrecords.TensorflowRelation.(TensorflowRelation.scala:29)
位于org.tensorflow.spark.datasources.tfrecords.DefaultSource.createRelation(DefaultSource.scala:78)
位于org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
位于org.apache.spark.sql.execution.command.executeCommandExec.sideEffectResult$lzycompute(commands.scala:70)
位于org.apache.spark.sql.execution.command.executeCommandExec.sideEffectResult(commands.scala:68)
位于org.apache.spark.sql.execution.command.executeCommandExec.doExecute(commands.scala:90)
位于org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
位于org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
位于org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
位于org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
位于org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
位于org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122)
位于org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121)
位于org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:944)
位于org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
在org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
位于org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
位于org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
位于org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
位于org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:944)
位于org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:396)
位于org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:380)
位于org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:269)
... 47删去
原因:java.lang.ClassNotFoundException:scala.Product$class
位于java.net.URLClassLoader.findClass(URLClassLoader.java:382)
位于java.lang.ClassLoader.loadClass(ClassLoader.java:418)
位于sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:355)
位于java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 70多

Spark版本是3.0.0,使用Scala版本2.12.10(Java HotSpot(TM)64位服务器虚拟机,Java 1.8.0_261)

问题是,您使用的是使用Scala 2.11编译的Tensorflow连接器(注意jar名称中的
\u 2.11
部分)和使用Scala 2.12编译的Spark 3.0


到目前为止,还没有为Spark 3.0编译的Tensorflow连接器,因此您需要使用使用Scala 2.11编译的Spark 2.4.6。

问题是您使用的Tensorflow连接器是使用Scala 2.11编译的(注意jar名称中的
\u 2.11
部分),而Spark 3.0是使用Scala 2.12编译的


到目前为止,还没有为Spark 3.0编译Tensorflow连接器,因此您需要使用Scala 2.11编译的Spark 2.4.6。

在另一台机器上使用Spark2成功,非常感谢!在另一台机器上使用Spark2成功,非常感谢!