Scala 无法使用内置toLocalIterator()将Spark数据集转换为迭代器
我试图将spark数据集转换为迭代器,以便将数据集写入XDB。在构建了所需的数据集之后,我需要将数据集转换为迭代器以传递给XDB编写器 但是,当为dataset类使用toLocalitator()内置函数时,就会出现问题 我得到了以下例外:Scala 无法使用内置toLocalIterator()将Spark数据集转换为迭代器,scala,apache-spark,dataset,Scala,Apache Spark,Dataset,我试图将spark数据集转换为迭代器,以便将数据集写入XDB。在构建了所需的数据集之后,我需要将数据集转换为迭代器以传递给XDB编写器 但是,当为dataset类使用toLocalitator()内置函数时,就会出现问题 我得到了以下例外: override def gatherTimeMetrics(df: DataFrame) (implicit params: ConversionParams, config: Config): Dataset[TimeMetric] = {
override def gatherTimeMetrics(df: DataFrame)
(implicit params: ConversionParams, config: Config): Dataset[TimeMetric] = {
df
.select($ "download_date", $ "unixtime".cast("long") as "unixtime")
.groupBy("download_date", "unixtime")
.agg(count("*") as "rows")
.repartition(1)
.as[(String, Long, Long)]
.map {
case (downloadDate, unixtime, rows) =>
TimeMetric(
unixtime,
Map(
"rows" - > rows
),
Map(
"download_date" - > downloadDate
)
)
}
}
此处使用返回的数据集:
def run(df: DataFrame) (implicit params: T, config: Config): Unit =
metricsService.write(Metrics(getMeasurementName, gatherTimeMetrics(df).toLocalIterator(), getCommonTags))
我希望内置的toLocalIterator()将转换为迭代器,但是
我得到了一个例外:
Exception in thread "main" java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter$
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.buildReaderWithPartitionValues(ParquetFileFormat.scala:301)
at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:285)
at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:283)
at org.apache.spark.sql.execution.FileSourceScanExec.inputRDDs(DataSourceScanExec.scala:303)
at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:42)
at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:386)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:88)
at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:124)
at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:115)
at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
at org.apache.spark.sql.execution.exchange.ShuffleExchange.doExecute(ShuffleExchange.scala:115)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:252)
at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
at org.apache.spark.sql.execution.DeserializeToObjectExec.inputRDDs(objects.scala:79)
at org.apache.spark.sql.execution.MapElementsExec.inputRDDs(objects.scala:215)
at org.apache.spark.sql.execution.SerializeFromObjectExec.inputRDDs(objects.scala:116)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:386)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:228)
at org.apache.spark.sql.execution.SparkPlan.executeToIterator(SparkPlan.scala:290)
at org.apache.spark.sql.Dataset$$anonfun$toLocalIterator$1.apply(Dataset.scala:2421)
at org.apache.spark.sql.Dataset$$anonfun$toLocalIterator$1.apply(Dataset.scala:2416)
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2842)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2841)
at org.apache.spark.sql.Dataset.toLocalIterator(Dataset.scala:2416)
这是具有类
org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.6.1</version>
</dependency>
打电话的人是
val urls = urlsinclasspath(getClass.getClassLoader).foreach(println)
我的项目正在使用sbt,我想我已经在本地使用了spark sql jar。在我的外部库中,我可以找到ParquetSchemaConverter类。这是我build.sbt``val sparkVersion=“2.2.1”libraryDependencies++=Seq(“org.apache.spark”%%”sparkcore“%sparkVersion%已提供”,org.apache.spark”%%”sparksql“%sparkVersion%已提供”,org.apache.spark”%%”sparkmllib“%sparkVersion%已提供,”org.apache.spark”%%“spark-hive”%sparkVersion%Provided,``提供给运行时是所需的,如果您正在制作fat/uber jar,则不会成为fat jar的一部分。maven或sbt几乎语法不同,符号相同。您在驱动程序中尝试过urlsinclasspath吗?它应该提供足够的运行时信息。
val urls = urlsinclasspath(getClass.getClassLoader).foreach(println)