Apache spark 为什么保存到超过10000列的拼花地板文件会导致JaninRuntimeException?

Apache spark 为什么保存到超过10000列的拼花地板文件会导致JaninRuntimeException?,apache-spark,apache-spark-sql,parquet,Apache Spark,Apache Spark Sql,Parquet,我有一个生成随机df的代码,并在spark 2.1中将其作为拼花文件写入磁盘。当列数变为10000时,这会遇到问题,但对于10000,它似乎可以正常工作 在100000列的情况下,spark只是在屏幕上打印一组代码,并抛出一个错误,如下所示 我怎样才能在没有错误的情况下将其写入拼花地板 import org.apache.spark.sql.types.{StructType,StructField,IntegerType,DoubleType} import org.apache.spark.

我有一个生成随机df的代码,并在spark 2.1中将其作为拼花文件写入磁盘。当列数变为10000时,这会遇到问题,但对于10000,它似乎可以正常工作

在100000列的情况下,spark只是在屏幕上打印一组代码,并抛出一个错误,如下所示

我怎样才能在没有错误的情况下将其写入拼花地板

import org.apache.spark.sql.types.{StructType,StructField,IntegerType,DoubleType}
import org.apache.spark.ml.Pipeline
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import scala.util.Random
import scala.math

val nRows = 10000
val nCols = 100000
val rD = sc.parallelize(0 to nRows-1).map { _ => Row.fromSeq(Seq.fill(nCols)(math.ceil(1000*Random.nextDouble()))) }

val schema = StructType((0 to nCols-1).map { i => StructField("C" + i, DoubleType, true) } )
val df = spark.createDataFrame(rD, schema)
df.select("*").write.format("parquet").save("df.parquet")  

/* 379357 */   private void apply_22702(InternalRow i) {
/* 379358 */
/* 379359 */
/* 379360 */     boolean isNull90808 = i.isNullAt(90808);
/* 379361 */     double value90808 = isNull90808 ? -1.0 :  (i.getDouble(90808));
/* 379362 */     if (isNull90808) {
/* 379363 */       rowWriter.setNullAt(90808);
/* 379364 */     } else { 
/* 379365 */       rowWriter.write(90808, value90808);
/* 379366 */     }
/* 379367 */
/* 379368 */
/* 379369 */     boolean isNull90809 = i.isNullAt(90809);  
/* 379370 */     double value90809 = isNull90809 ? -1.0 : (i.getDouble(90809));
/* 379371 */     if (isNull90809) {
/* 379372 */       rowWriter.setNullAt(90809);
/* 379373 */     } else {
/* 379374 */       rowWriter.write(90809, value90809);
/* 379375 */     }
/* 379376 */
/* 379377 */
/* 379378 */     boolean isNull90810 = i.isNullAt(90810);
/* 379379 */     double value90810 = isNull90810 ? -1.0 : (i.getDouble(90810)); 
/* 379380 */     if (isNull90810) {
/* 379381 */       rowWriter.setNullAt(90810);
/* 379382 */     } else {
/* 379383 */       rowWriter.write(90810, value90810);
/* 379384 */     }
/* 379385 */
.
.
.
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:941)
    at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:998)
    at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:995)
    at org.spark_project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
    at org.spark_project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
    at org.spark_project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
    ... 25 more
Caused by: org.codehaus.janino.JaninoRuntimeException: Constant pool for class org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection has grown past JVM limit of 0xFFFF
    at org.codehaus.janino.util.ClassFile.addToConstantPool(ClassFile.java:499)
    at org.codehaus.janino.util.ClassFile.addConstantIntegerInfo(ClassFile.java:395)
    at org.codehaus.janino.UnitCompiler.addConstantIntegerInfo(UnitCompiler.java:11137)
    at org.codehaus.janino.UnitCompiler.pushConstant(UnitCompiler.java:9681)
    at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4911)
    at org.codehaus.janino.UnitCompiler.access$7700(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$12.visitIntegerLiteral(UnitCompiler.java:3776)
    at org.codehaus.janino.UnitCompiler$12.visitIntegerLiteral(UnitCompiler.java:3762)
    at org.codehaus.janino.Java$IntegerLiteral.accept(Java.java:4635)
    at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
    at org.codehaus.janino.UnitCompiler.fakeCompile(UnitCompiler.java:3128)
    at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4927)
    at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4526)
    at org.codehaus.janino.UnitCompiler.access$7500(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$12.visitMethodInvocation(UnitCompiler.java:3774)
    at org.codehaus.janino.UnitCompiler$12.visitMethodInvocation(UnitCompiler.java:3762)
    at org.codehaus.janino.Java$MethodInvocation.accept(Java.java:4328)
    at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
    at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4933)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:2330)
    at org.codehaus.janino.UnitCompiler.access$2600(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1386)
    at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1370)
    at org.codehaus.janino.Java$LocalVariableDeclarationStatement.accept(Java.java:2974)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:1370)
    at org.codehaus.janino.UnitCompiler.compileStatements(UnitCompiler.java:1450)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:2811)
    at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1262)
    at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1234)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:538)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:890)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:894)
    at org.codehaus.janino.UnitCompiler.access$600(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:377)
    at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:369)
    at org.codehaus.janino.Java$MemberClassDeclaration.accept(Java.java:1128)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
    at org.codehaus.janino.UnitCompiler.compileDeclaredMemberTypes(UnitCompiler.java:1209)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:564)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:420)
    at org.codehaus.janino.UnitCompiler.access$400(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$2.visitPackageMemberClassDeclaration(UnitCompiler.java:374)
    at org.codehaus.janino.UnitCompiler$2.visitPackageMemberClassDeclaration(UnitCompiler.java:369)
    at org.codehaus.janino.Java$AbstractPackageMemberClassDeclaration.accept(Java.java:1309)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
    at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:345)
    at org.codehaus.janino.SimpleCompiler.compileToClassLoader(SimpleCompiler.java:396)
    at org.codehaus.janino.ClassBodyEvaluator.compileToClass(ClassBodyEvaluator.java:311)
    at org.codehaus.janino.ClassBodyEvaluator.cook(ClassBodyEvaluator.java:229)
    at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:196)
    at org.codehaus.commons.compiler.Cookable.cook(Cookable.java:91)
    at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:935)
    ... 30 more

这看起来像是codegen超出64k方法限制的那些令人讨厌的问题之一(如和中所述)


您可能想从中查看夜间2.2.0-SNAPSHOT版本之一,看看这是否可以在将来解决您的问题(该版本将于何时发布)。

2.2.0已经发布