厚度(未知 来源)在 org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$generateResultProjection$1.apply(AggregationIterator.scala:235) 在 org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$generateResultProjection$1.apply(AggregationIterator.scala:224) 在 org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:86) 在 org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:33) 位于scala.collection.Iterator$$anon$11。下一步(Iterator.scala:410)位于 scala.collection.Iterator$$anon$11.next(Iterator.scala:410)位于 scala.collection.Iterator$$anon$11.next(Iterator.scala:410)位于 迭代器$GroupEditor.takeDestructive(迭代器.scala:1074) 位于scala.collection.Iterator$GroupEditor.go(Iterator.scala:1089) 在 scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1126) 在 scala.collection.Iterator$GroupEditor.hasNext(Iterator.scala:1130) 位于scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) scala.collection.Iterator$class.foreach(Iterator.scala:891)位于 scala.collection.AbstractIterator.foreach(迭代器.scala:1334)位于 org.apache.spark.api.PythonRDD$.writeiteiteratortostream(PythonRDD.scala:368) 在 org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeiteiteratortostream(PythonUDFRunner.scala:50) 在 org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:430) 在 org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2136) 在 org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:236)

厚度(未知 来源)在 org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$generateResultProjection$1.apply(AggregationIterator.scala:235) 在 org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$generateResultProjection$1.apply(AggregationIterator.scala:224) 在 org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:86) 在 org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.next(ObjectAggregationIterator.scala:33) 位于scala.collection.Iterator$$anon$11。下一步(Iterator.scala:410)位于 scala.collection.Iterator$$anon$11.next(Iterator.scala:410)位于 scala.collection.Iterator$$anon$11.next(Iterator.scala:410)位于 迭代器$GroupEditor.takeDestructive(迭代器.scala:1074) 位于scala.collection.Iterator$GroupEditor.go(Iterator.scala:1089) 在 scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1126) 在 scala.collection.Iterator$GroupEditor.hasNext(Iterator.scala:1130) 位于scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409) scala.collection.Iterator$class.foreach(Iterator.scala:891)位于 scala.collection.AbstractIterator.foreach(迭代器.scala:1334)位于 org.apache.spark.api.PythonRDD$.writeiteiteratortostream(PythonRDD.scala:368) 在 org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.writeiteiteratortostream(PythonUDFRunner.scala:50) 在 org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:430) 在 org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2136) 在 org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:236),python,apache-spark,pyspark,apache-spark-sql,query-optimization,Python,Apache Spark,Pyspark,Apache Spark Sql,Query Optimization,驱动程序堆栈跟踪:在 org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2362) 在 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2350) 在 org.apac

驱动程序堆栈跟踪:在 org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2362) 在 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2350) 在 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2349) 在 scala.collection.mutable.resizeblearray$class.foreach(resizeblearray.scala:59) 位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) 在 org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2349) 在 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1102) 在 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1102) 位于scala.Option.foreach(Option.scala:257) org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1102) 在 org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2582) 在 org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2529) 在 org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2517) 位于org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) 在 org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:897) 位于org.apache.spark.SparkContext.runJob(SparkContext.scala:2280) org.apache.spark.SparkContext.runJob(SparkContext.scala:2378)位于 org.apache.spark.sql.execution.collect.Collector.runSparkJobs(Collector.scala:245) 在 org.apache.spark.sql.execution.Collector.collect(Collector.scala:280) 在 org.apache.spark.sql.execution.Collector.Collector$.collect(Collector.scala:80) 在 org.apache.spark.sql.execution.Collector.Collector$.collect(Collector.scala:86) 在 org.apache.spark.sql.execution.ResultCacheManager.getorcomputersult(ResultCacheManager.scala:508) 在 org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:480) 在 org.apache.spark.sql.execution.SparkPlan.executeCollectResult(SparkPlan.scala:325) 位于org.apache.spark.sql.Dataset$$anonfun$50.apply(Dataset.scala:3358) 位于org.apache.spark.sql.Dataset$$anonfun$50.apply(Dataset.scala:3357) 位于org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3492) 位于org.apache.spark.sql.Dataset$$anonfun$54.apply(Dataset.scala:3487) 在 org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:113) 在 org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:242) 在 org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:99) 在 org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:172) 在 org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3487) 位于org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3357) 位于的sun.reflect.NativeMethodAccessorImpl.invoke0(本机方法) invoke(NativeMethodAccessorImpl.java:62) 在 sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 位于java.lang.reflect.Method.invoke(Method.java:498) py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)位于 位于的py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) py4j.Gateway.invoke(Gateway.java:295)位于 py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) 在py4j.commands.CallCommand.execute(CallCommand.java:79)处 在以下位置运行(GatewayConnection.java:251) java.lang.Thread.run(Thread.java:748)由以下原因引起: java.lang.IllegalArgumentException:无法按大小增长BufferHolder 176因为增长后的尺寸超过尺寸限制2147483632 在 org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder.grow(BufferHolder.java:71) 在 org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.grow(UnsafeWriter.java:62) 在 org.apache.spark.sql.ca
### Import relevant modules / functions
import random
from pyspark.context import SparkContext
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
from pyspark.sql.functions import * 
import pyspark.sql.functions as f
from pyspark.sql.window import Window
from collections import Counter ## used for dictionary work
from itertools import product ## product('AB', 'xyz') == 'Ax' 'Ay' 'Az' 'Bx' 'By' 'Bz'

### Define UDFs to calculate all col(A) and col(B) 'dot product' pairings
udf_flatten3Dto2DList = udf(lambda x: [item for list in x for item in list], ArrayType(ArrayType(LongType())))
def product_lists(x, y):
  return [[x for x in pair] for pair in product(x, y)]
udf_product = udf(product_lists, ArrayType(ArrayType(LongType())))

############ EXAMPLE THAT WORKS AS EXPECTED ################
### Define parameters to control scale of problem
pair_index_options = 3
num_rows = 10

### Create dummy dataframe
a = spark.createDataFrame([(1, [random.randrange(1,pair_index_options + 1, random.randrange(1,pair_index_options + 1)], [random.randrange(1,pair_index_options + 1), random.randrange(1,pair_index_options + 1)]) for i in range(num_rows)], ['ID', 'A', 'B'])
a.show()

>>> OUTPUT:
+---+------+------+
| ID|     A|     B|
+---+------+------+
|  1|[2, 2]|[3, 3]|
|  1|[2, 3]|[1, 2]|
|  1|[1, 2]|[2, 1]|
|  1|[1, 3]|[2, 1]|
|  1|[2, 1]|[1, 1]|
|  1|[3, 1]|[2, 1]|
|  1|[3, 1]|[2, 3]|
|  1|[1, 1]|[1, 1]|
|  1|[1, 1]|[3, 1]|
|  1|[2, 2]|[2, 1]|
+---+------+------+


pair_lists = a.withColumn('product', udf_product('A','B'))
              .select(f.collect_list('product').alias('product'))
              .withColumn('product_pairs', udf_flatten3Dto2DList('product'))
              .collect()[0]['product_pairs']
print(pair_lists)

>>> OUTPUT:
[[1, 2], [1, 1], [2, 2], [2, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 2], [1, 1], [3, 2], [3, 1], [1, 3], [1, 1], [1, 3], [1, 1], [2, 1], [2, 2], [3, 1], [3, 2], [3, 2], [3, 3], [1, 2], [1, 3], [2, 1], [2, 1], [1, 1], [1, 1], [2, 2], [2, 1], [2, 2], [2, 1], [2, 3], [2, 3], [2, 3], [2, 3], [3, 2], [3, 1], [1, 2], [1, 1]]

### Convert lists to tuples to enable creation of a counter Dictionary (lists are not 'keyable' as they are not immutable)
pair_tuples = [tuple(pair) for pair in pair_lists]
dict_Counter = Counter(pair_tuples)
print(dict_Counter)

>>> OUTPUT:
Counter({(1, 1): 11, (2, 1): 6, (1, 2): 4, (2, 2): 4, (3, 2): 4, (2, 3): 4, (3, 1): 3, (1, 3): 3, (3, 3): 1})


############ EXAMPLE THAT CRASHES ################
pair_index_options = 200 ## Scale such that there are approximately 40,000 pair combinations
num_pairs = int(1e8) ### Scale such that there are approximately 100,000,000 rows to process

a = spark.createDataFrame([(1, [random.randrange(1,pair_index_options + 1), random.randrange(1,pair_index_options + 1)], [random.randrange(1,pair_index_options + 1), random.randrange(1,pair_index_options + 1)]) for i in range(num_pairs)], ['ID', 'A', 'B'])
a.show()
pair_lists = a.withColumn('product', udf_product('A', 'B')).select(f.collect_list('product').alias('product')).withColumn('product_pairs', udf_flatten3Dto2DList('product')).collect()[0]['product_pairs']
print(pair_lists)
print("")
pair_tuples = [tuple(pair) for pair in pair_lists]
dict_Counter = Counter(pair_tuples)
print(dict_Counter)