Python 构造ClassDict的参数应为零(对于pyspark.ml.linalg.SparseVector)

Python 构造ClassDict的参数应为零(对于pyspark.ml.linalg.SparseVector),python,apache-spark,pyspark,apache-spark-mllib,lda,Python,Apache Spark,Pyspark,Apache Spark Mllib,Lda,我正在努力创建一个LDA模型 以下是我到目前为止所做的工作——创建了一个unigram,并基于它将数据帧转换为RDD 代码如下: countVectors = CountVectorizer(inputCol="unigrams", outputCol="features", vocabSize=3, minDF=2.0) model = countVectors.fit(res) result = model.transform(res) result.show(5, truncate=Fa

我正在努力创建一个LDA模型

以下是我到目前为止所做的工作——创建了一个unigram,并基于它将数据帧转换为RDD

代码如下:

countVectors = CountVectorizer(inputCol="unigrams", outputCol="features", vocabSize=3, minDF=2.0)
model = countVectors.fit(res)

result = model.transform(res)
result.show(5, truncate=False)
这是数据集

+------------------------------------------------------------------------+---+-------------------+
|unigrams                                                                |id |features           |
+------------------------------------------------------------------------+---+-------------------+
|[born, furyth, leaguenemesi, rise, (the, leaguenemesi, rise, seri, book]|0  |(3,[0,1],[1.0,1.0])|
|[hous, raven, (the, nightfal, chronicl, book]                           |1  |(3,[0,1],[1.0,1.0])|
|[law, 101everyth, need, know, american, law, fourth, edit]              |2  |(3,[],[])          |
|[hot, summer, night]                                                    |3  |(3,[],[])          |
|[wet, bundlemega, collect, sex, stori, (30, book, box, set)]            |4  |(3,[0],[1.0])      |
+------------------------------------------------------------------------+---+-------------------+
根据上面的基本数据,我创建了MLLib所需的rdd,该rdd基于我正在关注的databrick文章

from pyspark.mllib.linalg import Vector, Vectors
rdd_convert = result.rdd

corpus = rdd_convert.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
corpus.take(4)
上述代码生成以下数据:

[[0,
  Row(unigrams=['born', 'furyth', 'leaguenemesi', 'rise', '(the', 'leaguenemesi', 'rise', 'seri', 'book'], id=0, features=SparseVector(3, {0: 1.0, 1: 1.0}))],
 [1,
  Row(unigrams=['hous', 'raven', '(the', 'nightfal', 'chronicl', 'book'], id=1, features=SparseVector(3, {0: 1.0, 1: 1.0}))],
 [2,
  Row(unigrams=['law', '101everyth', 'need', 'know', 'american', 'law', 'fourth', 'edit'], id=2, features=SparseVector(3, {}))],
 [3,
  Row(unigrams=['hot', 'summer', 'night'], id=3, features=SparseVector(3, {}))]]
现在我想在RDD上使用LDA

from pyspark.mllib.clustering import LDA, LDAModel
# Cluster the documents into three topics using LDA

from pyspark.mllib.linalg import Vectors


type(corpus)

rdd = spark.sparkContext.parallelize(corpus.collect())
type(rdd)
如果运行ldaModel=LDA.train(rdd),则会出现以下错误:

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-33-2abff4618359> in <module>()
----> 1 ldaModel = LDA.train(rdd)

~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/mllib/clustering.py in train(cls, rdd, k, maxIterations, docConcentration, topicConcentration, seed, checkpointInterval, optimizer)
   1037         model = callMLlibFunc("trainLDAModel", rdd, k, maxIterations,
   1038                               docConcentration, topicConcentration, seed,
-> 1039                               checkpointInterval, optimizer)
   1040         return LDAModel(model)
   1041 

~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/mllib/common.py in callMLlibFunc(name, *args)
    128     sc = SparkContext.getOrCreate()
    129     api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130     return callJavaFunc(sc, api, *args)
    131 
    132 

~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/mllib/common.py in callJavaFunc(sc, func, *args)
    121     """ Call Java Function """
    122     args = [_py2java(sc, a) for a in args]
--> 123     return _java2py(sc, func(*args))
    124 
    125 

~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1131         answer = self.gateway_client.send_command(command)
   1132         return_value = get_return_value(
-> 1133             answer, self.gateway_client, self.target_id, self.name)
   1134 
   1135         for temp_arg in temp_args:

~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    317                 raise Py4JJavaError(
    318                     "An error occurred while calling {0}{1}{2}.\n".
--> 319                     format(target_id, ".", name), value)
    320             else:
    321                 raise Py4JError(

Py4JJavaError: An error occurred while calling o401.trainLDAModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 26.0 failed 1 times, most recent failure: Lost task 0.0 in stage 26.0 (TID 81, localhost, executor driver): net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for pyspark.ml.linalg.SparseVector)
    at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
    at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)
    at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175)
    at net.razorvine.pickle.Unpickler.load(Unpickler.java:99)
    at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)
    at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1353)
    at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1352)
    at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
    at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
    at scala.Option.foreach(Option.scala:257)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069)
    at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1354)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
    at org.apache.spark.rdd.RDD.take(RDD.scala:1327)
    at org.apache.spark.mllib.clustering.EMLDAOptimizer.initialize(LDAOptimizer.scala:166)
    at org.apache.spark.mllib.clustering.EMLDAOptimizer.initialize(LDAOptimizer.scala:80)
    at org.apache.spark.mllib.clustering.LDA.run(LDA.scala:331)
    at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainLDAModel(PythonMLLibAPI.scala:552)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:280)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:214)
    at java.lang.Thread.run(Thread.java:748)
Caused by: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for pyspark.ml.linalg.SparseVector)
    at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
    at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707)
    at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175)
    at net.razorvine.pickle.Unpickler.load(Unpickler.java:99)
    at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112)
    at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1353)
    at org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1352)
    at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
    at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
    at scala.collection.AbstractIterator.to(Iterator.scala:1336)
    at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
    at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
    at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
    at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$29.apply(RDD.scala:1354)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:108)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    ... 1 more
---------------------------------------------------------------------------
Py4JJavaError回溯(最近一次调用)
在()
---->1 ldaModel=LDA.列车(rdd)
列车中的~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/mllib/clustering.py(cls、rdd、k、maxIterations、docConcentration、topicConcentration、种子、检查点间隔、优化器)
1037 model=callMLlibFunc(“trainLDAModel”),rdd,k,maxIterations,
1038 DOC浓度,主题浓度,种子,
->1039检查点间隔,优化器)
1040返回LDAModel(型号)
1041
callMLlibFunc(name,*args)中的~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/mllib/common.py
128 sc=SparkContext.getOrCreate()
129 api=getattr(sc.\u jvm.PythonMLLibAPI(),名称)
-->130返回callJavaFunc(sc、api、*args)
131
132
callJavaFunc(sc,func,*args)中的~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/mllib/common.py
121“调用Java函数”
122 args=[[u py2java(sc,a)表示args中的a]
-->123返回_java2py(sc,func(*args))
124
125
~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_-gateway.py in uuu调用(self,*args)
1131 answer=self.gateway\u client.send\u命令(command)
1132返回值=获取返回值(
->1133应答,self.gateway\u客户端,self.target\u id,self.name)
1134
1135对于临时参数中的临时参数:
装饰中的~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/pyspark/sql/utils.py(*a,**kw)
61 def装饰(*a,**千瓦):
62尝试:
--->63返回f(*a,**kw)
64除py4j.protocol.Py4JJavaError外的其他错误为e:
65 s=e.java_exception.toString()
获取返回值中的~/Documents/spark/spark-2.2.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py(答案、网关客户端、目标id、名称)
317 raise Py4JJavaError(
318“调用{0}{1}{2}时出错。\n”。
-->319格式(目标id,“.”,名称),值)
320其他:
321升起Py4JError(
Py4JJavaError:调用o401.trainLDAModel时出错。
:org.apache.SparkException:Job因阶段失败而中止:阶段26.0中的任务0失败1次,最近的失败:阶段26.0中的任务0.0丢失(TID 81,localhost,executor driver):net.razorvine.pickle.pickle异常:构建ClassDict的参数应为零(对于pyspark.ml.linalg.SparseVector)
位于net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
net.razorvine.pickle.Unpickler.load\u reduce(Unpickler.java:707)
位于net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175)
位于net.razorvine.pickle.Unpickler.load(Unpickler.java:99)
加载net.razorvine.pickle.Unpickler.load(Unpickler.java:112)
位于org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1353)
位于org.apache.spark.mllib.api.python.SerDeBase$$anonfun$pythonToJava$1$$anonfun$apply$2.apply(PythonMLLibAPI.scala:1352)
位于scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
位于scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
位于scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
位于scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
在scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
位于scala.collection.Iterator$class.foreach(Iterator.scala:893)
位于scala.collection.AbstractIterator.foreach(迭代器.scala:1336)
在scala.collection.generic.growtable$class.$plus$plus$eq(growtable.scala:59)
在scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
在scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
在scala.collection.TraversableOnce$class.to处(TraversableOnce.scala:310)
在scala.collection.AbstractIterator.to(Iterator.scala:1336)
在scala.collection.TraversableOnce$class.toBuffer处(TraversableOnce.scala:302)
位于scala.collection.AbstractIterator.toBuffer(Iterator.scala:1336)
位于scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
位于scala.collection.AbstractIterator.toArray(Iterator.scala:1336)
在org.apache.spark.rdd.rdd$$anonfun$take$1$$anonfun$29.apply上(rdd.scala:1354)
在org.apache.spark.rdd.rdd$$anonfun$take$1$$anonfun$29.apply上(rdd.scala:1354)
位于org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
位于org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2069)
位于org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
位于org.apache.spark.scheduler.Task.run(Task.scala:108)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:338)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
运行(Thread.java:748)
驱动程序堆栈跟踪:
位于org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
位于org.apache.spark.scheduler.DAGScheduler$$anonfun$abortSt
from pyspark.ml.clustering import LDA

LDA().fit(result)
from pyspark.mllib.linalg import Vectors as MLlibVectors
from pyspark.mllib.clustering import LDA as MLlibLDA


MLlibLDA.train(
  result.select("id", "features").rdd.mapValues(MLlibVectors.fromML).map(list)
)