Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/apache-spark/6.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Apache spark apachespark回归算法中的回归拟合错误_Apache Spark_Pyspark_Regression - Fatal编程技术网

Apache spark apachespark回归算法中的回归拟合错误

Apache spark apachespark回归算法中的回归拟合错误,apache-spark,pyspark,regression,Apache Spark,Pyspark,Regression,我在ApacheSpark上进行回归工作。每次我都会犯同样的错误。我解决不了。 模型=线性回归(featuresCol='features',labelCol='label')。拟合(df) --------------------------------------------------------------------------- Py4JJavaError回溯(最近一次调用) 在里面 ---->1型号=左后配合(df) C:\ProgramData\Anaconda3\lib\sit

我在ApacheSpark上进行回归工作。每次我都会犯同样的错误。我解决不了。 模型=线性回归(featuresCol='features',labelCol='label')。拟合(df)

---------------------------------------------------------------------------
Py4JJavaError回溯(最近一次调用)
在里面
---->1型号=左后配合(df)
C:\ProgramData\Anaconda3\lib\site packages\pyspark\ml\base.py(self、dataset、params)
130返回自复制(参数).\u拟合(数据集)
131其他:
-->132返回自拟合(数据集)
133其他:
134 raise VALUERROR(“参数必须是参数映射或参数映射的列表/元组,”
C:\ProgramData\Anaconda3\lib\site packages\pyspark\ml\wrapper.py in\u fit(self,dataset)
293
294定义拟合(自我,数据集):
-->295 java_model=self._fit_java(数据集)
296 model=self.\u创建\u模型(java\u模型)
297返回自身值。\u复制值(型号)
java中的C:\ProgramData\Anaconda3\lib\site packages\pyspark\ml\wrapper.py(self,dataset)
290         """
291 self._transfer_params_to_java()
-->292返回self.\u java.\u obj.fit(数据集.\u jdf)
293
294定义拟合(自我,数据集):
C:\ProgramData\Anaconda3\lib\site packages\py4j\java\u gateway.py in\uuuuu调用(self,*args)
1255 answer=self.gateway\u client.send\u命令(command)
1256返回值=获取返回值(
->1257应答,self.gateway_客户端,self.target_id,self.name)
1258
1259对于临时参数中的临时参数:
C:\ProgramData\Anaconda3\lib\site packages\pyspark\sql\utils.py in deco(*a,**kw)
61 def装饰(*a,**千瓦):
62尝试:
--->63返回f(*a,**kw)
64除py4j.protocol.Py4JJavaError外的其他错误为e:
65 s=e.java_exception.toString()
C:\ProgramData\Anaconda3\lib\site packages\py4j\protocol.py在get\u return\u值中(应答、网关\u客户端、目标\u id、名称)
326 raise Py4JJavaError(
327“调用{0}{1}{2}时出错。\n”。
-->328格式(目标id,“.”,名称),值)
329其他:
330升起Py4JError(
Py4JJavaError:调用o762.fit时出错。
:org.apache.SparkException:Job因阶段失败而中止:阶段66.0中的任务3失败1次,最近的失败:阶段66.0中的任务3.0丢失(TID 133,localhost,executor driver):scala.MatchError:[null,1.0,[34.0147.0249.02006.0,7.0,30.0,38.0,2.0](属于org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema类)
在org.apache.spark.ml.regression.LinearRegression.$anonfun$train$2(LinearRegression.scala:325)
在scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
位于scala.collection.Iterator.foreach(Iterator.scala:941)
位于scala.collection.Iterator.foreach$(Iterator.scala:941)
位于scala.collection.AbstractIterator.foreach(迭代器.scala:1429)
位于scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:160)
位于scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:158)
位于scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
位于scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:217)
位于scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:217)
位于scala.collection.AbstractIterator.aggregate(Iterator.scala:1429)
在org.apache.spark.rdd.rdd.$anonfun$treeaggreegate$3(rdd.scala:1145)
在org.apache.spark.rdd.rdd.$anonfun$treeaggreegate$5(rdd.scala:1146)
在org.apache.spark.rdd.rdd.$anonfun$mapPartitions$2(rdd.scala:801)
位于org.apache.spark.rdd.rdd.$anonfun$mapPartitions$2$adapted(rdd.scala:801)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:324)
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:288)
位于org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
在org.apache.spark.rdd.rdd.computeOrReadCheckpoint(rdd.scala:324)
位于org.apache.spark.rdd.rdd.iterator(rdd.scala:288)
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)上
在org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)上
位于org.apache.spark.scheduler.Task.run(Task.scala:121)
在org.apache.spark.executor.executor$TaskRunner.$annfun$run$3(executor.scala:411)
位于org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
位于org.apache.spark.executor.executor$TaskRunner.run(executor.scala:414)
位于java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
位于java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
运行(Thread.java:748)
驱动程序堆栈跟踪:
位于org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1889)
位于org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1877)
位于org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1876)
位于scala.collection.mutable.resizeblearray.foreach(resizeblearray.scala:62)
位于scala.collection.mutable.resizeblearray.foreach$(resizeblearray.scala:55)
位于scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
位于org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
位于org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:926)
位于org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:926)
位于scala.Option.foreach(Option.scala:274)
位于org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
位于org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:21
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-80-3156cdf2ca47> in <module>
----> 1 model = lr.fit(df)

C:\ProgramData\Anaconda3\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
    130                 return self.copy(params)._fit(dataset)
    131             else:
--> 132                 return self._fit(dataset)
    133         else:
    134             raise ValueError("Params must be either a param map or a list/tuple of param maps, "

C:\ProgramData\Anaconda3\lib\site-packages\pyspark\ml\wrapper.py in _fit(self, dataset)
    293 
    294     def _fit(self, dataset):
--> 295         java_model = self._fit_java(dataset)
    296         model = self._create_model(java_model)
    297         return self._copyValues(model)

C:\ProgramData\Anaconda3\lib\site-packages\pyspark\ml\wrapper.py in _fit_java(self, dataset)
    290         """
    291         self._transfer_params_to_java()
--> 292         return self._java_obj.fit(dataset._jdf)
    293 
    294     def _fit(self, dataset):

C:\ProgramData\Anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

C:\ProgramData\Anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

C:\ProgramData\Anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling o762.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 66.0 failed 1 times, most recent failure: Lost task 3.0 in stage 66.0 (TID 133, localhost, executor driver): scala.MatchError: [null,1.0,[34.0,147.0,249.0,2006.0,7.0,30.0,38.0,2.0]] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
    at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$2(LinearRegression.scala:325)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
    at scala.collection.Iterator.foreach(Iterator.scala:941)
    at scala.collection.Iterator.foreach$(Iterator.scala:941)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
    at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:160)
    at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:158)
    at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
    at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:217)
    at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:217)
    at scala.collection.AbstractIterator.aggregate(Iterator.scala:1429)
    at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1145)
    at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1146)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:801)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:801)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1889)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1877)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1876)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:926)
    at scala.Option.foreach(Option.scala:274)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
    at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1098)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.fold(RDD.scala:1092)
    at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1161)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
    at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1137)
    at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:105)
    at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$1(LinearRegression.scala:345)
    at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:183)
    at scala.util.Try$.apply(Try.scala:213)
    at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:183)
    at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:319)
    at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:176)
    at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)
Caused by: scala.MatchError: [null,1.0,[34.0,147.0,249.0,2006.0,7.0,30.0,38.0,2.0]] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
    at org.apache.spark.ml.regression.LinearRegression.$anonfun$train$2(LinearRegression.scala:325)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
    at scala.collection.Iterator.foreach(Iterator.scala:941)
    at scala.collection.Iterator.foreach$(Iterator.scala:941)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
    at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:160)
    at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:158)
    at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1429)
    at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:217)
    at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:217)
    at scala.collection.AbstractIterator.aggregate(Iterator.scala:1429)
    at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1145)
    at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1146)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:801)
    at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:801)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
    at org.apache.spark.scheduler.Task.run(Task.scala:121)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:411)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    ... 1 more