Python 在Spark中运行交叉验证估计器_Python_Apache Spark_Pyspark_Apache Spark Mllib

Python 在Spark中运行交叉验证估计器

python apache-spark pyspark

Python 在Spark中运行交叉验证估计器,python,apache-spark,pyspark,apache-spark-mllib,Python,Apache Spark,Pyspark,Apache Spark Mllib,所以我正在Spark中建立一个推荐系统。虽然我已经能够在具有初始手动输入超参数值的数据集上评估和运行算法。我想让交叉验证估计器从超参数值网格中进行选择，从而实现自动化。因此，我为同样的问题编写了以下函数 def recommendation(train): """ This function trains a collaborative filtering algorithm on a ratings training data We use a Cross Vali

所以我正在Spark中建立一个推荐系统。虽然我已经能够在具有初始手动输入超参数值的数据集上评估和运行算法。我想让交叉验证估计器从超参数值网格中进行选择，从而实现自动化。因此，我为同样的问题编写了以下函数

def recommendation(train):
    """ This function trains a collaborative filtering 
    algorithm on a ratings training data

    We use a Cross Validator and Grid Search to find the right hyper-parameter values



    Param: 
    train----> training data

    TUNING PARAMETERS: 
    alpha----> Alpha value to calculate the confidence matrix (only for implicit datasets)
    rank-----> no. of latent factors of the resulting X, Y matrix
    reg------> regularization parameter for penalising the X, Y factors


    Returns: 
    model-> ALS model object

    """


    from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
    from pyspark.ml.evaluation import BinaryClassificationEvaluator 

    from pyspark.ml.recommendation import ALS

    alsImplicit = ALS(implicitPrefs=True)

    #model=als.fit(train)

    paramMapImplicit = ParamGridBuilder() \
                    .addGrid(alsImplicit.rank, [20, 120]) \
                    .addGrid(alsImplicit.maxIter, [10, 15]) \
                    .addGrid(alsImplicit.regParam, [0.01, 1.0]) \
                    .addGrid(alsImplicit.alpha, [10.0, 40.0]) \
                    .build()


    evaluator=BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="rating",metricName="areaUnderROC")

    # Build the recommendation model using ALS on the training data

    #als = ALS(rank=120, maxIter=15, regParam=0.01, implicitPrefs=True)
    #model = als.fit(train)

    cvEstimator= CrossValidator(estimator=alsImplicit, estimatorParamMaps=paramMapImplicit, evaluator=evaluator)

    cvModel=cvEstimator.fit(train)

    return cvModel,evaluator

问题是，当我调用此函数时，会出现以下错误：

运行ALS功能以训练数据

但是，由于算法通过在主交叉验证程序类中的交叉验证数据集上测试来选择正确的超参数，因此在使用交叉验证程序估计器时，我不确定在交叉验证程序估计器运行时如何更改预测概率的数据类型

有人能在这里指导吗？

可能是@LostInOverflow的副本，谢谢。但我不确定我是否完全理解了答案。它没有使用Spark的交叉验证方法。我想使用交叉验证程序来获得最佳模型。只需使用交叉验证程序进行尝试。我确实尝试了交叉验证程序。我想说的是，Cross_validator需要一个evaluator参数，并对其进行评估。由于模型变换方法的预测不是以数据类型作为向量，因此会产生误差

model,evaluator=recommendation(train)
---------------------------------------------------------------------------
IllegalArgumentException                  Traceback (most recent call last)
<ipython-input-21-ea5de889f984> in <module>()
      1 # Running the ALS function to train the data
      2 
----> 3 model,evaluator=recommendation(train)

<ipython-input-15-0fb855b138b1> in recommendation(train)
    138     cvEstimator= CrossValidator(estimator=alsImplicit, estimatorParamMaps=paramMapImplicit, evaluator=evaluator)
    139 
--> 140     cvModel=cvEstimator.fit(train)
    141 
    142     return cvModel,evaluator

/Users/i854319/spark/python/pyspark/ml/pipeline.pyc in fit(self, dataset, params)
     67                 return self.copy(params)._fit(dataset)
     68             else:
---> 69                 return self._fit(dataset)
     70         else:
     71             raise ValueError("Params must be either a param map or a list/tuple of param maps, "

/Users/i854319/spark/python/pyspark/ml/tuning.pyc in _fit(self, dataset)
    239                 model = est.fit(train, epm[j])
    240                 # TODO: duplicate evaluator to take extra params from input
--> 241                 metric = eva.evaluate(model.transform(validation, epm[j]))
    242                 metrics[j] += metric
    243 

/Users/i854319/spark/python/pyspark/ml/evaluation.pyc in evaluate(self, dataset, params)
     67                 return self.copy(params)._evaluate(dataset)
     68             else:
---> 69                 return self._evaluate(dataset)
     70         else:
     71             raise ValueError("Params must be a param map but got %s." % type(params))

/Users/i854319/spark/python/pyspark/ml/evaluation.pyc in _evaluate(self, dataset)
     97         """
     98         self._transfer_params_to_java()
---> 99         return self._java_obj.evaluate(dataset._jdf)
    100 
    101     def isLargerBetter(self):

/Users/i854319/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
    811         answer = self.gateway_client.send_command(command)
    812         return_value = get_return_value(
--> 813             answer, self.gateway_client, self.target_id, self.name)
    814 
    815         for temp_arg in temp_args:

/Users/i854319/spark/python/pyspark/sql/utils.pyc in deco(*a, **kw)
     51                 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
     52             if s.startswith('java.lang.IllegalArgumentException: '):
---> 53                 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
     54             raise
     55     return deco

IllegalArgumentException: u'requirement failed: Column prediction must be of type org.apache.spark.mllib.linalg.VectorUDT@f71b0bce but was actually FloatType.'

def calcEval(testDF,predictions,evaluator):
    """ This function checks the evaluation metric for the recommendation algorithm

    testDF-> Validation or Test data to check the evalutation metric on

    """

    from pyspark.sql.functions import udf
    from pyspark.mllib.linalg import VectorUDT, DenseVector
    from pyspark.sql.types import DoubleType
    from pyspark.ml.evaluation import BinaryClassificationEvaluator



    #predictions=model.transform(testDF)
    #print "Total Count of the predictions data is {}".format(predictions.count())

    ## Converting the Data Type of the Rating and Prediction column

    as_prob = udf(lambda x: DenseVector([1-x,x]), VectorUDT())

    predictions=predictions.withColumn("prediction", as_prob(predictions["prediction"]))

    # Converting the Rating column to DoubleType()

    #predictions=predictions.withColumn("rating", predictions["rating"].cast(DoubleType()))    

    predictions.show(5)

    # Calculating the AUC

    print evaluator.getMetricName(), "The AUC of the Model is {}".format(evaluator.evaluate(predictions))

    print "The AUC under PR curve is {}".format(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}))