Pyspark 交叉验证期间的自定义计算器_Pyspark_Cross Validation

Pyspark 交叉验证期间的自定义计算器

pyspark

Pyspark 交叉验证期间的自定义计算器,pyspark,cross-validation,Pyspark,Cross Validation,我的目标是向CrossValidator函数（PySpark）添加一个基于排名的计算器虽然我需要将经过计算的数据帧传递到函数中，但我不知道如何完成这一部分 class rnkEvaluate(): def __init__(self, user_col = "user", rating_col ="rating", prediction_col = "prediction"): print(user_col) print(rating_col) print(predic

我的目标是向CrossValidator函数（PySpark）添加一个基于排名的计算器

虽然我需要将经过计算的数据帧传递到函数中，但我不知道如何完成这一部分

class rnkEvaluate():
def __init__(self, user_col = "user", rating_col ="rating", prediction_col = "prediction"):
    print(user_col)
    print(rating_col)
    print(prediction_col)

def isLargerBetter():
    return True


def evaluate(self,predictions):
    denominator = 
    predictions.groupBy().sum(self._rating_col).collect()[0][0]
    TODO 
    rest of the calculation ...
    return numerator / denominator

不知何故，我需要在每次折叠迭代时传递预测数据帧，但我无法管理它。

我已经解决了这个问题，下面是代码：

import numpy as np

from pyspark.ml.tuning import CrossValidator, CrossValidatorModel
from pyspark.sql.functions import rand

result = []
class CrossValidatorVerbose(CrossValidator):

    def writeResult(result):
        resfile = open('executions/results.txt', 'a')
        resfile.writelines("\n")
        resfile.writelines(result)
        resfile.close()

    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)

        eva = self.getOrDefault(self.evaluator)
        metricName = eva.getMetricName()

        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds

        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        for i in range(nFolds):
            foldNum = i + 1
            print("Comparing models on fold %d" % foldNum)

            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition)
            train = df.filter(~condition)

            for j in range(numModels):
                paramMap = epm[j]
                model = est.fit(train, paramMap)

                predictions = model.transform(validation, paramMap)
                #print(predictions.show())
                metric = eva.evaluate(spark=spark, predictions=predictions)
                metrics[j] += metric

                avgSoFar = metrics[j] / foldNum

                res=("params: %s\t%s: %f\tavg: %f" % (
                    {param.name: val for (param, val) in paramMap.items()},
                    metricName, metric, avgSoFar))
                writeResult(res)
                result.append(res)
                print(res)

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestParams = epm[bestIndex]
        bestModel = est.fit(dataset, bestParams)
        avgMetrics = [m / nFolds for m in metrics]
        bestAvg = avgMetrics[bestIndex]
        print("Best model:\nparams: %s\t%s: %f" % (
            {param.name: val for (param, val) in bestParams.items()},
            metricName, bestAvg))

        return self._copyValues(CrossValidatorModel(bestModel, avgMetrics))


evaluator = RankUserWeighted("user","rating","prediction")

cvImplicit = CrossValidatorVerbose(estimator=customImplicit, numFolds=8, estimatorParamMaps=paramMap
                            ,evaluator=evaluator)

将numpy导入为np
从pyspark.ml.tuning导入CrossValidator，CrossValidator模型
从pyspark.sql.functions导入rand
结果=[]
类CrossValidator详细（CrossValidator）：
def writeResult（结果）：
resfile=open（'executions/results.txt'，'a'）
resfile.writelines（“\n”）
resfile.writelines（结果）
resfile.close（）
定义拟合（自我，数据集）：
est=self.getOrDefault（self.estimator）
epm=self.getOrDefault（self.assistatorParamMaps）
numModels=len（epm）
eva=self.getOrDefault（self.evaluator）
metricName=eva.getMetricName（）
nFolds=self.getOrDefault（self.numFolds）
seed=self.getOrDefault（self.seed）
h=1.0/Nfold
randCol=self.uid+“\u rand”
df=dataset.select（“*”，rand（seed）.别名（randCol））
度量=[0.0]*numModels
对于范围内的i（nFold）：
foldNum=i+1
打印（“比较折叠%d”%foldNum上的模型）
validateLB=i*h
validateUB=（i+1）*h
条件=（df[randCol]>=validateLB）和（df[randCol]

我也有同样的问题，但我不明白你的答案。在何处实施

rankuserwarted

？

import numpy as np

from pyspark.ml.tuning import CrossValidator, CrossValidatorModel
from pyspark.sql.functions import rand

result = []
class CrossValidatorVerbose(CrossValidator):

    def writeResult(result):
        resfile = open('executions/results.txt', 'a')
        resfile.writelines("\n")
        resfile.writelines(result)
        resfile.close()

    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)

        eva = self.getOrDefault(self.evaluator)
        metricName = eva.getMetricName()

        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds

        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        for i in range(nFolds):
            foldNum = i + 1
            print("Comparing models on fold %d" % foldNum)

            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition)
            train = df.filter(~condition)

            for j in range(numModels):
                paramMap = epm[j]
                model = est.fit(train, paramMap)

                predictions = model.transform(validation, paramMap)
                #print(predictions.show())
                metric = eva.evaluate(spark=spark, predictions=predictions)
                metrics[j] += metric

                avgSoFar = metrics[j] / foldNum

                res=("params: %s\t%s: %f\tavg: %f" % (
                    {param.name: val for (param, val) in paramMap.items()},
                    metricName, metric, avgSoFar))
                writeResult(res)
                result.append(res)
                print(res)

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestParams = epm[bestIndex]
        bestModel = est.fit(dataset, bestParams)
        avgMetrics = [m / nFolds for m in metrics]
        bestAvg = avgMetrics[bestIndex]
        print("Best model:\nparams: %s\t%s: %f" % (
            {param.name: val for (param, val) in bestParams.items()},
            metricName, bestAvg))

        return self._copyValues(CrossValidatorModel(bestModel, avgMetrics))


evaluator = RankUserWeighted("user","rating","prediction")

cvImplicit = CrossValidatorVerbose(estimator=customImplicit, numFolds=8, estimatorParamMaps=paramMap
                            ,evaluator=evaluator)