Pyspark 交叉验证期间的自定义计算器
我的目标是向CrossValidator函数(PySpark)添加一个基于排名的计算器 虽然我需要将经过计算的数据帧传递到函数中,但我不知道如何完成这一部分Pyspark 交叉验证期间的自定义计算器,pyspark,cross-validation,Pyspark,Cross Validation,我的目标是向CrossValidator函数(PySpark)添加一个基于排名的计算器 虽然我需要将经过计算的数据帧传递到函数中,但我不知道如何完成这一部分 class rnkEvaluate(): def __init__(self, user_col = "user", rating_col ="rating", prediction_col = "prediction"): print(user_col) print(rating_col) print(predic
class rnkEvaluate():
def __init__(self, user_col = "user", rating_col ="rating", prediction_col = "prediction"):
print(user_col)
print(rating_col)
print(prediction_col)
def isLargerBetter():
return True
def evaluate(self,predictions):
denominator =
predictions.groupBy().sum(self._rating_col).collect()[0][0]
TODO
rest of the calculation ...
return numerator / denominator
不知何故,我需要在每次折叠迭代时传递预测数据帧,但我无法管理它。我已经解决了这个问题,下面是代码:
import numpy as np
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel
from pyspark.sql.functions import rand
result = []
class CrossValidatorVerbose(CrossValidator):
def writeResult(result):
resfile = open('executions/results.txt', 'a')
resfile.writelines("\n")
resfile.writelines(result)
resfile.close()
def _fit(self, dataset):
est = self.getOrDefault(self.estimator)
epm = self.getOrDefault(self.estimatorParamMaps)
numModels = len(epm)
eva = self.getOrDefault(self.evaluator)
metricName = eva.getMetricName()
nFolds = self.getOrDefault(self.numFolds)
seed = self.getOrDefault(self.seed)
h = 1.0 / nFolds
randCol = self.uid + "_rand"
df = dataset.select("*", rand(seed).alias(randCol))
metrics = [0.0] * numModels
for i in range(nFolds):
foldNum = i + 1
print("Comparing models on fold %d" % foldNum)
validateLB = i * h
validateUB = (i + 1) * h
condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
validation = df.filter(condition)
train = df.filter(~condition)
for j in range(numModels):
paramMap = epm[j]
model = est.fit(train, paramMap)
predictions = model.transform(validation, paramMap)
#print(predictions.show())
metric = eva.evaluate(spark=spark, predictions=predictions)
metrics[j] += metric
avgSoFar = metrics[j] / foldNum
res=("params: %s\t%s: %f\tavg: %f" % (
{param.name: val for (param, val) in paramMap.items()},
metricName, metric, avgSoFar))
writeResult(res)
result.append(res)
print(res)
if eva.isLargerBetter():
bestIndex = np.argmax(metrics)
else:
bestIndex = np.argmin(metrics)
bestParams = epm[bestIndex]
bestModel = est.fit(dataset, bestParams)
avgMetrics = [m / nFolds for m in metrics]
bestAvg = avgMetrics[bestIndex]
print("Best model:\nparams: %s\t%s: %f" % (
{param.name: val for (param, val) in bestParams.items()},
metricName, bestAvg))
return self._copyValues(CrossValidatorModel(bestModel, avgMetrics))
evaluator = RankUserWeighted("user","rating","prediction")
cvImplicit = CrossValidatorVerbose(estimator=customImplicit, numFolds=8, estimatorParamMaps=paramMap
,evaluator=evaluator)
将numpy导入为np
从pyspark.ml.tuning导入CrossValidator,CrossValidator模型
从pyspark.sql.functions导入rand
结果=[]
类CrossValidator详细(CrossValidator):
def writeResult(结果):
resfile=open('executions/results.txt','a')
resfile.writelines(“\n”)
resfile.writelines(结果)
resfile.close()
定义拟合(自我,数据集):
est=self.getOrDefault(self.estimator)
epm=self.getOrDefault(self.assistatorParamMaps)
numModels=len(epm)
eva=self.getOrDefault(self.evaluator)
metricName=eva.getMetricName()
nFolds=self.getOrDefault(self.numFolds)
seed=self.getOrDefault(self.seed)
h=1.0/Nfold
randCol=self.uid+“\u rand”
df=dataset.select(“*”,rand(seed).别名(randCol))
度量=[0.0]*numModels
对于范围内的i(nFold):
foldNum=i+1
打印(“比较折叠%d”%foldNum上的模型)
validateLB=i*h
validateUB=(i+1)*h
条件=(df[randCol]>=validateLB)和(df[randCol]
我也有同样的问题,但我不明白你的答案。在何处实施rankuserwarted
?
import numpy as np
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel
from pyspark.sql.functions import rand
result = []
class CrossValidatorVerbose(CrossValidator):
def writeResult(result):
resfile = open('executions/results.txt', 'a')
resfile.writelines("\n")
resfile.writelines(result)
resfile.close()
def _fit(self, dataset):
est = self.getOrDefault(self.estimator)
epm = self.getOrDefault(self.estimatorParamMaps)
numModels = len(epm)
eva = self.getOrDefault(self.evaluator)
metricName = eva.getMetricName()
nFolds = self.getOrDefault(self.numFolds)
seed = self.getOrDefault(self.seed)
h = 1.0 / nFolds
randCol = self.uid + "_rand"
df = dataset.select("*", rand(seed).alias(randCol))
metrics = [0.0] * numModels
for i in range(nFolds):
foldNum = i + 1
print("Comparing models on fold %d" % foldNum)
validateLB = i * h
validateUB = (i + 1) * h
condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
validation = df.filter(condition)
train = df.filter(~condition)
for j in range(numModels):
paramMap = epm[j]
model = est.fit(train, paramMap)
predictions = model.transform(validation, paramMap)
#print(predictions.show())
metric = eva.evaluate(spark=spark, predictions=predictions)
metrics[j] += metric
avgSoFar = metrics[j] / foldNum
res=("params: %s\t%s: %f\tavg: %f" % (
{param.name: val for (param, val) in paramMap.items()},
metricName, metric, avgSoFar))
writeResult(res)
result.append(res)
print(res)
if eva.isLargerBetter():
bestIndex = np.argmax(metrics)
else:
bestIndex = np.argmin(metrics)
bestParams = epm[bestIndex]
bestModel = est.fit(dataset, bestParams)
avgMetrics = [m / nFolds for m in metrics]
bestAvg = avgMetrics[bestIndex]
print("Best model:\nparams: %s\t%s: %f" % (
{param.name: val for (param, val) in bestParams.items()},
metricName, bestAvg))
return self._copyValues(CrossValidatorModel(bestModel, avgMetrics))
evaluator = RankUserWeighted("user","rating","prediction")
cvImplicit = CrossValidatorVerbose(estimator=customImplicit, numFolds=8, estimatorParamMaps=paramMap
,evaluator=evaluator)