如何有效地使用pyspark模型?
我现在正在和prophet一起训练股票模型。有3000只股票需要训练。我需要找到每个模型的最佳参数。所以我用prophet搭配spark,但效率不高。代码如下所示:如何有效地使用pyspark模型?,pyspark,facebook-prophet,Pyspark,Facebook Prophet,我现在正在和prophet一起训练股票模型。有3000只股票需要训练。我需要找到每个模型的最佳参数。所以我用prophet搭配spark,但效率不高。代码如下所示: filter_code = "'000001'" df = spark.read.format("jdbc") \ .option("url", "jdbc:mysql://local:3306?useSSL=false") \ .option("dbtable", "(select
filter_code = "'000001'"
df = spark.read.format("jdbc") \
.option("url", "jdbc:mysql://local:3306?useSSL=false") \
.option("dbtable", "(select code,datetime,open from table where code in (%s)) as table" % filter_code) \
.option("user", "user") \
.option("password","password") \
.option("numPartitions",1)
.option("partitionColumn","code")
.option("lowerBound", "000001")
.option("upperBound","000001")
.load()
df = df.withColumn('open',df['open'].cast('float'))
.withColumn('date',to_timestamp(df['datetime'],'yyyy/MM/dd HH:mm:ss').cast('timestamp'))
parameters = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306?useSSL=false").option("dbtable", "table").option("user", "user").option("password", "password").load()
evaluate = df.groupBy('code').agg(collect_list(struct(['date','open'])).alias("ds"))
evaluate_param = evaluate.join(parameters,evaluate.code==parameters.code_p,'left_outer').select('code','ds','changepoint_prior_scale','seasonality_prior_scale','monthly_seasonality','yearly_seasonality')
evaluate_result = evaluate_param.repartition(1) \
.rdd.mapPartitions(evaluationPartion)
print(evaluate_result.collect())
def evaluationPartion(partitions):
import pandas as pd
import pyarrow
import pyarrow.parquet as pq
import fbprophet
import numpy as np
result = []
for partition in partitions:
stock = pd.DataFrame.from_records(partition.ds, columns=['date','open'])
changepoint_prior_scale = partition.changepoint_prior_scale
weekly_seasonality = 'auto'
daily_seasonality = 'auto'
monthly_seasonality = False if partition.monthly_seasonality == 0 else True
yearly_seasonality = False if partition.yearly_seasonality == 0 else True
changepoints = None
seasonality_mode = 'additive'
seasonality_prior_scale = partition.seasonality_prior_scale
mcmc_samples = 0
interval_width = 0.80
uncertainty_samples = 100
stock['ds'] = stock['date']
stock['y'] = stock['open']
max_date = max(stock['date'])
min_date = min(stock['date'])
start_date = max_date - pd.DateOffset(months=1)
end_date = max_date
training_years = 3
train = stock[(stock['date'] < start_date) & (stock['date'] > (start_date - pd.DateOffset(years=training_years)))]
# get periods
train_max_date = max(train['date'])
time_diff = int((end_date.to_pydatetime()-train_max_date.to_pydatetime()).total_seconds()/60/5)
# Testing data is specified in the range
test = stock[(stock['date'] >= start_date) & (stock['date'] <= end_date)]
model = fbprophet.Prophet(daily_seasonality=daily_seasonality,
weekly_seasonality=weekly_seasonality,
yearly_seasonality=yearly_seasonality,
changepoint_prior_scale=changepoint_prior_scale,
changepoints=changepoints,
seasonality_mode=seasonality_mode,
seasonality_prior_scale=seasonality_prior_scale,
mcmc_samples=mcmc_samples,
interval_width=interval_width,
uncertainty_samples=uncertainty_samples)
if monthly_seasonality:
# Add monthly seasonality
model.add_seasonality(name = 'monthly', period = 30.5, fourier_order = 5)
model.fit(train)
# Make a future dataframe and predictions
future = model.make_future_dataframe(periods = time_diff, freq='5min')
future = model.predict(future)
# Merge predictions with the known values
test = pd.merge(test, future, on = 'ds', how = 'inner')
train = pd.merge(train, future, on = 'ds', how = 'inner')
# Calculate the differences between consecutive measurements
test['pred_diff'] = test['yhat'].diff()
test['real_diff'] = test['y'].diff()
# Correct is when we predicted the correct direction
test['correct'] = (np.sign(test['pred_diff']) == np.sign(test['real_diff'])) * 1
# Accuracy when we predict increase and decrease
increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct'])
decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct'])
# Calculate mean absolute error
test_errors = abs(test['y'] - test['yhat'])
test_mean_error = np.mean(test_errors)
train_errors = abs(train['y'] - train['yhat'])
train_mean_error = np.mean(train_errors)
# Calculate percentage of time actual value within prediction range
test['in_range'] = False
for i in test.index:
if (test.ix[i, 'y'] < test.ix[i, 'yhat_upper']) & (test.ix[i, 'y'] > test.ix[i, 'yhat_lower']):
test.ix[i, 'in_range'] = True
in_range_accuracy = 100 * np.mean(test['in_range'])
predict_price = future.ix[len(future) - 1, 'yhat']
actual_price = test.ix[len(test) - 1, 'y']
yield [partition.code]+['open',predict_price,actual_price,train_mean_error,test_mean_error,increase_accuracy,decrease_accuracy,in_range_accuracy] +[partition.changepoint_prior_scale,partition.seasonality_prior_scale,partition.monthly_seasonality,partition.yearly_seasonality]
评估参数结果如下所示:
ds
是时间序列数据,changepoint\u prior\u scale、季节性\u prior\u scale、月度季节性、年度季节性
是预测参数
evaluationPartion的功能如下:
filter_code = "'000001'"
df = spark.read.format("jdbc") \
.option("url", "jdbc:mysql://local:3306?useSSL=false") \
.option("dbtable", "(select code,datetime,open from table where code in (%s)) as table" % filter_code) \
.option("user", "user") \
.option("password","password") \
.option("numPartitions",1)
.option("partitionColumn","code")
.option("lowerBound", "000001")
.option("upperBound","000001")
.load()
df = df.withColumn('open',df['open'].cast('float'))
.withColumn('date',to_timestamp(df['datetime'],'yyyy/MM/dd HH:mm:ss').cast('timestamp'))
parameters = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306?useSSL=false").option("dbtable", "table").option("user", "user").option("password", "password").load()
evaluate = df.groupBy('code').agg(collect_list(struct(['date','open'])).alias("ds"))
evaluate_param = evaluate.join(parameters,evaluate.code==parameters.code_p,'left_outer').select('code','ds','changepoint_prior_scale','seasonality_prior_scale','monthly_seasonality','yearly_seasonality')
evaluate_result = evaluate_param.repartition(1) \
.rdd.mapPartitions(evaluationPartion)
print(evaluate_result.collect())
def evaluationPartion(partitions):
import pandas as pd
import pyarrow
import pyarrow.parquet as pq
import fbprophet
import numpy as np
result = []
for partition in partitions:
stock = pd.DataFrame.from_records(partition.ds, columns=['date','open'])
changepoint_prior_scale = partition.changepoint_prior_scale
weekly_seasonality = 'auto'
daily_seasonality = 'auto'
monthly_seasonality = False if partition.monthly_seasonality == 0 else True
yearly_seasonality = False if partition.yearly_seasonality == 0 else True
changepoints = None
seasonality_mode = 'additive'
seasonality_prior_scale = partition.seasonality_prior_scale
mcmc_samples = 0
interval_width = 0.80
uncertainty_samples = 100
stock['ds'] = stock['date']
stock['y'] = stock['open']
max_date = max(stock['date'])
min_date = min(stock['date'])
start_date = max_date - pd.DateOffset(months=1)
end_date = max_date
training_years = 3
train = stock[(stock['date'] < start_date) & (stock['date'] > (start_date - pd.DateOffset(years=training_years)))]
# get periods
train_max_date = max(train['date'])
time_diff = int((end_date.to_pydatetime()-train_max_date.to_pydatetime()).total_seconds()/60/5)
# Testing data is specified in the range
test = stock[(stock['date'] >= start_date) & (stock['date'] <= end_date)]
model = fbprophet.Prophet(daily_seasonality=daily_seasonality,
weekly_seasonality=weekly_seasonality,
yearly_seasonality=yearly_seasonality,
changepoint_prior_scale=changepoint_prior_scale,
changepoints=changepoints,
seasonality_mode=seasonality_mode,
seasonality_prior_scale=seasonality_prior_scale,
mcmc_samples=mcmc_samples,
interval_width=interval_width,
uncertainty_samples=uncertainty_samples)
if monthly_seasonality:
# Add monthly seasonality
model.add_seasonality(name = 'monthly', period = 30.5, fourier_order = 5)
model.fit(train)
# Make a future dataframe and predictions
future = model.make_future_dataframe(periods = time_diff, freq='5min')
future = model.predict(future)
# Merge predictions with the known values
test = pd.merge(test, future, on = 'ds', how = 'inner')
train = pd.merge(train, future, on = 'ds', how = 'inner')
# Calculate the differences between consecutive measurements
test['pred_diff'] = test['yhat'].diff()
test['real_diff'] = test['y'].diff()
# Correct is when we predicted the correct direction
test['correct'] = (np.sign(test['pred_diff']) == np.sign(test['real_diff'])) * 1
# Accuracy when we predict increase and decrease
increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct'])
decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct'])
# Calculate mean absolute error
test_errors = abs(test['y'] - test['yhat'])
test_mean_error = np.mean(test_errors)
train_errors = abs(train['y'] - train['yhat'])
train_mean_error = np.mean(train_errors)
# Calculate percentage of time actual value within prediction range
test['in_range'] = False
for i in test.index:
if (test.ix[i, 'y'] < test.ix[i, 'yhat_upper']) & (test.ix[i, 'y'] > test.ix[i, 'yhat_lower']):
test.ix[i, 'in_range'] = True
in_range_accuracy = 100 * np.mean(test['in_range'])
predict_price = future.ix[len(future) - 1, 'yhat']
actual_price = test.ix[len(test) - 1, 'y']
yield [partition.code]+['open',predict_price,actual_price,train_mean_error,test_mean_error,increase_accuracy,decrease_accuracy,in_range_accuracy] +[partition.changepoint_prior_scale,partition.seasonality_prior_scale,partition.monthly_seasonality,partition.yearly_seasonality]
def evaluationpartition(分区):
作为pd进口熊猫
导入pyarrow
导入pyarrow.parquet作为pq
进口食品
将numpy作为np导入
结果=[]
对于分区中的分区:
stock=pd.DataFrame.from_记录(partition.ds,columns=['date','open'])
changepoint\u prior\u scale=分区。changepoint\u prior\u scale
每周\季节性=‘自动’
每日季节性=‘自动’
如果分区为0,则月度季节性=False。如果分区为0,则月度季节性=True
如果分区为,则年度季节性=False。如果分区为,则年度季节性=0,否则为True
变更点=无
季节性_模式=‘加性’
季节性优先等级=分区。季节性优先等级
mcmc_样本=0
间隔宽度=0.80
样本的不确定度=100
股票['ds']=股票['date']
股票['y']=股票['open']
max_date=max(库存['日期])
min_date=min(股票['日期])
开始日期=最大日期-pd.DateOffset(月数=1)
结束日期=最长日期
培训年限=3年
培训=库存[(库存['date']<开始日期]和(库存['date']>(开始日期-pd.日期偏移(年=培训年))]
#月经来潮
列车最大日期=最大(列车[‘日期’])
time_diff=int((end_date.to_pydatetime()-train_max_date.to_pydatetime())。总秒数()/60/5)
#测试数据在范围内指定
测试=库存[(库存['date']>=开始日期)和(库存['date']0]['correct'])
降低精度=100*np.平均值(测试[测试['pred_diff']<0]['correct'])
#计算平均绝对误差
测试误差=abs(测试['y']-测试['yhat'])
测试平均误差=np.平均值(测试误差)
列车错误=防抱死制动系统(列车['y']-列车['yhat'])
序列平均误差=np.平均值(序列误差)
#计算预测范围内时间实际值的百分比
测试['in_range']=False
对于test.index中的i:
如果(test.ix[i,'y']test.ix[i,'yhat_下']):
test.ix[i,‘在_范围内’]=True
在_范围内_精度=100*np.平均值(测试['在_范围内])
预测价格=未来.ix[未来-1,'yhat']
实际价格=test.ix[len(test)-1,'y']
收益率[partition.code]+[open],预测价格,实际价格,训练平均误差,测试平均误差,提高精度,降低精度,范围内精度]+[partition.changepoint\u prior\u scale,partition.季节性\u prior\u scale,partition.monthly\u季节性,partition.year\u季节性]
现在训练过程很慢,evaluate\u结果
count是100,我需要5个小时来训练100次。我如何提高我的训练速度?谢谢