如何有效地使用pyspark模型?

如何有效地使用pyspark模型?,pyspark,facebook-prophet,Pyspark,Facebook Prophet,我现在正在和prophet一起训练股票模型。有3000只股票需要训练。我需要找到每个模型的最佳参数。所以我用prophet搭配spark,但效率不高。代码如下所示: filter_code = "'000001'" df = spark.read.format("jdbc") \ .option("url", "jdbc:mysql://local:3306?useSSL=false") \ .option("dbtable", "(select

我现在正在和prophet一起训练股票模型。有3000只股票需要训练。我需要找到每个模型的最佳参数。所以我用prophet搭配spark,但效率不高。代码如下所示:

    filter_code = "'000001'"
    df = spark.read.format("jdbc") \
        .option("url", "jdbc:mysql://local:3306?useSSL=false") \
        .option("dbtable", "(select code,datetime,open from table where code in (%s)) as table" % filter_code) \
        .option("user", "user") \
        .option("password","password") \
        .option("numPartitions",1)
        .option("partitionColumn","code")
        .option("lowerBound", "000001")
        .option("upperBound","000001")
        .load()
    df = df.withColumn('open',df['open'].cast('float'))
        .withColumn('date',to_timestamp(df['datetime'],'yyyy/MM/dd HH:mm:ss').cast('timestamp'))
    parameters = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306?useSSL=false").option("dbtable", "table").option("user", "user").option("password", "password").load()
    evaluate = df.groupBy('code').agg(collect_list(struct(['date','open'])).alias("ds"))
    evaluate_param = evaluate.join(parameters,evaluate.code==parameters.code_p,'left_outer').select('code','ds','changepoint_prior_scale','seasonality_prior_scale','monthly_seasonality','yearly_seasonality')
    evaluate_result = evaluate_param.repartition(1) \
                        .rdd.mapPartitions(evaluationPartion)
    print(evaluate_result.collect())
    def evaluationPartion(partitions):
        import pandas as pd
        import pyarrow
        import pyarrow.parquet as pq
        import fbprophet
        import numpy as np
        result = []
        for partition in partitions:
            stock = pd.DataFrame.from_records(partition.ds, columns=['date','open'])
            changepoint_prior_scale = partition.changepoint_prior_scale
            weekly_seasonality = 'auto'
            daily_seasonality = 'auto'
            monthly_seasonality = False if partition.monthly_seasonality == 0 else True
            yearly_seasonality = False if partition.yearly_seasonality == 0 else True
            changepoints = None
            seasonality_mode = 'additive'
            seasonality_prior_scale = partition.seasonality_prior_scale
            mcmc_samples = 0
            interval_width = 0.80
            uncertainty_samples = 100
            stock['ds'] = stock['date']
            stock['y'] = stock['open']
            max_date = max(stock['date'])
            min_date = min(stock['date'])
            start_date = max_date - pd.DateOffset(months=1)
            end_date = max_date
            training_years = 3
            train = stock[(stock['date'] < start_date) &  (stock['date'] > (start_date - pd.DateOffset(years=training_years)))]
            # get periods
            train_max_date = max(train['date'])
            time_diff = int((end_date.to_pydatetime()-train_max_date.to_pydatetime()).total_seconds()/60/5)
            # Testing data is specified in the range
            test = stock[(stock['date'] >= start_date) & (stock['date'] <= end_date)]
            model = fbprophet.Prophet(daily_seasonality=daily_seasonality,
                                      weekly_seasonality=weekly_seasonality,
                                      yearly_seasonality=yearly_seasonality,
                                      changepoint_prior_scale=changepoint_prior_scale,
                                      changepoints=changepoints,
                                      seasonality_mode=seasonality_mode,
                                      seasonality_prior_scale=seasonality_prior_scale,
                                      mcmc_samples=mcmc_samples,
                                      interval_width=interval_width,
                                      uncertainty_samples=uncertainty_samples)
            if monthly_seasonality:
                # Add monthly seasonality
                model.add_seasonality(name = 'monthly', period = 30.5, fourier_order = 5)
            model.fit(train)
            # Make a future dataframe and predictions
            future = model.make_future_dataframe(periods = time_diff, freq='5min')
            future = model.predict(future)
            # Merge predictions with the known values
            test = pd.merge(test, future, on = 'ds', how = 'inner')
            train = pd.merge(train, future, on = 'ds', how = 'inner')
            # Calculate the differences between consecutive measurements
            test['pred_diff'] = test['yhat'].diff()
            test['real_diff'] = test['y'].diff()
            # Correct is when we predicted the correct direction
            test['correct'] = (np.sign(test['pred_diff']) == np.sign(test['real_diff'])) * 1
            # Accuracy when we predict increase and decrease
            increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct'])
            decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct'])
            # Calculate mean absolute error
            test_errors = abs(test['y'] - test['yhat'])
            test_mean_error = np.mean(test_errors)
            train_errors = abs(train['y'] - train['yhat'])
            train_mean_error = np.mean(train_errors)
            # Calculate percentage of time actual value within prediction range
            test['in_range'] = False
            for i in test.index:
                if (test.ix[i, 'y'] < test.ix[i, 'yhat_upper']) & (test.ix[i, 'y'] > test.ix[i, 'yhat_lower']):
                    test.ix[i, 'in_range'] = True
            in_range_accuracy = 100 * np.mean(test['in_range'])
            predict_price = future.ix[len(future) - 1, 'yhat']
            actual_price = test.ix[len(test) - 1, 'y']
            yield [partition.code]+['open',predict_price,actual_price,train_mean_error,test_mean_error,increase_accuracy,decrease_accuracy,in_range_accuracy] +[partition.changepoint_prior_scale,partition.seasonality_prior_scale,partition.monthly_seasonality,partition.yearly_seasonality]
评估参数结果如下所示:

ds
时间序列数据
changepoint\u prior\u scale、季节性\u prior\u scale、月度季节性、年度季节性
是预测参数

evaluationPartion的功能如下:

    filter_code = "'000001'"
    df = spark.read.format("jdbc") \
        .option("url", "jdbc:mysql://local:3306?useSSL=false") \
        .option("dbtable", "(select code,datetime,open from table where code in (%s)) as table" % filter_code) \
        .option("user", "user") \
        .option("password","password") \
        .option("numPartitions",1)
        .option("partitionColumn","code")
        .option("lowerBound", "000001")
        .option("upperBound","000001")
        .load()
    df = df.withColumn('open',df['open'].cast('float'))
        .withColumn('date',to_timestamp(df['datetime'],'yyyy/MM/dd HH:mm:ss').cast('timestamp'))
    parameters = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306?useSSL=false").option("dbtable", "table").option("user", "user").option("password", "password").load()
    evaluate = df.groupBy('code').agg(collect_list(struct(['date','open'])).alias("ds"))
    evaluate_param = evaluate.join(parameters,evaluate.code==parameters.code_p,'left_outer').select('code','ds','changepoint_prior_scale','seasonality_prior_scale','monthly_seasonality','yearly_seasonality')
    evaluate_result = evaluate_param.repartition(1) \
                        .rdd.mapPartitions(evaluationPartion)
    print(evaluate_result.collect())
    def evaluationPartion(partitions):
        import pandas as pd
        import pyarrow
        import pyarrow.parquet as pq
        import fbprophet
        import numpy as np
        result = []
        for partition in partitions:
            stock = pd.DataFrame.from_records(partition.ds, columns=['date','open'])
            changepoint_prior_scale = partition.changepoint_prior_scale
            weekly_seasonality = 'auto'
            daily_seasonality = 'auto'
            monthly_seasonality = False if partition.monthly_seasonality == 0 else True
            yearly_seasonality = False if partition.yearly_seasonality == 0 else True
            changepoints = None
            seasonality_mode = 'additive'
            seasonality_prior_scale = partition.seasonality_prior_scale
            mcmc_samples = 0
            interval_width = 0.80
            uncertainty_samples = 100
            stock['ds'] = stock['date']
            stock['y'] = stock['open']
            max_date = max(stock['date'])
            min_date = min(stock['date'])
            start_date = max_date - pd.DateOffset(months=1)
            end_date = max_date
            training_years = 3
            train = stock[(stock['date'] < start_date) &  (stock['date'] > (start_date - pd.DateOffset(years=training_years)))]
            # get periods
            train_max_date = max(train['date'])
            time_diff = int((end_date.to_pydatetime()-train_max_date.to_pydatetime()).total_seconds()/60/5)
            # Testing data is specified in the range
            test = stock[(stock['date'] >= start_date) & (stock['date'] <= end_date)]
            model = fbprophet.Prophet(daily_seasonality=daily_seasonality,
                                      weekly_seasonality=weekly_seasonality,
                                      yearly_seasonality=yearly_seasonality,
                                      changepoint_prior_scale=changepoint_prior_scale,
                                      changepoints=changepoints,
                                      seasonality_mode=seasonality_mode,
                                      seasonality_prior_scale=seasonality_prior_scale,
                                      mcmc_samples=mcmc_samples,
                                      interval_width=interval_width,
                                      uncertainty_samples=uncertainty_samples)
            if monthly_seasonality:
                # Add monthly seasonality
                model.add_seasonality(name = 'monthly', period = 30.5, fourier_order = 5)
            model.fit(train)
            # Make a future dataframe and predictions
            future = model.make_future_dataframe(periods = time_diff, freq='5min')
            future = model.predict(future)
            # Merge predictions with the known values
            test = pd.merge(test, future, on = 'ds', how = 'inner')
            train = pd.merge(train, future, on = 'ds', how = 'inner')
            # Calculate the differences between consecutive measurements
            test['pred_diff'] = test['yhat'].diff()
            test['real_diff'] = test['y'].diff()
            # Correct is when we predicted the correct direction
            test['correct'] = (np.sign(test['pred_diff']) == np.sign(test['real_diff'])) * 1
            # Accuracy when we predict increase and decrease
            increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct'])
            decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct'])
            # Calculate mean absolute error
            test_errors = abs(test['y'] - test['yhat'])
            test_mean_error = np.mean(test_errors)
            train_errors = abs(train['y'] - train['yhat'])
            train_mean_error = np.mean(train_errors)
            # Calculate percentage of time actual value within prediction range
            test['in_range'] = False
            for i in test.index:
                if (test.ix[i, 'y'] < test.ix[i, 'yhat_upper']) & (test.ix[i, 'y'] > test.ix[i, 'yhat_lower']):
                    test.ix[i, 'in_range'] = True
            in_range_accuracy = 100 * np.mean(test['in_range'])
            predict_price = future.ix[len(future) - 1, 'yhat']
            actual_price = test.ix[len(test) - 1, 'y']
            yield [partition.code]+['open',predict_price,actual_price,train_mean_error,test_mean_error,increase_accuracy,decrease_accuracy,in_range_accuracy] +[partition.changepoint_prior_scale,partition.seasonality_prior_scale,partition.monthly_seasonality,partition.yearly_seasonality]
def evaluationpartition(分区):
作为pd进口熊猫
导入pyarrow
导入pyarrow.parquet作为pq
进口食品
将numpy作为np导入
结果=[]
对于分区中的分区:
stock=pd.DataFrame.from_记录(partition.ds,columns=['date','open'])
changepoint\u prior\u scale=分区。changepoint\u prior\u scale
每周\季节性=‘自动’
每日季节性=‘自动’
如果分区为0,则月度季节性=False。如果分区为0,则月度季节性=True
如果分区为,则年度季节性=False。如果分区为,则年度季节性=0,否则为True
变更点=无
季节性_模式=‘加性’
季节性优先等级=分区。季节性优先等级
mcmc_样本=0
间隔宽度=0.80
样本的不确定度=100
股票['ds']=股票['date']
股票['y']=股票['open']
max_date=max(库存['日期])
min_date=min(股票['日期])
开始日期=最大日期-pd.DateOffset(月数=1)
结束日期=最长日期
培训年限=3年
培训=库存[(库存['date']<开始日期]和(库存['date']>(开始日期-pd.日期偏移(年=培训年))]
#月经来潮
列车最大日期=最大(列车[‘日期’])
time_diff=int((end_date.to_pydatetime()-train_max_date.to_pydatetime())。总秒数()/60/5)
#测试数据在范围内指定
测试=库存[(库存['date']>=开始日期)和(库存['date']0]['correct'])
降低精度=100*np.平均值(测试[测试['pred_diff']<0]['correct'])
#计算平均绝对误差
测试误差=abs(测试['y']-测试['yhat'])
测试平均误差=np.平均值(测试误差)
列车错误=防抱死制动系统(列车['y']-列车['yhat'])
序列平均误差=np.平均值(序列误差)
#计算预测范围内时间实际值的百分比
测试['in_range']=False
对于test.index中的i:
如果(test.ix[i,'y']test.ix[i,'yhat_下']):
test.ix[i,‘在_范围内’]=True
在_范围内_精度=100*np.平均值(测试['在_范围内])
预测价格=未来.ix[未来-1,'yhat']
实际价格=test.ix[len(test)-1,'y']
收益率[partition.code]+[open],预测价格,实际价格,训练平均误差,测试平均误差,提高精度,降低精度,范围内精度]+[partition.changepoint\u prior\u scale,partition.季节性\u prior\u scale,partition.monthly\u季节性,partition.year\u季节性]
现在训练过程很慢,
evaluate\u结果
count是100,我需要5个小时来训练100次。我如何提高我的训练速度?谢谢