Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/300.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/solr/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 有没有更好的方法来编写这个SparkSQL语句?_Python_Apache Spark_Pyspark_Apache Spark Sql_Spark Dataframe - Fatal编程技术网

Python 有没有更好的方法来编写这个SparkSQL语句?

Python 有没有更好的方法来编写这个SparkSQL语句?,python,apache-spark,pyspark,apache-spark-sql,spark-dataframe,Python,Apache Spark,Pyspark,Apache Spark Sql,Spark Dataframe,编写SparkSQL语句的最佳方法是什么 对此类声明的任何建议: import pyspark.sql.functions as psf df_final = df_01\ .select(df_01['*'], psf.when(df_01.record_two>\ df_01.record_three, d

编写SparkSQL语句的最佳方法是什么

对此类声明的任何建议:

import pyspark.sql.functions as psf

df_final = df_01\
            .select(df_01['*'],
                    psf.when(df_01.record_two>\
                             df_01.record_three,
                             df_01.record_two)
                       .otherwise(df_01.record_three)\
                       .alias("some_new_alias"))\
            .where("some_field > 1000")\
            .where((df_01.record_one.isin(df_red01.record_one)==False) |\
                   (df_01.record_one.isin(df_blue01.record_one)==False) |\
                   (df_01.record_one.isin(df_violet01.record_one)==False) |\
                   (df_01.record_one.isin(df_green01.record_one)==False) |\
                   (df_01.record_one.isin(df_black01.record_one)==False) |\
                   (df_01.record_one.isin(df_white01.record_one)==False) |\
                   (df_01.record_one.isin(df_red02.record_one)==False) |\
                   (df_01.record_one.isin(df_blue02.record_one)==False) |\
                   (df_01.record_one.isin(df_violet02.record_one)==False) |\
                   (df_01.record_one.isin(df_green02.record_one)==False) |\
                   (df_01.record_one.isin(df_black02.record_one)==False) |\
                   (df_01.record_one.isin(df_white02.record_one)==False) |\
                   (df_01.record_one.isin(df_blue03.record_one)==False) |\
                   (df_01.record_one.isin(df_violet03.record_one)==False) |\
                   (df_01.record_one.isin(df_green03.record_one)==False) |\
                   (df_01.record_one.isin(df_black03.record_one)==False) |\
                   (df_01.record_one.isin(df_violet04.record_one)==False) |\
                   (df_01.record_one.isin(df_green04.record_one)==False) |\
                   (df_01.record_one.isin(df_violet04.record_one)==False))\
            .select("record_one", "some_new_alias")


df_another_test_frame = df_jibber01\
                         .select(df_jibber01.field01,
                                 df_jibber01.field02,
                                 df_jibber01.field03,
                                 df_jibber01.field04,
                                 df_jibber01.field05,
                                 df_jibber01.field06,
                                 df_jibber01.field07,
                                 df_jibber01.field08,
                                 df_jibber01.field09,
                                 psf.when(df_jibber01.field04 <= 100,
                                          psf.round(2000*df_jibber01.field10/59, 10))\
                                    .when(df_jibber01.field05 >= 1,
                                          psf.round(2000*df_jibber01.field10/59, 10))                                     
                                    .when(df_jibber01.field06 >= 2,
                                          psf.round(2000*df_jibber01.field10/59, 10))                                     
                                    .when(df_jibber01.field04 <= 3,
                                          psf.round(20*df_jibber01.field10/59, 10))                                     
                                    .when(df_jibber01.field05 >= 4,
                                          psf.round(20*df_jibber01.field10/59, 10))       
                                    .when(df_jibber01.field06 >= 5,
                                          psf.round(20*df_jibber01.field10/59, 10))
                                    .when(df_jibber01.field04 <= 6,
                                          psf.round(9999*df_jibber01.field10/59, 10))
                                    .when(df_jibber01.field05 >= 7,
                                          psf.round(9999*df_jibber01.field10/59, 10))
                                    .when(df_jibber01.field06 >= 8,
                                          psf.round(9999*df_jibber01.field10/59, 10))                                     
                                    .otherwise(psf.round(9999*df_jibber01.field10/59, 10))\
                                    .alias("field11")
                                )
import pyspark.sql.函数作为psf
df_最终=df_01\
。选择(df_01['*'],
psf.何时(df_01.记录_two>\
df_01.记录_三,
df_01.记录(二)
.否则(df_01.记录_三)\
.alias(“一些新的别名”)\
.其中(“某些字段>1000”)\
.where((df_01.record_one.isin(df_red01.record_one)=False)|\
(df_01.record_one.isin(df_blue01.record_one)==False)|\
(df_01.record_one.isin(df_01.record_one)=False)|\
(df_01.record_one.isin(df_green01.record_one)==False)|\
(df_01.record_one.isin(df_black01.record_one)==False)|\
(df_01.record_one.isin(df_white01.record_one)==False)|\
(df_01.record_one.isin(df_red02.record_one)==False)|\
(df_01.record_one.isin(df_blue02.record_one)==False)|\
(df_01.record_one.isin(df_01.record_one)=False)|\
(df_01.record_one.isin(df_green02.record_one)==False)|\
(df_01.record_one.isin(df_black02.record_one)==False)|\
(df_01.record_one.isin(df_white02.record_one)==False)|\
(df_01.record_one.isin(df_blue03.record_one)==False)|\
(df_01.record_one.isin(df_03.record_one)=False)|\
(df_01.record_one.isin(df_green03.record_one)==False)|\
(df_01.record_one.isin(df_black03.record_one)==False)|\
(df_01.record_one.isin(df_04.record_one)=False)|\
(df_01.record_one.isin(df_green04.record_one)==False)|\
(df_01.record_one.isin(df_04.record_one)=False))\
.选择(“录制一个”、“一些新别名”)
df_另一个测试_帧=df_jibber01\
.选择(df_jibber01.field01,
df_jibber01.field02,
df_jibber01.field03,
df_jibber01.field04,
df_jibber01.05,
df_jibber01.06,
df_jibber01.field07,
df_jibber01.field08,
df_jibber01.field09,
psf.当(df_jibber01.field04=1,
psf.圆形(2000*df_jibber01.field10/59,10))
.当(df_jibber01.field06>=2时,
psf.圆形(2000*df_jibber01.field10/59,10))
.当(df_jibber01.field04=4,
psf.圆形(20*df_jibber01.field10/59,10))
.当(df_jibber01.field06>=5时,
psf.圆形(20*df_jibber01.field10/59,10))
.当(df_jibber01.field04=7,
psf.圆形(9999*df_jibber01.field10/59,10))
.当(df_jibber01.field06>=8时,
psf.圆形(9999*df_jibber01.field10/59,10))
.否则(psf.圆形(9999*df_jibber01.field10/59,10))\
.别名(“字段11”)
)
存在多个“地点”和“时间”条件。有没有更干净的写作方法?我有100份这样的声明


任何建议都会有帮助。

我也有类似的问题,最佳做法是将此规则保存在单独的文件或表格中,例如用于df_另一个测试框架

rules_table(ruled_id int ,field04_from int,field04_to int ,field05_from int 
,field05_to int ,field06_from int,field06_to int,ponder_for_field_10 decimal)
然后,您总是与规则表有相同的通用联接,独立于更改,并且您可以为不同的任务存储更多的规则

如果可以切换到SparkSession.sql,则可以生成动态sql并将规则从文本文件直接添加到sql字符串,而无需联接。这种方法的问题在于,开发部门的人员必须维护该文件

如果您有简单的规则表,则客户或业务分析师可以维护这些规则