PySpark sql,用于复杂的sql=空&空;不在
我有这样一个spark数据框:PySpark sql,用于复杂的sql=空&空;不在,pyspark,apache-spark-sql,spark-dataframe,pyspark-sql,Pyspark,Apache Spark Sql,Spark Dataframe,Pyspark Sql,我有这样一个spark数据框: sc = CassandraSparkContext(conf=conf) sql = SQLContext(sc) log = sc.cassandraTable("test","log_a")\ .select("m_date","userid","fsa","fsid").toDF() sql.registerDataFrameAsTable(log, "log") query_str = ("select * from log w
sc = CassandraSparkContext(conf=conf)
sql = SQLContext(sc)
log = sc.cassandraTable("test","log_a")\
.select("m_date","userid","fsa","fsid").toDF()
sql.registerDataFrameAsTable(log, "log")
query_str = ("select * from log where m_date >= %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
temp.show()
query_str = "select * from log "\
"where userid != NULL "\
"or fsa not in ("\
"select fsa from log where userid is not null)"
query_str = query_str+ ("and m_date > %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
我可以很容易地查询m_date中的范围,如下所示:
sc = CassandraSparkContext(conf=conf)
sql = SQLContext(sc)
log = sc.cassandraTable("test","log_a")\
.select("m_date","userid","fsa","fsid").toDF()
sql.registerDataFrameAsTable(log, "log")
query_str = ("select * from log where m_date >= %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
temp.show()
query_str = "select * from log "\
"where userid != NULL "\
"or fsa not in ("\
"select fsa from log where userid is not null)"
query_str = query_str+ ("and m_date > %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
query_str=(“从日志中选择*,其中m_date>=%s和m_date<%s%”(149052766161498059766))
temp=sql.sql(query\u str)
临时表演
通过这个简单的查询,一切都可以。但我不同意这样一个更复杂的查询:
sc = CassandraSparkContext(conf=conf)
sql = SQLContext(sc)
log = sc.cassandraTable("test","log_a")\
.select("m_date","userid","fsa","fsid").toDF()
sql.registerDataFrameAsTable(log, "log")
query_str = ("select * from log where m_date >= %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
temp.show()
query_str = "select * from log "\
"where userid != NULL "\
"or fsa not in ("\
"select fsa from log where userid is not null)"
query_str = query_str+ ("and m_date > %s and m_date < %s" %(1497052766,1498059766))
temp=sql.sql(query_str)
query\u str=“从日志中选择*\
“其中userid!=NULL”\
“或fsa不在(”\
“从用户ID不为null的日志中选择fsa)”
query_str=query_str+(“和m_日期>%s和m_日期<%s%”(14905277661498059766))
temp=sql.sql(query\u str)
我遇到了这个问题:
Py4JJavaError Traceback (most recent call last)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\utils.py in deco(*a, **kw)
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
Py4JJavaError: An error occurred while calling o25.sql.
: org.apache.spark.sql.AnalysisException: Null-aware predicate sub-queries cannot be used in nested conditions: (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))));;
Project [m_date#0L, userid#1, fsa#2, fsid#3]
+- Filter (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))))
: +- Project [fsa#2]
: +- Filter isnotnull(userid#1)
: +- SubqueryAlias log
: +- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]
+- SubqueryAlias log
+- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:39)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:91)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:207)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:78)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at scala.collection.immutable.List.foreach(List.scala:381)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:78)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:91)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:52)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:67)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:632)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
During handling of the above exception, another exception occurred:
AnalysisException Traceback (most recent call last)
E:\FPT\project-spark-streaming\spark-calculate-newuser-daily.py in <module>()
76 "select fsa from log where userid is not null)"
77 query_str=query_str+ ("and m_date > %s and m_date < %s" %(1497052766,1498059766))
---> 78 temp=sql.sql(query_str)
79 pass
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\context.py in sql(self, sqlQuery)
382 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
383 """
--> 384 return self.sparkSession.sql(sqlQuery)
385
386 @since(1.0)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\session.py in sql(self, sqlQuery)
601 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
602 """
--> 603 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
604
605 @since(2.0)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\utils.py in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: 'Null-aware predicate sub-queries cannot be used in nested conditions: (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))));;\nProject [m_date#0L, userid#1, fsa#2, fsid#3]\n+- Filter (NOT (userid#1 = null) || ((NOT fsa#2 IN (list#62 []) && (m_date#0L > cast(1497052766 as bigint))) && (m_date#0L < cast(1498059766 as bigint))))\n : +- Project [fsa#2]\n : +- Filter isnotnull(userid#1)\n :
+- SubqueryAlias log\n : +- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]\n +- SubqueryAlias log\n +- LogicalRDD [m_date#0L, userid#1, fsa#2, fsid#3]\n'
17/12/24 20:53:17 WARN SparkEnv: Exception while deleting Spark temp dir: C:\Users\hptphuong\AppData\Local\Temp\spark-c9fd644d-de1a-47c9-9e19-cbd0b01df138\userFiles-412a0e89-c56f-4897-98e7-05cd6114855f
java.io.IOException: Failed to delete: C:\Users\hptphuong\AppData\Local\Temp\spark-c9fd644d-de1a-47c9-9e19-cbd0b01df138\userFiles-412a0e89-c56f-4897-98e7-05cd6114855f
at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1031)
at org.apache.spark.SparkEnv.stop(SparkEnv.scala:103)
at org.apache.spark.SparkContext$$anonfun$stop$11.apply$mcV$sp(SparkContext.scala:1944)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1317)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1943)
at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:581)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1948)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
17/12/24 20:53:17 ERROR ShutdownHookManager: Exception while deleting Spark temp dir: C:\Users\hptphuong\AppData\Local\Temp\spark-c9fd644d-de1a-47c9-9e19-cbd0b01df138\userFiles-412a0e89-c56f-4897-98e7-05cd6114855f
Py4JJavaError回溯(最近一次调用)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\utils.py in deco(*a,**kw)
62尝试:
--->63返回f(*a,**kw)
64除py4j.protocol.Py4JJavaError外的其他错误为e:
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get\u return\u值(答案、网关\u客户端、目标\u id、名称)
318“调用{0}{1}{2}时出错。\n”。
-->319格式(目标id,“.”,名称),值)
320其他:
Py4JJavaError:调用o25.sql时出错。
:org.apache.spark.sql.AnalysisException:空感知谓词子查询不能在嵌套条件中使用:(NOT(userid#1=Null)| |(NOT fsa#2 in(list#62[])&&(mu date 0L>cast(149752727766作为bigint))&(mu date#0L转换(1497052766作为bigint))&(MU date#0L<转换(1498059766作为bigint)))
:+-项目[fsa#2]
:+-Filter isnotnull(userid#1)
:+-子QueryAlias日志
:+-LogicalRDD[m#U日期#0L,用户ID#1,fsa#2,fsid#3]
+-亚Queryalias测井
+-逻辑RDD[m#U日期#0L,用户ID#1,fsa#2,fsid#3]
位于org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:39)
位于org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:91)
位于org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$CheckAnalysis$1.apply(CheckAnalysis.scala:207)
位于org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$CheckAnalysis$1.apply(CheckAnalysis.scala:78)
位于org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
位于org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
位于org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
位于scala.collection.immutable.List.foreach(List.scala:381)
位于org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
位于org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.CheckAnalysis(CheckAnalysis.scala:78)
位于org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:91)
位于org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:52)
位于org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:67)
位于org.apache.spark.sql.SparkSession.sql(SparkSession.scala:632)
在sun.reflect.NativeMethodAccessorImpl.invoke0(本机方法)处
位于sun.reflect.NativeMethodAccessorImpl.invoke(未知源)
在sun.reflect.DelegatingMethodAccessorImpl.invoke处(未知源)
位于java.lang.reflect.Method.invoke(未知源)
位于py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
位于py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
在py4j.Gateway.invoke处(Gateway.java:280)
位于py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
在py4j.commands.CallCommand.execute(CallCommand.java:79)
在py4j.GatewayConnection.run处(GatewayConnection.java:214)
位于java.lang.Thread.run(未知源)
在处理上述异常期间,发生了另一个异常:
AnalysisException回溯(最近一次调用上次)
E:\FPT\project spark streaming\spark-calculate-newuser-daily.py in()
76“从用户ID不为null的日志中选择fsa)”
77 query_str=query_str+(“和m_日期>%s和m_日期<%s%”(149052766161498059766))
--->78 temp=sql.sql(查询)
79关
sql中的C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\context.py(self,sqlQuery)
382[行(f1=1,f2=u'row1')、行(f1=2,f2=u'row2')、行(f1=3,f2=u'row3')]
383 """
-->384返回self.sparkSession.sql(sqlQuery)
385
386@自(1.0)
sql中的C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\session.py(self,sqlQuery)
601[行(f1=1,f2=u'row1')、行(f1=2,f2=u'row2')、行(f1=3,f2=u'row3')]
602 """
-->603返回数据帧(self.\u jsparkSession.sql(sqlQuery),self.\u包装)
604
605@自(2.0)
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java\u gateway.py in\uuu调用(self,*args)
1131 answer=self.gateway\u client.send\u命令(command)
1132返回值=获取返回值(
->1133应答,self.gateway\u客户端,self.target\u id,self.name)
1134
1135对于临时参数中的临时参数:
C:\opt\spark\spark-2.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\sql\utils.py in deco(*a,**kw)
67 e.java_exception.getStackTrace())
68如果s.StartWith('org.apache.spark.sql.AnalysisException:'):
--->69 raise AnalysisException(s.split(“:”,1)[1],stackTrace)
70如果s.startswith('org.apache.spark.sql。