Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/variables/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 在pyspark中按字典列表中的值筛选dataframe_Python_Pyspark_Apache Spark Sql - Fatal编程技术网

Python 在pyspark中按字典列表中的值筛选dataframe

Python 在pyspark中按字典列表中的值筛选dataframe,python,pyspark,apache-spark-sql,Python,Pyspark,Apache Spark Sql,在pyspark中,如何根据特定字典键的值筛选具有字典列表列的dataframe 也就是说,过滤foo_data字典在我的列表中具有name属性的任何值的行 # The dataframe # df.show() foo_data bar_id 0 [{'name': 'Foo 1'}, {'name': 'Foo 2'}] 42189321899fewa32 1 [{'name': 'Foo 1'}, {'

pyspark
中,如何根据特定字典键的值筛选具有字典列表列的
dataframe

也就是说,过滤
foo_data
字典在我的列表中具有
name
属性的任何值的行

# The dataframe
# df.show()

   foo_data                                   bar_id
0  [{'name': 'Foo 1'}, {'name': 'Foo 2'}]     42189321899fewa32
1  [{'name': 'Foo 1'}, {'name': 'Foo 3'}]     13829a38291dm2198
2  [{'name': 'Foo 2'}, {'name': 'Foo 3'}]     3910m312091412812
3  [{'name': 'Foo 2'}, {'name': 'Foo 4'}]     2189d2n18u9218219

# The values for the "name" key in the dictionaries of the column "foo_data"
foo_list = [
    "Foo 1",
    "Foo 4"
]

# df_filtered = df.filter...?
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType, BooleanType
#Creating a DataFrame
df = spark.createDataFrame(
    [([{'name': 'Foo 1'}, {'name': 'Foo 2'}],'42189321899fewa32'),
     ([{'name': 'Foo 1'}, {'name': 'Foo 3'}],'13829a38291dm2198'),
     ([{'name': 'Foo 2'}, {'name': 'Foo 4'}],'2189d2n18u9218219'),
     ([{'name': 'Foo 2'}, {'name': 'Foo 3'}],'239d2n18u92154619'),], 
    schema = ['foo_data','bar_id']
)
foo_list = [ "Foo 1", "Foo 4"]
df.show(truncate=False)
+----------------------------------------+-----------------+
|foo_data                                |bar_id           |
+----------------------------------------+-----------------+
|[Map(name -> Foo 1), Map(name -> Foo 2)]|42189321899fewa32|
|[Map(name -> Foo 1), Map(name -> Foo 3)]|13829a38291dm2198|
|[Map(name -> Foo 2), Map(name -> Foo 4)]|2189d2n18u9218219|
|[Map(name -> Foo 2), Map(name -> Foo 3)]|239d2n18u92154619|
+----------------------------------------+-----------------+

#Creating a UDF of a function
def list_values(col):
   list_all_values = [i['name'] for i in col]
   return any((True for x in list_all_values if x in foo_list))

list_values_udf = udf(list_values, BooleanType())

# Finally filtering all rows which had even one of the values from
# the user given 'foo_list' values of dictionary in 'foo_data' column.
df = df.withColumn('bool', list_values_udf(df.foo_data)).filter(col('bool')==True).drop('bool')
df.show(truncate=False)
+----------------------------------------+-----------------+
|foo_data                                |bar_id           |
+----------------------------------------+-----------------+
|[Map(name -> Foo 1), Map(name -> Foo 2)]|42189321899fewa32|
|[Map(name -> Foo 1), Map(name -> Foo 3)]|13829a38291dm2198|
|[Map(name -> Foo 2), Map(name -> Foo 4)]|2189d2n18u9218219|
+----------------------------------------+-----------------+