Arrays 根据条件创建单独的数组元素
我在数据帧中有以下数组Arrays 根据条件创建单独的数组元素,arrays,dataframe,pyspark,Arrays,Dataframe,Pyspark,我在数据帧中有以下数组 +-------------------------------------------------+ |typed_phone_numbers | +-------------------------------------------------+ |[-5594162570~222222-PHONE~FAX-17-TEST] | |[-2812597115~1111111-PHONE~FA
+-------------------------------------------------+
|typed_phone_numbers |
+-------------------------------------------------+
|[-5594162570~222222-PHONE~FAX-17-TEST] |
|[-2812597115~1111111-PHONE~FAX-17-TESTB] |
+-------------------------------------------------+
如果PHONE和FAX都存在于第一个元素中,我想在数组中创建另一个元素。如果只有电话或传真,则无需创建其他元素
预期产出
+-------------------------------------------------------+
|typed_phone_numbers |
+-------------------------------------------------------+
|["-5594162570-PHONE-17-TEST","-222222-FAX-17-TEST"] |
|["-2812597115-PHONE-17-TESTB","-1111111-FAX-17-TESTB"] |
+-------------------------------------------------------+
首先,您可以在
-
和
上进行拆分,删除其中任何一个,检查电话和传真是否都存在
(在
子句中使用更高阶函数过滤器
),然后使用元素应用您的逻辑,concat和concat_ws.
(spark2.4+)
#sample data
#df.show()
#+----------------------------------------+
#|typed_phone_numbers |
#+----------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST] |
#|[-2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE] |
#+----------------------------------------+
from pyspark.sql import functions as F
df.withColumn("yo", F.split(F.col("typed_phone_numbers")[0], '\-|~'))\
.withColumn("yo", F.expr("""filter(yo,x-> x!='')"""))\
.withColumn("typed_phone_numbers", F.when(F.size(F.expr("""filter(yo,x-> x='PHONE' or x='FAX')"""))==2,\
F.array(F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",1),\
F.element_at("yo",3),\
F.element_at("yo",5),\
F.element_at("yo",6))),\
F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",2),\
F.element_at("yo",4),\
F.element_at("yo",5),\
F.element_at("yo",6)))))\
.otherwise(F.col("typed_phone_numbers"))).drop("yo").show(truncate=False)
#+---------------------------------------------------+
#|typed_phone_numbers |
#+---------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST] |
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE] |
#+---------------------------------------------------+
更新:
#sample data
#df.show()
#+----------------------------------------+
#|typed_phone_numbers |
#+----------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST] |
#|[-2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE] |
#+----------------------------------------+
from pyspark.sql import functions as F
df.withColumn("yo", F.split(F.col("typed_phone_numbers")[0], '\-|~'))\
.withColumn("yo", F.expr("""filter(yo,x-> x!='')"""))\
.withColumn("typed_phone_numbers", F.when(F.size(F.expr("""filter(yo,x-> x='PHONE' or x='FAX')"""))==2,\
F.array(F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",1),\
F.element_at("yo",3),\
F.element_at("yo",5),\
F.element_at("yo",6))),\
F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",2),\
F.element_at("yo",4),\
F.element_at("yo",5),\
F.element_at("yo",6)))))\
.otherwise(F.col("typed_phone_numbers"))).drop("yo").show(truncate=False)
#+---------------------------------------------------+
#|typed_phone_numbers |
#+---------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST] |
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE] |
#+---------------------------------------------------+
使用高阶函数变换将逻辑应用于每个元素
#sample data
#df.show()
#+------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+------------------------------------------------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST] |
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE~FAX-17-TESTB] |
#|[-2812597115~1111111-PHONE] |
#+------------------------------------------------------------------------------+
from pyspark.sql import functions as F
df\
.withColumn("yo", F.expr("""(transform(typed_phone_numbers,x-> split(substring(x,2,length(x)),'\-|~')))"""))\
.withColumn("typed_phone_numbers",F.when(F.size(F.expr("""filter(yo[0],x->x='PHONE' or x='FAX')"""))==2,\
F.flatten(F.expr("""transform(yo,y->\
array(concat('-',concat_ws('-',y[0],y[2],y[4],y[5])),\
concat('-',concat_ws('-',y[1],y[3],y[4],y[5]))))""")))\
.otherwise(F.col("typed_phone_numbers")))\
.drop("yo").show(truncate=False)
#+---------------------------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+---------------------------------------------------------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST] |
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB] |
#|[-2812597115~1111111-PHONE] |
#+---------------------------------------------------------------------------------------------------+
如果您可以在任何数组行中使用单个电话或单个传真
,(即使使用其他电话+传真
),您也可以使用此选项
#+------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+------------------------------------------------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE] |
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE~FAX-17-TESTB, -2812597115~1111111-FAX] |
#|[-2812597115~1111111-PHONE] |
#+------------------------------------------------------------------------------+
from pyspark.sql import functions as F
df\
.withColumn("yo", F.expr("""(transform(typed_phone_numbers,x-> split(substring(x,2,length(x)),'\-|~')))"""))\
.withColumn("typed_phone_numbers",\
F.flatten(F.expr("""transform(yo,y->\
IF((array_contains(y,'PHONE')==True) and (array_contains(y,'FAX')==True),\
array(concat('-',concat_ws('-',y[0],y[2],y[4],y[5])),\
concat('-',concat_ws('-',y[1],y[3],y[4],y[5]))),\
array(concat('-',concat_ws('-',y)))))""")))\
.drop("yo").show(truncate=False)
#+---------------------------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+---------------------------------------------------------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-1111111-PHONE] |
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB, -2812597115-1111111-FAX] |
#|[-2812597115-1111111-PHONE] |
#+---------------------------------------------------------------------------------------------------+
您可以组合使用regexp\u replace(str,string模式,replace模式)
和split(col,string模式)
如下
你的数据
解决方案
请在您的问题文本中提供一个最小的可复制示例,包括您尝试过的代码。我没有一条线索如何实现该场景。如果数组中有多个元素,该代码将查找数组的第一个元素,例如['-5594162570~2222222-PHONE~FAX-17-TEST','-2812597115~1111111-PHONE~FAX-17-TESTB']在这种情况下,代码仅适用于第0个元素如果数组中有多个元素,该代码将查找数组的第一个元素,例如['-5594162570~2222-PHONE~FAX-17-TEST','-2812597115~1111-PHONE~FAX-17-TESTB']在这种情况下,代码仅适用于第0个元素
from pyspark.sql.functions import col, regexp_replace, split
(
df.
withColumn(
'typed_phone_numbers',
split(
regexp_replace(
regexp_replace(
col('typed_phone_numbers')[0],
'^(-\\d+)(~\\d+)(-PHONE)(~FAX)(-\\d+-\\w+)$',
'$1$3$5,$2$4$5'
),
'~',
'-'
),
','
)
).
show(truncate=False)
)
+---------------------------------------------------+
|typed_phone_numbers |
+---------------------------------------------------+
|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST] |
|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
|[-5594162570-PHONE-17-TEST] |
|[-2812597115-FAX-17-TESTB] |
+---------------------------------------------------+