Arrays 根据条件创建单独的数组元素

Arrays 根据条件创建单独的数组元素,arrays,dataframe,pyspark,Arrays,Dataframe,Pyspark,我在数据帧中有以下数组 +-------------------------------------------------+ |typed_phone_numbers | +-------------------------------------------------+ |[-5594162570~222222-PHONE~FAX-17-TEST] | |[-2812597115~1111111-PHONE~FA

我在数据帧中有以下数组

+-------------------------------------------------+
|typed_phone_numbers                              |
+-------------------------------------------------+
|[-5594162570~222222-PHONE~FAX-17-TEST]           |
|[-2812597115~1111111-PHONE~FAX-17-TESTB]         |
+-------------------------------------------------+
如果PHONE和FAX都存在于第一个元素中,我想在数组中创建另一个元素。如果只有电话或传真,则无需创建其他元素

预期产出

+-------------------------------------------------------+
|typed_phone_numbers                                    |
+-------------------------------------------------------+
|["-5594162570-PHONE-17-TEST","-222222-FAX-17-TEST"]    |
|["-2812597115-PHONE-17-TESTB","-1111111-FAX-17-TESTB"] |
+-------------------------------------------------------+

首先,您可以在
-
上进行拆分,删除其中任何一个,检查
电话和传真是否都存在
(在
子句中使用更高阶函数
过滤器
),然后使用
元素应用您的逻辑,concat和concat_ws.
(spark2.4+)

#sample data
#df.show()
#+----------------------------------------+
#|typed_phone_numbers                     |
#+----------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST]  |
#|[-2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE]             |
#+----------------------------------------+  



from pyspark.sql import functions as F
df.withColumn("yo", F.split(F.col("typed_phone_numbers")[0], '\-|~'))\
  .withColumn("yo", F.expr("""filter(yo,x-> x!='')"""))\
  .withColumn("typed_phone_numbers", F.when(F.size(F.expr("""filter(yo,x-> x='PHONE' or x='FAX')"""))==2,\
                           F.array(F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",1),\
                                                   F.element_at("yo",3),\
                                                   F.element_at("yo",5),\
                                                   F.element_at("yo",6))),\
                           F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",2),\
                                                   F.element_at("yo",4),\
                                                   F.element_at("yo",5),\
                                                   F.element_at("yo",6)))))\
              .otherwise(F.col("typed_phone_numbers"))).drop("yo").show(truncate=False)


#+---------------------------------------------------+
#|typed_phone_numbers                                |
#+---------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST]   |
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE]                        |
#+---------------------------------------------------+
更新:

#sample data
#df.show()
#+----------------------------------------+
#|typed_phone_numbers                     |
#+----------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST]  |
#|[-2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE]             |
#+----------------------------------------+  



from pyspark.sql import functions as F
df.withColumn("yo", F.split(F.col("typed_phone_numbers")[0], '\-|~'))\
  .withColumn("yo", F.expr("""filter(yo,x-> x!='')"""))\
  .withColumn("typed_phone_numbers", F.when(F.size(F.expr("""filter(yo,x-> x='PHONE' or x='FAX')"""))==2,\
                           F.array(F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",1),\
                                                   F.element_at("yo",3),\
                                                   F.element_at("yo",5),\
                                                   F.element_at("yo",6))),\
                           F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",2),\
                                                   F.element_at("yo",4),\
                                                   F.element_at("yo",5),\
                                                   F.element_at("yo",6)))))\
              .otherwise(F.col("typed_phone_numbers"))).drop("yo").show(truncate=False)


#+---------------------------------------------------+
#|typed_phone_numbers                                |
#+---------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST]   |
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE]                        |
#+---------------------------------------------------+
使用高阶函数变换将逻辑应用于每个元素

#sample data
#df.show()
#+------------------------------------------------------------------------------+
#|typed_phone_numbers                                                           |
#+------------------------------------------------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST]                                        |
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE~FAX-17-TESTB]                                      |
#|[-2812597115~1111111-PHONE]                                                   |
#+------------------------------------------------------------------------------+


from pyspark.sql import functions as F
df\
  .withColumn("yo", F.expr("""(transform(typed_phone_numbers,x-> split(substring(x,2,length(x)),'\-|~')))"""))\
  .withColumn("typed_phone_numbers",F.when(F.size(F.expr("""filter(yo[0],x->x='PHONE' or x='FAX')"""))==2,\
                          F.flatten(F.expr("""transform(yo,y->\
                                                   array(concat('-',concat_ws('-',y[0],y[2],y[4],y[5])),\
                                                         concat('-',concat_ws('-',y[1],y[3],y[4],y[5]))))""")))\
                          .otherwise(F.col("typed_phone_numbers")))\
                          .drop("yo").show(truncate=False)


#+---------------------------------------------------------------------------------------------------+
#|typed_phone_numbers                                                                                |
#+---------------------------------------------------------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST]                                                   |
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]                                                |
#|[-2812597115~1111111-PHONE]                                                                        |
#+---------------------------------------------------------------------------------------------------+
如果您可以在任何数组行中使用
单个电话或单个传真
(即使使用其他
电话+传真
),您也可以使用此选项

#+------------------------------------------------------------------------------+
#|typed_phone_numbers                                                           |
#+------------------------------------------------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE]             |
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE~FAX-17-TESTB, -2812597115~1111111-FAX]             |
#|[-2812597115~1111111-PHONE]                                                   |
#+------------------------------------------------------------------------------+

from pyspark.sql import functions as F
df\
  .withColumn("yo", F.expr("""(transform(typed_phone_numbers,x-> split(substring(x,2,length(x)),'\-|~')))"""))\
  .withColumn("typed_phone_numbers",\
                          F.flatten(F.expr("""transform(yo,y->\
                          IF((array_contains(y,'PHONE')==True) and (array_contains(y,'FAX')==True),\
                                                   array(concat('-',concat_ws('-',y[0],y[2],y[4],y[5])),\
                                                         concat('-',concat_ws('-',y[1],y[3],y[4],y[5]))),\
                                                         array(concat('-',concat_ws('-',y)))))""")))\
                          .drop("yo").show(truncate=False)


#+---------------------------------------------------------------------------------------------------+
#|typed_phone_numbers                                                                                |
#+---------------------------------------------------------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-1111111-PHONE]                        |
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB, -2812597115-1111111-FAX]                       |
#|[-2812597115-1111111-PHONE]                                                                        |
#+---------------------------------------------------------------------------------------------------+

您可以组合使用
regexp\u replace(str,string模式,replace模式)
split(col,string模式)
如下

你的数据 解决方案
请在您的问题文本中提供一个最小的可复制示例,包括您尝试过的代码。我没有一条线索如何实现该场景。如果数组中有多个元素,该代码将查找数组的第一个元素,例如['-5594162570~2222222-PHONE~FAX-17-TEST','-2812597115~1111111-PHONE~FAX-17-TESTB']在这种情况下,代码仅适用于第0个元素如果数组中有多个元素,该代码将查找数组的第一个元素,例如['-5594162570~2222-PHONE~FAX-17-TEST','-2812597115~1111-PHONE~FAX-17-TESTB']在这种情况下,代码仅适用于第0个元素
from pyspark.sql.functions import col, regexp_replace, split
(
    df.
        withColumn(
            'typed_phone_numbers',
            split(
                regexp_replace(
                    regexp_replace(
                        col('typed_phone_numbers')[0],
                        '^(-\\d+)(~\\d+)(-PHONE)(~FAX)(-\\d+-\\w+)$',
                        '$1$3$5,$2$4$5'
                    ),
                    '~',
                    '-'
                ),
                ','
            )
        ).
        show(truncate=False)
)
+---------------------------------------------------+                           
|typed_phone_numbers                                |
+---------------------------------------------------+
|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST]   |
|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
|[-5594162570-PHONE-17-TEST]                        |
|[-2812597115-FAX-17-TESTB]                         |
+---------------------------------------------------+