Сначала вы можете разделить как -
, так и ~
, удалить ничего, проверить, если phone and fax both exist
(используя функцию более высокого порядка filter
) в предложении when
, затем примените свои логики c, используя element_at,concat and concat_ws.
(spark2.4+)
#sample data
#df.show()
#+----------------------------------------+
#|typed_phone_numbers |
#+----------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST] |
#|[-2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE] |
#+----------------------------------------+
from pyspark.sql import functions as F
df.withColumn("yo", F.split(F.col("typed_phone_numbers")[0], '\-|~'))\
.withColumn("yo", F.expr("""filter(yo,x-> x!='')"""))\
.withColumn("typed_phone_numbers", F.when(F.size(F.expr("""filter(yo,x-> x='PHONE' or x='FAX')"""))==2,\
F.array(F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",1),\
F.element_at("yo",3),\
F.element_at("yo",5),\
F.element_at("yo",6))),\
F.concat(F.lit('-'),F.concat_ws('-',F.element_at("yo",2),\
F.element_at("yo",4),\
F.element_at("yo",5),\
F.element_at("yo",6)))))\
.otherwise(F.col("typed_phone_numbers"))).drop("yo").show(truncate=False)
#+---------------------------------------------------+
#|typed_phone_numbers |
#+---------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST] |
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE] |
#+---------------------------------------------------+
UPDATE:
Используйте преобразование higher order function
для применения логики c к каждому элементу.
#sample data
#df.show()
#+------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+------------------------------------------------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST] |
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE~FAX-17-TESTB] |
#|[-2812597115~1111111-PHONE] |
#+------------------------------------------------------------------------------+
from pyspark.sql import functions as F
df\
.withColumn("yo", F.expr("""(transform(typed_phone_numbers,x-> split(substring(x,2,length(x)),'\-|~')))"""))\
.withColumn("typed_phone_numbers",F.when(F.size(F.expr("""filter(yo[0],x->x='PHONE' or x='FAX')"""))==2,\
F.flatten(F.expr("""transform(yo,y->\
array(concat('-',concat_ws('-',y[0],y[2],y[4],y[5])),\
concat('-',concat_ws('-',y[1],y[3],y[4],y[5]))))""")))\
.otherwise(F.col("typed_phone_numbers")))\
.drop("yo").show(truncate=False)
#+---------------------------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+---------------------------------------------------------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST] |
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB] |
#|[-2812597115~1111111-PHONE] |
#+---------------------------------------------------------------------------------------------------+
Если у вас есть single PHONE or single FAX
, in any array row
(даже с другими PHONE+FAX
), вы можете использовать это.
#+------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+------------------------------------------------------------------------------+
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE] |
#|[-5594162570~222222-PHONE~FAX-17-TEST, -2812597115~1111111-PHONE~FAX-17-TESTB]|
#|[-2812597115~1111111-PHONE~FAX-17-TESTB, -2812597115~1111111-FAX] |
#|[-2812597115~1111111-PHONE] |
#+------------------------------------------------------------------------------+
from pyspark.sql import functions as F
df\
.withColumn("yo", F.expr("""(transform(typed_phone_numbers,x-> split(substring(x,2,length(x)),'\-|~')))"""))\
.withColumn("typed_phone_numbers",\
F.flatten(F.expr("""transform(yo,y->\
IF((array_contains(y,'PHONE')==True) and (array_contains(y,'FAX')==True),\
array(concat('-',concat_ws('-',y[0],y[2],y[4],y[5])),\
concat('-',concat_ws('-',y[1],y[3],y[4],y[5]))),\
array(concat('-',concat_ws('-',y)))))""")))\
.drop("yo").show(truncate=False)
#+---------------------------------------------------------------------------------------------------+
#|typed_phone_numbers |
#+---------------------------------------------------------------------------------------------------+
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-1111111-PHONE] |
#|[-5594162570-PHONE-17-TEST, -222222-FAX-17-TEST, -2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB]|
#|[-2812597115-PHONE-17-TESTB, -1111111-FAX-17-TESTB, -2812597115-1111111-FAX] |
#|[-2812597115-1111111-PHONE] |
#+---------------------------------------------------------------------------------------------------+