С Spark 2.4 и выше вы можете использовать функции более высокого порядка в spark-sql.
Попробуйте следующее, решение sql одинаково для обоих scala / python
val df = Seq(
("161","xyz Limited","U.K."),
("262","ABC Limited","U.K."),
("165","Sons & Sons","U.K."),
("361","TÜV GmbH","Germany"),
("462","Mueller GmbH","Germany"),
("369","Schneider AG","Germany"),
("467","Sahm UG","Germany")
).toDF("ID","customers","country")
df.show(false)
df.createOrReplaceTempView("secil")
spark.sql(
""" with t1 ( select id, customers, country, array('16','26') as a1, array('36','46') as a2 from secil),
t2 (select id, customers, country, filter(a1, x -> id like x||'%') a1f, filter(a2, x -> id like x||'%') a2f from t1),
t3 (select id, customers, country, a1f, a2f,
case when size(a1f) > 0 then 1 else 0 end a1r,
case when size(a2f) > 0 then 2 else 0 end a2r
from t2)
select id, customers, country, a1f, a2f, a1r, a2r, a1r+a2r as Cat_ID from t3
""").show(false)
Результаты:
+---+------------+-------+
|ID |customers |country|
+---+------------+-------+
|161|xyz Limited |U.K. |
|262|ABC Limited|U.K. |
|165|Sons & Sons |U.K. |
|361|TÜV GmbH |Germany|
|462|Mueller GmbH|Germany|
|369|Schneider AG|Germany|
|467|Sahm UG |Germany|
+---+------------+-------+
+---+------------+-------+----+----+---+---+------+
|id |customers |country|a1f |a2f |a1r|a2r|Cat_ID|
+---+------------+-------+----+----+---+---+------+
|161|xyz Limited |U.K. |[16]|[] |1 |0 |1 |
|262|ABC Limited|U.K. |[26]|[] |1 |0 |1 |
|165|Sons & Sons |U.K. |[16]|[] |1 |0 |1 |
|361|TÜV GmbH |Germany|[] |[36]|0 |2 |2 |
|462|Mueller GmbH|Germany|[] |[46]|0 |2 |2 |
|369|Schneider AG|Germany|[] |[36]|0 |2 |2 |
|467|Sahm UG |Germany|[] |[46]|0 |2 |2 |
+---+------------+-------+----+----+---+---+------+