Предполагая, что ваш ввод будет строками, как показано ниже
EQU EB.AR.DESCRIPT TO 1,EB.AR.ASSET.CLASS TO 2
EB.AR.CURRENCY TO 3, EB.AR.ORIGINAL.VALUE TO 4
попробуйте это решение scala df
scala> val df = Seq(("EQU EB.AR.DESCRIPT TO 1,EB.AR.ASSET.CLASS TO 2"),("EB.AR.CURRENCY TO 3, EB.AR.ORIGINAL.VALUE TO 4")).toDF("a")
df: org.apache.spark.sql.DataFrame = [a: string]
scala> df.show(false)
+----------------------------------------------+
|a |
+----------------------------------------------+
|EQU EB.AR.DESCRIPT TO 1,EB.AR.ASSET.CLASS TO 2|
|EB.AR.CURRENCY TO 3, EB.AR.ORIGINAL.VALUE TO 4|
+----------------------------------------------+
scala> val df2 = df.select(split($"a",",").getItem(0).as("a1"),split($"a",",").getItem(1).as("a2"))
df2: org.apache.spark.sql.DataFrame = [a1: string, a2: string]
scala> df2.show(false)
+-----------------------+--------------------------+
|a1 |a2 |
+-----------------------+--------------------------+
|EQU EB.AR.DESCRIPT TO 1|EB.AR.ASSET.CLASS TO 2 |
|EB.AR.CURRENCY TO 3 | EB.AR.ORIGINAL.VALUE TO 4|
+-----------------------+--------------------------+
scala> val df3 = df2.flatMap( r => { (0 until r.size).map( i=> r.getString(i) ) })
df3: org.apache.spark.sql.Dataset[String] = [value: string]
scala> df3.show(false)
+--------------------------+
|value |
+--------------------------+
|EQU EB.AR.DESCRIPT TO 1 |
|EB.AR.ASSET.CLASS TO 2 |
|EB.AR.CURRENCY TO 3 |
| EB.AR.ORIGINAL.VALUE TO 4|
+--------------------------+
scala> df3.select(regexp_extract($"value",""" TO (\d+)\s*$""",1).as("key"),regexp_replace($"value",""" TO (\d+)\s*$""","").as("value")).show(false)
+---+---------------------+
|key|value |
+---+---------------------+
|1 |EQU EB.AR.DESCRIPT |
|2 |EB.AR.ASSET.CLASS |
|3 |EB.AR.CURRENCY |
|4 | EB.AR.ORIGINAL.VALUE|
+---+---------------------+
Если вы хотите, чтобы они были в столбце "map", то
scala> val df4 = df3.select(regexp_extract($"value",""" TO (\d+)\s*$""",1).as("key"),regexp_replace($"value",""" TO (\d+)\s*$""","").as("value")).select(map($"key",$"value").as("kv"))
df4: org.apache.spark.sql.DataFrame = [kv: map<string,string>]
scala> df4.show(false)
+----------------------------+
|kv |
+----------------------------+
|[1 -> EQU EB.AR.DESCRIPT] |
|[2 -> EB.AR.ASSET.CLASS] |
|[3 -> EB.AR.CURRENCY] |
|[4 -> EB.AR.ORIGINAL.VALUE]|
+----------------------------+
scala> df4.printSchema
root
|-- kv: map (nullable = false)
| |-- key: string
| |-- value: string (valueContainsNull = true)
scala>