Если вы хотите преобразовать свои rdd
строк в dataframe
, попробуйте ниже, это поможет вам.
val rddStrs= spark.sparkContext.parallelize(List("Canada,0,,0,,,1,,,,,\"From: \"\"nitesh\"\" <nitesh@abc.com>\",Sub: RE: X Support Notification - Service Request #<4-20659465477> has been created.,\"To: \"\"'ABC Update'\"\" <support_reply@xyz.com>,; .<sunny@bcd.com>,; .<anchit@xyz.com>; \",,0,0,0,0,0,"))
val colName=List("start","from","subject","to","last")
val df=spark.createDataFrame(rddStrs.map(temp=>{
val str=temp.replaceAll(",{2,}", ",").replace("\"","")
val multipleCommaRemoved=str.replaceAll(",{2,}", ",").replace("\"","")
val indexOfFrom=multipleCommaRemoved.indexOf("From:")
val indexOfSub=multipleCommaRemoved.indexOf("Sub:")
val indexOfTo=multipleCommaRemoved.indexOf("To:")
val lastIndex=multipleCommaRemoved.lastIndexOf(";")
val start=multipleCommaRemoved.substring(0,indexOfFrom)
val from=multipleCommaRemoved.substring(indexOfFrom,indexOfSub)
val subject=multipleCommaRemoved.substring(indexOfSub,indexOfTo)
val to=multipleCommaRemoved.substring(indexOfTo,lastIndex)
val last=multipleCommaRemoved.substring(lastIndex+1).trim()
(start,from,subject,to,last)
})).toDF(colName:_*)
df.show()
//Sample output
+-------------+--------------------+--------------------+--------------------+-----------+
| start| from| subject| to| last|
+-------------+--------------------+--------------------+--------------------+-----------+
|Canada,0,0,1,|From: nitesh <nit...|Sub: RE: X Suppor...|To: 'ABC Update' ...|,0,0,0,0,0,|
+-------------+--------------------+--------------------+--------------------+-----------+