Вы можете сделать это, используя regexp_extract
из pyspark.sql.functions
.
Мой подход будет выглядеть примерно так:
#read with a different separator so df generated with a single column
df = spark.read.csv('filename',header=True,sep='|')
#renamed the column name with irr (to make it easy to call)
newcolnames=['irr']
for c,n in zip(df.columns,newcolnames):
df=df.withColumnRenamed(c,n)
df.withColumn('ID',regexp_extract(df['irr'],r'(\d+)',1))
.withColumn('Name',regexp_extract(df['irr'],'your_regex_pattern',0))
.drop(df['irr']).show()