Использование when()
spark.version
'2.3.2'
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T
schema = T.StructType([\
T.StructField("Name", T.StringType(), True),
T.StructField("RolNo", T.StringType(), True),
T.StructField("Dept", T.StringType(), True),
])
rows = sc.parallelize([("priy", "345", "cse"),\
("james", "NA", np.nan),\
(None, "567", "NULL")])
myDF = spark.createDataFrame(rows, schema)
myDF.show()
+-----+-----+----+
| Name|RolNo|Dept|
+-----+-----+----+
| priy| 345| cse|
|james| NA| NaN|
| null| 567|NULL|
+-----+-----+----+
# gives you a count of nans, nulls, specific string values, etc for each col
myDF = myDF.select([F.count(F.when(F.isnan(i) | \
F.col(i).contains('NA') | \
F.col(i).contains('NULL') | \
F.col(i).isNull(), i)).alias(i) \
for i in myDF.columns])
myDF.show()
+----+-----+----+
|Name|RolNo|Dept|
+----+-----+----+
| 1| 1| 2|
+----+-----+----+