Примерно так, но адаптируем соответственно:
data = [('Category A', 100, "This is category A"),
('Category B', 120, "This is category B"),
('Category C', 150, "This is category C")]
rdd = spark.sparkContext.parallelize(data)
rdd.collect
# generate a pipelined RDD with some dummy logic
rdd = rdd.filter(lambda x: x[2] == x[2])
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType
schema = StructType([
StructField('Category', StringType(), True),
StructField('Count', IntegerType(), True),
StructField('Description', StringType(), True)
])
df = spark.createDataFrame(rdd,schema)
print(df.schema)
df.show()