С функцией bin
и lpad
для достижения одинакового выхода
import pyspark.sql.functions as f
from pyspark import Row
from pyspark.shell import spark
df = spark.createDataFrame([Row(a=1, b='28', c='11', d='foo'),
Row(a=2, b='28', c='44', d='bar'),
Row(a=3, b='28', c='22', d='foo')])
df = df.withColumn('b', f.lpad(f.bin(df['b']), 9, '0'))
df.show()
С UDF
import pyspark.sql.functions as f
from pyspark import Row
from pyspark.shell import spark
df = spark.createDataFrame([Row(a=1, b='28', c='11', d='foo'),
Row(a=2, b='28', c='44', d='bar'),
Row(a=3, b='28', c='22', d='foo')])
@f.udf()
def to_binary(value):
return format(int(value), "09b")
df = df.withColumn('b', to_binary(df['b']))
df.show()
Выход:
+---+---------+---+---+
| a| b| c| d|
+---+---------+---+---+
| 1|000011100| 11|foo|
| 2|000011100| 44|bar|
| 3|000011100| 22|foo|
+---+---------+---+---+