Можно ли использовать передаваемый фрейм данных в UDF приложения pyspark SQl.
Мой код вызывает транслируемый фрейм данных внутри фрейма данных pyspark, как показано ниже.
fact_ent_df_data =
sparkSession.sparkContext.broadcast(fact_ent_df.collect())
def generate_lookup_code(col1,col2,col3):
fact_ent_df_count=fact_ent_df_data.
select(fact_ent_df_br.TheDate.between(col1,col2),
fact_ent_df_br.Ent.isin('col3')).count()
return fact_ent_df_count
sparkSession.udf.register("generate_lookup_code" , generate_lookup_code )
sparkSession.sql('select sample4,generate_lookup_code(sample1,sample2,sample 3) as count_hol from table_t')
Я получаю локальную переменную, используемую перед ошибкой присваивания, когда я использую транслируемый df_bc. Любая помощь приветствуется
И ошибка, которую я получаю,
Traceback (most recent call last):
File "C:/Users/Vignesh/PycharmProjects/gettingstarted/aramex_transit/spark_driver.py", line 46, in <module>
sparkSession.udf.register("generate_lookup_code" , generate_lookup_code )
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\sql\udf.py", line 323, in register
self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf)
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\sql\udf.py", line 148, in _judf
self._judf_placeholder = self._create_judf()
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\sql\udf.py", line 157, in _create_judf
wrapped_func = _wrap_function(sc, self.func, self.returnType)
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\sql\udf.py", line 33, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\rdd.py", line 2391, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\serializers.py", line 575, in dumps
return cloudpickle.dumps(obj, 2)
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\cloudpickle.py", line 918, in dumps
cp.dump(obj)
File "D:\spark-2.3.2-bin-hadoop2.6\spark-2.3.2-bin-hadoop2.6\python\pyspark\cloudpickle.py", line 249, in dump
raise pickle.PicklingError(msg)
pickle.PicklingError: Could not serialize object: Py4JError: An error occurred while calling o24.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)