Pyspark, prophet, pandas UDF - нет доступа к фрейму данных результатов - PullRequest
0 голосов
/ 07 мая 2020

Я пытаюсь создать аналогичное решение для этого: https://databricks.com/blog/2020/01/27/time-series-forecasting-prophet-spark.html. Я использую Facebook prophet, pyspark и pandasUDF, но не могу манипулировать или даже видеть конечный результат.

Может кто-нибудь поможет разобраться, в чем дело. Я проверил несколько подобных решений, но никогда не видел такой же проблемы.

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


df = pd.read_excel("All Mylan products value&units 2015-2020.xlsx")

df.drop(['Region', 'Channel', 'Product - Level 0', 'Product - Level 1', 'Time - Level 0', 'Time - Level 1', 'Sales RET [BGN]'],axis=1, inplace =True)

df

df.columns ='prod','ds','y'
df['ds'] = pd.to_datetime(df['ds'])

df.fillna(0,inplace=True)

df1 = df.groupby(by='prod')

df1.head()

from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.functions import *

import pyspark
from pyspark.sql.types import *

from pyspark.sql import SparkSession
# Create a spark session
spark = SparkSession.builder.getOrCreate()
#create schema using sparkDF
schema = StructType([
        StructField("prod", StringType(), True),
        StructField("ds", DateType(), True),
        StructField("y", DoubleType(), True)
    ])

import os
os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "0"
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

sparkdf = spark.createDataFrame(df,schema)

from fbprophet import Prophet

sparkschema = sparkdf.schema

mySchema = StructType([ StructField("prod", StringType(), True)\
                       ,StructField("ds", DateType(), True)\
                       ,StructField("trend", DoubleType(), True)\
                       ,StructField("yhat_lower", DoubleType(), True)\
                       ,StructField("yhat_upper", DoubleType(), True)\
                       ,StructField("trend_lower", DoubleType(), True)\
                       ,StructField("trend_upper", DoubleType(), True)\
                       ,StructField("monthly", DoubleType(), True)\
                       ,StructField("monthly_lower", DoubleType(), True)\
                       ,StructField("monthly_upper'", DoubleType(), True)\
                       ,StructField("multiplicative_terms", DoubleType(), True)\
                       ,StructField("multiplicative_terms_lower", DoubleType(), True)\
                       ,StructField('multiplicative_terms_upper', DoubleType(), True)\
                       ,StructField('additive_terms', DoubleType(), True)\
                       ,StructField('additive_terms_lower', DoubleType(), True)\
                       ,StructField('additive_terms_upper', DoubleType(), True)\
                       ,StructField('yhat', DoubleType(), True)])

@pandas_udf(sparkschema, PandasUDFType.GROUPED_MAP)
def forecast(df):

    model = Prophet(
        growth="linear",
        interval_width=0.10,
        seasonality_mode="multiplicative",
        yearly_seasonality=False,
        weekly_seasonality=False,
        daily_seasonality=False,
    ).add_seasonality(name="monthly", period=12 * 30.5, fourier_order=12)

    model.fit(df.loc[:, ["ds", "y"]])

    futper = model.make_future_dataframe(periods=12, freq="M")
    results_pd = model.predict(futper)
    results_pd = pd.concat([results_pd, df["prod"]], axis=1)

    return pd.DataFrame(results_pd, columns=mySchema.fieldNames())

results = df.groupby(['prod','ds']).apply(forecast)
results.show()

TypeError                                 Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
    724             try:
--> 725                 result = self._python_apply_general(f)
    726             except Exception:

~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
    741     def _python_apply_general(self, f):
--> 742         keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
    743 

~\Anaconda3\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
    236             group_axes = _get_axes(group)
--> 237             res = f(group)
    238             if not _is_indexed_like(res, group_axes):

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\udf.py in wrapper(*args)
    226         def wrapper(*args):
--> 227             return self(*args)
    228 

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\udf.py in __call__(self, *cols)
    206         sc = SparkContext._active_spark_context
--> 207         return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
    208 

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\column.py in _to_seq(sc, cols, converter)
     64     if converter:
---> 65         cols = [converter(c) for c in cols]
     66     return sc._jvm.PythonUtils.toSeq(cols)

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\column.py in <listcomp>(.0)
     64     if converter:
---> 65         cols = [converter(c) for c in cols]
     66     return sc._jvm.PythonUtils.toSeq(cols)

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\column.py in _to_java_column(col)
     52             "For column literals, use 'lit', 'array', 'struct' or 'create_map' "
---> 53             "function.".format(col, type(col)))
     54     return jcol

TypeError: Invalid argument, not a string or column:                                         prod         ds    y
0  ABACAVIR/LAMIV.MYL F.C.TABS 300 600 MG 30 2015-01-20  0.0 of type <class 'pandas.core.frame.DataFrame'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-134-c2532e8e46e8> in <module>
     19     return pd.DataFrame(results_pd, columns=mySchema.fieldNames())
     20 
---> 21 results = df.groupby(['prod','ds']).apply(forecast)
     22 results.show()

~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
    735 
    736                 with _group_selection_context(self):
--> 737                     return self._python_apply_general(f)
    738 
    739         return result

~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
    740 
    741     def _python_apply_general(self, f):
--> 742         keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
    743 
    744         return self._wrap_applied_output(

~\Anaconda3\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
    235             # group might be modified
    236             group_axes = _get_axes(group)
--> 237             res = f(group)
    238             if not _is_indexed_like(res, group_axes):
    239                 mutated = True

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\udf.py in wrapper(*args)
    225         @functools.wraps(self.func, assigned=assignments)
    226         def wrapper(*args):
--> 227             return self(*args)
    228 
    229         wrapper.__name__ = self._name

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\udf.py in __call__(self, *cols)
    205         judf = self._judf
    206         sc = SparkContext._active_spark_context
--> 207         return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
    208 
    209     # This function is for improving the online help system in the interactive interpreter.

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\column.py in _to_seq(sc, cols, converter)
     63     """
     64     if converter:
---> 65         cols = [converter(c) for c in cols]
     66     return sc._jvm.PythonUtils.toSeq(cols)
     67 

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\column.py in <listcomp>(.0)
     63     """
     64     if converter:
---> 65         cols = [converter(c) for c in cols]
     66     return sc._jvm.PythonUtils.toSeq(cols)
     67 

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\column.py in _to_java_column(col)
     51             "{0} of type {1}. "
     52             "For column literals, use 'lit', 'array', 'struct' or 'create_map' "
---> 53             "function.".format(col, type(col)))
     54     return jcol
     55 

TypeError: Invalid argument, not a string or column:      y
0  0.0 of type <class 'pandas.core.frame.DataFrame'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

pdres = results.toPandas()

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-114-fea8c4aadf2a> in <module>
----> 1 pdres = results.toPandas()

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\dataframe.py in toPandas(self)
   2224 
   2225         # Below is toPandas without Arrow optimization.
-> 2226         pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
   2227 
   2228         dtype = {}

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\dataframe.py in collect(self)
    561         """
    562         with SCCallSiteSync(self._sc) as css:
--> 563             sock_info = self._jdf.collectToPython()
    564         return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
    565 

~\Anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
   1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
-> 1257             answer, self.gateway_client, self.target_id, self.name)
   1258 
   1259         for temp_arg in temp_args:

C:\SPARK\spark\spark-3.0.0-preview2-bin-hadoop2.7\python\pyspark\sql\utils.py in deco(*a, **kw)
     96     def deco(*a, **kw):
     97         try:
---> 98             return f(*a, **kw)
     99         except py4j.protocol.Py4JJavaError as e:
    100             converted = convert_exception(e.java_exception)

~\Anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling o1193.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 22.0 failed 1 times, most recent failure: Lost task 7.0 in stage 22.0 (TID 89, DESKTOP-LE3RNA6, executor driver): java.io.FileNotFoundException: C:\Users\PZashev\AppData\Local\Temp\blockmgr-d86ed11f-022a-4dfa-8616-a9b2e2f9784b\31\temp_shuffle_e9340b3c-7c60-4fd9-b8ae-5eff51d457bd (The system cannot find the path specified)
    at java.io.FileOutputStream.open0(Native Method)
    at java.io.FileOutputStream.open(FileOutputStream.java:270)
    at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
    at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:105)
    at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:118)
    at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:245)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:158)
    at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:441)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:444)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:1989)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1977)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1976)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1976)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:956)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:956)
    at scala.Option.foreach(Option.scala:407)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:956)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2206)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2155)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2144)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:758)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2116)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2137)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2156)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2181)
    at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
    at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:365)
    at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3310)
    at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3472)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$4(SQLExecution.scala:100)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:87)
    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3468)
    at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3307)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: C:\Users\PZashev\AppData\Local\Temp\blockmgr-d86ed11f-022a-4dfa-8616-a9b2e2f9784b\31\temp_shuffle_e9340b3c-7c60-4fd9-b8ae-5eff51d457bd (The system cannot find the path specified)
    at java.io.FileOutputStream.open0(Native Method)
    at java.io.FileOutputStream.open(FileOutputStream.java:270)
    at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
    at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:105)
    at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:118)
    at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:245)
    at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:158)
    at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
    at org.apache.spark.scheduler.Task.run(Task.scala:127)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:441)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:444)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    ... 1 more

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...