Я пытаюсь использовать pyspark для предварительной обработки данных для модели прогнозирования. Я получаю сообщение об ошибке, когда пытаюсь использовать spark.createDataFrame из своей предварительной обработки. Есть ли способ проверить, как выглядит processingRDD, прежде чем перейти к кадру данных?
import findspark
findspark.init('/usr/local/spark')
import pyspark
from pyspark.sql import SQLContext
import os
import pandas as pd
import geohash2
sc = pyspark.SparkContext('local', 'sentinel')
spark = pyspark.SQLContext(sc)
sql = SQLContext(sc)
working_dir = os.getcwd()
df = sql.createDataFrame(data)
df = df.select(['starttime', 'latstart','lonstart', 'latfinish', 'lonfinish', 'trip_type'])
df.show(10, False)
processedRDD = df.rdd
processedRDD = processedRDD \
.map(lambda row: (row, g, b, minutes_per_bin)) \
.map(data_cleaner) \
.filter(lambda row: row != None)
print(processedRDD)
featuredDf = spark.createDataFrame(processedRDD, ['year', 'month', 'day', 'time_cat', 'time_num', 'time_cos', \
'time_sin', 'day_cat', 'day_num', 'day_cos', 'day_sin', 'weekend', \
'x_start', 'y_start', 'z_start','location_start', 'location_end', 'trip_type'])
Я получаю эту ошибку:
[Stage 1:> (0 + 1) / 1]2019-10-24 15:37:56 ERROR Executor:91 - Exception in task 0.0 in stage 1.0 (TID 1)
raise AppRegistryNotReady("Apps aren't loaded yet.") django.core.exceptions.AppRegistryNotReady: Apps aren't loaded yet.
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Я не понимаю, как это связано с импортом приложения