Я работаю в кластере блоков данных, который имеет 240GB
памяти и 64 ядра. Эти настройки я определил.
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as fs
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import count
from pyspark.sql.functions import col, countDistinct
from pyspark import SparkContext
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars
from geospark.register import GeoSparkRegistrator
spark.conf.set("spark.sql.shuffle.partitions", 1000)
#Recommended settings for using GeoSpark
spark.conf.set("spark.driver.memory", "20g")
spark.conf.set("spark.network.timeout", "1000s")
spark.conf.set("spark.driver.maxResultSize", "10g")
spark.conf.set("spark.serializer", KryoSerializer.getName)
spark.conf.set("spark.kryo.registrator", GeoSparkKryoRegistrator.getName)
upload_jars()
SparkContext.setSystemProperty("geospark.global.charset","utf8")
spark.conf.set
Я работаю с большими наборами данных, и это ошибка, которую я получаю после нескольких часов работы.
org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 10.0 failed 4 times, most recent failure: Lost task 3.3 in stage 10.0 (TID 6054, 10.17.21.12, executor 7):
ExecutorLostFailure (executor 7 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 170684 ms