спарк-оболочка не может прочитать файл партера - java.lang.NoClassDefFoundError - PullRequest
0 голосов
/ 25 октября 2018

Я использую предварительно загруженную версию Spark 2.3.0 для Hadoop версии 2.7+.Когда я запускаю spark-shell и пытаюсь прочитать файл паркета, он жалуется на отсутствующую зависимость:

scala> val df = spark.read.parquet("/home/spark/spark-2.3.0-bin-hadoop2.7/sample_data/titanic.parquet")
[Stage 0:>                                                          (0 + 1) / 1]2018-10-25 13:10:56 ERROR Executor:91 - Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
    at org.json4s.package$.<init>(package.scala:33)
    at org.json4s.package$.<clinit>(package.scala)
    at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
    at scala.util.Try$.apply(Try.scala:192)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
    at scala.Option.flatMap(Option.scala:171)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:109)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    ... 24 more
2018-10-25 13:10:56 WARN  TaskSetManager:66 - Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
    at org.json4s.package$.<init>(package.scala:33)
    at org.json4s.package$.<clinit>(package.scala)
    at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
    at scala.util.Try$.apply(Try.scala:192)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
    at scala.Option.flatMap(Option.scala:171)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:109)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    ... 24 more

2018-10-25 13:10:56 ERROR TaskSetManager:70 - Task 0 in stage 0.0 failed 1 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
    at org.json4s.package$.<init>(package.scala:33)
    at org.json4s.package$.<clinit>(package.scala)
    at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
    at scala.util.Try$.apply(Try.scala:192)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
    at scala.Option.flatMap(Option.scala:171)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
    at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
    at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:109)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    ... 24 more

Driver stacktrace:
  at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
  at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
  at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
  at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
  at scala.Option.foreach(Option.scala:257)
  at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
  at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
  at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
  at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
  at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
  at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:612)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:241)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$8.apply(DataSource.scala:202)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$8.apply(DataSource.scala:202)
  at scala.Option.orElse(Option.scala:289)
  at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:201)
  at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:392)
  at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
  at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:620)
  at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:604)
  ... 49 elided
Caused by: java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
  at org.json4s.package$.<init>(package.scala:33)
  at org.json4s.package$.<clinit>(package.scala)
  at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
  at scala.util.Try$.apply(Try.scala:192)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
  at scala.Option.flatMap(Option.scala:171)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
  at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
  at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
  at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
  at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
  at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
  at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
  at org.apache.spark.scheduler.Task.run(Task.scala:109)
  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
  at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
  at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
  at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
  at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
  at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
  ... 24 more

Но если я пытаюсь импортировать этот пакет, он не жалуется:

scala> import org.json4s.JsonAST
import org.json4s.JsonAST

что мне не хватает?

...