Я использую предварительно загруженную версию Spark 2.3.0 для Hadoop версии 2.7+.Когда я запускаю spark-shell
и пытаюсь прочитать файл паркета, он жалуется на отсутствующую зависимость:
scala> val df = spark.read.parquet("/home/spark/spark-2.3.0-bin-hadoop2.7/sample_data/titanic.parquet")
[Stage 0:> (0 + 1) / 1]2018-10-25 13:10:56 ERROR Executor:91 - Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
at org.json4s.package$.<init>(package.scala:33)
at org.json4s.package$.<clinit>(package.scala)
at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at scala.Option.flatMap(Option.scala:171)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 24 more
2018-10-25 13:10:56 WARN TaskSetManager:66 - Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
at org.json4s.package$.<init>(package.scala:33)
at org.json4s.package$.<clinit>(package.scala)
at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at scala.Option.flatMap(Option.scala:171)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 24 more
2018-10-25 13:10:56 ERROR TaskSetManager:70 - Task 0 in stage 0.0 failed 1 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
at org.json4s.package$.<init>(package.scala:33)
at org.json4s.package$.<clinit>(package.scala)
at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at scala.Option.flatMap(Option.scala:171)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 24 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:612)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:241)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$8.apply(DataSource.scala:202)
at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$8.apply(DataSource.scala:202)
at scala.Option.orElse(Option.scala:289)
at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:201)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:392)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:620)
at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:604)
... 49 elided
Caused by: java.lang.NoClassDefFoundError: org/json4s/JsonAST$JLong$
at org.json4s.package$.<init>(package.scala:33)
at org.json4s.package$.<clinit>(package.scala)
at org.apache.spark.sql.types.DataType$.fromJson(DataType.scala:113)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString$3.apply(ParquetFileFormat.scala:649)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.org$apache$spark$sql$execution$datasources$parquet$ParquetFileFormat$$deserializeSchemaString(ParquetFileFormat.scala:649)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readSchemaFromFooter$1.apply(ParquetFileFormat.scala:642)
at scala.Option.flatMap(Option.scala:171)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readSchemaFromFooter(ParquetFileFormat.scala:642)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:600)
at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$11.apply(ParquetFileFormat.scala:582)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.json4s.JsonAST$JLong$
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 24 more
Но если я пытаюсь импортировать этот пакет, он не жалуется:
scala> import org.json4s.JsonAST
import org.json4s.JsonAST
что мне не хватает?