Моя среда Spark - это scala 2.10.5, spark1.6.0, hadoop2.6.0.
Приложение использует Джексона, чтобы сделать некоторые вещи сериализации / десериализации.
при отправке в спарк (режим клиента пряжи):
spark-submit --class noce.train.Train_Grid --master yarn-client --num-executors 10 --executor-cores 2 --driver-memory 10g --executor-memory 12g --conf spark.yarn.executor.memoryOverhead=2048 \
--conf spark.executor.extraClassPath=./guava-15.0.jar:./jackson-annotations-2.4.4.jar:./jackson-core-2.4.4.jar:./jackson-databind-2.4.4.jar:./jackson-module-scala_2.10-2.4.4.jar \
--conf spark.driver.extraClassPath=/home/ck/lib/guava-15.0.jar:/home/ck/lib/jackson-annotations-2.4.4.jar:/home/ck/lib/jackson-core-2.4.4.jar:/home/ck/lib/jackson-databind-2.4.4.jar:/home/ck/lib/jackson-module-scala_2.10-2.4.4.jar \
--jars /home/ck/lib/guava-15.0.jar,/home/ck/lib/jackson-annotations-2.4.4.jar,/home/ck/lib/jackson-core-2.4.4.jar,/home/ck/lib/jackson-databind-2.4.4.jar,/home/ck/lib/jackson-module-scala_2.10-2.4.4.jar \
/home/ck/gnoce_scala.jar
Я получил ошибки:
18/09/12 09:46:47 WARN scheduler.TaskSetManager: Lost task 39.0 in stage 7.0 (TID 893, host-9-138): java.lang.NoClassDefFoundError: Could not initialize class noce.grid.Grid$
at noce.train.Train_Grid$$anonfun$3.apply(Train_Grid.scala:80)
at noce.train.Train_Grid$$anonfun$3.apply(Train_Grid.scala:79)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:64)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
18/09/12 09:46:47 INFO scheduler.TaskSetManager: Lost task 198.0 in stage 7.0 (TID 897) on executor host-9-136: java.lang.NoClassDefFoundError (Could not initialize class noce.grid.Grid$) [duplicate 1]
18/09/12 09:46:47 WARN scheduler.TaskSetManager: Lost task 58.0 in stage 7.0 (TID 890, host-9-136): java.lang.AbstractMethodError: noce.grid.Grid$$anon$1.com$fasterxml$jackson$module$scala$experimental$ScalaObjectMapper$_setter_$com$fasterxml$jackson$module$scala$experimental$ScalaObjectMapper$$typeCache_$eq(Lorg/spark-project/guava/cache/LoadingCache;)V
at com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper$class.$init$(ScalaObjectMapper.scala:50)
at noce.grid.Grid$$anon$1.<init>(Grid.scala:75)
at noce.grid.Grid$.<init>(Grid.scala:75)
at noce.grid.Grid$.<clinit>(Grid.scala)
at noce.train.Train_Grid$$anonfun$3.apply(Train_Grid.scala:80)
at noce.train.Train_Grid$$anonfun$3.apply(Train_Grid.scala:79)
... ...
Код выглядит следующим образом:
//Train_Grid.scala
val newGridData: RDD[(Long, Grid)] = data.map(nr => { //line 79
val grid = Grid(nr) //line 80
(grid.id, grid)
}).reduceByKey(_.merge(_))
//Grid.scala
object Grid {
val mapper = new ObjectMapper() with ScalaObjectMapper //line 75
mapper.registerModule(DefaultScalaModule)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
Я печатаю пути классов в драйвере:
val cl = ClassLoader.getSystemClassLoader
cl.asInstanceOf[java.net.URLClassLoader].getURLs.take(20).foreach(println)
file:/home/ck/lib/guava-15.0.jar
file:/home/ck/lib/jackson-annotations-2.4.4.jar
file:/home/ck/lib/jackson-core-2.4.4.jar
file:/home/ck/lib/jackson-databind-2.4.4.jar
file:/home/ck/lib/jackson-module-scala_2.10-2.4.4.jar
file:/etc/spark/conf.cloudera.spark_on_yarn/
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/spark-assembly-1.6.0-cdh5.7.2-hadoop2.6.0-cdh5.7.2.jar
file:/etc/spark/conf.cloudera.spark_on_yarn/yarn-conf/
file:/etc/hive/conf.cloudera.hive/
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/ST4-4.0.4.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/accumulo-core-1.6.0.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/accumulo-fate-1.6.0.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/accumulo-start-1.6.0.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/accumulo-trace-1.6.0.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/activation-1.1.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/ant-1.9.1.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/ant-launcher-1.9.1.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/antisamy-1.4.3.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/antlr-2.7.7.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/antlr-runtime-3.4.jar
и исполнители:
val x = sc.parallelize(0 to 1, 2)
val p = x.flatMap { i =>
val cl = ClassLoader.getSystemClassLoader
cl.asInstanceOf[java.net.URLClassLoader].getURLs.take(20).map(_.toString)
}
p.collect().foreach(println)
file:/DATA2/yarn/nm/usercache/ck/appcache/application_1533542623806_5351/container_1533542623806_5351_01_000007/guava-15.0.jar
file:/DATA2/yarn/nm/usercache/ck/appcache/application_1533542623806_5351/container_1533542623806_5351_01_000007/jackson-annotations-2.4.4.jar
file:/DATA2/yarn/nm/usercache/ck/appcache/application_1533542623806_5351/container_1533542623806_5351_01_000007/jackson-core-2.4.4.jar
file:/DATA2/yarn/nm/usercache/ck/appcache/application_1533542623806_5351/container_1533542623806_5351_01_000007/jackson-databind-2.4.4.jar
file:/DATA2/yarn/nm/usercache/ck/appcache/application_1533542623806_5351/container_1533542623806_5351_01_000007/jackson-module-scala_2.10-2.4.4.jar
file:/DATA2/yarn/nm/usercache/ck/appcache/application_1533542623806_5351/container_1533542623806_5351_01_000007/
file:/DATA7/yarn/nm/usercache/ck/filecache/745/__spark_conf__2134162299477543917.zip/
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/spark-assembly-1.6.0-cdh5.7.2-hadoop2.6.0-cdh5.7.2.jar
file:/etc/hadoop/conf.cloudera.yarn/
file:/var/run/cloudera-scm-agent/process/2147-yarn-NODEMANAGER/
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/parquet-column-1.5.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/parquet-format-2.1.0-cdh5.7.2-sources.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/parquet-jackson-1.5.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/parquet-scala_2.10-1.5.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/parquet-hadoop-1.5.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/hadoop-common-2.6.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/parquet-avro-1.5.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/hadoop-auth-2.6.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/hadoop-aws-2.6.0-cdh5.7.2.jar
file:/opt/cloudera/parcels/CDH-5.7.2-1.cdh5.7.2.p0.18/jars/hadoop-common-2.6.0-cdh5.7.2-tests.jar
... ...
Но, очевидно, он по-прежнему использует неверную версию guava (org.spark-project.guava.cache.LoadingCache)
И если я установлю spark. {Driver, executor} .userClassPathFirst в true, я получу:
Exception in thread "main" java.lang.UnsatisfiedLinkError: org.xerial.snappy.SnappyNative.maxCompressedLength(I)I
так, есть предложения?