Как выполнить запрос Hive внутри sparkcontext.parallelize (.......). Map ()? - PullRequest
0 голосов
/ 17 декабря 2018

Я не могу выполнить приведенный ниже код.Этот код пытался выполнить запрос улья из таблицы улья, используя SparkSession внутри метода runjob () SparkContext.

val lines = sparkSession.sparkContext.parallelize(Seq("hello world"),1)
sparkSession.sparkContext.runJob(lines, (t: TaskContext, it: Iterator[String]) => {

val conf = new SparkConf().setAppName("Testing")
val session = SparkSession.builder().master("local[*]").config(conf).enableHiveSupport().getOrCreate()

var df : DataFrame= session.sql(s"select count(*) from employee")
println("Task attempt id => " + t.taskAttemptId() + "  and  count:" + df.first)

})

I am able to get the result in local IntelliJ. However when I try to run the JAR in "yarn cluster" mode, I am getting the below exception:-

18/12/17 12:18:27 ERROR Executor: Exception in task 0.3 in stage 0.0 (TID 3)
java.io.StreamCorruptedException: invalid type code: 00
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1377)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
    at java.io.ObjectInputStream.defaultReadObject(ObjectInputStream.java:500)
    at org.apache.spark.rdd.ParallelCollectionPartition$$anonfun$readObject$1.apply$mcV$sp(ParallelCollectionRDD.scala:74)
    at org.apache.spark.rdd.ParallelCollectionPartition$$anonfun$readObject$1.apply(ParallelCollectionRDD.scala:70)
    at org.apache.spark.rdd.ParallelCollectionPartition$$anonfun$readObject$1.apply(ParallelCollectionRDD.scala:70)
    at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1269)
    at org.apache.spark.rdd.ParallelCollectionPartition.readObject(ParallelCollectionRDD.scala:70)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:606)
    at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1893)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:370)
    at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
    at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:298)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)


Caused by: java.io.IOException: java.util.NoSuchElementException: key not found: 0
    at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1276)
    at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:206)
    at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:66)
    at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:66)
    at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:96)
    at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
    at org.apache.spark.rdd.HadoopRDD.getJobConf(HadoopRDD.scala:147)
    at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:198)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.rdd.UnionRDD$$anonfun$1.apply(UnionRDD.scala:84)
    at org.apache.spark.rdd.UnionRDD$$anonfun$1.apply(UnionRDD.scala:84)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
    at scala.collection.immutable.List.foreach(List.scala:381)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
    at scala.collection.immutable.List.map(List.scala:285)
    at org.apache.spark.rdd.UnionRDD.getPartitions(UnionRDD.scala:84)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
    at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)
    at org.apache.spark.sql.execution.exchange.ShuffleExchange$.prepareShuffleDependency(ShuffleExchange.scala:261)
    at org.apache.spark.sql.execution.exchange.ShuffleExchange.prepareShuffleDependency(ShuffleExchange.scala:84)
    at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:121)
    at org.apache.spark.sql.execution.exchange.ShuffleExchange$$anonfun$doExecute$1.apply(ShuffleExchange.scala:112)
    at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
    ... 38 more
Caused by: java.util.NoSuchElementException: key not found: 0
    at scala.collection.MapLike$class.default(MapLike.scala:228)
    at scala.collection.AbstractMap.default(Map.scala:59)
    at scala.collection.mutable.HashMap.apply(HashMap.scala:65)
    at org.apache.spark.storage.BlockInfoManager.lockForReading(BlockInfoManager.scala:196)
    at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:450)
    at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$readBroadcastBlock$1.apply(TorrentBroadcast.scala:210)
    at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1269)

Как устранить эту проблему?

1 Ответ

0 голосов
/ 17 декабря 2018

Этот код пытался выполнить запрос улья из таблицы улья, используя SparkSession внутри метода runJob () SparkContext.

Это технически невозможно, и Spark SQL / Spark Core его не поддерживает.

Запрос Hive в Spark SQL переводится в RDD, который сам по себе является описанием распределенных вычислений.

SparkContext.runJob предназначен для запуска распределенных вычислений в качестве задания Spark (котороесам по себе набор задач).

Видите различие?Первое описание, а второе исполнение.Они разные и не будут работать вместе.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...