Бросок структурированной искровой потоковой передачи - PullRequest
0 голосов
/ 14 июня 2019

Я начинаю с Structured Spark Streaming с исходным кодом Kafka и следую простому учебнику. Мой Kafka сервер - это OSS kafka. Исходный код моего производителя выглядит следующим образом

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.Properties;

public class LogGenerator {
    public static void main(String[] args) throws Exception {
        Properties prop = new Properties();
        prop.load(ClassLoader.getSystemResourceAsStream("kafka-producer.properties"));

        KafkaProducer<String, String> producer = new KafkaProducer<String, String>(prop);

        for (int i=0; i<10000; i++) {
            System.out.printf("%d ", i);
            producer.send(new ProducerRecord<>("my-log-topic", Integer.toString(i), Integer.toString(i)));
        }
    }

}

Производитель пишет только с (0,0) по (999 999)

Структурированный код потоковой передачи для чтения из этой темы выглядит следующим образом

mport org.apache.spark.sql.SparkSession

object DimaStreamer {

  //initialize configs

  val spark: SparkSession = SparkSession.builder().master("local").appName("StreamingData").getOrCreate()

  def main(args: Array[String]): Unit = {
    val dfRefundStream = spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", "0.0.0.0:9091,0.0.0.0:9092")
      .option("subscribe", "my-log-topic")

      .load()

    import org.apache.spark.sql.functions._
    dfRefundStream.printSchema()
    dfRefundStream.select(col("value")).writeStream
      .format("console")
        .queryName("inittest")
      .start()
      .awaitTermination()

  }
}

Это проект Maven. Задание выполняется следующим образом с опцией --jars, используемой для передачи файлов jar с разделением запятыми.

spark-submit --class com.apple.arcadia.solr.batch.DimaStreamer --jars $JARSPATH target/mainproject-1.0-SNAPSHOT.jar

Задание выдает следующее исключение

19/06/13 15:10:05 INFO internals.ConsumerCoordinator: Setting newly assigned partitions [my-log-topic-1, my-log-topic-0] for group spark-kafka-source-0ce55060-09ad-430a-9916-e0063d8103fe--48657453-driver-0
19/06/13 15:10:06 INFO kafka010.KafkaSource: Initial offsets: {"my-log-topic":{"1":9878,"0":10122}}
19/06/13 15:10:06 INFO streaming.StreamExecution: Committed offsets for batch 0. Metadata OffsetSeqMetadata(0,1560463806021,Map(spark.sql.shuffle.partitions -> 200))
19/06/13 15:10:06 INFO kafka010.KafkaSource: GetBatch called with start = None, end = {"my-log-topic":{"1":9878,"0":10122}}
19/06/13 15:10:06 INFO kafka010.KafkaSource: Partitions added: Map()
19/06/13 15:10:06 INFO kafka010.KafkaSource: GetBatch generating RDD of offset range: KafkaSourceRDDOffsetRange(my-log-topic-0,10122,10122,None), KafkaSourceRDDOffsetRange(my-log-topic-1,9878,9878,None)
19/06/13 15:10:06 ERROR streaming.StreamExecution: Query inittest [id = 305e10aa-a446-4f42-a4e9-8d2372250fa8, runId = 2218049f-08a4-40bf-9b46-b7e898321b85] terminated with error
java.lang.NoSuchMethodError: org.apache.spark.sql.SQLContext.internalCreateDataFrame(Lorg/apache/spark/rdd/RDD;Lorg/apache/spark/sql/types/StructType;Z)Lorg/apache/spark/sql/Dataset;
    at org.apache.spark.sql.kafka010.KafkaSource.getBatch(KafkaSource.scala:301)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2$$anonfun$apply$7.apply(StreamExecution.scala:614)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2$$anonfun$apply$7.apply(StreamExecution.scala:610)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
    at org.apache.spark.sql.execution.streaming.StreamProgress.foreach(StreamProgress.scala:25)
    at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
    at org.apache.spark.sql.execution.streaming.StreamProgress.flatMap(StreamProgress.scala:25)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2.apply(StreamExecution.scala:610)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2.apply(StreamExecution.scala:610)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch(StreamExecution.scala:609)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(StreamExecution.scala:306)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:290)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:206)
Exception in thread "stream execution thread for inittest [id = 305e10aa-a446-4f42-a4e9-8d2372250fa8, runId = 2218049f-08a4-40bf-9b46-b7e898321b85]" java.lang.NoSuchMethodError: org.apache.spark.sql.SQLContext.internalCreateDataFrame(Lorg/apache/spark/rdd/RDD;Lorg/apache/spark/sql/types/StructType;Z)Lorg/apache/spark/sql/Dataset;
    at org.apache.spark.sql.kafka010.KafkaSource.getBatch(KafkaSource.scala:301)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2$$anonfun$apply$7.apply(StreamExecution.scala:614)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2$$anonfun$apply$7.apply(StreamExecution.scala:610)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
    at org.apache.spark.sql.execution.streaming.StreamProgress.foreach(StreamProgress.scala:25)
    at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
    at org.apache.spark.sql.execution.streaming.StreamProgress.flatMap(StreamProgress.scala:25)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2.apply(StreamExecution.scala:610)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2.apply(StreamExecution.scala:610)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch(StreamExecution.scala:609)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(StreamExecution.scala:306)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:290)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:206)
Exception in thread "main" org.apache.spark.sql.streaming.StreamingQueryException: org.apache.spark.sql.SQLContext.internalCreateDataFrame(Lorg/apache/spark/rdd/RDD;Lorg/apache/spark/sql/types/StructType;Z)Lorg/apache/spark/sql/Dataset;
=== Streaming Query ===
Identifier: inittest [id = 305e10aa-a446-4f42-a4e9-8d2372250fa8, runId = 2218049f-08a4-40bf-9b46-b7e898321b85]
Current Committed Offsets: {}
Current Available Offsets: {KafkaSource[Subscribe[my-log-topic]]: {"my-log-topic":{"1":9878,"0":10122}}}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
Project [value#1]
+- StreamingExecutionRelation KafkaSource[Subscribe[my-log-topic]], [key#0, value#1, topic#2, partition#3, offset#4L, timestamp#5, timestampType#6]

    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:343)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:206)
Caused by: java.lang.NoSuchMethodError: org.apache.spark.sql.SQLContext.internalCreateDataFrame(Lorg/apache/spark/rdd/RDD;Lorg/apache/spark/sql/types/StructType;Z)Lorg/apache/spark/sql/Dataset;
    at org.apache.spark.sql.kafka010.KafkaSource.getBatch(KafkaSource.scala:301)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2$$anonfun$apply$7.apply(StreamExecution.scala:614)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2$$anonfun$apply$7.apply(StreamExecution.scala:610)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
    at org.apache.spark.sql.execution.streaming.StreamProgress.foreach(StreamProgress.scala:25)
    at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
    at org.apache.spark.sql.execution.streaming.StreamProgress.flatMap(StreamProgress.scala:25)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2.apply(StreamExecution.scala:610)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch$2.apply(StreamExecution.scala:610)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatch(StreamExecution.scala:609)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(StreamExecution.scala:306)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$apply$mcZ$sp$1.apply(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:279)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:294)
    at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:290)
    ... 1 more

Исходя из исключения, похоже, что в классе kafka010. Кажется, в классе KafkaSource отсутствует метод с сигнатурой SQLContext.internalCreateDataFrame (rdd, schema)

После проверки исходного кода здесь [https://github.com/apache/spark/blob/master/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala]

Я вижу, что метод существует. Кто-нибудь видел такую ​​проблему и решил ее? Если да, не могли бы вы помочь понять, в чем проблема

Наконец, детали POM:

 <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
      <version>2.3.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.11</artifactId>
      <version>2.3.1</version>
      <classifier>tests</classifier>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>2.3.1</version>
    </dependency>
      <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
      <version>2.3.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
      <version>2.3.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>2.3.1</version>
    </dependency>

1 Ответ

0 голосов
/ 14 июня 2019

Я смог запустить это, поскольку заметил, что версия spark, в которой я использую spark-submit, была 2.2, а все библиотеки (как вы можете видеть) - 2.3.1.Я поменял spark-версию и смог ее успешно запустить.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...