Невозможно отправить данные в MongoDB с использованием структурированной потоковой передачи Kafka-Spark - PullRequest
0 голосов
/ 22 апреля 2019

Я не могу сохранить структурированные потоковые данные из Kafka в MongoDB.Это первый раз, когда я внедряю данные структурированной потоковой передачи Kafka-Spark в приемник MongoDB.Я следил за этой статьей https://learningfromdata.blog/2017/04/16/real-time-data-ingestion-with-apache-spark-structured-streaming-implementation/ Она предлагает создать MongoForeachWriter и класс Helper вместе с программой структурированной потоковой передачи.Тем не менее, после этого я не смог увидеть данные в коллекции MongoDB.Может кто-то увидеть и исправить, где я иду не так ???

Error:

Error:

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
Project [cast(value#8 as string) AS value#21]
+- StreamingExecutionRelation KafkaSource[Subscribe[TOPIC_WITH_COMP_P2_R2, TOPIC_WITH_COMP_P2_R2.DIT, TOPIC_WITHOUT_COMP_P2_R2.DIT]], [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]

        at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:295)
        at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 1.0 failed 1 times, most recent failure: Lost task 2.0 in stage 1.0 (TID 8, localhost, executor driver): java.lang.ClassCastException: java.lang.String cannot be cast to [B
        at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:42)
        at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:15)
        at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:53)
        at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:49)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
        at org.apache.spark.scheduler.Task.run(Task.scala:109)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
        at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
        at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
        at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
        at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
        at scala.Option.foreach(Option.scala:257)
        at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
        at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
        at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:929)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:927)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
        at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:927)
        at org.apache.spark.sql.execution.streaming.ForeachSink.addBatch(ForeachSink.scala:49)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
        at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
        at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
        at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
        at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
        at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
        at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
        at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
        at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
        ... 1 more
Caused by: java.lang.ClassCastException: java.lang.String cannot be cast to [B
        at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:42)
        at example_new.MongoDBForeachWriter.process(MongoDBForeachWriter.scala:15)
        at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:53)
        at org.apache.spark.sql.execution.streaming.ForeachSink$$anonfun$addBatch$1.apply(ForeachSink.scala:49)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
        at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
        at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
        at org.apache.spark.scheduler.Task.run(Task.scala:109)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
2019-04-22 19:40:26 INFO  SparkContext:54 - Invoking stop() from shutdown hook
2019-04-22 19:40:26 INFO  AbstractConnector:318 - Stopped Spark@907f2b7{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
2019-04-22 19:40:26 INFO  SparkUI:54 - Stopped Spark web UI at http://W10BZVGSQ2.aus.amer.dell.com:4040
2019-04-22 19:40:26 INFO  MapOutputTrackerMasterEndpoint:54 - MapOutputTrackerMasterEndpoint stopped!
2019-04-22 19:40:26 INFO  MemoryStore:54 - MemoryStore cleared
2019-04-22 19:40:26 INFO  BlockManager:54 - BlockManager stopped
2019-04-22 19:40:26 INFO  BlockManagerMaster:54 - BlockManagerMaster stopped
2019-04-22 19:40:26 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
2019-04-22 19:40:26 WARN  SparkEnv:87 - Exception while deleting Spark temp dir: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
java.io.IOException: Failed to delete: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
        at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1070)
        at org.apache.spark.SparkEnv.stop(SparkEnv.scala:103)
        at org.apache.spark.SparkContext$$anonfun$stop$11.apply$mcV$sp(SparkContext.scala:1940)
        at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1357)
        at org.apache.spark.SparkContext.stop(SparkContext.scala:1939)
        at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:572)
        at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
        at scala.util.Try$.apply(Try.scala:192)
        at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
        at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
2019-04-22 19:40:26 INFO  SparkContext:54 - Successfully stopped SparkContext
2019-04-22 19:40:26 INFO  ShutdownHookManager:54 - Shutdown hook called
2019-04-22 19:40:26 INFO  ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10
2019-04-22 19:40:26 ERROR ShutdownHookManager:91 - Exception while deleting Spark temp dir: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10
java.io.IOException: Failed to delete: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10
        at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1070)
        at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:65)
        at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:62)
        at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
        at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
        at org.apache.spark.util.ShutdownHookManager$$anonfun$1.apply$mcV$sp(ShutdownHookManager.scala:62)
        at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
        at scala.util.Try$.apply(Try.scala:192)
        at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
        at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
2019-04-22 19:40:26 INFO  ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\spark-992d4d7e-ea11-4295-9368-c4038b26f895
2019-04-22 19:40:26 INFO  ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\temporaryReader-4d362eeb-6ee5-4a48-9da9-3792a22ec1ca
2019-04-22 19:40:26 INFO  ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\temporary-add2fc32-1623-4784-8df1-f5cb0a1dd9fc
2019-04-22 19:40:26 INFO  ShutdownHookManager:54 - Deleting directory C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
2019-04-22 19:40:26 ERROR ShutdownHookManager:91 - Exception while deleting Spark temp dir: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
java.io.IOException: Failed to delete: C:\Users\raheem_mohammed\AppData\Local\Temp\spark-f9296938-c32b-42ff-af71-f90efcd49b10\userFiles-ee595b18-8c75-41be-b20e-f8c30628c765
        at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1070)
        at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:65)
        at org.apache.spark.util.ShutdownHookManager$$anonfun$1$$anonfun$apply$mcV$sp$3.apply(ShutdownHookManager.scala:62)
        at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
        at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
        at org.apache.spark.util.ShutdownHookManager$$anonfun$1.apply$mcV$sp(ShutdownHookManager.scala:62)
        at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
        at scala.util.Try$.apply(Try.scala:192)
        at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
        at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
        at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)

Создано MongoForeachWriter.scala, Helper.scala и StructuredStreamingProgram.scala

package example

import java.util.concurrent.TimeUnit

import scala.concurrent.Await
import scala.concurrent.duration.Duration

import org.mongodb.scala._

object Helpers {

  implicit class DocumentObservable[C](val observable: Observable[Document]) extends ImplicitObservable[Document] {
    override val converter: (Document) => String = (doc) => doc.toJson
  }

  implicit class GenericObservable[C](val observable: Observable[C]) extends ImplicitObservable[C] {
    override val converter: (C) => String = (doc) => doc.toString
  }

  trait ImplicitObservable[C] {
    val observable: Observable[C]
    val converter: (C) => String

    def results(): Seq[C] = Await.result(observable.toFuture(), Duration(10, TimeUnit.SECONDS))
    def headResult() = Await.result(observable.head(), Duration(10, TimeUnit.SECONDS))
    def printResults(initial: String = ""): Unit = {
      if (initial.length > 0) print(initial)
      results().foreach(res => println(converter(res)))
    }
    def printHeadResult(initial: String = ""): Unit = println(s"${initial}${converter(headResult())}")
  }

}

package example
import java.util.Calendar

import org.apache.spark.util.LongAccumulator
import org.apache.spark.sql.Row
import org.apache.spark.sql.ForeachWriter
import org.mongodb.scala._
import org.mongodb.scala.bson.collection.mutable.Document
import org.mongodb.scala.bson._
import example.Helpers._

import scala.util.Try


class MongoDBForeachWriter(p_uri: String,
                           p_dbName: String,
                           p_collectionName: String,
                           p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {

  val mongodbURI = p_uri
  val dbName = p_dbName
  val collectionName = p_collectionName
  val messageCountAccum = p_messageCountAccum

  var mongoClient: MongoClient = null
  var db: MongoDatabase = null
  var collection: MongoCollection[Document] = null

  def ensureMongoDBConnection(): Unit = {
    if (mongoClient == null) {
      mongoClient = MongoClient(mongodbURI)
      db = mongoClient.getDatabase(dbName)
      collection = db.getCollection(collectionName)
    }
  }

  override def open(partitionId: Long, version: Long): Boolean = {
    true
  }

  override def process(record: Row): Unit = {
    val valueStr = new String(record.getAs[Array[Byte]]("value"))

    val doc: Document = Document(valueStr)
    doc += ("log_time" -> Calendar.getInstance().getTime())

    // lazy opening of MongoDB connection
    ensureMongoDBConnection()
    val result = collection.insertOne(doc)

    // tracks how many records I have processed
    if (messageCountAccum != null)
      messageCountAccum.add(1)
  }

  override def close(errorOrNull: Throwable): Unit = {
    if(mongoClient != null) {
      Try {
        mongoClient.close()
      }
    }
  }
}

package example
import org.apache.spark.sql.functions.{col, _}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.util.LongAccumulator
import example.Helpers._
import java.util.Calendar

object StructuredStreamingProgram {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession
      .builder()
      .master("local[*]")
      .appName("OSB_Streaming_Model")
      .getOrCreate()

    import spark.implicits._

    val df = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "10.160.172.45:9092, 10.160.172.46:9092, 10.160.172.100:9092")
      .option("subscribe", "TOPIC_WITH_COMP_P2_R2, TOPIC_WITH_COMP_P2_R2.DIT, TOPIC_WITHOUT_COMP_P2_R2.DIT")
      .load()

    val dfs = df.selectExpr("CAST(value AS STRING)")

    // sends to MongoDB once every 20 seconds
    val mongodb_uri = "mongodb://dstk8sdev06.us.dell.com:27018"
    val mdb_name = "HANZO_MDB"
    val mdb_collection = "Spark"
    val CountAccum: LongAccumulator = spark.sparkContext.longAccumulator("mongostreamcount")

    val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)
    val query = dfs.writeStream
      .foreach(structuredStreamForeachWriter)
      .trigger(Trigger.ProcessingTime("20 seconds"))
      .start()

    while (!spark.streams.awaitAnyTermination(60000)) {
      println(Calendar.getInstance().getTime()+" :: mongoEventsCount = "+CountAccum.value)
    }

  }
}

Мне нужно сохранить структурированные потоковые данные в коллекцию Mongo

1 Ответ

2 голосов
/ 23 апреля 2019

В соответствии с ошибкой, у вас уже есть строка (вы уже сделали df.selectExpr("CAST(value AS STRING)")), поэтому вы должны попробовать получить событие Row как String, а не как Array[Byte]

Startизменив

val valueStr = new String(record.getAs[Array[Byte]]("value"))

на

val valueStr = record.getAs[String]("value")

Я понимаю, что у вас, возможно, уже есть кластер для запуска кода Spark, но я бы посоветовал все же изучить Kafka ConnectMongo Sink Connector , чтобы вам не приходилось писать и поддерживать свой собственный Mongo Writer в Spark-коде.

Или вы можете напрямую записывать наборы данных Spark в монго

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...