Question

Вот несколько примеров создания источников MQTT [1] [2] для Spark Streaming. Однако я хочу создать приемник MQTT, в котором я могу опубликовать sh результаты вместо использования метода print(). Я попытался создать один MqttSink, но получаю ошибку object not serializable. Затем я основываю код на этом блоге , но я не могу найти метод send, который я создал для объекта MqttSink.

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.fusesource.mqtt.client.QoS
import org.sense.spark.util.{MqttSink, TaxiRideSource}

object TaxiRideCountCombineByKey {

  val mqttTopic: String = "spark-mqtt-sink"
  val qos: QoS = QoS.AT_LEAST_ONCE

  def main(args: Array[String]): Unit = {

    val outputMqtt: Boolean = if (args.length > 0 && args(0).equals("mqtt")) true else false

    // Create a local StreamingContext with two working thread and batch interval of 1 second.
    // The master requires 4 cores to prevent from a starvation scenario.
    val sparkConf = new SparkConf()
      .setAppName("TaxiRideCountCombineByKey")
      .setMaster("local[4]")
    val ssc = new StreamingContext(sparkConf, Seconds(1))

    val stream = ssc.receiverStream(new TaxiRideSource())
    val driverStream = stream.map(taxiRide => (taxiRide.driverId, 1))
    val countStream = driverStream.combineByKey(
      (v) => (v, 1), //createCombiner
      (acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
      (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2), // mergeCombiners
      new HashPartitioner(3)
    )

    if (outputMqtt) {
      println("Use the command below to consume data:")
      println("mosquitto_sub -h 127.0.0.1 -p 1883 -t " + mqttTopic)

      val mqttSink = ssc.sparkContext.broadcast(MqttSink)
      countStream.foreachRDD { rdd =>
        rdd.foreach { message =>
          mqttSink.value.send(mqttTopic, message.toString()) // "send" method does not exist
        }
      }
    } else {
      countStream.print()
    }

    ssc.start() // Start the computation
    ssc.awaitTermination() // Wait for the computation to terminate
  }
}
import org.fusesource.mqtt.client.{FutureConnection, MQTT, QoS}

class MqttSink(createProducer: () => FutureConnection) extends Serializable {
  lazy val producer = createProducer()
  def send(topic: String, message: String): Unit = {
    producer.publish(topic, message.toString().getBytes, QoS.AT_LEAST_ONCE, false)
  }
}

object MqttSink {
  def apply(): MqttSink = {
    val f = () => {
      val mqtt = new MQTT()
      mqtt.setHost("localhost", 1883)
      val producer = mqtt.futureConnection()
      producer.connect().await()
      sys.addShutdownHook {
        producer.disconnect().await()
      }
      producer
    }
    new MqttSink(f)
  }
}

mike · Answer 1 · 24 июня 2020

В качестве альтернативы вы также можете использовать Structure Streaming с Apache Bahir Spark Extension для MQTT .

Полный пример

build.sbt :

name := "MQTT_StructuredStreaming"
version := "0.1"

libraryDependencies += "org.apache.spark" % "spark-core_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-sql_2.12" % "2.4.4"
libraryDependencies += "org.apache.spark" % "spark-streaming_2.12" % "2.4.4" % "provided"
libraryDependencies += "org.apache.bahir" % "spark-sql-streaming-mqtt_2.12" % "2.4.0"

Основной. scala

import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object Main extends App {

  val brokerURL = "tcp://localhost:1883"
  val subTopicName = "/my/subscribe/topic"
  val pubTopicName = "/my/publish/topic"

  val spark: SparkSession = SparkSession
    .builder
    .appName("MQTT_StructStreaming")
    .master("local[*]")
    .config("spark.sql.streaming.checkpointLocation", "/my/sparkCheckpoint/dir")
    .getOrCreate

  spark.sparkContext.setLogLevel("ERROR")

  import spark.implicits._

  val lines: Dataset[String] = spark.readStream
    .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider")
    .option("topic", subTopicName)
    .option("clientId", "some-client-id")
    .option("persistence", "memory")
    .load(brokerURL)
    .selectExpr("CAST(payload AS STRING)").as[String]

  // Split the lines into words
  val words: Dataset[String] = lines.as[String].flatMap(_.split(";"))

  // Generate running word count
  val wordCounts: DataFrame = words.groupBy("value").count()

  // Start running the query that prints the running counts to the console
  val query: StreamingQuery = wordCounts.writeStream
    .format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSinkProvider")
    .outputMode("complete")
    .option("topic", pubTopicName)
    .option("brokerURL", brokerURL)
    .start

  query.awaitTermination()
}

Felipe · Answer 2 · 18 июня 2020

это рабочий пример, основанный на записи в блоге Шаблоны интеграции Spark и Kafka .

package org.sense.spark.app

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf}
import org.fusesource.mqtt.client.QoS
import org.sense.spark.util.{MqttSink, TaxiRideSource}

object TaxiRideCountCombineByKey {

  val mqttTopic: String = "spark-mqtt-sink"
  val qos: QoS = QoS.AT_LEAST_ONCE

  def main(args: Array[String]): Unit = {

    val outputMqtt: Boolean = if (args.length > 0 && args(0).equals("mqtt")) true else false

    // Create a local StreamingContext with two working thread and batch interval of 1 second.
    // The master requires 4 cores to prevent from a starvation scenario.
    val sparkConf = new SparkConf()
      .setAppName("TaxiRideCountCombineByKey")
      .setMaster("local[4]")
    val ssc = new StreamingContext(sparkConf, Seconds(1))

    val stream = ssc.receiverStream(new TaxiRideSource())
    val driverStream = stream.map(taxiRide => (taxiRide.driverId, 1))
    val countStream = driverStream.combineByKey(
      (v) => (v, 1), //createCombiner
      (acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1), //mergeValue
      (acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2), // mergeCombiners
      new HashPartitioner(3)
    )

    if (outputMqtt) {
      println("Use the command below to consume data:")
      println("mosquitto_sub -h 127.0.0.1 -p 1883 -t " + mqttTopic)

      val mqttSink = ssc.sparkContext.broadcast(MqttSink())
      countStream.foreachRDD { rdd =>
        rdd.foreach { message =>
          mqttSink.value.send(mqttTopic, message.toString()) // "send" method does not exist
        }
      }
    } else {
      countStream.print()
    }

    ssc.start() // Start the computation
    ssc.awaitTermination() // Wait for the computation to terminate
  }
}
package org.sense.spark.util

import org.fusesource.mqtt.client.{FutureConnection, MQTT, QoS}

class MqttSink(createProducer: () => FutureConnection) extends Serializable {

  lazy val producer = createProducer()

  def send(topic: String, message: String): Unit = {
    producer.publish(topic, message.toString().getBytes, QoS.AT_LEAST_ONCE, false)
  }
}

object MqttSink {
  def apply(): MqttSink = {
    val f = () => {
      val mqtt = new MQTT()
      mqtt.setHost("localhost", 1883)
      val producer = mqtt.futureConnection()
      producer.connect().await()
      sys.addShutdownHook {
        producer.disconnect().await()
      }
      producer
    }
    new MqttSink(f)
  }
}
package org.sense.spark.util

import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import java.util.Locale
import java.util.zip.GZIPInputStream

import org.apache.spark.storage._
import org.apache.spark.streaming.receiver._
import org.joda.time.DateTime
import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}

case class TaxiRide(rideId: Long, isStart: Boolean, startTime: DateTime, endTime: DateTime,
                    startLon: Float, startLat: Float, endLon: Float, endLat: Float,
                    passengerCnt: Short, taxiId: Long, driverId: Long)

object TimeFormatter {
  val timeFormatter: DateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").withLocale(Locale.US).withZoneUTC()
}

class TaxiRideSource extends Receiver[TaxiRide](StorageLevel.MEMORY_AND_DISK_2) {
  val dataFilePath = "/home/flink/nycTaxiRides.gz";
  var dataRateListener: DataRateListener = _

  /**
   * Start the thread that receives data over a connection
   */
  def onStart() {
    dataRateListener = new DataRateListener()
    dataRateListener.start()
    new Thread("TaxiRide Source") {
      override def run() {
        receive()
      }
    }.start()
  }

  def onStop() {}

  /**
   * Periodically generate a TaxiRide event and regulate the emission frequency
   */
  private def receive() {
    while (!isStopped()) {
      val gzipStream = new GZIPInputStream(new FileInputStream(dataFilePath))
      val reader: BufferedReader = new BufferedReader(new InputStreamReader(gzipStream, StandardCharsets.UTF_8))
      try {
        var line: String = null
        do {
          // start time before reading the line
          val startTime = System.nanoTime

          // read the line on the file and yield the object
          line = reader.readLine
          if (line != null) {
            val taxiRide: TaxiRide = getTaxiRideFromString(line)
            store(taxiRide)
          }

          // regulate frequency of the source
          dataRateListener.busySleep(startTime)
        } while (line != null)
      } finally {
        reader.close
      }
    }
  }

  def getTaxiRideFromString(line: String): TaxiRide = {
    // println(line)
    val tokens: Array[String] = line.split(",")
    if (tokens.length != 11) {
      throw new RuntimeException("Invalid record: " + line)
    }

    val rideId: Long = tokens(0).toLong
    val (isStart, startTime, endTime) = tokens(1) match {
      case "START" => (true, DateTime.parse(tokens(2), TimeFormatter.timeFormatter), DateTime.parse(tokens(3), TimeFormatter.timeFormatter))
      case "END" => (false, DateTime.parse(tokens(2), TimeFormatter.timeFormatter), DateTime.parse(tokens(3), TimeFormatter.timeFormatter))
      case _ => throw new RuntimeException("Invalid record: " + line)
    }
    val startLon: Float = if (tokens(4).length > 0) tokens(4).toFloat else 0.0f
    val startLat: Float = if (tokens(5).length > 0) tokens(5).toFloat else 0.0f
    val endLon: Float = if (tokens(6).length > 0) tokens(6).toFloat else 0.0f
    val endLat: Float = if (tokens(7).length > 0) tokens(7).toFloat else 0.0f
    val passengerCnt: Short = tokens(8).toShort
    val taxiId: Long = tokens(9).toLong
    val driverId: Long = tokens(10).toLong

    TaxiRide(rideId, isStart, startTime, endTime, startLon, startLat, endLon, endLat, passengerCnt, taxiId, driverId)
  }
}

Как создать приемник MQTT для потоковой передачи Spark?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Полный пример

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Как создать приемник MQTT для потоковой передачи Spark?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Полный пример

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы