Получение сообщений офсета - это сброс настроек в режиме структурированной потоковой передачи в Spark. - PullRequest
2 голосов
/ 10 апреля 2019

Spark (v2.4) Функция программы:

  • Чтение данных JSON из Kafka очереди в режиме структурированной потоковой передачи в Spark
  • Печать данных чтения на консоли в видеis

Получение проблем:
- Получение Resetting offset for partition nifi-log-batch-0 to offset 2826180.

Исходный код:

package io.xyz.streaming

import org.apache.spark.sql.avro._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.functions._

object readKafkaJson {
    private val topic = "nifi-log-batch"
    private val kafkaUrl = "http://<hostname>:9092"
    private val chk = "/home/xyz/tmp/checkpoint"
    private val outputFileLocation = "/home/xyz/abc/data"
    private val sparkSchema = StructType(Array(
                StructField("timestamp", StringType),
                StructField("level", StringType),
                StructField("thread", StringType),
                StructField("class", StringType),
                StructField("message", StringType),
                StructField("updatedOn", StringType),
                StructField("stackTrace", StringType)))


    def main(args: Array[String]): Unit = {
        val spark = SparkSession
            .builder
            .appName("ConfluentConsumer")
            .master("local[*]")
            .getOrCreate()

        import spark.implicits._ 

        // ===================Read Kafka data in JSON==================
        val df = spark
            .readStream
            .format("kafka")
            .option("kafka.bootstrap.servers", kafkaUrl)
            .option("startingOffsets", "latest")
            .option("subscribe", topic)
            .load()

        val dfs1 = df
            .selectExpr("CAST(value AS STRING)")
            .select(from_json(col("value"), sparkSchema).alias("my_column"))
            .select("my_column.*")


        // ===================Write to console==================
        dfs1
            .writeStream
            .format("console")
            .start()
            .awaitTermination()

    }
}

Подробный журнал проблем на консоли:

2019-04-10 01:12:58 INFO  WriteToDataSourceV2Exec:54 - Start processing data source writer: org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@622d0057. The input RDD has 0 partitions.
2019-04-10 01:12:58 INFO  SparkContext:54 - Starting job: start at readKafkaJson.scala:70
2019-04-10 01:12:58 INFO  DAGScheduler:54 - Job 0 finished: start at readKafkaJson.scala:70, took 0.003870 s
2019-04-10 01:12:58 INFO  WriteToDataSourceV2Exec:54 - Data source writer org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@622d0057 is committing.
-------------------------------------------
Batch: 0
-------------------------------------------
2019-04-10 01:12:58 INFO  CodeGenerator:54 - Code generated in 41.952695 ms
+---------+-----+------+-----+-------+---------+----------+
|timestamp|level|thread|class|message|updatedOn|stackTrace|
+---------+-----+------+-----+-------+---------+----------+
+---------+-----+------+-----+-------+---------+----------+

2019-04-10 01:12:58 INFO  WriteToDataSourceV2Exec:54 - Data source writer org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter@622d0057 committed.
2019-04-10 01:12:58 INFO  SparkContext:54 - Starting job: start at readKafkaJson.scala:70
2019-04-10 01:12:58 INFO  DAGScheduler:54 - Job 1 finished: start at readKafkaJson.scala:70, took 0.000104 s
2019-04-10 01:12:58 INFO  CheckpointFileManager:54 - Writing atomically to file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/0 using temp file file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/.0.eb290a31-1965-40e7-9028-d18f2eea0627.tmp
2019-04-10 01:12:58 INFO  CheckpointFileManager:54 - Renamed temp file file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/.0.eb290a31-1965-40e7-9028-d18f2eea0627.tmp to file:/tmp/temporary-df2fea18-7b2f-4146-bcfd-7923cfab65e7/commits/0
2019-04-10 01:12:58 INFO  MicroBatchExecution:54 - Streaming query made progress: {
  "id" : "fb44fbef-5d05-4bb8-ae72-3327b98af261",
  "runId" : "ececfe49-bbc6-4964-8798-78980cbec525",
  "name" : null,
  "timestamp" : "2019-04-10T06:12:56.414Z",
  "batchId" : 0,
  "numInputRows" : 0,
  "processedRowsPerSecond" : 0.0,
  "durationMs" : {
    "addBatch" : 1324,
    "getBatch" : 10,
    "getEndOffset" : 1,
    "queryPlanning" : 386,
    "setOffsetRange" : 609,
    "triggerExecution" : 2464,
    "walCommit" : 55
  },
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "KafkaV2[Subscribe[nifi-log-batch]]",
    "startOffset" : null,
    "endOffset" : {
      "nifi-log-batch" : {
        "0" : 2826180
      }
    },
    "numInputRows" : 0,
    "processedRowsPerSecond" : 0.0
  } ],
  "sink" : {
    "description" : "org.apache.spark.sql.execution.streaming.ConsoleSinkProvider@6ced6212"
  }
}
2019-04-10 01:12:58 INFO  Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.
2019-04-10 01:12:58 INFO  MicroBatchExecution:54 - Streaming query made progress: {
  "id" : "fb44fbef-5d05-4bb8-ae72-3327b98af261",
  "runId" : "ececfe49-bbc6-4964-8798-78980cbec525",
  "name" : null,
  "timestamp" : "2019-04-10T06:12:58.935Z",
  "batchId" : 1,
  "numInputRows" : 0,
  "inputRowsPerSecond" : 0.0,
  "processedRowsPerSecond" : 0.0,
  "durationMs" : {
    "getEndOffset" : 1,
    "setOffsetRange" : 11,
    "triggerExecution" : 15
  },
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "KafkaV2[Subscribe[nifi-log-batch]]",
    "startOffset" : {
      "nifi-log-batch" : {
        "0" : 2826180
      }
    },
    "endOffset" : {
      "nifi-log-batch" : {
        "0" : 2826180
      }
    },
    "numInputRows" : 0,
    "inputRowsPerSecond" : 0.0,
    "processedRowsPerSecond" : 0.0
  } ],
  "sink" : {
    "description" : "org.apache.spark.sql.execution.streaming.ConsoleSinkProvider@6ced6212"
  }
}
2019-04-10 01:12:58 INFO  Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.
2019-04-10 01:12:58 INFO  Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.
2019-04-10 01:12:58 INFO  Fetcher:583 - [Consumer clientId=consumer-1, groupId=spark-kafka-source-9a027b2b-0a3a-4773-a356-a585e488062c--81433247-driver-0] Resetting offset for partition nifi-log-batch-0 to offset 2826180.

Даже когда я запускаю эквивалентный код в pySpark, я также сталкиваюсь с той же проблемой.
Пожалуйста, предложите, как решить эту проблему.

  • Кафка: v2.1.0 cpl, confluent
  • Spark: 2.4

Задание отправлено с помощью следующей команды:

 spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 --jars /home/xyz/Softwares/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar --class io.xyz.streaming.readKafkaJson --master local[*] /home/xyz/ScalaCode/target/SparkSchemaKafka-0.0.1-SNAPSHOT-jar-with-dependencies.jar
...