Есть ли способ интегрировать потоковую передачу Apache Spark с Apache Hive и Apache Kafka в одном приложении
после добавления списка с помощью collectAsList и сохранения его в списке.Я получил следующую ошибку.
Может кто-нибудь помочь мне решить эту проблему.
Заранее спасибо.
import org.apache.spark.api.java.function.MapFunction;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
public class DatasetKafka {
public static void main(String[] args) throws IOException {
SparkSession spark = SparkSession
.builder()
.appName("Java Spark Hive Example").master("yarn")
.config("spark.sql.warehouse.dir", "hdfs://localhost:54310/user/hive/warehouse")
.enableHiveSupport()
.getOrCreate();
Logger.getRootLogger().setLevel(Level.ERROR);
Dataset<String> lines = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "test")
.load().selectExpr("CAST(value AS STRING)")
.as(Encoders.STRING());
List<String> line=lines.collectAsList();
for(String li:line) {
String values[]=li.split(",");
String query="insert into table match values("+Integer.parseInt(
values[0])+
","+values[1]+
","+Integer.parseInt(values[2])+
","+Integer.parseInt(values[3])+
","+Integer.parseInt(values[4])+
","+values[5]+
","+Integer.parseInt(values[6])+
","+values[7]+
","+Integer.parseInt(values[8])+
","+Integer.parseInt(values[9])+
","+Integer.parseInt(values[10])+
","+values[11]+
","+Integer.parseInt(values[12])+
","+Integer.parseInt(values[13])+
","+Integer.parseInt(values[14])+
","+Integer.parseInt(values[15])+
","+Integer.parseInt(values[16])+
","+values[17]+
","+values[18]+")";
spark.sql(query);
}
// List<String> values=ll.collectAsList();
Dataset<String> words=lines.map((MapFunction<String, String>)k->{
return k;
}, Encoders.STRING());
Dataset<Row> wordCounts = words.flatMap(
(FlatMapFunction<String, String>) x -> Arrays.asList(x.split(",")).iterator(),
Encoders.STRING()).groupBy("value").count();
StreamingQuery query = wordCounts.writeStream()
.outputMode("complete")
.format("console")
.start();
try {
query.awaitTermination();
} catch (StreamingQueryException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Exception in thread "main" org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
kafka
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.org$apache$spark$sql$catalyst$analysis$UnsupportedOperationChecker$$throwError(UnsupportedOperationChecker.scala:389)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:38)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$$anonfun$checkForBatch$1.apply(UnsupportedOperationChecker.scala:36)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:126)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126)
at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.checkForBatch(UnsupportedOperationChecker.scala:36)
at org.apache.spark.sql.execution.QueryExecution.assertSupported(QueryExecution.scala:51)
at org.apache.spark.sql.execution.QueryExecution.withCachedData$lzycompute(QueryExecution.scala:62)
at org.apache.spark.sql.execution.QueryExecution.withCachedData(QueryExecution.scala:60)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:66)
at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:66)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:72)
at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:68)
at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:77)
at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:77)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3360)
at org.apache.spark.sql.Dataset.collectAsList(Dataset.scala:2794)
at com.ges.kafka.DatasetKafka.main(DatasetKafka.java:48)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)