Реализация наивного алгоритма Байеса в Scala, и когда я пишу код здесь, я получаю ошибку в строке val trainingDF=spark.createDataFrame(training).toDF("sentence","label")
, пожалуйста, посмотрите на код.
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession
object Naive {
case class Sentence(sentence: String,label: Double)
def main(args:Array[String]) {
System.setProperty("spark.sql.warehouse.dir", "file:///tmp/spark-warehouse");
val spark = SparkSession
.builder
.appName("Movies Reviews")
.config("spark.master", "local")
.getOrCreate()
// Prepare training documents from a list of (id, text, label) tuples.
val neg = spark.sparkContext.textFile("file:///F:/KpIt Board/rt-neg.txt/").repartition(4)
.map(w => Sentence(w, 0.0))
val pos = spark.sparkContext.textFile("file:///F:/KpIt Board/rt-pos.txt/").repartition(4)
.map(w => Sentence(w, 1.0))
val test = spark.sparkContext.wholeTextFiles("file:///F:/KpIt Board/Trump Tweets Graph/TrumpTweets.txt").repartition(4)
.map({case(file,sentence) => (file.split("/").last.split("\\.")(0),sentence)})
// val test = spark.sparkContext.wholeTextFiles("file:F:/KpIt Board/rt-neg.txt/").repartition(4)
// .map({case(file,sentence) => (file.split(" ").last.split("\\.")(0),sentence)})
val training=neg.union(pos)
val trainingDF=spark.createDataFrame(training).toDF("sentence","label")
val testDF=spark.createDataFrame(test)
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and Naive Bayes
val tokenizer = new Tokenizer()
.setInputCol("sentence")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val nb = new NaiveBayes()
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, nb))
// Fit the pipeline to training documents.
val model = pipeline.fit(trainingDF)
// Make predictions on test documents.
model.transform(testDF).repartition(1)
.select("file", "prediction")
.write.format("csv")
.option("header","true")
.option("delimiter","\t")
.save("/tmp/spark-prediction")
spark.stop()
}
}
и получаю следующую ошибку:
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/03/01 15:18:36 INFO SparkContext: Running Spark version 2.4.0
19/03/01 15:18:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
19/03/01 15:18:37 INFO SparkContext: Submitted application: Movies Reviews
19/03/01 15:18:37 INFO SecurityManager: Changing view acls to: FaiZii
19/03/01 15:18:37 INFO SecurityManager: Changing modify acls to: FaiZii
19/03/01 15:18:37 INFO SecurityManager: Changing view acls groups to:
19/03/01 15:18:37 INFO SecurityManager: Changing modify acls groups to:
19/03/01 15:18:37 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(FaiZii); groups with view permissions: Set(); users with modify permissions: Set(FaiZii); groups with modify permissions: Set()
19/03/01 15:18:38 INFO Utils: Successfully started service 'sparkDriver' on port 53071.
19/03/01 15:18:38 INFO SparkEnv: Registering MapOutputTracker
19/03/01 15:18:38 INFO SparkEnv: Registering BlockManagerMaster
19/03/01 15:18:38 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
19/03/01 15:18:38 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
19/03/01 15:18:38 INFO DiskBlockManager: Created local directory at C:\Users\Micro Center\AppData\Local\Temp\blockmgr-bbdbfc04-7072-4d47-8a76-fea33e471bb9
19/03/01 15:18:38 INFO MemoryStore: MemoryStore started with capacity 900.6 MB
19/03/01 15:18:38 INFO SparkEnv: Registering OutputCommitCoordinator
19/03/01 15:18:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
19/03/01 15:18:39 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
19/03/01 15:18:39 INFO Utils: Successfully started service 'SparkUI' on port 4042.
19/03/01 15:18:39 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://DESKTOP-SBMTUMB:4042
19/03/01 15:18:39 INFO Executor: Starting executor ID driver on host localhost
19/03/01 15:18:39 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 53080.
19/03/01 15:18:39 INFO NettyBlockTransferService: Server created on DESKTOP-SBMTUMB:53080
19/03/01 15:18:39 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
19/03/01 15:18:39 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:39 INFO BlockManagerMasterEndpoint: Registering block manager DESKTOP-SBMTUMB:53080 with 900.6 MB RAM, BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:39 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:39 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 214.6 KB, free 900.4 MB)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 20.4 KB, free 900.4 MB)
19/03/01 15:18:40 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on DESKTOP-SBMTUMB:53080 (size: 20.4 KB, free: 900.6 MB)
19/03/01 15:18:40 INFO SparkContext: Created broadcast 0 from textFile at Naive.scala:24
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 214.7 KB, free 900.2 MB)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 20.4 KB, free 900.1 MB)
19/03/01 15:18:40 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on DESKTOP-SBMTUMB:53080 (size: 20.4 KB, free: 900.6 MB)
19/03/01 15:18:40 INFO SparkContext: Created broadcast 1 from textFile at Naive.scala:27
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 215.1 KB, free 899.9 MB)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 20.5 KB, free 899.9 MB)
19/03/01 15:18:40 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on DESKTOP-SBMTUMB:53080 (size: 20.5 KB, free: 900.5 MB)
19/03/01 15:18:40 INFO SparkContext: Created broadcast 2 from wholeTextFiles at Naive.scala:30
19/03/01 15:18:41 INFO FileInputFormat: Total input paths to process : 1
19/03/01 15:18:41 INFO FileInputFormat: Total input paths to process : 1
19/03/01 15:18:42 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:///tmp/spark-warehouse').
19/03/01 15:18:42 INFO SharedState: Warehouse path is 'file:///tmp/spark-warehouse'.
Exception in thread "main" java.lang.NoClassDefFoundError: org/antlr/v4/runtime/misc/ParseCancellationException
at org.apache.spark.sql.internal.BaseSessionStateBuilder.sqlParser$lzycompute(BaseSessionStateBuilder.scala:117)
at org.apache.spark.sql.internal.BaseSessionStateBuilder.sqlParser(BaseSessionStateBuilder.scala:116)
at org.apache.spark.sql.internal.BaseSessionStateBuilder.build(BaseSessionStateBuilder.scala:292)
at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1104)
at org.apache.spark.sql.SparkSession.$anonfun$sessionState$2(SparkSession.scala:146)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:144)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:141)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78)
at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:300)
at Naive$.main(Naive.scala:38)
at Naive.main(Naive.scala)
Caused by: java.lang.ClassNotFoundException: org.antlr.v4.runtime.misc.ParseCancellationException
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 12 more
19/03/01 15:18:43 INFO SparkContext: Invoking stop() from shutdown hook
19/03/01 15:18:43 INFO SparkUI: Stopped Spark web UI at http://DESKTOP-SBMTUMB:4042
19/03/01 15:18:43 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
19/03/01 15:18:43 INFO MemoryStore: MemoryStore cleared
19/03/01 15:18:43 INFO BlockManager: BlockManager stopped
19/03/01 15:18:43 INFO BlockManagerMaster: BlockManagerMaster stopped
19/03/01 15:18:43 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
19/03/01 15:18:43 INFO SparkContext: Successfully stopped SparkContext
19/03/01 15:18:43 INFO ShutdownHookManager: Shutdown hook called
19/03/01 15:18:43 INFO ShutdownHookManager: Deleting directory C:\Users\Micro Center\AppData\Local\Temp\spark-f691e6e4-9080-427e-ae2d-ca448411bce8