Реализация наивного байесовского алгоритма в скале, ParseCancellationException - PullRequest
0 голосов
/ 01 марта 2019

Реализация наивного алгоритма Байеса в Scala, и когда я пишу код здесь, я получаю ошибку в строке val trainingDF=spark.createDataFrame(training).toDF("sentence","label"), пожалуйста, посмотрите на код.

    import org.apache.spark.ml.Pipeline    
    import org.apache.spark.ml.classification.NaiveBayes    
    import org.apache.spark.ml.feature.{HashingTF, Tokenizer}    
    import org.apache.spark.sql.SparkSession    

    object Naive {    

      case class Sentence(sentence: String,label: Double)



    def main(args:Array[String]) {


    System.setProperty("spark.sql.warehouse.dir",  "file:///tmp/spark-warehouse");

    val spark = SparkSession
      .builder
      .appName("Movies Reviews")
      .config("spark.master", "local")
      .getOrCreate()


    // Prepare training documents from a list of (id, text, label) tuples.
    val neg = spark.sparkContext.textFile("file:///F:/KpIt Board/rt-neg.txt/").repartition(4)
      .map(w => Sentence(w, 0.0))

    val pos = spark.sparkContext.textFile("file:///F:/KpIt Board/rt-pos.txt/").repartition(4)
      .map(w => Sentence(w, 1.0))

    val test = spark.sparkContext.wholeTextFiles("file:///F:/KpIt Board/Trump Tweets Graph/TrumpTweets.txt").repartition(4)
    .map({case(file,sentence) => (file.split("/").last.split("\\.")(0),sentence)})

//        val test = spark.sparkContext.wholeTextFiles("file:F:/KpIt Board/rt-neg.txt/").repartition(4)
//        .map({case(file,sentence) => (file.split(" ").last.split("\\.")(0),sentence)})


    val training=neg.union(pos)
    val trainingDF=spark.createDataFrame(training).toDF("sentence","label")
    val testDF=spark.createDataFrame(test)


    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and Naive Bayes
    val tokenizer = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
    val hashingTF = new HashingTF()
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")
    val nb = new NaiveBayes()

    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, nb))

    // Fit the pipeline to training documents.
    val model = pipeline.fit(trainingDF)

    // Make predictions on test documents.
    model.transform(testDF).repartition(1)
      .select("file", "prediction")
      .write.format("csv")
      .option("header","true")
      .option("delimiter","\t")
      .save("/tmp/spark-prediction")
    spark.stop()
      }
  }

и получаю следующую ошибку:

    Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/03/01 15:18:36 INFO SparkContext: Running Spark version 2.4.0
19/03/01 15:18:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
19/03/01 15:18:37 INFO SparkContext: Submitted application: Movies Reviews
19/03/01 15:18:37 INFO SecurityManager: Changing view acls to: FaiZii
19/03/01 15:18:37 INFO SecurityManager: Changing modify acls to: FaiZii
19/03/01 15:18:37 INFO SecurityManager: Changing view acls groups to: 
19/03/01 15:18:37 INFO SecurityManager: Changing modify acls groups to: 
19/03/01 15:18:37 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(FaiZii); groups with view permissions: Set(); users  with modify permissions: Set(FaiZii); groups with modify permissions: Set()
19/03/01 15:18:38 INFO Utils: Successfully started service 'sparkDriver' on port 53071.
19/03/01 15:18:38 INFO SparkEnv: Registering MapOutputTracker
19/03/01 15:18:38 INFO SparkEnv: Registering BlockManagerMaster
19/03/01 15:18:38 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
19/03/01 15:18:38 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
19/03/01 15:18:38 INFO DiskBlockManager: Created local directory at C:\Users\Micro Center\AppData\Local\Temp\blockmgr-bbdbfc04-7072-4d47-8a76-fea33e471bb9
19/03/01 15:18:38 INFO MemoryStore: MemoryStore started with capacity 900.6 MB
19/03/01 15:18:38 INFO SparkEnv: Registering OutputCommitCoordinator
19/03/01 15:18:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
19/03/01 15:18:39 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
19/03/01 15:18:39 INFO Utils: Successfully started service 'SparkUI' on port 4042.
19/03/01 15:18:39 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://DESKTOP-SBMTUMB:4042
19/03/01 15:18:39 INFO Executor: Starting executor ID driver on host localhost
19/03/01 15:18:39 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 53080.
19/03/01 15:18:39 INFO NettyBlockTransferService: Server created on DESKTOP-SBMTUMB:53080
19/03/01 15:18:39 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
19/03/01 15:18:39 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:39 INFO BlockManagerMasterEndpoint: Registering block manager DESKTOP-SBMTUMB:53080 with 900.6 MB RAM, BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:39 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:39 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, DESKTOP-SBMTUMB, 53080, None)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 214.6 KB, free 900.4 MB)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 20.4 KB, free 900.4 MB)
19/03/01 15:18:40 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on DESKTOP-SBMTUMB:53080 (size: 20.4 KB, free: 900.6 MB)
19/03/01 15:18:40 INFO SparkContext: Created broadcast 0 from textFile at Naive.scala:24
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 214.7 KB, free 900.2 MB)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 20.4 KB, free 900.1 MB)
19/03/01 15:18:40 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on DESKTOP-SBMTUMB:53080 (size: 20.4 KB, free: 900.6 MB)
19/03/01 15:18:40 INFO SparkContext: Created broadcast 1 from textFile at Naive.scala:27
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 215.1 KB, free 899.9 MB)
19/03/01 15:18:40 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 20.5 KB, free 899.9 MB)
19/03/01 15:18:40 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on DESKTOP-SBMTUMB:53080 (size: 20.5 KB, free: 900.5 MB)
19/03/01 15:18:40 INFO SparkContext: Created broadcast 2 from wholeTextFiles at Naive.scala:30
19/03/01 15:18:41 INFO FileInputFormat: Total input paths to process : 1
19/03/01 15:18:41 INFO FileInputFormat: Total input paths to process : 1
19/03/01 15:18:42 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir ('file:///tmp/spark-warehouse').
19/03/01 15:18:42 INFO SharedState: Warehouse path is 'file:///tmp/spark-warehouse'.
Exception in thread "main" java.lang.NoClassDefFoundError: org/antlr/v4/runtime/misc/ParseCancellationException
    at org.apache.spark.sql.internal.BaseSessionStateBuilder.sqlParser$lzycompute(BaseSessionStateBuilder.scala:117)
    at org.apache.spark.sql.internal.BaseSessionStateBuilder.sqlParser(BaseSessionStateBuilder.scala:116)
    at org.apache.spark.sql.internal.BaseSessionStateBuilder.build(BaseSessionStateBuilder.scala:292)
    at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$instantiateSessionState(SparkSession.scala:1104)
    at org.apache.spark.sql.SparkSession.$anonfun$sessionState$2(SparkSession.scala:146)
    at scala.Option.getOrElse(Option.scala:121)
    at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:144)
    at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:141)
    at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78)
    at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:300)
    at Naive$.main(Naive.scala:38)
    at Naive.main(Naive.scala)
Caused by: java.lang.ClassNotFoundException: org.antlr.v4.runtime.misc.ParseCancellationException
    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    ... 12 more
19/03/01 15:18:43 INFO SparkContext: Invoking stop() from shutdown hook
19/03/01 15:18:43 INFO SparkUI: Stopped Spark web UI at http://DESKTOP-SBMTUMB:4042
19/03/01 15:18:43 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
19/03/01 15:18:43 INFO MemoryStore: MemoryStore cleared
19/03/01 15:18:43 INFO BlockManager: BlockManager stopped
19/03/01 15:18:43 INFO BlockManagerMaster: BlockManagerMaster stopped
19/03/01 15:18:43 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
19/03/01 15:18:43 INFO SparkContext: Successfully stopped SparkContext
19/03/01 15:18:43 INFO ShutdownHookManager: Shutdown hook called
19/03/01 15:18:43 INFO ShutdownHookManager: Deleting directory C:\Users\Micro Center\AppData\Local\Temp\spark-f691e6e4-9080-427e-ae2d-ca448411bce8
...