PYSAPRK python java.lang.NoSuchMethodError: net.jpountz.lz4.LZ4BlockInputStream. <init>(Ljava / io / InputStream; Z) V - PullRequest
0 голосов
/ 29 октября 2018

На этот вопрос только для PySpark нет ответа. Поэтому я снова спрашиваю. Я делаю простой wordcount.py пример использования PySpark из Spark download .

Код внизу. Я сделал mvn clean install и использовал это предложение и добавил зависимость к моему pom.xml в примерах папки spark. И mvn install снова.

<dependency>
  <groupId>net.jpountz.lz4</groupId>
  <artifactId>lz4</artifactId>
  <version>1.3.0</version>
</dependency>
from __future__ import print_function

import sys
from operator import add

from pyspark.sql import SparkSession
if __name__ == "__main__":
if len(sys.argv) != 2:
    print("Usage: wordcount <file>", file=sys.stderr)
    sys.exit(-1)

spark = SparkSession\
    .builder\
    .appName("PythonWordCount")\
    .getOrCreate()

lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
counts = lines.flatMap(lambda x: x.split(' ')) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)
output = counts.collect()
for (word, count) in output:
    print("%s: %i" % (word, count))

spark.stop()

Ошибка, которую я получаю:

 2018-10-29 15:19:01 ERROR Utils:91 - Uncaught exception in thread stdout writer for python
java.lang.NoSuchMethodError: net.jpountz.lz4.LZ4BlockInputStream.<init>(Ljava/io/InputStream;Z)V
    at org.apache.spark.io.LZ4CompressionCodec.compressedInputStream(CompressionCodec.scala:122)
    at org.apache.spark.serializer.SerializerManager.wrapForCompression(SerializerManager.scala:163)
    at org.apache.spark.serializer.SerializerManager.wrapStream(SerializerManager.scala:124)
    at org.apache.spark.shuffle.BlockStoreShuffleReader$$anonfun$2.apply(BlockStoreShuffleReader.scala:50)
    at org.apache.spark.shuffle.BlockStoreShuffleReader$$anonfun$2.apply(BlockStoreShuffleReader.scala:50)
    at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:417)
    at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:61)
    at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:204)
    at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:407)
    at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:215)
    at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
    at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:170)
Exception in thread "stdout writer for python" java.lang.NoSuchMethodError: net.jpountz.lz4.LZ4BlockInputStream.<init>(Ljava/io/InputStream;Z)V
    at org.apache.spark.io.LZ4CompressionCodec.compressedInputStream(CompressionCodec.scala:122)
    at org.apache.spark.serializer.SerializerManager.wrapForCompression(SerializerManager.scala:163)
    at org.apache.spark.serializer.SerializerManager.wrapStream(SerializerManager.scala:124)
    at org.apache.spark.shuffle.BlockStoreShuffleReader$$anonfun$2.apply(BlockStoreShuffleReader.scala:50)
    at org.apache.spark.shuffle.BlockStoreShuffleReader$$anonfun$2.apply(BlockStoreShuffleReader.scala:50)
    at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:417)
    at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:61)
    at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
    at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
    at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
    at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
    at scala.collection.Iterator$class.foreach(Iterator.scala:893)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
    at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:204)
    at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:407)
    at org.apache.spark.api.python.BasePythonRunner$WriterThread$$anonfun$run$1.apply(PythonRunner.scala:215)
    at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1988)
    at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:170)
^C2018-10-29 15:19:10 INFO  SparkContext:54 - Invoking stop() from shutdown hook
Traceback (most recent call last):
  File "/usr/local/Cellar/spark-2.3.0/spark/examples/src/main/python/wordcount.py", line 40, in <module>
    output = counts.collect()

1 Ответ

0 голосов
/ 02 ноября 2018

Я обновил спарк до 2.3.2, и ошибка выглядит исправленной.

Пример:

scala> val lines = sc.parallelize (Array (('a', 1), ('a', 1), ('b', 1)))

строк: org.apache.spark.rdd.RDD [(Char, Int)] = ParallelCollectionRDD [0] при распараллеливании в: 24

scala> val y = lines.reduceByKey ((x, y) => (x + y))

y: org.apache.spark.rdd.RDD [(Char, Int)] = ShuffledRDD [1] at reduByKey в: 25

scala> y.collect () res0: Array [(Char, Int)] = Array ((a, 2), (b, 1))

работает ..!

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...