Независимо от того, что я делаю, я не могу обойти эту ошибку при запуске Apache spark.Я запускаю его через Jupyter Notebook, который указывает на среду Anaconda.
Я новичок в изучении зажигания, и я проверил так много блогов, чтобы исправить эту ошибку, но ничего не помогло.
любая помощьценим это исправить.Я использую Java 8.
Код, который я запускаю:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
lines=sc.textFile("C:/User/PG186028/Spark.txt")
lines.count()
Ошибка, генерируемая после lines.count ():
---------------------------------------------------------------------------
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
<ipython-input-6-c98a4dbe0cd9> in <module>()
----> 1 files.count()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py in count(self)
1051 3
1052 """
-> 1053 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1054
1055 def stats(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py in sum(self)
1042 6.0
1043 """
-> 1044 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
1045
1046 def count(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py in fold(self, zeroValue, op)
913 # zeroValue provided to each partition is unique from the one provided
914 # to the final reduce call
--> 915 vals = self.mapPartitions(func).collect()
916 return reduce(op, vals, zeroValue)
917
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py in collect(self)
812 """
813 with SCCallSiteSync(self.context) as css:
--> 814 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
815 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
816
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py in _jrdd(self)
2472 self._jrdd_deserializer, profiler)
2473 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
-> 2474 self.preservesPartitioning)
2475 self._jrdd_val = python_rdd.asJavaRDD()
2476
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1523 answer = self._gateway_client.send_command(command)
1524 return_value = get_return_value(
-> 1525 answer, self._gateway_client, None, self._fqn)
1526
1527 for temp_arg in temp_args:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
--> 332 format(target_id, ".", name, value))
333 else:
334 raise Py4JError(
Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonRDD. Trace:
py4j.Py4JException: Constructor org.apache.spark.api.python.PythonRDD([class org.apache.spark.rdd.MapPartitionsRDD, class org.apache.spark.api.python.PythonFunction, class java.lang.Boolean]) does not exist
at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179)
at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196)
at py4j.Gateway.invoke(Gateway.java:237)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)