Я пытаюсь реализовать в Pyspark пример из книги High Performance Spark. Здесь есть простая концепция:
У вас есть такой набор данных:
test_data = [
('adam', 'My cutest panda is so happy it literally shits happiness from the inside'),
('magda', 'my panda is not so happy, as it requires a lot of attention and likes whenISaySoLongWordsItsCrazy'),
('jacek', 'I dont like pandas, cause I am dumb'),
('maciek', 'WTF happy happy happy happy'),
('adam', 'this is me')
]
И вы хотите посчитать несколько вещей: - Количество счастливых слов на ключ - Самое длинное слово на ключ - Среднее количество слов на ключ
Для этого подготовлен специальный класс:
from __future__ import annotations
from dataclasses import dataclass
from pyspark import RDD, SparkConf, SparkContext
class MetricsCalculator:
def __init__(self, start_words: int, longest_word: int, happy_mentions: int):
self.total_words = start_words
self.longest_word = longest_word
self.happy_mentions = happy_mentions
self.number_report_cards = 0
def seq_op(self, report_card: str) -> MetricsCalculator:
string_buffer = ''
for char in report_card + ' ':
if char == ' ':
self.total_words += 1
if string_buffer == 'happy':
self.happy_mentions += 1
self.longest_word = max(self.longest_word, len(string_buffer))
string_buffer = ''
else:
string_buffer = string_buffer + char
self.number_report_cards += 1
return self
def comp_op(self, other: MetricsCalculator) -> MetricsCalculator:
self.total_words += other.total_words
self.longest_word = max(self.longest_word, other.longest_word)
self.happy_mentions += other.happy_mentions
self.number_report_cards += other.number_report_cards
return self
def to_report_card(self) -> ReportCardMetrics:
return ReportCardMetrics(self.longest_word,
self.happy_mentions,
self.total_words / self.number_report_cards)
и класс данных:
@dataclass
class ReportCardMetrics:
longest_word: int
happy_mentions: int
average_words: float
Все это используется внутри метода aggregateByKey:
def calculate_report_card_statistics(rdd: RDD(str, str)) -> RDD(str, ReportCardMetrics):
return rdd.aggregateByKey(MetricsCalculator(0, 0, 0),
lambda report_card_metrics, report_card_text: report_card_metrics.seq_op(
report_card_text), lambda x, y: x.comp_op(y))
test_data = [
('adam', 'My cutest panda is so happy it literally shits happiness from the inside'),
('magda', 'my panda is not so happy, as it requires a lot of attention and likes whenISaySoLongWordsItsCrazy'),
('jacek', 'I dont like pandas, cause I am dumb'),
('maciek', 'WTF happy happy happy happy'),
('adam', 'to ja')
]
conf = SparkConf().setMaster("local").setAppName("WordApp")
sc = SparkContext(conf=conf)
rdd = sc.parallelize(test_data, 4)
print(calculate_report_card_statistics(rdd).collect())
И сообщение об ошибке:
Traceback (most recent call last):
File "/home/adam/dev/Python and Other/courses/pyspark/optimization/my_aggr_by_key.py", line 68, in <module>
print(calculate_report_card_statistics(rdd).collect())
File "/home/adam/dev/Python and Other/courses/pyspark/optimization/my_aggr_by_key.py", line 54, in calculate_report_card_statistics
report_card_text), lambda x, y: x.comp_op(y))
File "/home/adam/dev/Python and Other/courses/pyspark/venv/lib/python3.7/site-packages/pyspark/rdd.py", line 1889, in aggregateByKey
lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions, partitionFunc)
File "/home/adam/dev/Python and Other/courses/pyspark/venv/lib/python3.7/site-packages/pyspark/rdd.py", line 1865, in combineByKey
shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)
File "/home/adam/dev/Python and Other/courses/pyspark/venv/lib/python3.7/site-packages/pyspark/rdd.py", line 1802, in partitionBy
keyed._jrdd.rdd()).asJavaPairRDD()
File "/home/adam/dev/Python and Other/courses/pyspark/venv/lib/python3.7/site-packages/pyspark/rdd.py", line 2532, in _jrdd
self._jrdd_deserializer, profiler)
File "/home/adam/dev/Python and Other/courses/pyspark/venv/lib/python3.7/site-packages/pyspark/rdd.py", line 2434, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "/home/adam/dev/Python and Other/courses/pyspark/venv/lib/python3.7/site-packages/pyspark/rdd.py", line 2420, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "/home/adam/dev/Python and Other/courses/pyspark/venv/lib/python3.7/site-packages/pyspark/serializers.py", line 607, in dumps
raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: TypeError: can't pickle mappingproxy objects
Process finished with exit code 1
Похоже, что Pyspark не принимает настраиваемый объект как нулевое значение. Я даже пробовал более простой пример:
test_data2 = [
('adam', 5),
('magda', 1),
('jacek', 4),
('maciek', 9),
('adam', 4)
]
rdd2 = sc.parallelize(test_data2, 2)
class TestClass:
def __init__(self, val):
self.value = val
def add(self, x):
self.value += x
return self
def add_comp(self, x):
self.value += x.value
return self
print(rdd2.aggregateByKey(TestClass(5), lambda test_object, y: test_object.add(y),
lambda obj1, obj2: obj1.add_comp(obj2)).collect())
Но он генерирует другую (хотя и рассольную) ошибку:
_pickle.PicklingError: Can't pickle <class '__main__.TestClass'>: attribute lookup TestClass on __main__ failed