Вы можете использовать cartesian()
и filter()
только необходимый треугольник, например ::100100
In []:
def calculate_distance(a, b):
return f'd({a}, {b})' # Py 3.6
rdd = sc.parallelize([(1, ['a', 'b', 'c']), (2, ['c', 'd', 'e']), (3, ['e', 'f', 'g'])])
(rdd.cartesian(rdd)
.filter(lambda x: x[0][0] < x[1][0])
.map(lambda x: (x[0][0], x[1][0], calculate_distance(x[0][1], x[1][1])))
.collect())
Out[]:
[(1, 2, "d(['a', 'b', 'c'], ['c', 'd', 'e'])"),
(1, 3, "d(['a', 'b', 'c'], ['e', 'f', 'g'])"),
(2, 3, "d(['c', 'd', 'e'], ['e', 'f', 'g'])")]