Я пытаюсь реализовать сходство с Жакаром, используя технику, указанную в Spark ML Lib. У меня есть фрейм данных пользователя и предметов. Я получаю неправильные результаты с нулевым баллом сходства. Что я делаю не так?
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler
df = sc.parallelize([ \
Row(CUST_ID=1, ITEM_ID=1),\
Row(CUST_ID=1, ITEM_ID=2),\
Row(CUST_ID=2, ITEM_ID=1),\
Row(CUST_ID=2, ITEM_ID=2),\
Row(CUST_ID=2, ITEM_ID=3)
]).toDF()
dfpivot=(df
.groupBy("CUST_ID").pivot("ITEM_ID").count().na.fill(0)
)
input_cols = [x for x in dfpivot.columns if x !="CUST_ID"]
dfassembler1 = (VectorAssembler(inputCols=input_cols, outputCol="features")
.transform(dfpivot)
.select("CUST_ID", "features"))
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=3)
model = mh.fit(dfassembler)
# Feature Transformation
print("The hashed dataset where hashed values are stored in the column
'хэшей': ")
model.transform (dfassembler) .show (3, False)
dfA=dfassembler
dfB=dfassembler
print("Approximately joining dfA and dfB on distance smaller than 0.6:")
model.approxSimilarityJoin(dfA, dfB, 0.3, distCol="JaccardDistance")\
.select(col("datasetA.CUST_ID").alias("idA"),
col("datasetB.CUST_ID").alias("idB"),
col("JaccardDistance")).show()
Approximately joining dfA and dfB on distance smaller than 0.6:
+---+---+---------------+
+---+---+---------------+
|idA|idB|JaccardDistance|
+---+---+---------------+
| 1| 1| 0.0|
| 2| 2| 0.0|
+---+---+---------------+