вы можете просто использовать UDF (User Defined Functions)
для достижения этой цели, как только вы объединили свои два кадра данных, как показано ниже:
import sparkSession.sqlContext.implicits._
val df1 = Seq((1, "aaa", "a", "a1"), (2, "bbb", "b", "xx"), (3, "ccc", "c", "c3"), (4, "ddd", "d", "d4"), (6, "fff", "f", "f6")).toDF("id", "name", "type", "ref")
val df2 = Seq((1, "aaa", "a", "a1"), (2, "bbb", "b", "b2"), (3, "ccc", "c", "xx"), (4, "ddd", "d", "d4"), (6, "fff", "f", "f66")).toDF("id", "name", "type", "ref")
val diffCondition: UserDefinedFunction = udf {
(ref1: String, ref2: String) => {
var result: String = null
if (!ref1.equals(ref2) && !"xx".equals(ref1) && !"xx".equals(ref2)) {
result = s"$ref1 -> $ref2"
}
result
}
}
df1.join(df2, Seq("id", "name", "type"))
.withColumn("difference", diffCondition(df1("ref"), df2("ref")))
.filter("difference is not null")
.show()
и вывод
+---+----+----+---+---+----------+
| id|name|type|ref|ref|difference|
+---+----+----+---+---+----------+
| 6| fff| f| f6|f66| f6 -> f66|
+---+----+----+---+---+----------+