val l = df.select("org", "step");
val r = df.select("org", "id");
val right = l.join(r, "org");
val result = df.join(right, Seq("org", "id", "step"), "right_outer").distinct().orderBy("org", "id", "step");
result.show
Дает:
+---+---+----+-----+
|org| id|step|value|
+---+---+----+-----+
| 1| 1| 1| 12|
| 1| 1| 2| 13|
| 1| 1| 3| 14|
| 1| 1| 4| 15|
| 1| 2| 1| 16|
| 1| 2| 2| 17|
| 1| 2| 3| null|
| 1| 2| 4| null|
| 2| 1| 1| 1|
| 2| 1| 2| 2|
+---+---+----+-----+
Бонус: sql-запрос к таблице (orgs
), отражающий df
содержимое
select distinct o_right."org", o_right."id", o_right."step", o_left."value"
from orgs as o_left
right outer join (
select o_in_left."org", o_in_right."id", o_in_left."step"
from orgs as o_in_right
join (select "org", "step" from orgs) as o_in_left
on o_in_right."org" = o_in_left."org"
order by "org", "id", "step"
) as o_right
on o_left."org" = o_right."org"
and o_left."step" = o_right."step"
and o_left."id" = o_right."id"
order by "org", "id", "step"