Не уверен насчет ввода, надеюсь, следующий пример поможет
Пример 1
import org.apache.spark.sql.functions._
import sqlContext.implicits._
val df1 = Seq(("foo", "bar","too","aaa"), ("bar", "bar","aaa","foo"), ("aaa", "bbb","ccc","ddd")).toDF("k1","v1","v2","v3")
val df2 = Seq(("aaa", "bbb","ddd"), ("www", "eee","rrr"), ("jjj", "rrr","www")).toDF("k1","v1","v2")
//df1 = df1.withColumn("id", monotonically_increasing_id())
//df2 = df2.withColumn("id", monotonically_increasing_id())
df1.show()
df2.show()
val df3 = df2.join(df1, Seq("k1"), "outer")
// You can use outer ,inner ,right,left any join as per fit in your requirmens
df3.show()
Результаты:
+---+---+---+---+
| k1| v1| v2| v3|
+---+---+---+---+
|foo|bar|too|aaa|
|bar|bar|aaa|foo|
|aaa|bbb|ccc|ddd|
+---+---+---+---+
+---+---+---+
| k1| v1| v2|
+---+---+---+
|aaa|bbb|ddd|
|www|eee|rrr|
|jjj|rrr|www|
+---+---+---+
+---+----+----+----+----+----+
| k1| v1| v2| v1| v2| v3|
+---+----+----+----+----+----+
|jjj| rrr| www|null|null|null|
|aaa| bbb| ddd| bbb| ccc| ddd|
|bar|null|null| bar| aaa| foo|
|foo|null|null| bar| too| aaa|
|www| eee| rrr|null|null|null|
+---+----+----+----+----+----+
import org.apache.spark.sql.functions._
import sqlContext.implicits._
df1: org.apache.spark.sql.DataFrame = [k1: string, v1: string ... 2 more fields]
df2: org.apache.spark.sql.DataFrame = [k1: string, v1: string ... 1 more field]
df3: org.apache.spark.sql.DataFrame = [k1: string, v1: string ... 4 more fields]
Пример 2:
import org.apache.spark.sql.functions._
import sqlContext.implicits._
val df12 = sc.parallelize(Seq(("1001","vaquar"),("2001","khan1"))).toDF("Key" ,"Value")
val df22 = sc.parallelize(Seq(("1001","Noman"),("2001","khan2"))).toDF("Key" ,"Value")
df12.show()
df22.show()
val df33 = df22.join(df12, Seq("Key"), "left_outer")
df33.show()
Результаты:
+----+------+
| Key| Value|
+----+------+
|1001|vaquar|
|2001| khan1|
+----+------+
+----+-----+
| Key|Value|
+----+-----+
|1001|Noman|
|2001|khan2|
+----+-----+
+----+-----+------+
| Key|Value| Value|
+----+-----+------+
|2001|khan2| khan1|
|1001|Noman|vaquar|
+----+-----+------+
import org.apache.spark.sql.functions._
import sqlContext.implicits._
rdd1: org.apache.spark.sql.DataFrame = [Key: string, Value: string]
df12: org.apache.spark.sql.DataFrame = [Key: string, Value: string]
df22: org.apache.spark.sql.DataFrame = [Key: string, Value: string]
df33: org.apache.spark.sql.DataFrame = [Key: string, Value: string ... 1 more field]