Несколько левых объединений не работают, как ожидалось в Spark 2.0 (Scala) - PullRequest
0 голосов
/ 14 октября 2019

У меня есть датафрейм из БД

val listvaluesDF = spark.sqlContext.read.format("jdbc").option("driver" , "com.microsoft.sqlserver.jdbc.SQLServerDriver").option("url", "jdbc:sqlserver://azure.cloud.acme.com:14481;databaseName=dbadmin3").option("dbtable", "(select distinct [key], value, internal from dbadmin3.V_LIST_VALUES where internal in ('year', 'wmt0SBU', 'wmt0Department', 'wmt0DeptCategory', 'wmt0DotcomOnly', 'wmt0WalmartWeek', 'wmt0SetWeek', 'wmt0Event', 'wmt1Qtr','seasonType')) tmp").option("user", "aaa").option("password", "xxx").load()

, разделенный на несколько фреймов данных

listvaluesDF.createOrReplaceTempView("listvaluesDF")

var dfYear = spark.sql("select key, value from listvaluesDF where internal = 'year'")
var dfSBU = spark.sql("select key, value from listvaluesDF where internal = 'wmt0SBU'")
var dfDept = spark.sql("select key, value from listvaluesDF where internal = 'wmt0Department'")
var dfDeptCategory = spark.sql("select key, value from listvaluesDF where internal = 'wmt0DeptCategory'")
var dfDotcom = spark.sql("select key, value from listvaluesDF where internal = 'wmt0DotcomOnly'")
var dfWalmartWeek = spark.sql("select key, value from listvaluesDF where internal = 'wmt0WalmartWeek'")
var dfSetWeek = spark.sql("select key, value from listvaluesDF where internal = 'wmt0SetWeek'")
var dfEvent = spark.sql("select key, value from listvaluesDF where internal = 'wmt0Event'")
var dfQtr = spark.sql("select key, value from listvaluesDF where internal = 'wmt1Qtr'")
var dfseasonType = spark.sql("select key, value from listvaluesDF where internal = 'seasonType'")

, и я выполняю множественное левое соединение с Main DF, как

val seasonFinalDF = seasonsDF.alias("seasonsDF").join(paletteDF.alias("primaryPalette"), seasonsDF("primaryPalette") === paletteDF("id"), "left_outer").join(flextypeDF.alias("SBU"), seasonsDF("hierarchy") === flextypeDF("key"), "left_outer").join(dfYear.alias("fiscalYearEnding"), seasonsDF("fiscalYearEnding") === dfYear("key"), "left_outer").join(dfSBU.alias("SBU"), seasonsDF("SBU") === dfSBU("key"), "left_outer").join(dfDept.alias("department"), seasonsDF("department") === dfDept("key"), "left_outer").join(dfDeptCategory.alias("dept_Category"), seasonsDF("dept_Category") === dfDeptCategory("key"), "left_outer").join(dfDotcom.alias("dotcomOnly"), seasonsDF("dotcomOnly") === dfDotcom("key"), "left_outer").join(dfseasonType.alias("type"), seasonsDF("type") === dfseasonType("key"), "left_outer").join(dfWalmartWeek.alias("walmartWeek"), seasonsDF("walmartWeek") === dfWalmartWeek("key"), "left_outer").join(dfSetWeek.alias("setWeek"), seasonsDF("setWeek") === dfSetWeek("key"), "left_outer").join(dfEvent.alias("event"), seasonsDF("event") === dfEvent("key"), "left_outer").join(dfQtr.alias("quarter"), seasonsDF("quarter") === dfQtr("key"), "left_outer").select("seasonsDF.seasonMasterID","seasonsDF.seasonName","fiscalYearEnding.value","SBU.value","department.value","dept_Category.value","dotcomOnly.value","seasonsDF.active","type.value","seasonsDF.createdDate","seasonsDF.createdBy","seasonsDF.updatedDate","seasonsDF.modifiedBy","seasonsDF.seasonId","seasonsDF.flexID","primaryPalette.paletteName","walmartWeek.value","setWeek.value","event.value","quarter.value","hierarchy.DisplayName").toDF("seasonMasterID","seasonName","fiscalYearEnding","SBU","department","dept_Category","dotcomOnly","active","type","createdDate","createdBy","updatedDate","modifiedBy","seasonId","flexID","primaryPalette","walmartWeek","setWeek","event","quarter","hierarchy")

и, наконец, получим результат = DF, как

scala> seasonFinalDF.printSchema
root
 |-- seasonMasterID: long (nullable = true)
 |-- seasonName: string (nullable = true)
 |-- fiscalYearEnding: string (nullable = true)
 |-- SBU: string (nullable = true)
 |-- department: string (nullable = true)
 |-- dept_Category: string (nullable = true)
 |-- dotcomOnly: string (nullable = true)
 |-- active: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- createdDate: timestamp (nullable = true)
 |-- createdBy: long (nullable = true)
 |-- updatedDate: timestamp (nullable = true)
 |-- modifiedBy: long (nullable = true)
 |-- seasonId: long (nullable = true)
 |-- flexID: string (nullable = true)
 |-- primaryPalette: string (nullable = true)
 |-- walmartWeek: string (nullable = true)
 |-- setWeek: string (nullable = true)
 |-- event: string (nullable = true)
 |-- quarter: string (nullable = true)

Теперь, в этом случае все левые объединения становятся нулевыми, кроме первого, когда я делаю объяснение, они присоединяются к одному и тому же sql несколько раз

Я не уверен, где и как все идет не так, может ли кто-нибудь помочь найти правильный путь к левому соединению

...