Искра Как может быть столбец подзапроса - PullRequest
0 голосов
/ 10 февраля 2020

spark Как может быть подзапрос столбца?

Причина: java .lang.RuntimeException: Не удалось найти count (DISTINCT orderid) # 69L в [storeid # 4, combox_pid # 6, pid # 7, count (1) # 61L]

выберите b.pid, (выберите count (отличный orderid) от a, где a.pid = b.pid) как order_num, count (1) от b как b группу по b.pid

import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SparkSession}

object OrderCountTset {
  Logger.getRootLogger.setLevel(Level.WARN)
  def main(args: Array[String]): Unit = {
    val data = Seq(
      Row("a", "100", "200", "300"),
      Row("a", "100", "200", "300"),
      Row("a", "101", "201", "300"),
      Row("a", "101", "200", "3001"),
      Row("a", "102", "200", "300"),
      Row("a", "103", "201", "300")
    )
    val schme = new StructType()
      .add("storeid", StringType)
      .add("orderid", StringType)
      .add("combox_pid", StringType)
      .add("pid", StringType)

    val spark =   SparkSession.builder().master("local[*]").getOrCreate()
    val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schme)
    df.show()
    df.createOrReplaceTempView("tab_tmp")
    spark.sql("select storeid,orderid,combox_pid,pid,count(distinct orderid ),count(1) from tab_tmp group by storeid,orderid,combox_pid,pid").show()
    //    spark.sql(
    //      """
    //        |select b.storeid,b.combox_pid,b.pid,
    //        |(select count(distinct a.orderid) from tab_tmp as a where a.pid=b.pid) as order_num,
    //        |count(1)
    //        | from tab_tmp as b group by b.storeid,b.combox_pid,b.pid
    //      """.stripMargin).show()
    spark.sql(
      """
        |select storeid,combox_pid,pid,count(1) as num
        | from tab_tmp group by storeid,combox_pid,pid
      """.stripMargin).createOrReplaceTempView("tab_tmp_2")
    spark.sql(
      """
        |select b.storeid,b.combox_pid,b.pid,num,
        |(select count(distinct a.orderid) from tab_tmp as a where a.pid=b.pid) as order_num
        | from tab_tmp_2 as b
      """.stripMargin).show()
  }

}```

1 Ответ

0 голосов
/ 10 февраля 2020

У вас проблема с SQL Запрос не с Spark sub Query. Замените ваш последний запрос на ниже Spark Sql.

Если вы хотите получить distinct orderid независимо от pid, используйте ниже:

spark.sql(
     |       """
     |         |select b.storeid,b.combox_pid,b.pid,num,
     |         |(select count(distinct a.orderid) from tab_tmp as a join tab_tmp_2 as b on a.pid=b.pid) as order_num
     |         | from tab_tmp_2 as b
     |       """.stripMargin).show()
+-------+----------+----+---+---------+
|storeid|combox_pid| pid|num|order_num|
+-------+----------+----+---+---------+
|      a|       200| 300|  3|        4|
|      a|       201| 300|  2|        4|
|      a|       200|3001|  1|        4|
+-------+----------+----+---+---------+

Или, если вы хотите получить distinct orderid за pid, используйте ниже Запрос

scala> spark.sql("select a.pid, count(distinct a.orderid) as order_num from tab_tmp as a join tab_tmp_2 as b on a.pid=b.pid group by a.pid").createOrReplaceTempView("tab_tmp_3")

scala> spark.sql(
     |       """
     |         |select b.storeid,b.combox_pid,b.pid, c.order_num
     |         | from tab_tmp_2 as b left join tab_tmp_3 c on b.pid = c.pid
     |       """.stripMargin).show()
+-------+----------+----+---------+
|storeid|combox_pid| pid|order_num|
+-------+----------+----+---------+
|      a|       200| 300|        4|
|      a|       201| 300|        4|
|      a|       200|3001|        1|
+-------+----------+----+---------+
...