GitHub: https://github.com/mvasyliv/LearningSpark/blob/master/src/main/scala/spark/GroupListValueToColumn.scala
- исходный код {package spark
import org. apache .spark. sql .SparkSession import org. apache .spark. sql .functions ._
объект GroupListValueToColumn extends App {
val spark = SparkSession.builder () .master ("local") .appName ("Mapper") .getOrCreate ()
case case Customer ( cust_id: Int, addresstype: String)
import spark.implicits ._
val source = Seq (Customer (300312008, "credit_card"), Customer (300312008, "to"), Customer (300312008, "от"), Клиент (300312009, "до"), Клиент (300312009, "от"), Клиент (300312010, "до"), Клиент (300312010, "credit_card"), Клиент (300312010, "от ")) .toDF ()
val res = source.groupBy (" cust_id "). agg (collect_list (" addresstype "))
res.show (false) // + - -------- + ------------------------- + // | cust_id | collect_list (адресный тип) | // + --------- + ------------------------- + // | 300312010 | [to, credit_card, from ] | // | 300312008 | [credit_card, to, from] | // | 300312009 | [to, from] | // + --------- + ------------------------- +
val res1 = source. groupBy ("cust_id"). agg (collect_set ("addresstype"))
res1.show (false)
// + --------- + --- --------------------- + // | cust_id | collect_set (адресный тип) | // + --------- + ------------------------ + // | 300312010 | [от, до, кредитная карта] | // | 300312008 | [от, до, кредитная карта] | // | 300312009 | [от, до] | // + --------- + ------------------------ +}
}