В искре, как абстрагировать массив из структуры и создать с ним новое поле - PullRequest
1 голос
/ 04 августа 2020

У меня есть структура со схемой:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)

Теперь я хочу создать новый столбец с product_color, поэтому в моем фрейме данных я добавляю новый столбец, например

  df.withColumn("product_color", col(currentNode + "." + fieldName))

С новым столбцом схема:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: integer (nullable = true)
 |    |    |    |-- color: string (nullable = true)
 |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |-- products_id: long (nullable = true)

Если вы посмотрите на схему для product_color, добавляется элемент массива.

|-- element: array (containsNull = true)

Мне нужна помощь, чтобы понять, как может создать новый столбец с точной схемой, которая находится внутри структуры продукта.

Ожидаемая схема:

root
 |-- id: long (nullable = true)
 |-- products: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- product_color: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: integer (nullable = true)
 |    |    |    |    |-- color: string (nullable = true)
 |    |    |    |    |-- created_at: long (nullable = true)
 |    |    |    |    |-- updated_at: long (nullable = true)
 |    |    |    |    |-- products_id: long (nullable = true)
 |    |    |-- orders_id: long (nullable = true)
 |-- product_color: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- color: string (nullable = true)
 |    |    |-- created_at: long (nullable = true)
 |    |    |-- updated_at: long (nullable = true)
 |    |    |-- products_id: long (nullable = true)

Spark: 2.4.5 Язык: Scala

1 Ответ

2 голосов
/ 04 августа 2020

При добавлении new column разнесите массив, чтобы получить желаемую схему.

Example:

//sample df schema

df.printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- order_id: long (nullable = true)
// |    |    |-- product_color: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- color: string (nullable = true)
// |    |    |    |    |-- id: long (nullable = true)
// |    |    |    |    |-- products_id: long (nullable = true)

df.withColumn("product_color",explode(col("products.product_color"))).printSchema
//root
// |-- id: long (nullable = true)
// |-- products: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- order_id: long (nullable = true)
// |    |    |-- product_color: array (nullable = true)
// |    |    |    |-- element: struct (containsNull = true)
// |    |    |    |    |-- color: string (nullable = true)
// |    |    |    |    |-- id: long (nullable = true)
// |    |    |    |    |-- products_id: long (nullable = true)
// |-- product_color: array (nullable = true)
// |    |-- element: struct (containsNull = true)
// |    |    |-- color: string (nullable = true)
// |    |    |-- id: long (nullable = true)
// |    |    |-- products_id: long (nullable = true)
...