Хорошо, так что это один из возможных способов удалить все это.
Вы можете использовать информацию схемы для создания всех вложенных имен. Например, entities.media.additional_media_info
, тогда вы можете просто использовать SQL для их выбора.
Это немного трудоемко, и может не обобщать, но это работает
Я хотел бы думать, что это тоже должно быть быстрым, так как это всего лишь SELECT
утверждение.
columns_to_flatten <- sdf_schema_json(sample_tbl, simplify = T) %>%
# using rlist package for ease of use
rlist::list.flatten(use.names = T) %>%
# get names
names() %>%
# remove contents of brackets and whitespace
gsub("\\(.*?\\)|\\s", "", .) %>%
# add alias for column names, dot replaced with double underscore
# this avoids duplicate names that would otherwise occur with singular
{paste(., "AS", gsub("\\.", "__", .))} %>%
# required, otherwise doesn't seem to work
sub("variants", "variants[0]", .)
# construct query
sql_statement <- paste("SELECT",
paste(columns_to_flatten, collapse = ", "),
"FROM example")
# execute on spark cluster, save as table in cluster
spark_session(sc) %>%
sparklyr::invoke("sql", sql_statement) %>%
sparklyr::invoke("createOrReplaceTempView", "flattened_example")
tbl(sc, "flattened_example") %>%
sdf_schema_viewer()
![enter image description here](https://i.stack.imgur.com/OhHXJ.png)
Сгенерированный SQL выглядит так, довольно просто, просто долго:
SELECT contributors AS contributors, coordinates AS coordinates, created_at AS created_at, display_text_range AS display_text_range, entities.hashtags.indices AS entities__hashtags__indices, entities.hashtags.text AS entities__hashtags__text, entities.media.additional_media_info.description AS entities__media__additional_media_info__description, entities.media.additional_media_info.embeddable AS entities__media__additional_media_info__embeddable, entities.media.additional_media_info.monetizable AS entities__media__additional_media_info__monetizable, entities.media.additional_media_info.title AS entities__media__additional_media_info__title, entities.media.display_url AS entities__media__display_url, entities.media.expanded_url AS entities__media__expanded_url, entities.media.id AS entities__media__id, entities.media.id_str AS entities__media__id_str, entities.media.indices AS entities__media__indices, entities.media.media_url AS entities__media__media_url, entities.media.media_url_https AS entities__media__media_url_https, entities.media.sizes.large.h AS entities__media__sizes__large__h, entities.media.sizes.large.resize AS entities__media__sizes__large__resize, entities.media.sizes.large.w AS entities__media__sizes__large__w FROM example