В EMR 5.21.0 с Spark 2.4.0 и AWS Glue в качестве мета-хранилища я не могу создать таблицу с интервалом, используя следующий синтаксис
CREATE TABLE TABLE_NAME
USING PARQUET
PARTITIONED BY (abc) CLUSTERED BY (abc) SORTED BY (abc) INTO 50 buckets
OPTIONS ('compression'='snappy')
LOCATION 's3n://<path>'
SELECT * FROM ORIGINAL TABLE
Выдает следующую ошибку -:
19/05/26 19:34:55 WARN HiveExternalCatalog: Persisting bucketed data source table `db`.`table_name` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
java.lang.IllegalArgumentException: Can not create a Path from an empty string
at org.apache.hadoop.fs.Path.checkPathArg(Path.java:163)
at org.apache.hadoop.fs.Path.<init>(Path.java:175)
at org.apache.spark.sql.catalyst.catalog.CatalogUtils$.stringToURI(ExternalCatalogUtils.scala:236)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getDatabase$1$$anonfun$apply$2.apply(HiveClientImpl.scala:343)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getDatabase$1$$anonfun$apply$2.apply(HiveClientImpl.scala:339)
at scala.Option.map(Option.scala:146)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getDatabase$1.apply(HiveClientImpl.scala:339)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getDatabase$1.apply(HiveClientImpl.scala:345)
at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:275)
at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:213)
at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:212)
at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:258)
at org.apache.spark.sql.hive.client.HiveClientImpl.getDatabase(HiveClientImpl.scala:338)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$getDatabase$1.apply(HiveExternalCatalog.scala:211)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$getDatabase$1.apply(HiveExternalCatalog.scala:211)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
at org.apache.spark.sql.hive.HiveExternalCatalog.getDatabase(HiveExternalCatalog.scala:210)
at org.apache.spark.sql.hive.HiveExternalCatalog.saveTableIntoHive(HiveExternalCatalog.scala:487)
at org.apache.spark.sql.hive.HiveExternalCatalog.org$apache$spark$sql$hive$HiveExternalCatalog$$createDataSourceTable(HiveExternalCatalog.scala:399)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply$mcV$sp(HiveExternalCatalog.scala:263)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:236)
at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createTable$1.apply(HiveExternalCatalog.scala:236)
at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
at org.apache.spark.sql.hive.HiveExternalCatalog.createTable(HiveExternalCatalog.scala:236)
at org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.createTable(ExternalCatalogWithListener.scala:94)
at org.apache.spark.sql.catalyst.catalog.SessionCatalog.createTable(SessionCatalog.scala:319)
at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:185)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:115)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:195)
at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:195)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:195)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:80)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642)
То же утверждение работает, если я удаляю предложение CLUSTERED BY AND BUCKETED. Есть ли какая-либо конфигурация, которую мне нужно установить? Чего мне не хватает?