У меня есть таблица в postgres, где один из столбцов имеет имя с двойной кавычкой (имя в формате, аналогичном Prefix column name "A"
).
Теперь я пытаюсь загрузить эту таблицу в df:
val df = spark.read.format("jdbc").option("url", "jdbc:postgresql://host.com:port/compose?user=user&password=password")
.option("dbtable", "schema.\"NameOfTable\"")
.option(JDBCOptions.JDBC_DRIVER_CLASS, "org.postgresql.Driver")
.option("stringtype", "unspecified").load()
Неудача, за исключением:
org.postgresql.util.PSQLException: ERROR: zero-length delimited identifier at or near """"
Position: 42
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2433)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2178)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:306)
at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:441)
at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:365)
at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:155)
at org.postgresql.jdbc.PgPreparedStatement.executeQuery(PgPreparedStatement.java:118)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:301)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
06/13/2019 12:32:42.332|WARN ||||org.apache.spark.internal.Logging$class.logWarning(Logging.scala:66)|Lost task 0.0 in stage 14.0 (TID 14, localhost, executor driver): org.postgresql.util.PSQLException: ERROR: zero-length delimited identifier at or near """"
Position: 42
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2433)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2178)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:306)
at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:441)
at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:365)
at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:155)
at org.postgresql.jdbc.PgPreparedStatement.executeQuery(PgPreparedStatement.java:118)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:301)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
06/13/2019 12:32:42.334|ERROR||||org.apache.spark.internal.Logging$class.logError(Logging.scala:70)|Task 0 in stage 14.0 failed 1 times; aborting job
06/13/2019 12:32:42.334|INFO ||||org.apache.spark.internal.Logging$class.logInfo(Logging.scala:54)|Removed TaskSet 14.0, whose tasks have all completed, from pool
06/13/2019 12:32:42.337|INFO ||||org.apache.spark.internal.Logging$class.logInfo(Logging.scala:54)|Cancelling stage 14
06/13/2019 12:32:42.338|INFO ||||org.apache.spark.internal.Logging$class.logInfo(Logging.scala:54)|ResultStage 14 (show at GenerateDataDistributionSpec.scala:530) failed in 1.030 s due to Job aborted due to stage failure: Task 0 in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage 14.0 (TID 14, localhost, executor driver): org.postgresql.util.PSQLException: ERROR: zero-length delimited identifier at or near """"
Position: 42
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2433)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2178)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:306)
at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:441)
at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:365)
at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:155)
at org.postgresql.jdbc.PgPreparedStatement.executeQuery(PgPreparedStatement.java:118)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD.compute(JDBCRDD.scala:301)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Для меня идеальным решением было бы добавить некоторые опции в строку кода искры для обработки двойных кавычек во время загрузки.Я хотел бы избежать именования всех столбцов в select.
ОБНОВЛЕНИЕ: Любое решение будет чрезвычайно полезным.
Знаете ли вы какой-либо способ его настройки, чтобы он работал?
Я пытался добавить .option("escape", "\\")
.Также причиной может быть тот факт, что когда я пытался экспортировать таблицу как csv, появился столбец с именем "Prefix column name ""A"""
, поэтому, возможно, способ, которым postgres пытается экспортировать его, вызывает проблемы.