db.spark.exceptions.MongoTypeConversionException: невозможно преобразовать STRING в NullType (значение: BsonString {value = '10203701901867242'}) - PullRequest
0 голосов
/ 17 октября 2019

Я попытался сделать скрипт для ввода данных из MongoDB в корзину S3:

val mongoDF = spark.read.mongo(readConfig)
mongoDF.repartition(1).write.json(path.format(TABLE))

и этот другой способ тоже

 val mongoDF = spark.read.mongo(readConfig)
 val mongo_table = mongoDF.sqlContext.sql("""SELECT *FROM mongo_%s""".format(TABLE)).toDF()
 mongo_table.repartition(1).write.json(path.format(TABLE))

Итак, я простохочу преобразовать myDF в документ JSON для ввода в мое ведро. Я сею DF. Преобразование, чтобы помочь мне бросить нулевые типы, но я не знаю, решит ли это мою проблему. Можешь мне помочь? У меня была эта ошибка:



org.apache.spark.SparkException: Job aborted.
      at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:198)
      at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
      at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
      at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
      at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
      at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
      at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
      at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:156)
      at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
      at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
      at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
      at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
      at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
      at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
      at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
      at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
      at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
      at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
      at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
      at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
      at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
      at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
      at org.apache.spark.sql.DataFrameWriter.json(DataFrameWriter.scala:545)
      ... 91 elided
    Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 102.0 failed 4 times, most recent failure: Lost task 0.3 in stage 102.0 (TID 499, ip-172-35-6-115.ec2.internal, executor 207): com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a NullType (value: BsonString{value='10203701901867242'})
        at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:200)
        at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
        at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
        at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
        at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
        at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
        at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
        at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
        at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
        at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
        at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$1.apply(MongoRelation.scala:58)
        at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$1.apply(MongoRelation.scala:58)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
        at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
        at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
        at org.apache.spark.scheduler.Task.run(Task.scala:123)
        at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

    Driver stacktrace:
      at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2041)
      at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2029)
      at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2028)
      at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
      at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
      at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2028)
      at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:966)
      at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:966)
      at scala.Option.foreach(Option.scala:257)
      at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:966)
      at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2262)
      at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2211)
      at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2200)
      at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
      at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:777)
      at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
      at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
      ... 113 more
    Caused by: com.mongodb.spark.exceptions.MongoTypeConversionException: Cannot cast STRING into a NullType (value: BsonString{value='10203701901867242'})
      at com.mongodb.spark.sql.MapFunctions$.com$mongodb$spark$sql$MapFunctions$$convertToDataType(MapFunctions.scala:200)
      at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:39)
      at com.mongodb.spark.sql.MapFunctions$$anonfun$3.apply(MapFunctions.scala:37)
      at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
      at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
      at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
      at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
      at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
      at com.mongodb.spark.sql.MapFunctions$.documentToRow(MapFunctions.scala:37)
      at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$1.apply(MongoRelation.scala:58)
      at com.mongodb.spark.sql.MongoRelation$$anonfun$buildScan$1.apply(MongoRelation.scala:58)
      at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
      at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
      at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
      at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
      at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
      at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
      at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
      at org.apache.spark.scheduler.Task.run(Task.scala:123)
      at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
      at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
      at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
...