Я получаю сообщение об ошибке «org.bson.BsonInvalidOperationException: Invalid state INITIAL» при чтении (Py) Spark Dataframe из MongoDB с соединителем dataframe:
df = spark.read.format("com.mongodb.spark.sql").option("uri", config.uri).option("database", config.DB).option("collection", config.collection).load()
df.count()
Технические характеристики:
Spark version: 2.2.1 (Scala 11)
bson-3.6.0.jar
mongo-spark-connector_2.11-2.2.1.jar
mongo-java-driver-3.4.2.jar
При вызове действия подсчета (или другого действия) на df я получил следующую ошибку:
Py4JJavaError: Произошла ошибка при вызове o60.count.
: org.apache.spark.SparkException: задание прервано из-за сбоя этапа: задание 0 на этапе 1.0 не выполнено 1 раз, последний сбой: потерянное задание 0.0 на этапе 1.0 (TID 1, localhost, драйвер исполнителя): org.bson.BsonInvalidOperationException : Неверное состояние ПЕРВОНАЧАЛЬНО
в org.bson.json.StrictCharacterStreamJsonWriter.checkPreconditions (StrictCharacterStreamJsonWriter.java:352)
в org.bson.json.StrictCharacterStreamJsonWriter.writeNull (StrictCharacterStreamJsonWriter.java:183)
в org.bson.json.JsonNullConverter.convert (JsonNullConverter.java:25)
в org.bson.json.JsonNullConverter.convert (JsonNullConverter.java:22)
в org.bson.json.JsonWriter.doWriteNull (JsonWriter.java:204)
в org.bson.AbstractBsonWriter.writeNull (AbstractBsonWriter.java:556)
в org.bson.codecs.BsonNullCodec.encode (BsonNullCodec.java:38)
в org.bson.codecs.BsonNullCodec.encode (BsonNullCodec.java:28)
в org.bson.codecs.EncoderContext.encodeWithChildContext (EncoderContext.java:91)
в org.bson.codecs.BsonValueCodec.encode (BsonValueCodec.java:62)
в com.mongodb.spark.sql.BsonValueToJson $ .apply (BsonValueToJson.scala: 29)
в com.mongodb.spark.sql.MapFunctions $ .bsonValueToString (MapFunctions.scala: 103)
в com.mongodb.spark.sql.MapFunctions $ .com $ mongodb $ spark $ sql $ MapFunctions $$ convertToDataType (MapFunctions.scala: 78)
в com.mongodb.spark.sql.MapFunctions $$ anonfun $ 3.apply (MapFunctions.scala: 39)
в com.mongodb.spark.sql.MapFunctions $$ anonfun $ 3.apply (MapFunctions.scala: 37)
на scala.collection.TraversableLike $$ anonfun $ map $ 1.apply (TraversableLike.scala: 234)
на scala.collection.TraversableLike $$ anonfun $ map $ 1.apply (TraversableLike.scala: 234)
в scala.collection.IndexedSeqOptimized $ class.foreach (IndexedSeqOptimized.scala: 33)
в scala.collection.mutable.ArrayOps $ ofRef.foreach (ArrayOps.scala: 186)
в scala.collection.TraversableLike $ class.map (TraversableLike.scala: 234)
в scala.collection.mutable.ArrayOps $ ofRef.map (ArrayOps.scala: 186)
в com.mongodb.spark.sql.MapFunctions $ .documentToRow (MapFunctions.scala: 37)
на com.mongodb.spark.sql.MongoRelation $$ anonfun $ buildScan $ 2.apply (MongoRelation.scala: 45)
на com.mongodb.spark.sql.MongoRelation $$ anonfun $ buildScan $ 2.apply (MongoRelation.scala: 45)
на scala.collection.Iterator $$ anon $ 11.next (Iterator.scala: 409)
на scala.collection.Iterator $$ anon $ 11.next (Iterator.scala: 409)
в org.apache.spark.sql.catalyst.expressions.GeneratedClass $ GeneratedIterator.processNext (неизвестный источник)
в org.apache.spark.sql.execution.BufferedRowIterator.hasNext (BufferedRowIterator.java:43)
в org.apache.spark.sql.execution.WholeStageCodegenExec $$ anonfun $ 8 $$ anon $ 1.hasNext (WholeStageCodegenExec.scala: 395)
в org.apache.spark.sql.execution.columnar.InMemoryRelation $$ anonfun $ 1 $$ anon $ 1.hasNext (InMemoryRelation.scala: 133)
в org.apache.spark.storage.memory.MemoryStore.putIteratorAsBytes (MemoryStore.scala: 371)
в org.apache.spark.storage.BlockManager $$ anonfun $ doPutIterator $ 1.apply (BlockManager.scala: 1055)
в org.apache.spark.storage.BlockManager $$ anonfun $ doPutIterator $ 1.apply (BlockManager.scala: 1029)
в org.apache.spark.storage.BlockManager.doPut (BlockManager.scala: 969)
в org.apache.spark.storage.BlockManager.doPutIterator (BlockManager.scala: 1029)
в org.apache.spark.storage.BlockManager.getOrElseUpdate (BlockManager.scala: 760)
в org.apache.spark.rdd.RDD.getOrCompute (RDD.scala: 334)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 285)в org.apache.spark.rdd.MapPartitionsRDD.compute (MapPartitionsRDD.scala: 38)
в org.apache.spark.rdd.RDD.computeOrReadCheckpoint (RDD.scala: 323)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 287)
в org.apache.spark.rdd.MapPartitionsRDD.compute (MapPartitionsRDD.scala: 38)
в org.apache.spark.rdd.RDD.computeOrReadCheckpoint (RDD.scala: 323)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 287)
в org.apache.spark.rdd.MapPartitionsRDD.compute (MapPartitionsRDD.scala: 38)
в org.apache.spark.rdd.RDD.computeOrReadCheckpoint (RDD.scala: 323)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 287)
в org.apache.spark.scheduler.ShuffleMapTask.runTask (ShuffleMapTask.scala: 96)
в org.apache.spark.scheduler.ShuffleMapTask.runTask (ShuffleMapTask.scala: 53)
в org.apache.spark.scheduler.Task.run (Task.scala: 108)
в org.apache.spark.executor.Executor $ TaskRunner.run (Executor.scala: 338)
в java.util.concurrent.ThreadPoolExecutor.runWorker (ThreadPoolExecutor.java:1142)
в java.util.concurrent.ThreadPoolExecutor $ Worker.run (ThreadPoolExecutor.java:617)
на java.lang.Thread.run (Thread.java:748)
Трассировка стека драйверов:
в org.apache.spark.scheduler.DAGScheduler.org $ apache $ spark $ scheduler $ DAGScheduler $$ failJobAndIndependentStages (DAGScheduler.scala: 1517)
в org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.apply (DAGScheduler.scala: 1505)
в org.apache.spark.scheduler.DAGScheduler $$ anonfun $ abortStage $ 1.apply (DAGScheduler.scala: 1504)
в scala.collection.mutable.ResizableArray $ class.foreach (ResizableArray.scala: 59)
в scala.collection.mutable.ArrayBuffer.foreach (ArrayBuffer.scala: 48)
в org.apache.spark.scheduler.DAGScheduler.abortStage (DAGScheduler.scala: 1504)
в org.apache.spark.scheduler.DAGScheduler $$ anonfun $ handleTaskSetFailed $ 1.apply (DAGScheduler.scala: 814)
в org.apache.spark.scheduler.DAGScheduler $$ anonfun $ handleTaskSetFailed $ 1.apply (DAGScheduler.scala: 814)
в scala.Option.foreach (Option.scala: 257)
в org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed (DAGScheduler.scala: 814)
в org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive (DAGScheduler.scala: 1732)
в org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive (DAGScheduler.scala: 1687)
в org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive (DAGScheduler.scala: 1676)
в org.apache.spark.util.EventLoop $$ anon $ 1.run (EventLoop.scala: 48)
в org.apache.spark.scheduler.DAGScheduler.runJob (DAGScheduler.scala: 630)
в org.apache.spark.SparkContext.runJob (SparkContext.scala: 2029)
в org.apache.spark.SparkContext.runJob (SparkContext.scala: 2050)
в org.apache.spark.SparkContext.runJob (SparkContext.scala: 2069)
в org.apache.spark.SparkContext.runJob (SparkContext.scala: 2094)
в org.apache.spark.rdd.RDD $$ anonfun $ collect $ 1.apply (RDD.scala: 936)
в org.apache.spark.rdd.RDDOperationScope $ .withScope (RDDOperationScope.scala: 151)
в org.apache.spark.rdd.RDDOperationScope $ .withScope (RDDOperationScope.scala: 112)
в org.apache.spark.rdd.RDD.withScope (RDD.scala: 362)
в org.apache.spark.rdd.RDD.collect (RDD.scala: 935)
в org.apache.spark.sql.execution.SparkPlan.executeCollect (SparkPlan.scala: 278)
в org.apache.spark.sql.Dataset $$ anonfun $ count $ 1.apply (Dataset.scala: 2435)
в org.apache.spark.sql.Dataset $$ anonfun $ count $ 1.apply (Dataset.scala: 2434)
в org.apache.spark.sql.Dataset $$ anonfun $ 55.apply (Dataset.scala: 2842)
в org.apache.spark.sql.execution.SQLExecution $ .withNewExecutionId (SQLExecution.scala: 65)
в org.apache.spark.sql.Dataset.withAction (Dataset.scala: 2841)
в org.apache.spark.sql.Dataset.count (Dataset.scala: 2434)
at sun.reflect.NativeMethodAccessorImpl.invoke0 (собственный метод)
at sun.reflect.NativeMethodAccessorImpl.invoke (NativeMethodAccessorImpl.java:62)at sun.reflect.DelegatingMethodAccessorImpl.invoke (DelegatingMethodAccessorImpl.java:43)
в java.lang.reflect.Method.invoke (Method.java:498)
на py4j.reflection.MethodInvoker.invoke (MethodInvoker.java:244)
в py4j.reflection.ReflectionEngine.invoke (ReflectionEngine.java:357)
at py4j.Gateway.invoke (Gateway.java:280)
на py4j.commands.AbstractCommand.invokeMethod (AbstractCommand.java:132)
на py4j.commands.CallCommand.execute (CallCommand.java:79)
at py4j.GatewayConnection.run (GatewayConnection.java:214)
на java.lang.Thread.run (Thread.java:748)
Вызвано: org.bson.BsonInvalidOperationException: недопустимое состояние INITIAL
в org.bson.json.StrictCharacterStreamJsonWriter.checkPreconditions (StrictCharacterStreamJsonWriter.java:352)
в org.bson.json.StrictCharacterStreamJsonWriter.writeNull (StrictCharacterStreamJsonWriter.java:183)
в org.bson.json.JsonNullConverter.convert (JsonNullConverter.java:25)
в org.bson.json.JsonNullConverter.convert (JsonNullConverter.java:22)
в org.bson.json.JsonWriter.doWriteNull (JsonWriter.java:204)
в org.bson.AbstractBsonWriter.writeNull (AbstractBsonWriter.java:556)
в org.bson.codecs.BsonNullCodec.encode (BsonNullCodec.java:38)
в org.bson.codecs.BsonNullCodec.encode (BsonNullCodec.java:28)
в org.bson.codecs.EncoderContext.encodeWithChildContext (EncoderContext.java:91)
в org.bson.codecs.BsonValueCodec.encode (BsonValueCodec.java:62)
в com.mongodb.spark.sql.BsonValueToJson $ .apply (BsonValueToJson.scala: 29)
в com.mongodb.spark.sql.MapFunctions $ .bsonValueToString (MapFunctions.scala: 103)
в com.mongodb.spark.sql.MapFunctions $ .com $ mongodb $ spark $ sql $ MapFunctions $$ convertToDataType (MapFunctions.scala: 78)
в com.mongodb.spark.sql.MapFunctions $$ anonfun $ 3.apply (MapFunctions.scala: 39)
в com.mongodb.spark.sql.MapFunctions $$ anonfun $ 3.apply (MapFunctions.scala: 37)
на scala.collection.TraversableLike $$ anonfun $ map $ 1.apply (TraversableLike.scala: 234)
на scala.collection.TraversableLike $$ anonfun $ map $ 1.apply (TraversableLike.scala: 234)
в scala.collection.IndexedSeqOptimized $ class.foreach (IndexedSeqOptimized.scala: 33)
в scala.collection.mutable.ArrayOps $ ofRef.foreach (ArrayOps.scala: 186)
в scala.collection.TraversableLike $ class.map (TraversableLike.scala: 234)
в scala.collection.mutable.ArrayOps $ ofRef.map (ArrayOps.scala: 186)
в com.mongodb.spark.sql.MapFunctions $ .documentToRow (MapFunctions.scala: 37)
на com.mongodb.spark.sql.MongoRelation $$ anonfun $ buildScan $ 2.apply (MongoRelation.scala: 45)
на com.mongodb.spark.sql.MongoRelation $$ anonfun $ buildScan $ 2.apply (MongoRelation.scala: 45)
на scala.collection.Iterator $$ anon $ 11.next (Iterator.scala: 409)
на scala.collection.Iterator $$ anon $ 11.next (Iterator.scala: 409)
в org.apache.spark.sql.catalyst.expressions.GeneratedClass $ GeneratedIterator.processNext (неизвестный источник)
в org.apache.spark.sql.execution.BufferedRowIterator.hasNext (BufferedRowIterator.java:43)
в org.apache.spark.sql.execution.WholeStageCodegenExec $$ anonfun $ 8 $$ anon $ 1.hasNext (WholeStageCodegenExec.scala: 395)
в org.apache.spark.sql.execution.columnar.InMemoryRelation $$ anonfun $ 1 $$ anon $ 1.hasNext (InMemoryRelation.scala: 133)
в org.apache.spark.storage.memory.MemoryStore.putIteratorAsBytes (MemoryStore.scala: 371)
в org.apache.spark.storage.BlockManager $$ anonfun $ doPutIterator $ 1.apply (BlockManager.scala: 1055)
в org.apache.spark.storage.BlockManager $$ anonfun $ doPutIterator $ 1.apply (BlockManager.scala: 1029)
в org.apache.spark.storage.BlockManager.doPut (BlockManager.scala: 969)
в org.apache.spark.storage.BlockManager.doPutIterator (BlockManager.scala: 1029)
в org.apache.spark.storage.BlockManager.getOrElseUpdate (BlockManager.scala: 760)
в org.apache.spark.rdd.RDD.getOrCompute (RDD.scala: 334)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 285)в org.apache.spark.rdd.MapPartitionsRDD.compute (MapPartitionsRDD.scala: 38)
в org.apache.spark.rdd.RDD.computeOrReadCheckpoint (RDD.scala: 323)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 287)
в org.apache.spark.rdd.MapPartitionsRDD.compute (MapPartitionsRDD.scala: 38)
в org.apache.spark.rdd.RDD.computeOrReadCheckpoint (RDD.scala: 323)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 287)
в org.apache.spark.rdd.MapPartitionsRDD.compute (MapPartitionsRDD.scala: 38)
в org.apache.spark.rdd.RDD.computeOrReadCheckpoint (RDD.scala: 323)
в org.apache.spark.rdd.RDD.iterator (RDD.scala: 287)
в org.apache.spark.scheduler.ShuffleMapTask.runTask (ShuffleMapTask.scala: 96)
в org.apache.spark.scheduler.ShuffleMapTask.runTask (ShuffleMapTask.scala: 53)
в org.apache.spark.scheduler.Task.run (Task.scala: 108)
в org.apache.spark.executor.Executor $ TaskRunner.run (Executor.scala: 338)
в java.util.concurrent.ThreadPoolExecutor.runWorker (ThreadPoolExecutor.java:1142)
в java.util.concurrent.ThreadPoolExecutor $ Worker.run (ThreadPoolExecutor.java:617)
... еще 1