Меня всегда приветствует предупреждение ArrayIndexOutOfBoundException: 0 при чтении объекта json в DataFrame, хотя, похоже, это не влияет на результат.Просто интересно, я что-то здесь не так делаю?(pyspark версия 1.60).
Заранее благодарим за помощь.
>>> d = {"id": 1, "json":[{"col1":1, "col2":2}, {"col1":2, "col2":4}]}
>>> x = sqlContext.read.json(sc.parallelize([d,d,d,d,d]))
>>> x.printSchema()
root
|-- id: long (nullable = true)
|-- json: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- col1: long (nullable = true)
| | |-- col2: long (nullable = true)
>>> x.show()
18/12/20 13:25:43 WARN util.ExecutionListenerManager: Error executing query execution listener
java.lang.ArrayIndexOutOfBoundsException: 0
at org.apache.spark.sql.query.analysis.QueryAnalysis$$anonfun$getInputMetadata$2.apply(QueryAnalysis.scala:121)
at org.apache.spark.sql.query.analysis.QueryAnalysis$$anonfun$getInputMetadata$2.apply(QueryAnalysis.scala:108)
at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:111)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.generic.TraversableForwarder$class.foreach(TraversableForwarder.scala:32)
at scala.collection.mutable.ListBuffer.foreach(ListBuffer.scala:45)
at org.apache.spark.sql.util.ExecutionListenerManager.org$apache$spark$sql$util$ExecutionListenerManager$$withErrorHandling(QueryExecutionListener.scala:119)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply$mcV$sp(QueryExecutionListener.scala:99)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply(QueryExecutionListener.scala:99)
at org.apache.spark.sql.util.ExecutionListenerManager$$anonfun$onSuccess$1.apply(QueryExecutionListener.scala:99)
at org.apache.spark.sql.util.ExecutionListenerManager.readLock(QueryExecutionListener.scala:132)
at org.apache.spark.sql.util.ExecutionListenerManager.onSuccess(QueryExecutionListener.scala:98)
at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2116)
at org.apache.spark.sql.DataFrame.head(DataFrame.scala:1389)
at org.apache.spark.sql.DataFrame.take(DataFrame.scala:1471)
at org.apache.spark.sql.DataFrame.showString(DataFrame.scala:184)
at sun.reflect.GeneratedMethodAccessor35.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
+---+--------------+
| id| json|
+---+--------------+
| 1|[[1,2], [2,4]]|
| 1|[[1,2], [2,4]]|
| 1|[[1,2], [2,4]]|
| 1|[[1,2], [2,4]]|
| 1|[[1,2], [2,4]]|
+---+--------------+