У меня появляется следующее StackOverflowError
всплывающее в журналах
java.lang.StackOverflowError
at scala.collection.generic.GenTraversableFactory$GenericCanBuildFrom.apply(GenTraversableFactory.scala:57)
at scala.collection.generic.GenTraversableFactory$GenericCanBuildFrom.apply(GenTraversableFactory.scala:52)
at scala.collection.TraversableLike$class.builder$1(TraversableLike.scala:229)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:233)
at scala.collection.immutable.List.map(List.scala:296)
at org.apache.spark.sql.catalyst.plans.logical.Project.output(basicLogicalOperators.scala:52)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitUnaryNode(SizeInBytesOnlyStatsPlanVisitor.scala:36)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:136)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlanVisitor$class.visit(LogicalPlanVisitor.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visit(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:33)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$class.stats(LogicalPlanStats.scala:33)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.stats(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitUnaryNode(SizeInBytesOnlyStatsPlanVisitor.scala:39)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:136)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlanVisitor$class.visit(LogicalPlanVisitor.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visit(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:33)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$class.stats(LogicalPlanStats.scala:33)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.stats(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitUnaryNode(SizeInBytesOnlyStatsPlanVisitor.scala:39)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:136)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlanVisitor$class.visit(LogicalPlanVisitor.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visit(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:33)
at scala.Option.getOrElse(Option.scala:121)
....
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$class.stats(LogicalPlanStats.scala:33)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.stats(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitUnaryNode(SizeInBytesOnlyStatsPlanVisitor.scala:39)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:136)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlanVisitor$class.visit(LogicalPlanVisitor.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visit(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:33)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$class.stats(LogicalPlanStats.scala:33)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.stats(LogicalPlan.scala:29)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitUnaryNode(SizeInBytesOnlyStatsPlanVisitor.scala:39)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:136)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visitProject(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlanVisitor$class.visit(LogicalPlanVisitor.scala:37)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.SizeInBytesOnlyStatsPlanVisitor$.visit(SizeInBytesOnlyStatsPlanVisitor.scala:27)
at org.apache.spark.sql.catalyst.plans.logical.statsEstimation.LogicalPlanStats$$anonfun$stats$1.apply(LogicalPlanStats.scala:37)
После копания я обнаружил, что это вызвано во время звонка на dataFrame.persist(DEFAULT_STORAGE_LEVEL)
Я подозреваюэто из-за слишком большого количества вызовов (которые в основном разбивают массив на множество столбцов):
df.withColumn(colname, col(arrayColumn)(colindex))
Может ли кто-нибудь предоставить указатели на то, что может быть причиной этой ошибки?