Я знаю, что это, возможно, задавали и раньше, но я спрашиваю об этом, потому что я не уверен, является ли проблема такой же или нет.Дело в том, что я использую spark-sql, и я сначала создаю таблицу вроде:
sqlContext = HiveContext(sc)
sqlContext.sql("""drop table if exists test_table""")
sqlContext.sql(""" create external table test_table
.
.
.
.
.
.)
partitioned by('column_name' datatype)
stored as textfile
location '/home/..../test_table'
""")
Эта таблица имеет как 400-500 столбцов или даже больше, чем
Затем я вставляю перезапись, получая данные из нескольких огромных огромных таблиц, используя объединение, например
sqlContext.sql("""
insert overwrite table table_name
partition(`column_name`)
select
col1,
col2,
col3,
..
..
from table1
left join ... table2 on ...
left join ... table3
left join ... tale_4
union all
select col1,
col2,
..
..
..
from table5
left join.. ... table6
.
.
.
.union all
from table19
left join tabl18 ...
""")
.*
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 590.9 in stage 67.0 (TID 25051) on #####, executor 3: java.io.IOException (Bad connect ack with firstBadLink as *****:1004) [duplicate 15]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 590.10 in stage 67.0 (TID 25161, *.com, executor 3, partition 590,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 531.10 in stage 67.0 (TID 25162, *.com, executor 13, partition 531,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 431.8 in stage 67.0 (TID 25066) on ***, executor 13: java.io.IOException (Bad connect ack with firstBadLink as *******:1004) [duplicate 25]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 431.9 in stage 67.0 (TID 25163, ****, executor 13, partition 431,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 443.9 in stage 67.0 (TID 25076) on ****, executor 13: java.io.IOException (Bad connect ack with firstBadLink as *****:1004) [duplicate 24]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 774.9 in stage 67.0 (TID 25058) on ****, executor 3: java.io.IOException (Bad connect ack with firstBadLink as *****:1004) [duplicate 9]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 774.10 in stage 67.0 (TID 25164, ****, executor 15, partition 774,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 790.9 in stage 67.0 (TID 25053) on ****, executor 3: java.io.IOException (Bad connect ack with firstBadLink as ******:1004) [duplicate 16]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 790.10 in stage 67.0 (TID 25165, ****, executor 15, partition 790,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 574.9 in stage 67.0 (TID 25061) on ****, executor 15: java.io.IOException (Bad connect ack with firstBadLink as *****:1004) [duplicate 17]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 574.10 in stage 67.0 (TID 25166, ****, executor 3, partition 574,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 433.9 in stage 67.0 (TID 25167, ****, executor 14, partition 433,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 419.9 in stage 67.0 (TID 25075) on ****, executor 14: java.io.IOException (Bad connect ack with firstBadLink as *****:1004) [duplicate 26]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Lost task 474.9 in stage 67.0 (TID 25054) on ****, executor 15: java.io.IOException (Bad connect ack with firstBadLink as ****:1004) [duplicate 10]
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 474.10 in stage 67.0 (TID 25168, ****, executor 3, partition 474,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 INFO scheduler.TaskSetManager: Starting task 436.10 in stage 67.0 (TID 25169, ****, executor 19, partition 436,NODE_LOCAL, 2348 bytes)
18/09/26 22:18:57 WARN scheduler.TaskSetManager: Lost task 411.8 in stage 67.0 (TID 25056, ****, executor 19): java.io.IOException: Bad connect ack with firstBadLink as ****:1004
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.createBlockOutputStream(DFSOutputStream.java:1643)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1541)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:683)
File "/opt/cloudera/parcels/CDH-5.9.0-1.cdh5.9.0.p0.23/lib/spark/python/lib/pyspark.zip/pyspark/sql/context.py", line 580, in sql
return DataFrame(self._ssql_ctx.sql(sqlQuery), self)
File "/opt/cloudera/parcels/CDH-5.9.0-1.cdh5.9.0.p0.23/lib/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/opt/cloudera/parcels/CDH-5.9.0-1.cdh5.9.0.p0.23/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 45, in deco
18/09/26 22:19:09 WARN scheduler.TaskSetManager: Lost task 1210.4 in stage 67.0 (TID 25307, ****.com, executor 8): TaskKilled (killed intentionally)
return f(*a, **kw)
File "/opt/cloudera/parcels/CDH-5.9.0-1.cdh5.9.0.p0.23/lib/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
format(target_id, ".", name), value)
18/09/26 22:19:09 WARN scheduler.TaskSetManager: Lost task 449.12 in stage 67.0 (TID 25300, ***.com, executor 14): TaskKilled (killed intentionally)
Py4JJavaError: An error occurred while calling o61.sql.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 403 in stage 67.0 failed 14 times, most recent failure: Lost task 403.13 in stage 67.0 (TID 25227, *******, executor 7): java.io.IOException: Bad connect ack with firstBadLink as ******:1004
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.createBlockOutputStream(DFSOutputStream.java:1643)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1541)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:683)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1642)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1590)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1844)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1857)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1934)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.saveAsHiveFile(InsertIntoHiveTable.scala:84)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.sideEffectResult$lzycompute(InsertIntoHiveTable.scala:201)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.sideEffectResult(InsertIntoHiveTable.scala:127)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.doExecute(InsertIntoHiveTable.scala:276)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:55)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:55)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:145)
at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:130)
at org.apache.spark.sql.DataFrame$.apply(DataFrame.scala:52)
at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:817)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: Bad connect ack with firstBadLink as ******:1004
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.createBlockOutputStream(DFSOutputStream.java:1643)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1541)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:683)