Я использую Spark 2.3.1 на ноутбуке Zeppelin.Я создаю фрейм данных, загружая его из Hive.Вот как создается фрейм данных:
val df = hive.executeQuery("select trim(a_vno) as dst, trim(s_vno) as src, share, administrator, account, all_shares from ebyn.babs_edges_2016 where (share <> 0 or administrator <> 0 or account <> 0 or all_shares <> 0 ) and trim(date) = '201601'")
Когда я вызываю
df.show
, он показывает первые 20 строк.Но когда я звоню
df.where("src = 'XXXXX' and dst = 'YYYYY'").show
Это выдает следующую ошибку:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 303.0 failed 4 times, most recent failure: Lost task 3.3 in stage 303.0 (TID 10797, analitik10.host, executor 96): org.apache.spark.util.TaskCompletionListenerException: null
at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:139)
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:117)
at org.apache.spark.scheduler.Task.run(Task.scala:125)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3273)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
at org.apache.spark.sql.Dataset.show(Dataset.scala:723)
at org.apache.spark.sql.Dataset.show(Dataset.scala:682)
at org.apache.spark.sql.Dataset.show(Dataset.scala:691)
... 56 elided
Caused by: org.apache.spark.util.TaskCompletionListenerException: null
at org.apache.spark.TaskContextImpl.invokeListeners(TaskContextImpl.scala:139)
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:117)
at org.apache.spark.scheduler.Task.run(Task.scala:125)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
... 3 more
Вот свойства таблицы Hive:
CREATE TABLE `EBYN.BABS_EDGES_2016 `(
`date` string,
`a_vno` string,
`s_vno` string,
`amount` double,
`number` int,
`share` int,
`share_ratio` int,
`administrator` int,
`account` int,
`all_sharelik` int)
COMMENT 'Imported by sqoop on 2018/10/17 14:53:12'
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='',
'line.delim'='\n',
'serialization.format'='')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://ggmprod/warehouse/tablespace/managed/hive/ebyn.db/babs_edges_2016 '
TBLPROPERTIES (
'bucketing_version'='2',
'last_modified_by'='hadoop_etluser',
'last_modified_time'='1539867401',
'transactional'='true',
'transactional_properties'='insert_only',
В чем причина того, что этопоказывает фрейм данных, но не работает, когда вызывается для отфильтрованного фрейма данных?