Я обновил нашу систему HDP (Hortonworks Data Platform) с 2.5.2 до новейших версий (2.6.5). Все отлично работало, кроме Spark 2. Когда я выполнял этот запрос (сложный запрос) на Spark 2.3 (текущая версия spark на HDP 2.6.5):
EXPLAIN SELECT COUNT(*) FROM A JOIN B ...
Потребовалось 10 секунд, чтобы выполнить.
Однако когда я запустил его на старой системе - Spark 2.1.5 (версия spark на HDP 2.5.2), потребовалось всего 1-2 секунды, чтобы выполнить объяснение.
Вот мой план:
== Parsed Logical Plan ==
'Project ['count(1) AS total_record#10326]
+- 'SubqueryAlias __auto_generated_subquery_name
+- 'Project [unresolvedalias(1, None)]
+- 'Filter ((1 = 1) && 'co.companyid IN (130,131,132,133,134,135,136,137,138,11,140,142,143,144,34,41,42,43,52,54,55,56,57,58,60,63,65,67,70,72,75,77,78,79,80,90,91,101,102,103,104,105,106,107,108,109,110,112,115))
+- 'Join RightOuter, ('s.companyid = 'co.companyid)
:- 'SubqueryAlias s
: +- 'Aggregate ['companyid], ['companyid, 'SUM('paid_goals) AS paid_goals#10296, 'SUM('paid_conv_items) AS paid_conv_items#10297, 'SUM('imp) AS imp#10298, 'round('SUM('paid_conv_value), 2) AS paid_conv_value#10299, 'round('SUM('cost), 2) AS cost#10300, 'SUM('paid_conv) AS paid_conv#10301, 'SUM('clicks) AS clicks#10302, 'round((('sum('s.paid_conv_value) - 'sum('s.Cost)) - 'SUM('s.paid_refundvalue)), 2) AS roi#10303, 'round(('Sum('s.all_conv_value) / 'Sum('s.all_conv)), 2) AS avg_all_conv_value#10304, 'round((('Sum('s.Clicks) / 'Sum('s.imp)) * 100), 2) AS ctr#10305, 'round(('SUM('s.all_conv_value) - 'SUM('s.paid_conv_value)), 2) AS other_conv_value#10306, ('SUM('s.all_conv_items) - 'SUM('s.paid_conv_items)) AS other_conv_items#10307, 'SUM('s.all_conv_items) AS all_conv_items#10308, 'SUM('s.all_conv) AS all_conv#10309, ('SUM('s.all_goals) - 'SUM('s.paid_goals)) AS other_goals#10310, 'round(('Sum('s.paid_conv_value) / 'Sum('s.paid_conv)), 2) AS avg_paid_conv_value#10311, 'SUM('s.all_goals) AS all_goals#10312, 'SUM('s.all_refundquantity) AS all_refundquantity#10313, 'SUM('paid_refundquantity) AS paid_refundquantity#10314, ('SUM('s.all_refundquantity) - 'SUM('s.paid_refundquantity)) AS other_refundquantity#10315, 'round('SUM('s.all_refundvalue), 2) AS all_refundvalue#10316, 'round('SUM('paid_refundvalue), 2) AS paid_refundvalue#10317, 'round(('SUM('s.all_refundvalue) - 'SUM('s.paid_refundvalue)), 2) AS other_refundvalue#10318, ... 7 more fields]
: +- 'SubqueryAlias s
: +- 'Aggregate ['s.companyid], ['s.companyid, 'sum('ifnull('cost, 0)) AS cost#10279, 'sum('ifnull('impressions, 0)) AS imp#10280, 'sum('ifnull('clicks, 0)) AS clicks#10281, 'sum('quantitybydate) AS all_conv_items#10282, 'sum('conversionsbydate) AS all_conv#10283, 'SUM('goalbydate) AS all_goals#10284, 'Sum('revenuebydate) AS all_conv_value#10285, 'sum('refundquantitybydate) AS all_refundquantity#10286, 'SUM('refundvaluebydate) AS all_refundvalue#10287, 'Sum('refundsbydate) AS all_refunds#10288, 'sum('if((('type = 1) && ('accountid > 0)), 'quantitybydate, 0)) AS paid_conv_items#10289, 'sum('if((('type = 1) && ('accountid > 0)), 'conversionsbydate, 0)) AS paid_conv#10290, 'sum('if((('type = 1) && ('accountid > 0)), 'goalbydate, 0)) AS paid_goals#10291, 'sum('if((('type = 1) && ('accountid > 0)), 'revenuebydate, 0)) AS paid_conv_value#10292, 'sum('if((('type = 1) && ('accountid > 0)), 'refundsbydate, 0)) AS paid_refunds#10293, 'sum('if((('type = 1) && ('accountid > 0)), 'refundvaluebydate, 0)) AS paid_refundvalue#10294, 'sum('if((('type = 1) && ('accountid > 0)), 'refundquantitybydate, 0)) AS paid_refundquantity#10295]
: +- 'Filter ((('datestats >= 1528156800) && ('datestats <= 1528934400)) && ('s.siteid IN (1026,1003,1160,1029,1159,1032,1033,1035,1038,1005,1040,1041,1042,1044,1006,1047,1049,1050,1051,1002,1008,1156,1031,1071,1073,1011,1076,1080,1004,1081,1082,1088,1090,1091,1092,1093,1094,1095,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1018,1128,1129,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139,1140,1013,1001,1146,1020,1021) && 'type_report IN (1,2)))
: +- 'SubqueryAlias s
: +- 'UnresolvedRelation `global_summary_partitioned_20180101_20181231_cache`
+- 'SubqueryAlias co
+- 'UnresolvedRelation `mtx_system`.`tbl_um_company_orc`
== Analyzed Logical Plan ==
total_record: bigint
Aggregate [count(1) AS total_record#10326L]
+- SubqueryAlias __auto_generated_subquery_name
+- Project [1 AS 1#10410]
+- Filter ((1 = 1) && cast(companyid#10328L as bigint) IN (cast(130 as bigint),cast(131 as bigint),cast(132 as bigint),cast(133 as bigint),cast(134 as bigint),cast(135 as bigint),cast(136 as bigint),cast(137 as bigint),cast(138 as bigint),cast(11 as bigint),cast(140 as bigint),cast(142 as bigint),cast(143 as bigint),cast(144 as bigint),cast(34 as bigint),cast(41 as bigint),cast(42 as bigint),cast(43 as bigint),cast(52 as bigint),cast(54 as bigint),cast(55 as bigint),cast(56 as bigint),cast(57 as bigint),cast(58 as bigint),cast(60 as bigint),cast(63 as bigint),cast(65 as bigint),cast(67 as bigint),cast(70 as bigint),cast(72 as bigint),cast(75 as bigint),cast(77 as bigint),cast(78 as bigint),cast(79 as bigint),cast(80 as bigint),cast(90 as bigint),cast(91 as bigint),cast(101 as bigint),cast(102 as bigint),cast(103 as bigint),cast(104 as bigint),cast(105 as bigint),cast(106 as bigint),cast(107 as bigint),cast(108 as bigint),cast(109 as bigint),cast(110 as bigint),cast(112 as bigint),cast(115 as bigint)))
+- Join RightOuter, (companyid#6925L = companyid#10328L)
:- SubqueryAlias s
: +- Aggregate [companyid#6925L], [companyid#6925L, sum(paid_goals#10291L) AS paid_goals#10296L, sum(paid_conv_items#10289L) AS paid_conv_items#10297L, sum(imp#10280L) AS imp#10298L, round(sum(paid_conv_value#10292), 2) AS paid_conv_value#10299, round(sum(cost#10279), 2) AS cost#10300, sum(paid_conv#10290L) AS paid_conv#10301L, sum(clicks#10281L) AS clicks#10302L, round(((sum(paid_conv_value#10292) - sum(Cost#10279)) - sum(paid_refundvalue#10294)), 2) AS roi#10303, round((sum(all_conv_value#10285) / cast(sum(all_conv#10283L) as double)), 2) AS avg_all_conv_value#10304, round(((cast(sum(Clicks#10281L) as double) / cast(sum(imp#10280L) as double)) * cast(100 as double)), 2) AS ctr#10305, round((sum(all_conv_value#10285) - sum(paid_conv_value#10292)), 2) AS other_conv_value#10306, (sum(all_conv_items#10282L) - sum(paid_conv_items#10289L)) AS other_conv_items#10307L, sum(all_conv_items#10282L) AS all_conv_items#10308L, sum(all_conv#10283L) AS all_conv#10309L, (sum(all_goals#10284L) - sum(paid_goals#10291L)) AS other_goals#10310L, round((sum(paid_conv_value#10292) / cast(sum(paid_conv#10290L) as double)), 2) AS avg_paid_conv_value#10311, sum(all_goals#10284L) AS all_goals#10312L, sum(all_refundquantity#10286L) AS all_refundquantity#10313L, sum(paid_refundquantity#10295L) AS paid_refundquantity#10314L, (sum(all_refundquantity#10286L) - sum(paid_refundquantity#10295L)) AS other_refundquantity#10315L, round(sum(all_refundvalue#10287), 2) AS all_refundvalue#10316, round(sum(paid_refundvalue#10294), 2) AS paid_refundvalue#10317, round((sum(all_refundvalue#10287) - sum(paid_refundvalue#10294)), 2) AS other_refundvalue#10318, ... 7 more fields]
: +- SubqueryAlias s
: +- Aggregate [companyid#6925L], [companyid#6925L, sum(ifnull(cost#6950, 0)) AS cost#10279, sum(ifnull(impressions#6951L, 0)) AS imp#10280L, sum(ifnull(clicks#6958L, 0)) AS clicks#10281L, sum(quantitybydate#6949L) AS all_conv_items#10282L, sum(conversionsbydate#6962L) AS all_conv#10283L, sum(goalbydate#6956L) AS all_goals#10284L, sum(revenuebydate#6960) AS all_conv_value#10285, sum(refundquantitybydate#6966L) AS all_refundquantity#10286L, sum(refundvaluebydate#6967) AS all_refundvalue#10287, sum(refundsbydate#6968L) AS all_refunds#10288L, sum(if (((type#6938 = 1) && (accountid#6937L > cast(0 as bigint)))) quantitybydate#6949L else cast(0 as bigint)) AS paid_conv_items#10289L, sum(if (((type#6938 = 1) && (accountid#6937L > cast(0 as bigint)))) conversionsbydate#6962L else cast(0 as bigint)) AS paid_conv#10290L, sum(if (((type#6938 = 1) && (accountid#6937L > cast(0 as bigint)))) goalbydate#6956L else cast(0 as bigint)) AS paid_goals#10291L, sum(if (((type#6938 = 1) && (accountid#6937L > cast(0 as bigint)))) revenuebydate#6960 else cast(0 as double)) AS paid_conv_value#10292, sum(if (((type#6938 = 1) && (accountid#6937L > cast(0 as bigint)))) refundsbydate#6968L else cast(0 as bigint)) AS paid_refunds#10293L, sum(if (((type#6938 = 1) && (accountid#6937L > cast(0 as bigint)))) refundvaluebydate#6967 else cast(0 as double)) AS paid_refundvalue#10294, sum(if (((type#6938 = 1) && (accountid#6937L > cast(0 as bigint)))) refundquantitybydate#6966L else cast(0 as bigint)) AS paid_refundquantity#10295L]
: +- Filter (((datestats#6931L >= cast(1528156800 as bigint)) && (datestats#6931L <= cast(1528934400 as bigint))) && (siteid#6935 IN (1026,1003,1160,1029,1159,1032,1033,1035,1038,1005,1040,1041,1042,1044,1006,1047,1049,1050,1051,1002,1008,1156,1031,1071,1073,1011,1076,1080,1004,1081,1082,1088,1090,1091,1092,1093,1094,1095,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1018,1128,1129,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139,1140,1013,1001,1146,1020,1021) && type_report#6932 IN (1,2)))
: +- SubqueryAlias s
: +- SubqueryAlias global_summary_partitioned_20180101_20181231_cache
: +- RepartitionByExpression [partition_key#6974, type#6938], 9
: +- Project [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields]
: +- SubqueryAlias global_summary_partitioned_20180101_20181231
: +- HiveTableRelation `metrixa_global_database`.`global_summary_partitioned_20180101_20181231`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields]
+- SubqueryAlias co
+- SubqueryAlias tbl_um_company_orc
+- HiveTableRelation `mtx_system`.`tbl_um_company_orc`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [companyid#10328L, name#10329, officelocation#10330, address#10331, city#10332, countrycode#10333, countrycodes#10334, contactemail#10335, contactphone#10336, contactmobile#10337, level#10338, status#10339, optimisationstatus#10340, issoftdelete#10341, userid#10342, sitenumber#10343, accountnumber#10344]
== Optimized Logical Plan ==
Aggregate [count(1) AS total_record#10326L]
+- Project
+- Join RightOuter, (companyid#6925L = companyid#10328L)
:- Aggregate [companyid#6925L], [companyid#6925L]
: +- Aggregate [companyid#6925L], [companyid#6925L]
: +- Project [companyid#6925L]
: +- Filter ((((isnotnull(datestats#6931L) && (datestats#6931L >= 1528156800)) && (datestats#6931L <= 1528934400)) && siteid#6935 INSET (1105,1031,1110,1005,1073,1041,1137,1095,1001,1132,1013,1100,1040,1109,1136,1020,1051,1104,1047,1091,1021,1042,1032,1159,1128,1106,1111,1006,1138,1160,1082,1099,1131,1050,1002,1035,1071,1026,1135,1146,1103,1139,1107,1092,1090,1011,1080,1029,1044,1102,1076,1134,1081,1018,1049,1003,1156,1098,1038,1130,1094,1140,1008,1108,1101,1133,1033,1093,1129,1004,1088)) && type_report#6932 IN (1,2))
: +- InMemoryRelation [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas), `global_summary_partitioned_20180101_20181231_cache`
: +- Exchange hashpartitioning(partition_key#6974, type#6938, 9)
: +- HiveTableScan [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields], HiveTableRelation `metrixa_global_database`.`global_summary_partitioned_20180101_20181231`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields]
+- Project [companyid#10328L]
+- Filter companyid#10328L INSET (138,101,115,56,142,42,52,110,57,78,106,132,133,60,102,70,137,65,109,77,41,134,105,34,54,144,103,140,91,108,130,135,80,112,63,67,11,72,143,43,104,55,75,58,107,136,79,131,90)
+- HiveTableRelation `mtx_system`.`tbl_um_company_orc`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [companyid#10328L, name#10329, officelocation#10330, address#10331, city#10332, countrycode#10333, countrycodes#10334, contactemail#10335, contactphone#10336, contactmobile#10337, level#10338, status#10339, optimisationstatus#10340, issoftdelete#10341, userid#10342, sitenumber#10343, accountnumber#10344]
== Physical Plan ==
*(6) HashAggregate(keys=[], functions=[count(1)], output=[total_record#10326L])
+- Exchange SinglePartition
+- *(5) HashAggregate(keys=[], functions=[partial_count(1)], output=[count#10668L])
+- *(5) Project
+- SortMergeJoin [companyid#6925L], [companyid#10328L], RightOuter
:- *(2) Sort [companyid#6925L ASC NULLS FIRST], false, 0
: +- *(2) HashAggregate(keys=[companyid#6925L], functions=[], output=[companyid#6925L])
: +- *(2) HashAggregate(keys=[companyid#6925L], functions=[], output=[companyid#6925L])
: +- *(2) HashAggregate(keys=[companyid#6925L], functions=[], output=[companyid#6925L])
: +- Exchange hashpartitioning(companyid#6925L, 9)
: +- *(1) HashAggregate(keys=[companyid#6925L], functions=[], output=[companyid#6925L])
: +- *(1) Project [companyid#6925L]
: +- *(1) Filter ((((isnotnull(datestats#6931L) && (datestats#6931L >= 1528156800)) && (datestats#6931L <= 1528934400)) && siteid#6935 INSET (1105,1031,1110,1005,1073,1041,1137,1095,1001,1132,1013,1100,1040,1109,1136,1020,1051,1104,1047,1091,1021,1042,1032,1159,1128,1106,1111,1006,1138,1160,1082,1099,1131,1050,1002,1035,1071,1026,1135,1146,1103,1139,1107,1092,1090,1011,1080,1029,1044,1102,1076,1134,1081,1018,1049,1003,1156,1098,1038,1130,1094,1140,1008,1108,1101,1133,1033,1093,1129,1004,1088)) && type_report#6932 IN (1,2))
: +- InMemoryTableScan [companyid#6925L, datestats#6931L, siteid#6935, type_report#6932], [isnotnull(datestats#6931L), (datestats#6931L >= 1528156800), (datestats#6931L <= 1528934400), siteid#6935 INSET (1105,1031,1110,1005,1073,1041,1137,1095,1001,1132,1013,1100,1040,1109,1136,1020,1051,1104,1047,1091,1021,1042,1032,1159,1128,1106,1111,1006,1138,1160,1082,1099,1131,1050,1002,1035,1071,1026,1135,1146,1103,1139,1107,1092,1090,1011,1080,1029,1044,1102,1076,1134,1081,1018,1049,1003,1156,1098,1038,1130,1094,1140,1008,1108,1101,1133,1033,1093,1129,1004,1088), type_report#6932 IN (1,2)]
: +- InMemoryRelation [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields], true, 10000, StorageLevel(disk, memory, deserialized, 1 replicas), `global_summary_partitioned_20180101_20181231_cache`
: +- Exchange hashpartitioning(partition_key#6974, type#6938, 9)
: +- HiveTableScan [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields], HiveTableRelation `metrixa_global_database`.`global_summary_partitioned_20180101_20181231`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [networkid#6924, companyid#6925L, clicktime#6926L, countryid#6927, languageid#6928, sourceurlid#6929, regionid#6930, datestats#6931L, type_report#6932, cityid#6933, ismobiledevice#6934, siteid#6935, affiliateid#6936, accountid#6937L, type#6938, deviceid#6939, countrycode#6940, countryname#6941, regionname#6942, regioncode#6943, language#6944, city#6945, conversions#6946L, uniquerevenuebydate#6947, ... 27 more fields]
+- *(4) Sort [companyid#10328L ASC NULLS FIRST], false, 0
+- Exchange hashpartitioning(companyid#10328L, 9)
+- *(3) Filter companyid#10328L INSET (138,101,115,56,142,42,52,110,57,78,106,132,133,60,102,70,137,65,109,77,41,134,105,34,54,144,103,140,91,108,130,135,80,112,63,67,11,72,143,43,104,55,75,58,107,136,79,131,90)
+- HiveTableScan [companyid#10328L], HiveTableRelation `mtx_system`.`tbl_um_company_orc`, org.apache.hadoop.hive.ql.io.orc.OrcSerde, [companyid#10328L, name#10329, officelocation#10330, address#10331, city#10332, countrycode#10333, countrycodes#10334, contactemail#10335, contactphone#10336, contactmobile#10337, level#10338, status#10339, optimisationstatus#10340, issoftdelete#10341, userid#10342, sitenumber#10343, accountnumber#10344]
Кто-нибудь может дать мне несколько идей?
Спасибо