Мне нужно выполнить много заданий, которые содержат множество операций агрегации (COALESCE, Group By, Left Join) и проверки условий (если, в случае if, case, isnull) над несколькими таблицами с более чем 1G строк в impala, нашем кластереможет справиться с заданиями, но я чувствую, что они стоят слишком много времени, для выполнения некоторых запросов может потребоваться 5-10 часов.
Интересно, в общем, что было бы лучшим способом оптимизировать трудоемкий тяжелый запрос?
Типичный запрос подобен приведенному ниже, но я просто спрашиваю здесь общие рекомендации, поэтому запрос размещен здесь только для справки.
Любая мысль, которая здесь дается, приветствуется.
select distinct
,coalesce(w.pmaw,w2.pmaw,w3.pmaw) as pmaw
,if(w.pmaw is not null, 'pmaw_sheet',if(w2.pmaw is not null, 'gap_fill_by_mma_in_same_region','gap_fill_by_max_mma_of_all_regions')) as pmaw_source
,if(w.pmaw is not null,1,0) as is_actual_pmaw
,i.date_pull -- partitioned col needs to be last in select
from (
select dr.date_pull,dr.id_acct,dr.id_cust,dr.acct_presence,pas_rg.leaflkp as dim_pas_region_key,ah_prd.LeafLkp as dim_ah_product_key
,if( coalesce( ah_prd.L4_Stype,pas_prd.l3_category_cd) ='PER','MMA pmaw', 'BMMA pmaw') as pmaw_mma_lob
from pasle.${VERSION}_fact_driver dr --fact_driver: 985164794 rows
left join pasle.${VERSION}_ah01 ah on dr.date_pull=ah.date_pull and dr.id_acct=ah.id_acct --ah01: 894956463 rows
left join pasle.${VERSION}_pas pas on pas.date_pull=dr.date_pull and pas.acct_num=dr.id_acct --pas: 502785263 rows
left join pasle.${VERSION}_dim_ah_product ah_prd --1241 rows
on isnull(ah.prod_code_1,'')=isnull(ah_prd.l1_class,'')
and isnull(ah.prod_code_2,'')=isnull(ah_prd.l2_service,'')
and isnull(ah.prod_code_3,'')=isnull(ah_prd.l3_ptype,'')
and isnull(ah.prod_code_4,'')=isnull(ah_prd.l4_stype,'')
--remove leading zeroes in prod_code_5 for values that are all numeric to match the dimension table values.
and isnull(case when regexp_like(ah.prod_code_5,'^[0-9]+$') then cast(cast(ah.prod_code_5 as int) as string) else ah.prod_code_5 end,'')=isnull(ah_prd.appl_product_type,'')
left join pasle.${VERSION}_dim_pas_product pas_prd on pas_prd.product_cd=pas.product_cd --dim_pas_product: 326 rows
left join pasle.${VERSION}_dim_pas_region pas_rg on cast(pas_rg.bank_cd as string)=pas.bank_cd and cast(pas_rg.region_cd as string) =pas.region_cd --dim_pas_region: 23 rows
) i
-- pmaw straight from pmaw sheet
left join pasle.${VERSION}_pmaw_sheet w on w.date_pull = i.date_pull and w.dim_pas_region_key=i.dim_pas_region_key and w.dim_ah_product_key =i.dim_ah_product_key --pmaw_sheet: 315795
-- pmaw gap fill case 1 : when an account matches to pmaw sheet by region but not by product, take MMA/BBMA pmaw.
-- the following combinations from acct_presence=both will fall into this category :
--CHECKING DDA
--Retirement IRA
--Retirement QP
--Retirement SAV
--All PAS accounts fall into this category. match on region but not product
left join pasle.${VERSION}_pmaw_sheet w2 on w2.pmawproduct in ('MMA pmaw','BMMA pmaw') --pmaw_sheet: 315795
and w2.date_pull = i.date_pull and w2.dim_pas_region_key=i.dim_pas_region_key and w2.pmawproduct = i.pmaw_mma_lob
-- pmaw gap fill case 2 : for all AH_only accounts, that don't match product or region, take the average of pmaw.
-- all accounts in ah_only where ah_product_class in (Investment,Loan,Other,Unmapped) and not match to region, we will take max(pmaw) of MMA product of corresponding lob.
left join (select date_pull,pmawproduct,max(pmaw) as pmaw from pasle.${VERSION}_pmaw_sheet where pmawproduct in ('MMA pmaw','BMMA pmaw') group by date_pull ,pmawproduct) w3 --pmaw_sheet: 315795
on w3.date_pull = i.date_pull and w3.pmawproduct = i.pmaw_mma_lob