Мы сталкиваемся с проблемой небольших файлов в разделе, вызывающей неравномерное распределение и проблемы с производительностью.
Не могли бы вы помочь, как мы можем избежать этих проблем ![enter image description here](https://i.stack.imgur.com/E0XFL.png)
Код HIVE
SET hive.exec.dynamic.partition = TRUE;
SET hive.exec.dynamic.partition.mode = nonstrict;
SET mapreduce.map.memory.mb = 5120;
SET mapreduce.reduce.memory.mb = 5120;
SET mapreduce.map.java.opts = -Xmx4608m;
SET mapreduce.redu.java.opts = -Xmx4608m;
SET parquet.compression = SNAPPY;
SET hive.exec.compress.output = true;
SET mapred.output.сжатие.codec = org.apache.hadoop.io.compress.SnappyCodec;
SET mapred.output.compression.type = BLOCK;
SET hive.merge.mapfiles = true;
set hive.merge.smallfiles.avgsize = 1000000;
set hive.merge.size.per.task = 128000000;
INSERT OVERWRITE таблица tn_1 PARTITION (dt, hr)
SELECT
tab1.proc_seq_no,
tab1.accs_pont_nm,
tab1.mobl_cnty_cd,
tab1.mobl_ntwk_cd,
tab1.host_url_id,
1049 * tab1.refr_url_id,
tab1.user_agnt_id,
tab1.qry_domn_nm,
tab1.rtsp_user_agnt_id,
tab1.smtp_frst_er58 * 10 *
tab1.pop3_frst_fail_resp_cd,
tab1.qsms_appl_stck_nm,
tab1.qsms_appl_fmly_nm,
tab1.imsi_id,
ip.1069 * tab1.msisdn_id,
tab1.msisdn_id_pii_md5,
tab1.imei_id,
tab1.imei_id_pii_md5,
tab1.ntwk_tech_type_cd *, 1077
tab1.locl_ts,
tab1.utc_ts,
tab1.caus_cd,
tab1.frst_url_id,
tab1.rtsp_url_id,
tab1.dvic_modl_nm,
tab1.dvic_mnfc_nm,
tab1.clnt_ip_id,
tab1.clnt_ip_id_pii_md5,
tab1.serv_ip_id, * 10981099 * tab1.subs_type_cd,
tab1.rslv_ip_id,
tab1.min_flow_strt_ts,
tab1.max_flow_end_ts,
tab1.tcp_mdte_cnnc_qty,
tab1.
tab1.dwld_totl_time_qty,
tab1.upld_totl_byte_qty,
tab1.dwld_totl_byte_qty,
tab1.
tab1.http_1xx_resp_qty,
tab1.http_2xx_resp_qty,
tab1.http_3xx_resp_qty,
tab1.
tab1.rtsp_1xx_resp_qty,
tab1.rtsp_2xx_resp_qty,
tab1.rtsp_3xx_resp_qty,
tab1.
tab1.ftp_file_trnf_qty,
tab1.imap_mail_qty,
tab1.smtp_recv_qty,
tab1.pop3_mail_qty,
tab1.tran_qty, 1152
1153 * tab1.upld_medn_tput_qty,
tab1.dwld_medn_tput_qty,
tab1.ECF_XTRC_TS,
tab1.ECF_DELT_FLG,
tab1.ECF_NSRT_PROC_RUN_NO,
tab1.ECF_SRCE_SYST_CD,
tab1.ECF_OPEN_TS,
tab1.ECF_CLSE_TS, * 11681 * 1169.ECF_NSRT_TS,
tab1.dt,
tab1.hr
FROM (
SELECT proc_seq_nr AS proc_seq_no,
apn AS accs_pont_nm,
mcc AS mobl_cnty_cd,
mnc AS mobl_ntwk_cd,
host AS host_url_id,
реферер AS refr_url_id,
ua AS user_agnt_id
имя_запроса AS qry_domn_nm,
rtsp_ua AS rtsp_user_agnt_id,
smtp_first_err_code AS smtp_frst_err_cd,
as_st_p_f_
qosmos_app_family AS, qsms_appl_fmly_nm,
imsi AS imsi_id,
MD5 (imsi) AS imsi_id_PII_MD5,
msisdn AS msisdn_id,
MD5 (msisdn) AS msisdn_id_PII_MD5,
imei AS imei_id,
MD5 (imei) AS imei_id_PII_Mchn15 * *AS ntwk_tech_type_cd,
CAST (локальная метка времени AS) locl_ts,
CAST (utc_timestamp метка времени AS) utc_ts,
причина_код AS caus_cd,
url ASR,
rtsp_url AS rtsp_url_id,
device_model AS dvic_modl_nm,
device_manufacturer AS dvic_mnfc_nm,
client_ip AS client clip_ip_id,
1233 * * 1232) AS clnt_ip_id_PII_MD5,
server_ip AS serv_ip_id AS,
subscriber_type AS subs_type_cd,
resolved_ip_add AS rslv_ip_id,
12_min_flow_startmax_flow_end AS max_flow_end_ts,
tcp_med_conn_time AS tcp_mdte_cnnc_qty,
tcp_med_rtt AS tcp_mdte_rtt_qty,
12-недельный_срок_добавок______данных_данных_временной_временной передачиtotl_time_qty, * тысяча двести пятьдесят два * * тысяча двести пятьдесят-три * data_total_up_vol А.С. upld_totl_byte_qty,
* +1255 * data_total_down_vol А.С. dwld_totl_byte_qty, * тысяча двести пятьдесят шесть * * +1257 * data_med_up_rtr_cnt А.С. upld_medn_rtr_qty, * +1259 * data_med_down_rtr_cnt А.С. dwld_medn_rtr_qty, 1 261 * http_1xx_resp А.С.http_1xx_resp_qty,
http_2xx_resp AS http_2xx_resp_qty,
http_3xx_resp AS http_3xx_resp_qty,
http_4xx_resp AS http_4xx_resp * __12_ * * * * http: * http: * http: * http: * http: * _t_p__t_0_p__t_0_5_p__t_0_5 * * * * * * * * * * * * * *rtsp_1xx_resp_qty, * тысяча двести семьдесят два *
rtsp_2xx_resp А.С. rtsp_2xx_resp_qty,
* +1275 * rtsp_3xx_resp А.С. rtsp_3xx_resp_qty, * тысяча двести семьдесят-семь * rtsp_4xx_resp А.С. rtsp_4xx_resp_qty, * +1279 * rtsp_5xx_resp А.С. rtsp_5xx_resp_qty, 1 281 * ftp_file_transfer А.С.ftp_file_trnf_qty,
imap_mail_count AS imap_mail_qty,
smtp_receivers AS smtp_recv_qty,
pop3_mail_count AS pop3_mail_qty * 1288_ * * *
* *1289* * * 1289upld_medn_tput_qty
data_med_down_throughput AS dwld_medn_tput_qty,
CAST (CONCAT (SUBSTR (DT, 1,4)), '-', SUBSTR (DT, 5,2), '-', SUBSTR (DT, 7, 2), '', SUBSTR (HR, 1,2), ': 00:00') AS TIMESTAMP) AS ECF_XTRC_TS,
'N' AS ECF_DELT_FLG,
CAST ('1000 'DECIMAL) AS ECF_NSRT_PROC_RUN_NO,
' NDC 'AS ECF_SRCE_SYST_CD,
CURRENT_TIMESTAMP () AS ECF_OPEN_TS,
1306 * EC06_CURRENT_TIMESTAMP () AS ECF_NSRT_TS,
DT,
HR
FROM dev1_DP_TEMP.mobl_data_hr_summ_nsit_mud) tab1