приведенные ниже шаги могут быть полезны,
используйте hivexmlserde из Maven https://mvnrepository.com/artifact/com.ibm.spss.hive.serde2.xml/hivexmlserde
скопируйте ввод xml файл для куста внешней таблицы, указывающей на hdfs
hadoop fs -copyFromLocal emp.xml /stackoverflow/data/hive/dwh/employee
добавьте xml serde jar в терминал улья, создайте ddl и действуйте, как показано ниже.
add jars file:///home/sathya/Downloads/hivexmlserde-1.0.5.3.jar;
Added [file:///home/sathya/Downloads/hivexmlserde-1.0.5.3.jar] to class path
Added resources: [file:///home/sathya/Downloads/hivexmlserde-1.0.5.3.jar]
CREATE EXTERNAL TABLE employee (
`employer_name` string,
`occupation` string,
`hiredDate` string,
`fileDate` string,
`effDate` string
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES (
"column.xpath.employer_name"="/employment/employer/unparsed/text()",
"column.xpath.occupation"="/employment/occupation/text()",
"column.xpath.hiredDate"="/employment/hiredDate/TillNo/text()",
"column.xpath.fileDate"="/employment/fileDate/text()",
"column.xpath.effDate"="/employment/effDate/text()"
)
STORED AS INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION 'hdfs://localhost:9000/stackoverflow/data/hive/dwh/employee'
TBLPROPERTIES (
"xmlinput.start"="<employment","xmlinput.end"="</employment>"
);
select * from employee;
EMPLOYER-2 NULL NULL 2020-07-21 2020-07-21
EMPLOYER-1 NURSE NULL 2015-08-07 2015-08-07