在配置单元中使用XMLSerDe进行XML文件配对时出现问题
下面是我的输入XML在配置单元中使用XMLSerDe进行XML文件配对时出现问题,xml,hadoop,hive,hiveql,bigdata,Xml,Hadoop,Hive,Hiveql,Bigdata,下面是我的输入XML <entity> <link idType="ProviderId">AEY000977645</link> <link idType="PAID">000977645</link> <link idType="PID">AEY</link> <message reason="Not Currently In TMS Database" status="Unmappa
<entity>
<link idType="ProviderId">AEY000977645</link>
<link idType="PAID">000977645</link>
<link idType="PID">AEY</link>
<message reason="Not Currently In TMS Database" status="Unmappable"/>
</entity>
AEY000977645
000977645
埃伊
我需要解析这些数据,并使用hivexmlserde创建一个包含4列(ProviderID、PAID、PID、message_reason)的配置单元表。因为所有的值都在标记内,所以很难解析数据。下面是我的表格DDL
CREATE EXTERNAL TABLE xml_testing
(
provider_id String,
paid String,
pid String,
message_reason String
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES (
"column.xpath.provider_id"="/entity/link/@idType", --> not sure what to give here
"column.xpath.paid"="/entity/link/@idType", --> not sure what to give here
"column.xpath.pid"="/entity/link/@idType", --> not sure what to give here
"column.xpath.message_reason"="/entity/message/@reason"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
LOCATION '/input/'
TBLPROPERTIES (
"xmlinput.start"="<entity>",
"xmlinput.end"="</entity>"
);
创建外部表xml\u测试
(
提供者id字符串,
付费字符串,
pid字符串,
消息\u原因字符串
)
行格式SERDE'com.ibm.spss.hive.serde2.xml.XmlSerDe'
具有serdeproperty(
“column.xpath.provider_id”=“/entity/link/@idType”,-->不确定在这里给出什么
“column.xpath.paid”=“/entity/link/@idType”,-->不确定在这里给出什么
“column.xpath.pid”=“/entity/link/@idType”,-->不确定在这里给出什么
“column.xpath.message_reason”=“/entity/message/@reason”
)
存储为
INPUTFORMAT'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
位置'/input/'
TBLProperty(
“xmlinput.start”=”,
“xmlinput.end”=“”
);
有人能帮我吗?按属性值选择元素 ie/entity/link[@idType='ProviderId']/text()
创建外部表xml\u测试
(
提供者id字符串,
付费字符串,
pid字符串,
消息\u原因字符串
)
行格式SERDE'com.ibm.spss.hive.serde2.xml.XmlSerDe'
具有serdeproperty(
“column.xpath.provider_id”=“/entity/link[@idType='ProviderId']/text()”,
“column.xpath.paid”=“/entity/link[@idType='paid']/text()”,
“column.xpath.pid”=“/entity/link[@idType='pid']/text()”,
“column.xpath.message_reason”=“/entity/message/@reason”
)
存储为
INPUTFORMAT'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
位置'/input/'
TBLProperty(
“xmlinput.start”=”,
“xmlinput.end”=“”
);
与其定义三列,不如将一列定义为映射
CREATE EXTERNAL TABLE xml_testing
(
provider_id String,
paid String,
pid String,
message_reason String
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES (
"column.xpath.provider_id"="/entity/link[@idType='ProviderId']/text()",
"column.xpath.paid"="/entity/link[@idType='PAID']/text()",
"column.xpath.pid"="/entity/link[@idType='PID']/text()",
"column.xpath.message_reason"="/entity/message/@reason"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
LOCATION '/input/'
TBLPROPERTIES (
"xmlinput.start"="<entity>",
"xmlinput.end"="</entity>"
);