配置单元-XML-Serde-Key-Value对-Create表
键值对组合。我们试图将ID分解为列名,将值分解为每列的相应数据配置单元-XML-Serde-Key-Value对-Create表,xml,hadoop,xpath,hive,hive-serde,Xml,Hadoop,Xpath,Hive,Hive Serde,键值对组合。我们试图将ID分解为列名,将值分解为每列的相应数据 `<CT> <items> <item> <field> <id>Column1</id> <value>25672</value> </field> <field> <id>Column2</id> <value>FGE</value> </field&g
`<CT> <items> <item> <field> <id>Column1</id> <value>25672</value> </field>
<field> <id>Column2</id> <value>FGE</value> </field> <field>
<id>Column3</id> <value>Florence to Venice</value> </field> </item>
</items>
</CT>`
我们尝试使用Map来提取键值对,但没有得到期望的结果
'CREATE EXTERNAL TABLE dev.reference_test(
PM_SubCollection array<map<string,string>>
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES(
"column.xpath.PM_SubCollection"="/CT/items/item/field",
"xml.map.specification.id"="#id->#value"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
LOCATION '/dev/reference_test'
TBLPROPERTIES (
"xmlinput.start"="",
"xmlinput.end"=""
);'
Output:
'[{"field":"Column125672"},{"field":"Column2FGE"},{"field":"Column3Florence to Venice"}]'
“创建外部表开发参考\u测试”(
PM_子集合数组
)
行格式SERDE'com.ibm.spss.hive.serde2.xml.XmlSerDe'
具有serdeproperty(
“column.xpath.PM_子集合”=“/CT/items/item/field”,
“xml.map.specification.id”=“#id->#value”
)
存储为
INPUTFORMAT'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
位置“/dev/reference\u test”
TBLProperty(
“xmlinput.start”=”,
“xmlinput.end”=“”
);'
输出:
“[{”字段“:“Column125672”},{”字段“:“Column2FGE”},{”字段“:“Column3Florence to Venice”}”
任何建议都会有所帮助请参见
如果必须从以下XML中捕获消息id以及id和值
<?xml version="1.0" encoding="UTF-8"?>
<CT>
<messageID>11736</messageID>
<items>
<item>
<field>
<id>Column1</id>
<value>25672</value>
</field>
<field>
<id>Column2</id>
<value>FGE</value>
</field>
<field>
<id>Column3</id>
<value>Florence to Venice</value>
</field>
</item>
</items>
</CT>
不会发生的。专栏应该提前知道。感谢您的回复。如果我们知道这些列并创建包含预期列的表,我们可以将其值部分映射到相应的列吗?
<?xml version="1.0" encoding="UTF-8"?>
<CT>
<messageID>11736</messageID>
<items>
<item>
<field>
<id>Column1</id>
<value>25672</value>
</field>
<field>
<id>Column2</id>
<value>FGE</value>
</field>
<field>
<id>Column3</id>
<value>Florence to Venice</value>
</field>
</item>
</items>
</CT>
DROP TABLE IF EXISTS xml_42a;
CREATE TABLE xml_42a(
message_id string,
fields array<struct<field:struct<id:string,value:string>>>
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
WITH SERDEPROPERTIES(
"column.xpath.message_id"="/CT/messageID/text()",
"column.xpath.fields"="/CT/items/item/field"
)
STORED AS
INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
TBLPROPERTIES (
"xmlinput.start"="<CT>",
"xmlinput.end"="</CT>"
);
load data local inpath '/Users/dvasilen/Misc/XML/42a.xml' OVERWRITE into table xml_42a;
select * from xml_42a;
hive>
> select * from xml_42a;
OK
11736 [{"field":{"id":"Column1","value":"25672"}},{"field":{"id":"Column2","value":"FGE"}},{"field":{"id":"Column3","value":"Florence to Venice"}}]
Time taken: 0.08 seconds, Fetched: 1 row(s)