Python 数据框架到分层xml
将csv读取为数据帧,然后使用lxml库将其转换为xml 这是我第一次处理xml,似乎取得了部分成功。我们将非常感谢您的帮助 用于创建数据帧的CSV文件:Python 数据框架到分层xml,python,xml,pandas,lxml,Python,Xml,Pandas,Lxml,将csv读取为数据帧,然后使用lxml库将其转换为xml 这是我第一次处理xml,似乎取得了部分成功。我们将非常感谢您的帮助 用于创建数据帧的CSV文件: 将lxml.etree导入为etree 作为pd进口熊猫 导入json #读取csv文件 dfc=pd.read\U csv('test\U data\U txlife.csv')。fillna('NA')) ##删除带有注释的行 #dfc=dfc[~dfc['Element'].str.contains(“您已经有了一个很好的开始!我认
将lxml.etree导入为etree
作为pd进口熊猫
导入json
#读取csv文件
dfc=pd.read\U csv('test\U data\U txlife.csv')。fillna('NA'))
##删除带有注释的行
#dfc=dfc[~dfc['Element'].str.contains(“您已经有了一个很好的开始!我认为一点一点地检查代码并解释它需要调整的地方是最容易的,并提出一些改进建议:
读取和清理数据
#读取csv文件
dfc=pd.read\U csv('test\U data\U txlife.csv')。fillna('NA'))
##删除带有注释的行
#dfc=dfc[~dfc['Element'].str.contains(“非常感谢您的帮助。通过这个很棒的Response学到了一些新东西。非常欢迎您!
Parent,Element,Text,Attribute
,TXLife,"
",{'Version': '2.25.00'}
TXLife,UserAuthRequest,"
",{}
UserAuthRequest,UserLoginName,*****,{}
UserAuthRequest,UserPswd,"
",{}
UserPswd,CryptType,None,{}
UserPswd,Pswd,****,{}
TXLife,TXLifeRequest,"
",{'PrimaryObjectID': 'Policy_1'}
TXLifeRequest,TransRefGUID,706D67C1-CC4D-11CF-91FB444554540000,{}
TXLifeRequest,TransType,Holding Change,{'tc': '502'}
TXLifeRequest,TransExeDate,2006-11-19,{}
TXLifeRequest,TransExeTime,13:15:33-07:00,{}
TXLifeRequest,ChangeSubType,"
",{}
ChangeSubType,ChangeTC,Change Participant,{'tc': '9'}
TXLifeRequest,OLifE,"
",{}
OLifE,Holding,"
",{'id': 'Policy_1'}
Holding,HoldingTypeCode,Policy,{'tc': '2'}
Holding,Policy,"
",{}
Policy,PolNumber,1234567,{}
Policy,LineOfBusiness,Annuity,{'tc': '2'}
Policy,Annuity,,{}
OLifE,Party,"
",{'id': 'Beneficiary_1'}
Party,PartyTypeCode,Organization,{'tc': '2'}
Party,FullName,The Smith Trust,{}
Party,Organization,"
",{}
Organization,OrgForm,Trust,{'tc': '16'}
Organization,DBA,The Smith Trust,{}
OLifE,Relation,"
","{'id': 'Relation_1', 'OriginatingObjectID': 'Policy_1', 'RelatedObjectID': 'Beneficiary_1'}"
Relation,OriginatingObjectType,Holding,{'tc': '4'}
Relation,RelatedObjectType,Party,{'tc': '6'}
Relation,RelationRoleCode,Primary Beneficiary,{'tc': '34'}
Relation,BeneficiaryDesignation,Named,{'tc': '1'}
import lxml.etree as etree
import pandas as pd
import json
# Read the csv file
dfc = pd.read_csv('test_data_txlife.csv') .fillna('NA')
# # Remove rows with comments
# dfc = dfc[~dfc['Element'].str.contains("<cyfunction")].fillna('')
dfc['Attribute'] = dfc['Attribute'].apply(lambda x: x.replace("'", '"'))
# Add the root element for xml
root = etree.Element(dfc['Element'][0])
tree = root.getroottree()
for prnt, elem, txt, attr in dfc[['Parent', 'Element', 'Text', 'Attribute']][1:].values:
# Convert attributes to json (dictionary)
attrib = json.loads(attr)
# list(root) = root.getchildren()
children = [item for item in str(list(root)).split(' ')]
rootstring = str(root).split(' ')[1]
# If the parent is root then add the element as child (appaers to work?)
if prnt == str(root).split(' ')[1]:
parent = etree.SubElement(root, elem)
# If the parent is not root but is one of its children then add the elements to the parent
elif not prnt == rootstring and prnt in children:
child = etree.SubElement(parent, elem, attrib).text = txt
# # If the parent is not in root's descendents then add the childern to the parents
elif not prnt in [str(item).split(' ') for item in root.iterdescendants()]:
child = etree.SubElement(parent, elem, attrib).text = txt
print(etree.tostring(tree, pretty_print=True).decode())
<TXLife>
<UserAuthRequest>
<UserLoginName>*****</UserLoginName>
<UserPswd>
</UserPswd>
<CryptType>None</CryptType>
<Pswd>xxxxxx</Pswd>
</UserAuthRequest>
<TXLifeRequest>
<TransRefGUID>706D67C1-CC4D-11CF-91FB444554540000</TransRefGUID>
<TransType tc="502">Holding Change</TransType>
<TransExeDate>11/19/2006</TransExeDate>
<TransExeTime>13:15:33-07:00</TransExeTime>
<ChangeSubType>
</ChangeSubType>
<ChangeTC tc="9">Change Participant</ChangeTC>
<OLifE>
</OLifE>
<Holding id="Policy_1">
</Holding>
<HoldingTypeCode tc="2">Policy</HoldingTypeCode>
<Policy>
</Policy>
<PolNumber>1234567</PolNumber>
<LineOfBusiness tc="2">Annuity</LineOfBusiness>
<Annuity>NA</Annuity>
<Party id="Beneficiary_1">
</Party>
<PartyTypeCode tc="2">Organization</PartyTypeCode>
<FullName>The Smith Trust</FullName>
<Organization>
</Organization>
<OrgForm tc="16">Trust</OrgForm>
<DBA>The Smith Trust</DBA>
<Relation OriginatingObjectID="Policy_1" RelatedObjectID="Beneficiary_1" id="Relation_1">
</Relation>
<OriginatingObjectType tc="4">Holding</OriginatingObjectType>
<RelatedObjectType tc="6">Party</RelatedObjectType>
<RelationRoleCode tc="34">Primary Beneficiary</RelationRoleCode>
<BeneficiaryDesignation tc="1">Named</BeneficiaryDesignation>
</TXLifeRequest>
</TXLife>
<TXLife Version="2.25.00">
<UserAuthRequest>
<UserLoginName>*****</UserLoginName>
<UserPswd>
<CryptType>None</CryptType>
<Pswd>****</Pswd>
</UserPswd>
</UserAuthRequest>
<TXLifeRequest PrimaryObjectID="Policy_1">
<TransRefGUID>706D67C1-CC4D-11CF-91FB444554540000</TransRefGUID>
<TransType tc="502">Holding Change</TransType>
<TransExeDate>2006-11-19</TransExeDate>
<TransExeTime>13:15:33-07:00</TransExeTime>
<ChangeSubType>
<ChangeTC tc="9">Change Participant</ChangeTC>
</ChangeSubType>
<OLifE>
<Holding id="Policy_1">
<HoldingTypeCode tc="2">Policy</HoldingTypeCode>
<Policy>
<PolNumber>1234567</PolNumber>
<LineOfBusiness tc="2">Annuity</LineOfBusiness>
<Annuity></Annuity>
</Policy>
</Holding>
<Party id="Beneficiary_1">
<PartyTypeCode tc="2">Organization</PartyTypeCode>
<FullName>The Smith Trust</FullName>
<Organization>
<OrgForm tc="16">Trust</OrgForm>
<DBA>The Smith Trust</DBA>
</Organization>
</Party>
<Relation id="Relation_1" OriginatingObjectID="Policy_1" RelatedObjectID="Beneficiary_1">
<OriginatingObjectType tc="4">Holding</OriginatingObjectType>
<RelatedObjectType tc="6">Party</RelatedObjectType>
<RelationRoleCode tc="34">Primary Beneficiary</RelationRoleCode>
<BeneficiaryDesignation tc="1">Named</BeneficiaryDesignation>
</Relation>
</OLifE>
</TXLifeRequest>
</TXLife>
import lxml.etree as etree
import pandas as pd
import json
# Read the csv file
dfc = pd.read_csv('test_data_txlife.csv').fillna("NA")
dfc['Attribute'] = dfc['Attribute'].str.replace("'", '"').apply(lambda s: json.loads(s))
# Add the root element for xml
root = etree.Element(dfc['Element'][0], dfc['Attribute'][0])
for idx, prnt, elem, txt, attr in dfc[1:].itertuples():
# Fix text
text = txt.strip()
if not text:
text = None
# Find parent element
if prnt == root.tag:
parent = root
else:
parent = root.find(".//" + prnt)
# Create element
child = etree.SubElement(parent, elem, attr)
child.text = text
xml_string = etree.tostring(root, pretty_print=True).decode().replace(">NA<", "><")
print(xml_string)