如何在Python(XML模式)中读取/解析.xls文件
如何在python中读取此结构如何在Python(XML模式)中读取/解析.xls文件,python,xml,pandas,xls,Python,Xml,Pandas,Xls,如何在python中读取此结构 <?xml version="1.0" encoding="ISO-8859-1"?> -<Workbook xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:msxsl="urn:schemas-micro
<?xml version="1.0" encoding="ISO-8859-1"?>
-<Workbook xmlns:html="http://www.w3.org/TR/REC-html40" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:msxsl="urn:schemas-microsoft-com:xslt" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:schemas-microsoft-com:office:spreadsheet">
-<Styles>
-<Style ss:ID="VIEW">
<Font ss:Bold="1"/>
</Style>
-<Style ss:ID="HEADER">
<Font ss:Bold="1" ss:Color="#FFFFFF"/>
<Interior ss:Color="#666699" ss:Pattern="Solid"/>
</Style>
-<Style ss:ID="DOUBLE_0">
<NumberFormat ss:Format="0"/>
</Style>
-<Style ss:ID="DOUBLE_2">
<NumberFormat ss:Format="0.00"/>
</Style>
-<Style ss:ID="DOUBLE_3">
<NumberFormat ss:Format="0.000"/>
</Style>
-<Style ss:ID="DOUBLE_4">
<NumberFormat ss:Format="0.0000"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_2">
<NumberFormat ss:Format="0.00%"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_3">
<NumberFormat ss:Format="0.000%"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_4">
<NumberFormat ss:Format="0.0000%"/>
</Style>
-<Style ss:ID="PERCENT_FIXED_5">
<NumberFormat ss:Format="0.00000%"/>
</Style>
-<Style ss:ID="DATE">
<NumberFormat ss:Format="yyyy\-mm\-dd;@"/>
</Style>
<ss:Style ss:ID="STRING"/>
</Styles>
-<Worksheet ss:Name="MSFRE">
-<Table x:FullRows="1" x:FullColumns="1">
-<Row>
-<Cell ss:StyleID="VIEW" ss:Index="1">
<Data ss:Type="String">Geo</Data>
</Cell>
</Row>
-<Row>
<Cell ss:StyleID="HEADER"/>
<Cell ss:StyleID="HEADER"/>
-<Cell ss:StyleID="HEADER" ss:Index="3" ss:MergeAcross="5">
<Data ss:Type="String"/>
</Cell>
</Row>
-<Row>
<Cell ss:StyleID="HEADER"/>
<Cell ss:StyleID="HEADER"/>
-<Cell ss:StyleID="HEADER" ss:Index="3">
<Data ss:Type="String">Holding Date</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="4">
<Data ss:Type="String">Fund code</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="5">
<Data ss:Type="String">Fund name</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="6">
<Data ss:Type="String">PTF</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="7">
<Data ss:Type="String">REF</Data>
</Cell>
-<Cell ss:StyleID="HEADER" ss:Index="8">
<Data ss:Type="String">PTF-REF</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="1">
<Data ss:Type="String">Geo</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.18545250736645816</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.18545250736645816</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Asia ex-Japan</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">9.356235001537855E-4</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">9.356235001537855E-4</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">EMU</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.10654090959320628</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.10654090959320628</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Emerging Countries</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.00294017805163712</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.00294017805163712</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Europe ex-EMU</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.02354783768818136</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.02354783768818136</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">Japan</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.005898729959204227</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.005898729959204227</Data>
</Cell>
</Row>
-<Row>
-<Cell ss:Index="2">
<Data ss:Type="String">North America</Data>
</Cell>
-<Cell ss:StyleID="DATE" ss:Index="3">
<Data ss:Type="DateTime">2020-09-10T00:00:00</Data>
</Cell>
-<Cell ss:Index="4">
<Data ss:Type="String">PF39594</Data>
</Cell>
-<Cell ss:Index="5">
<Data ss:Type="String">MSFRE</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="6">
<Data ss:Type="Number">0.044037874185699856</Data>
</Cell>
-<Cell ss:StyleID="PERCENT_FIXED_2" ss:Index="8">
<Data ss:Type="Number">0.044037874185699856</Data>
</Cell>
</Row>
</Table>
</Worksheet>
</Workbook>
返回(pd.数据帧(数据))
在这种情况下,如果单元格索引为空或null,如何处理?(在这种情况下,单元格索引7并不总是存在)。Win32库无法帮助我在这种情况下,我尝试过,但它没有处理数千个文件所需的性能尝试使用“索引”属性而不是单元格元素索引:
# add "ss" namespace declaration to the namespaces map
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet", "ss": "urn:schemas-microsoft-com:office:spreadsheet"}
# in function call reference element "Cell" having an attribute "Index" with value "7"
getvalueofnode(node.find('doc:Cell[@ss:Index="7"]/doc:Data', ns))
同样的方法也可以用于其他细胞
此代码将尝试查找具有给定索引属性的单元格元素。如果未找到,函数getvalueofnode()将返回None
# add "ss" namespace declaration to the namespaces map
ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet", "ss": "urn:schemas-microsoft-com:office:spreadsheet"}
# in function call reference element "Cell" having an attribute "Index" with value "7"
getvalueofnode(node.find('doc:Cell[@ss:Index="7"]/doc:Data', ns))