Python 3.x Python3中的XML解析_Python 3.x_Xml Parsing_Elementtree

Python 3.x Python3中的XML解析

python-3.x

Python 3.x Python3中的XML解析,python-3.x,xml-parsing,elementtree,Python 3.x,Xml Parsing,Elementtree,我有一个类似这样的xml文件，它是下面DF中的一个系列 userid | fid | response ----------------------- 1125 | 58940 | xml1 3344 | 47839 | xml2 3455 | 12335 | xml3 响应列包含如下xml文件 HTTP/1.1 100 Continue HTTP/1.1 200 OK Expires: 0 Buffer: false Pragma: No-cache Cache-Control:

我有一个类似这样的xml文件，它是下面DF中的一个系列

 userid | fid | response
 -----------------------
 1125 | 58940 | xml1
 3344 | 47839 | xml2
 3455 | 12335 | xml3

响应列包含如下xml文件

HTTP/1.1 100 Continue

HTTP/1.1 200 OK
Expires: 0
Buffer: false
Pragma: No-cache
Cache-Control: no-cache
Server: IBM_CICS_Transaction_Server/4.1.0(zOS)
Connection: close
Content-Type: text/html
Content-Length: 33842
Date: Sat, 02 Aug 2014 09:27:02 GMT

<?xml version="1.0" encoding="UTF-8"?><creditBureau xmlns="http://www.transunion.com/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber>

HTTP/1.1 100是否继续
HTTP/1.1200ok
过期日期：0
缓冲区：false
Pragma：没有缓存
缓存控制：没有缓存
服务器：IBM_CICS_Transaction_Server/4.1.0（zOS）
连接：关闭
内容类型：text/html
内容长度：33842
日期：2014年8月2日星期六09:27:02 GMT
响应2.9信用报告示例

这只是整个文档的一部分。我必须解析这个大xml并将其转换为json。我遇到的第一个问题是解析这个文件。我当前的代码如下所示：

 import pandas as pd
 import re

 raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
 df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
 file = open("testfile.txt", "w")
 file.write(df.loc[0][2])
 file.close()

 #Adding Root Element
 with open("testfile.txt") as f:
     file = f.read()
     file_list = file.split('\n')
 file_list[12] = '<root>'
 file_list.append('</root>')
 start = file_list.index('<root>')
 new_list = file_list[start:]

 #Converting to String
 str1 = ''.join(new_list)
 f = open("tocng.xml","w")
 f.write(str1)

 #parsing xml
 import xml.etree.ElementTree as ET
 tree = ET.parse('tocng.xml')
 ### Gives an error:XML or text declaration not at start of entity: line 1, column 6

将熊猫作为pd导入
进口稀土
原始数据=pd.read\u csv（'C:\\Users\\Desktop\\xml.csv'，sep='|'）
df=pd.DataFrame（原始数据，列=['userid'，'fid'，'response']）
file=open（“testfile.txt”、“w”）
file.write（df.loc[0][2]）
file.close（）文件
#添加根元素
将open（“testfile.txt”）作为f：
file=f.read（）
file\u list=file.split（'\n'）
文件列表[12]=''
文件\u list.append（“”）
开始=文件\u列表.索引（“”）
新建列表=文件列表[开始：]
#转换为字符串
str1=''.join（新列表）
f=打开（“tocng.xml”，“w”）
f、 写入（str1）
#解析xml
将xml.etree.ElementTree作为ET导入
tree=ET.parse（'tocng.xml'）
###给出错误：XML或文本声明不在实体的开头：第1行第6列

我不明白这里有什么问题。

初始版本

基于新的单行xml xml文件中的更多变体可能需要对代码进行调整

import pandas as pd
import re

raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()

#Adding Root Element
with open("testfile.txt") as f:
    file = f.read()
    # Replace up to <?xml tag.
    file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
    # Strip file and add \n at each instance of >.
    file = file.strip()
    file = file.replace('>', '>\n')
    # Split file and make a list with no empty items.
    file_list = file.split('\n')
    file_list = [item for item in file_list if item != '']
    # Remove known xml declarations.
    if file_list[0][:5] == '<?xml':
        del file_list[0]
    if file_list[0][:13] == '<creditBureau':
        del file_list[0]
    # Add root tags.
    file_list.insert(0, '<root>')
    file_list.append('</root>')

#Converting to String
str1 = ''.join(file_list)
print(str1) ## See output in my answer
with open("tocng.xml","w") as f:
    f.write(str1)

#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

在结尾添加了一个漂亮的打印。可选，仅用于查看结果。

已对Stackoverflow进行了回答：===================================================================================================@AnupYadav我已对其进行了检查。这个解决方案行不通。它仍然给了我同样的错误。你把所有的HTML头都写到

testfile.txt

文件中了吗？首先为什么会有HTTP头？在第三列中是否确实存在包含完整HTTP响应的CSV文件？因为这是一个非常不寻常的格式选择，程序员通常不会选择。你是从外部获得的还是你自己创造的？如果是你自己创造的，为什么？这么多问题…您编辑的单行xml缺少结束标记

。我将添加一个基于xml的答案it@Tomalak我刚刚收到了超过2mil的类似文件。我不知道它们来自哪里。我只需要把所有的东西都放在mongodb里。我在这里试图做的是解析

whateverFormat.xml

并将其转换为json。

使用open（“testfile.txt”）作为f:file=f.read（）file\u list=file.split（'\n'）file\u list[13]=''file\u list.append（''）start=file\u list.index（''）new list=file\u list[start:]new list

输出：

[''，]

if file\u-list[13]使其成为根，然后查看新的_list@KaranGupta，新列表打印：

[''response'，'2.9'，'Credit Report Example'，'Z'，''']

这很令人惊讶，因为我打印的是

['''''，]

。所以，我做了

文件列表。插入（13'，）

。如果我使用它，我会在

之后得到

项。如果你得到了那个项目，它可以使xml变形，因为它应该在根上，不应该有双引号。我插入了新行以使其可读。我正在编辑这个问题，为您提供确切的xml。

import pandas as pd
import re

raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()

#Adding Root Element
with open("testfile.txt") as f:
    file = f.read()
    file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
    file_list = file.split('\n')
file_list[1] = '<root>'
file_list.append('</root>')
start = file_list.index('<root>')
new_list = file_list[start:]

#Converting to String
str1 = ''.join(new_list)
f = open("tocng.xml","w")
f.write(str1)
f.close() ### close file handle so ET can read it

#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

import pandas as pd
import re

raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
file = open("testfile.txt", "w")
file.write(df.loc[0][2])
file.close()

#Adding Root Element
with open("testfile.txt") as f:
    file = f.read()
    # Replace up to <?xml tag.
    file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)
    # Strip file and add \n at each instance of >.
    file = file.strip()
    file = file.replace('>', '>\n')
    # Split file and make a list with no empty items.
    file_list = file.split('\n')
    file_list = [item for item in file_list if item != '']
    # Remove known xml declarations.
    if file_list[0][:5] == '<?xml':
        del file_list[0]
    if file_list[0][:13] == '<creditBureau':
        del file_list[0]
    # Add root tags.
    file_list.insert(0, '<root>')
    file_list.append('</root>')

#Converting to String
str1 = ''.join(file_list)
print(str1) ## See output in my answer
with open("tocng.xml","w") as f:
    f.write(str1)

#parsing xml
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

<root><document>response</document><version>2.9</version><transactionControl><userRefNumber>Credit Report Example</userRefNumber></transactionControl></root>

    if file_list[0][:13] == '<creditBureau':
        del file_list[0]

import pandas as pd
import re


def customize_xml(content, mode=0):
    '''Customizes xml tags in the content and may insert a <root> tag.'''

    # No modification.
    if mode in (-1, None):
        return content

    # Select a pattern (mode) that modifies the whole xml.
    pattern = (r'\2\3\4',                # 0. <cB>...</cB>
               r'<root>\2\3\4</root>',   # 1. <root><cB>...</cB><root>
               r'\1<root>\2\3\4</root>', # 2. <?xml?><root><cB>...</cB><root>
               r'<root>\3</root>',       # 3. <root>...<root>
               r'\1<root>\3</root>',     # 4. <?xml?><root>...<root>
               )

    # Groups are marked as \1 \2 ... to use for pattern above.
    content = re.sub(r'(<\?xml.+?\?>)'      # \1
                      '(<creditBureau.*?>)' # \2
                      '(.+?)'               # \3
                      '(</creditBureau>)'   # \4
                     , pattern[mode], content, flags=re.S)

    return content


raw_data = pd.read_csv('C:\\Users\\Desktop\\xml.csv', sep = '|')
df = pd.DataFrame(raw_data, columns = ['userid', 'fid', 'response'])
with open("testfile.txt", "w") as f:
    f.write(df.loc[0][2])

with open("testfile.txt") as f:
    file = f.read()

# Remove characters up to <?xml tag.
file = re.sub(r'\A.*(<\?xml.*)\Z', r'\1', file, flags=re.S)

# Make xml one single line if not already.
file = file.replace('\n', '')

file = customize_xml(file, 3)

# Write customized xml.
with open("tocng.xml", "w") as f:
    f.write(file)

# Parsing xml.
import xml.etree.ElementTree as ET
tree = ET.parse('tocng.xml')

# Print pretty xml from xml string.
from xml.dom import minidom
pretty_xml = minidom.parseString(file).toprettyxml(indent="    ")
print(pretty_xml)