在python中加载包含非ascii字符的xml_Python_Python 2.7

在python中加载包含非ascii字符的xml

python python-2.7

在python中加载包含非ascii字符的xml,python,python-2.7,Python,Python 2.7,我试图用以下代码解析Python2.7中的xml #!/usr/bin/env python # -*- coding: utf-8 -*- import xml.etree.ElementTree as ET import sys, json txtfile='game_file.txt' def jd(payload): return json.dumps(payload, sort_keys=True, indent=4) def parse_demo_txt(demofi

我试图用以下代码解析Python2.7中的xml

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET
import sys, json

txtfile='game_file.txt'

def jd(payload):
    return json.dumps(payload, sort_keys=True, indent=4)

def parse_demo_txt(demofile):
    tree = ET.ElementTree(file=demofile)
    scores={}
    for player in tree.iter('player'):
        if player.attrib['name'] not in scores:
            scores[player.attrib['name']]={'death':player.attrib['deaths'], 'win': player.attrib['spree'], 'totalscore': player.attrib['frags']}

    return scores

parse_demo_txt(txtfile)

源文件包含一些正在生成错误的非ascii字符

$ python parse_xml.py
Traceback (most recent call last):
  File "parse_xml.py", line 38, in <module>
    parse_demo_xml(xmlfile)
  File "parse_xml.py", line 18, in parse_demo_xml
    tree = ET.ElementTree(file=xmlfile)
  File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 611, in __init__
    self.parse(file)
  File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 656, in parse
    parser.feed(data)
  File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 1653, in feed
    self._raiseerror(v)
  File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 1517, in _raiseerror
    raise err
xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 5, column 14

$python parse_xml.py
回溯（最近一次呼叫最后一次）：
文件“parse_xml.py”，第38行，在
解析xml（xml文件）
parse_demo_xml中第18行的文件“parse_xml.py”
tree=ET.ElementTree（file=xmlfile）
文件“/usr/lib/python2.7/xml/etree/ElementTree.py”，第611行，在__
self.parse（文件）
文件“/usr/lib/python2.7/xml/etree/ElementTree.py”，第656行，在parse中
提要（数据）
文件“/usr/lib/python2.7/xml/etree/ElementTree.py”，第1653行，在提要中
自我提升错误（v）
文件“/usr/lib/python2.7/xml/etree/ElementTree.py”，第1517行，存在错误
提出错误
xml.etree.ElementTree.ParseError:格式不正确（无效令牌）：第5行第14列

实际上，在第5行第14列中有一些非ascii字符。除了先解析此文件并转换这些有问题的字符之外，还有其他方法（使用纯元素树）解决此问题吗？

您得到的错误不是编码错误，而是XML错误。XML确实支持非ascii字符（希望如此！），因此问题不在于是否具有非ascii字符，而在于是否在声明ascii编码的文档中具有非ascii字符（在

行中）

如果您知道此文档的实际编码，只需修改

编码

声明即可。否则，把这个XML文件发回给创建它的人，并要求他提供一个有效的XML文件。。。更糟糕的是，您可以尝试使用

chardet

或

unicodeammit

来尝试猜测编码，但请记住，这仍然是一个胡乱猜测。

好吧，我最后编写了自己的函数，从原始文件中删除不需要的字符，然后将其传递给XML解析器

def normalize_player(demofile):
    ''' this function will normalize player="" field in xml file by removind non-ascii characters from it 
        and writting output to name_normalized file.
    '''
    with open(demofile+'_normalized', 'w') as normalized_file:
        with open(demofile, 'r') as inputfile:
            for line in inputfile:
                if '<player' in line and 'name' in line:
                    plname=re.findall('name="(.*)" team', line)
                    # print('line stripped: {}'.format(line.rstrip('\r\n')))
                    # print('plname: {}'.format(plname))
                    nname=normalize_nickname(''.join(plname))
                    # print('plname normalized: {}'.format(nname))
                    line=line.replace(''.join(plname),nname)
                    # print('replaced line: {}'.format(line))
                normalized_file.write(line)
    remove(demofile)
    move(demofile+'_normalized', demofile)

最后，generate_translation_table（）函数如下所示

def generate_translation_table():
    ''' python implementation of https://github.com/deurk/mvdsv/blob/master/src/common.c#L1717 '''
    ascii_table={}

    # some basic transformations
    for i in range (0, 32):
        ascii_table[i] = 35         # '#'
        ascii_table[i + 128] = 35   # '#'
    for i in range (32, 128):
        ascii_table[i] = i
        ascii_table[i + 128] = i

    # special cases
    ascii_table[10] = 10
    ascii_table[13] = 13

    # dot
    ascii_table[5] = 46             # '.'
    ascii_table[14] = 46            # '.'
    ascii_table[15] = 46            # '.'
    ascii_table[28] = 46            # '.'
    ascii_table[46] = 46            # '.'
    ascii_table[5 + 128] = 46       # '.'
    ascii_table[14 + 128] = 46      # '.'
    ascii_table[15 + 128] = 46      # '.'
    ascii_table[28 + 128] = 46      # '.'
    ascii_table[46 + 128] = 46      # '.'

    # numbers
    for i in range (18, 28):
        ascii_table[i] = i + 30
        ascii_table[i + 128] = i + 30

    # brackets
    ascii_table[16] = 91            # '['
    ascii_table[16 + 128] = 91      # '['
    ascii_table[17] = 93            # ']'
    ascii_table[17 + 128] = 93      # ']'
    ascii_table[29] = 40            # '('
    ascii_table[29 + 128] = 40      # '('
    ascii_table[128] = 40           # '('
    ascii_table[31] = 41            # ')'
    ascii_table[31 + 128] = 41      # ')'
    ascii_table[130] = 41           # ')'

    # left arrow
    ascii_table[127] = 62           # '>'

    # right arrow
    ascii_table[141] = 60           # '<'

    # =
    ascii_table[30] = 61            # '='
    ascii_table[30+128] = 61        # '='
    ascii_table[129] = 61           # '='

    return ascii_table

def generate_translation_table（）：
“”的python实现https://github.com/deurk/mvdsv/blob/master/src/common.c#L1717 '''
ascii_表={}
#一些基本变换
对于范围（0,32）内的i：
ascii_表[i]=35#'#'
ascii_表[i+128]=35#'#'
对于范围（32128）内的i：
ascii_表[i]=i
ascii_表[i+128]=i
#特例
ascii_表[10]=10
ascii_表[13]=13
#圆点
ascii_表[5]=46#'。'
ascii_表[14]=46#'。'
ascii_表[15]=46#'。'
ascii_表[28]=46#'。'
ascii_表[46]=46#'。'
ascii_表[5+128]=46#'
ascii_表[14+128]=46#'
ascii#U表格[15+128]=46#'
ascii_表[28+128]=46#'
ascii_表[46+128]=46#'
#数字
对于范围（18,28）内的i：
ascii_表[i]=i+30
ascii_表[i+128]=i+30
#括号
ascii_表[16]=91#'['
ascii_表[16+128]=91#'['
ascii_表[17]=93#']'
ascii_表[17+128]=93#']
ascii_表[29]=40#'（'
ascii_表[29+128]=40#'（'
ascii_表[128]=40#'（'
ascii_表[31]=41#'）'
ascii_表[31+128]=41#'）
ascii_表[130]=41#'）'
#左箭头
ascii_表[127]=62#'>'
#右箭头
ascii_table[141]=60#“标头中根本没有编码。添加encoding=“UTF-8”并没有改变任何事情。添加“encoding=UTF-8”仅在UTF-8时有用，如果没有xml文件，任何其他操作都只是浪费时间。您需要提供一个。没有它，您的代码在这里被认为是离题的。另外，在提出多余的问题之前，一定要对你所犯的错误进行研究。谢谢。以上代码工作正常。唯一缺少的部分是game_file.txt，但我不知道如何将其附加到这里以保持原始编码。我可以把它贴在某个地方，但这里的链接可能并不积极。此外，我还整理了我在编写单独函数时遇到的问题，我将在一分钟内与大家分享。很高兴您找到了解决方案，但这不是重点。对于一个致力于提供问答知识库的网站来说，写出草率、不完整的问题，然后进行猜测，而不是分析和回答，是没有帮助的。这就是你的问题必须包含MCVE的原因。
def generate_translation_table():
    ''' python implementation of https://github.com/deurk/mvdsv/blob/master/src/common.c#L1717 '''
    ascii_table={}

    # some basic transformations
    for i in range (0, 32):
        ascii_table[i] = 35         # '#'
        ascii_table[i + 128] = 35   # '#'
    for i in range (32, 128):
        ascii_table[i] = i
        ascii_table[i + 128] = i

    # special cases
    ascii_table[10] = 10
    ascii_table[13] = 13

    # dot
    ascii_table[5] = 46             # '.'
    ascii_table[14] = 46            # '.'
    ascii_table[15] = 46            # '.'
    ascii_table[28] = 46            # '.'
    ascii_table[46] = 46            # '.'
    ascii_table[5 + 128] = 46       # '.'
    ascii_table[14 + 128] = 46      # '.'
    ascii_table[15 + 128] = 46      # '.'
    ascii_table[28 + 128] = 46      # '.'
    ascii_table[46 + 128] = 46      # '.'

    # numbers
    for i in range (18, 28):
        ascii_table[i] = i + 30
        ascii_table[i + 128] = i + 30

    # brackets
    ascii_table[16] = 91            # '['
    ascii_table[16 + 128] = 91      # '['
    ascii_table[17] = 93            # ']'
    ascii_table[17 + 128] = 93      # ']'
    ascii_table[29] = 40            # '('
    ascii_table[29 + 128] = 40      # '('
    ascii_table[128] = 40           # '('
    ascii_table[31] = 41            # ')'
    ascii_table[31 + 128] = 41      # ')'
    ascii_table[130] = 41           # ')'

    # left arrow
    ascii_table[127] = 62           # '>'

    # right arrow
    ascii_table[141] = 60           # '<'

    # =
    ascii_table[30] = 61            # '='
    ascii_table[30+128] = 61        # '='
    ascii_table[129] = 61           # '='

    return ascii_table