Python 3.x 如何在Python中解析高效的Html？_Python 3.x_Parsing

Python 3.x 如何在Python中解析高效的Html？

python-3.x parsing

Python 3.x 如何在Python中解析高效的Html？,python-3.x,parsing,Python 3.x,Parsing,我将在没有外部libarry的情况下高效地解析Html代码我已经准备好用for进行测试，并检查了它是哪个符号。是这样的: list=[] html=“”你好“”“ m=0 对于html格式的文件：如果a==“”： m=0 list.append（[]）其他：列表[-1]=a 打印（列表）代码在50KB的文件上运行得非常慢。我可以推荐从下面所示的简单HTML解析器开始吗？它使用Python附带的标准库，没有外部依赖项。您可能需要根据需要修改和扩展它，但它提供了一个基本的domapi，

我将在没有外部libarry的情况下高效地解析Html代码

我已经准备好用for进行测试，并检查了它是哪个符号。是这样的:

list=[]
html=“”你好“”“
m=0
对于html格式的文件：
如果a==“”：
m=0
list.append（[]）
其他：
列表[-1]=a
打印（列表）

代码在50KB的文件上运行得非常慢。

我可以推荐从下面所示的简单HTML解析器开始吗？它使用Python附带的标准库，没有外部依赖项。您可能需要根据需要修改和扩展它，但它提供了一个基本的domapi，这应该是一个很好的起点。该代码适用于它要处理的简单情况；但是，根据您的需要，您可能需要添加更多功能来实现您的最终目标

#! /usr/bin/env python3
import html.parser
import pprint
import xml.dom.minidom


def main():
    # noinspection PyPep8
    document = '''
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
'''
    parser = DocumentParser()
    parser.feed(document)
    parser.close()
    model = parser.document.documentElement
    model.normalize()
    print(model.toprettyxml())
    first_title = model.getElementsByTagName('title')[0]
    print(first_title.toxml())
    print(first_title.tagName)
    print(first_title.firstChild.data)
    print(first_title.parentNode.tagName)
    first_p = model.getElementsByTagName('p')[0]
    print(first_p.toxml())
    print(first_p.getAttribute('class'))
    all_a = model.getElementsByTagName('a')
    print(all_a[0].toxml())
    pprint.pprint([element.toxml() for element in all_a])
    pprint.pprint([element.toxml() for element in find(model, id='link3')])
    for element in all_a:
        print(element.getAttribute('href'))
    print(*get_text(model), sep='\n')


class DocumentParser(html.parser.HTMLParser):
    # noinspection SpellCheckingInspection
    def __init__(self, *, convert_charrefs=True):
        super().__init__(convert_charrefs=convert_charrefs)
        self.document = self.focus = xml.dom.minidom.DOMImplementation() \
            .createDocument(None, None, None)

    @property
    def document_has_focus(self):
        return self.document is self.focus

    def handle_starttag(self, tag, attrs):
        element = self.document.createElement(tag)
        for name, value in attrs:
            element.setAttribute(name, value)
        self.focus.appendChild(element)
        self.focus = element

    def handle_endtag(self, tag):
        while self.focus.tagName != tag:
            self.focus = self.focus.parentNode
        self.focus = self.focus.parentNode

    def handle_data(self, data):
        if not self.document_has_focus and not data.isspace():
            self.focus.appendChild(self.document.createTextNode(data.strip()))

    def error(self, message):
        raise RuntimeError(message)

    def close(self):
        super().close()
        while not self.document_has_focus:
            self.focus = self.focus.parentNode


def find(element, **kwargs):
    get_attribute = getattr(element, 'getAttribute', None)
    if get_attribute and \
            all(get_attribute(key) == value for key, value in kwargs.items()):
        yield element
    for child in element.childNodes:
        yield from find(child, **kwargs)


def get_nodes_by_type(node, node_type):
    if node.nodeType == node_type:
        yield node
    for child in node.childNodes:
        yield from get_nodes_by_type(child, node_type)


def get_text(node):
    return (node.data for node in get_nodes_by_type(node, node.TEXT_NODE))


if __name__ == '__main__':
    main()

#/usr/bin/env蟒蛇3
导入html.parser
导入pprint
导入xml.dom.minidom
def main（）：
#无检查PyPep8
文件=“”
睡鼠的故事
睡鼠的故事
从前有三个小姐妹；他们的名字是
,
和
;
他们住在井底

'''
parser=DocumentParser（）
提要（文档）
parser.close（）
模型=parser.document.documentElement
model.normalize（）
打印（model.toprettyxml（））
first_title=model.getElementsByTagName（'title'）[0]
打印（第一个标题.toxml（））
打印（第一个标题标记名）
打印（第一个标题.firstChild.data）
打印（第一个标题.parentNode.tagName）
first_p=model.getElementsByTagName（'p'）[0]
打印（first_p.toxml（））
打印（第一个属性（'class'））
all_a=model.getElementsByTagName（'a'）
打印（所有[0].toxml（））
pprint.pprint（[element.toxml（）表示所有元素中的元素]）
pprint.pprint（[element.toxml（）用于find（model，id='link3'）中的元素）
对于所有_a中的元素：
打印（element.getAttribute（'href'））
打印（*获取文本（模型），sep='\n'）
类DocumentParser（html.parser.HTMLParser）：
#无检查拼写检查检查
def uuu init uuuu（self，*，convert_charrefs=True）：
super（）
self.document=self.focus=xml.dom.minidom.DOMImplementation（）\
.createDocument（无，无，无）
@财产
def文档有焦点（自身）：
返回self.document是self.focus
def句柄\u开始标记（自身、标记、属性）：
element=self.document.createElement（标记）
对于名称，属性中的值：
element.setAttribute（名称、值）
self.focus.appendChild（元素）
self.focus=元素
def handle_endtag（self，tag）：
而self.focus.tagName！=标签：
self.focus=self.focus.parentNode
self.focus=self.focus.parentNode
def句柄_数据（自身、数据）：
如果不是self.document_有_焦点而不是data.isspace（）：
self.focus.appendChild（self.document.createTextNode（data.strip（）））
def错误（自我，消息）：
引发运行时错误（消息）
def关闭（自我）：
super（）.close（）
虽然不是self.document\u有\u焦点：
self.focus=self.focus.parentNode
def查找（元素，**kwargs）：
get_attribute=getattr（元素“getAttribute”，无）
如果获取_属性和\
全部（获取属性（键）=键的值，kwargs.items（）中的值）：
屈服要素
对于element.childNodes中的子节点：
从查找中获得的收益（子项，**kwargs）
def按节点类型获取节点（节点，节点类型）：
如果node.nodeType==节点类型：
屈服点
对于node.childNodes中的子节点：
按节点类型（子节点、节点类型）获取节点的产量
def get_文本（节点）：
返回（按类型获取节点中节点的node.data（node，node.TEXT\u node））
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
main（）

您到底想实现什么目标？请添加更多描述，例如您希望得到什么输出？你有错误吗？哪一个？完成堆栈跟踪！是否要将空列表附加到列表？为什么不能使用外部库呢？我不确定您想做什么，但您可以使用标准库模块

HTML.parser

解析HTML。