Python 从lxml.html节点获取文本长度并截断_Python_Html_Xml_Lxml_Truncate

Python 从lxml.html节点获取文本长度并截断

python html xml

Python 从lxml.html节点获取文本长度并截断,python,html,xml,lxml,truncate,Python,Html,Xml,Lxml,Truncate,在parse_html（）中，我试图解析每个元素的文本，并获取每个元素中文本的len（）。我想生成一个脚本，解析每个元素中的文本长度，当累积文本长度达到一个设置的大小参数时，它会截断文档中的其余文本。我的问题在child.text/tag1.text/tag2.text/tag3.text中。len（）似乎不在处理这些问题。有没有一种方法可以提取这些文本字符串的数字长度 import sys import imaplib import getpass import email import em

在parse_html（）中，我试图解析每个元素的文本，并获取每个元素中文本的len（）。我想生成一个脚本，解析每个元素中的文本长度，当累积文本长度达到一个设置的大小参数时，它会截断文档中的其余文本。我的问题在child.text/tag1.text/tag2.text/tag3.text中。len（）似乎不在处理这些问题。有没有一种方法可以提取这些文本字符串的数字长度

import sys
import imaplib
import getpass
import email
import email.header
import datetime
from bs4 import BeautifulSoup 
import re
from lxml import etree, html
from io import StringIO, BytesIO
from lxml.html.clean import clean_html, Cleaner, word_break
from lxml.etree import HTML 
from lxml.html import HtmlMixin 

EMAIL_ACCOUNT = "sample@gmai.com"
EMAIL_FOLDER = "INBOX"


def process_mailbox(M):
    """
    Do something with emails messages in the folder.  
    For the sake of this example, print some headers.
    """

    rv, data = M.search(None, "ALL")
    if rv != 'OK':
        print "No messages found!"
        return
    for num in data[0].split(): 
        rv, data = M.fetch(num, '(RFC822)')
        if rv != 'OK':
            print "ERROR getting message", num
            return

        msg = email.message_from_string(data[0][1])
        decode = email.header.decode_header(msg['Subject'])[0]
        subject = unicode(decode[0])
        body = msg.get_payload(decode=True)
        print 'Message %s: %s' % (num, subject)
        print 'Raw Date:', msg['Date']
        print 'Body:', body

        if msg.is_multipart():
            html = None
            print "Checking for html or text"
            for part in msg.get_payload():
                if part.get_content_charset() is None:
                    charset = chardet.detect(srt(part))['encoding']
                else:
                    charset = part.get_content_charset()
                    if part.get_content_type() == 'text/plain':
                    text = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
                    f = open('email.txt', 'w')
                    f.write(text)
                    f.close
                if part.get_content_type() == 'text/html':
                    html = unicode(part.get_payload(decode=True),str(charset),"ignore").encode('utf8','replace')
                    f = open('email.html','w')
                    f.write(html)
                    f.close
                if part.get('Content-Disposition') is None:
                    continue

                filename = part.get_filename()

                if not os.path.isfile(filename) :
                    fp = open(filename, 'wb')
                    fp.write(part.get_payload(decode=True))
                    fp.close()
                    return 0

            if html is None:
                return text.strip()
            else:
                return html.strip()
        # Now convert to local date-time
        date_tuple = email.utils.parsedate_tz(msg['Date'])
        if date_tuple:
            local_date = datetime.datetime.fromtimestamp(
                email.utils.mktime_tz(date_tuple))
            print "Local Date:", \
                local_date.strftime("%a, %d %b %Y %H:%M:%S")

def parse_html():
    #htmldoc = open('email.html', 'r+')
    #doc = htmldoc.read()
    VALID_TAGS = ['iframe', 'video', 'o>', 'li', 'sub', 'sup', 'source', 'br', 'h3', 'h4', 'h6', 'hr', 'q', 'mark','wbr', 'audio','strong', 'em', 'p','ul', 'li', 'br', 'blockquote', 'pre', 'del', 'h3', 'body', 'header', 'html', 'title', 'div', 'img', 'a']
    parser = etree.HTMLParser()
    tree = etree.parse("email.html", parser)
    #results = etree.tostring(tree.getroot(), pretty_print=True, method="html")
    page = html.tostring(tree)
    cleaner = Cleaner(page_structure=False, add_nofollow=True, style=True, links=True, safe_attrs_only=True)
    clean_page = cleaner.clean_html(page)
    root = tree.getroot()
    child = root[0]
    print len(root)
    children = list(root)
    for child in root:
        print child.tag
        print child.attrib
        print child.text
        for tag1 in child:
            print tag1.tag
            print tag1.attrib
            print tag1.text
            for tag2 in tag1:
                print tag2.tag
                print tag2.attrib
                print tag2.text
                for tag3 in tag2:
                    print tag3.tag
                    print tag3.attrib
                    print tag3.text

M = imaplib.IMAP4_SSL('imap.gmail.com')

try:
    rv, data = M.login(EMAIL_ACCOUNT, getpass.getpass())
except imaplib.IMAP4.error:
    print "LOGIN FAILED!!! "
    sys.exit(1)

print rv, data

rv, mailboxes = M.list()
if rv == 'OK':
    print "Mailboxes:"
    print mailboxes

rv, data = M.select(EMAIL_FOLDER)
if rv == 'OK':
    print "Processing mailbox...\n"
    process_mailbox(M)
    parse_html()
    M.close()
else:
    print "ERROR: Unable to open mailbox ", rv

M.logout()

这就是我在尝试使用len（）时遇到的错误

另外，如果您知道如何使用lxml.html进行截断，我希望您能给我指出正确的方向。

谢谢。

也许有更有效的方法，但我能得到有效的结果。我必须把每个孩子变成一个字符串，然后去掉每个孩子字符串的html标签。我使用了函数stringify_children（）和strip_tags（），可以在以下链接找到：和

TypeError: object of type 'NoneType' has no len()

def stringify_children(node):
    from lxml.etree import tostring
    from itertools import chain
    parts = ([node.text] +
            list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
            [node.tail])
    # filter removes possible Nones in texts and tails
    return ''.join(filter(None, parts))

    class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

tree = etree.fromstring(docstring)
walkAll = tree.iterchildren()

for elt in walkAll:
    child = stringify_children(elt)
    childtext = strip_tags(child)
    print len(childtext)