使用python仅访问电子邮件正文_Python_Email

使用python仅访问电子邮件正文

python email

使用python仅访问电子邮件正文,python,email,Python,Email,我正在尝试只访问电子邮件的正文，并已使用电子邮件模块和email.getpayload（）函数成功地获取了此内容。唯一的问题是，似乎有“垃圾短信”出现，而且它的格式、数量和内容似乎有所不同，这取决于发送电子邮件的人。有没有办法只访问电子邮件的真实内容？谢谢这就是我所拥有的： msg = email.message_from_bytes(data[0][1]) body = '' if msg.is_multipart(): for part in msg.wal

我正在尝试只访问电子邮件的正文，并已使用电子邮件模块和email.getpayload（）函数成功地获取了此内容。唯一的问题是，似乎有“垃圾短信”出现，而且它的格式、数量和内容似乎有所不同，这取决于发送电子邮件的人。有没有办法只访问电子邮件的真实内容？谢谢

这就是我所拥有的：

msg = email.message_from_bytes(data[0][1])
    body = ''
    if msg.is_multipart():
        for part in msg.walk():
            if part.is_multipart():
                for subpart in part.get_payload():
                    if subpart.is_multipart():
                        for subsubpart in subpart.get_payload():
                            body = body +     str(subsubpart.get_payload(decode=True)) + '\n'
                    else:
                        body = body + str(subpart.get_payload(decode=True)) + '\n'
            else:
                body = body + str(part.get_payload(decode=True)) + '\n'

    else:
        body = body + str(msg.get_payload(decode=True)) + '\n'

我用mailbox为我的邮件组编写了这个程序，这就是为什么它如此复杂的原因。我从未失望过。从来没有垃圾。若消息是多部分的，则输出字典将包含一个键“files”（一个子目录），其中包含提取的其他非文本或html文件的所有文件名。这是一种提取附件和其他二进制数据的方法。您可以在pullout（）中更改它，或者只更改file_exists（）和save_file（）的行为。construct_name（）使用消息id和多部分消息文件名（如果有）构造文件名。这里可能有一些错误，因为它是用来处理mailbox.Message的，而不是用来处理email.Message的。但这就是我目前所拥有的一切，在你向我们提供更多信息之前，我无法告诉你更多。也许你只是忘记了decode=True，收到了一些rfc822垃圾文件。

你指的是什么真正的垃圾文本？你的电子邮件是多部分邮件吗？也就是说，它是否有附件、图像、自定义背景或不寻常的字体或类似的东西？您是否使用了m.get_有效载荷（decode=True）？？谢谢，我会尝试decode=True，但我认为这两者以及缺少对多方的处理都给我带来了麻烦！我曾尝试使用decode=True，但每次它只产生一个none，而从不产生任何有价值的内容。请查看msg.is\u multipart（）==True，因为如果是，那么msg.get\u有效负载（0）将提供0部分中的所有内容。你试过我的密码了吗？如果我的代码也给了你奇怪的结果，那么就有大问题了。您试图阅读的邮件来自哪个邮箱？你能把你的一些代码放在这里吗？我想问题是邮寄者都是不同的，我会把我所有的都发出去！我编辑了我的代码。找到了一些时间并添加了评论，所以，现在它更有意义了。从本质上讲，您的代码应该生成OK输出，尽管它并不像您所做的那样。请参阅我的代码，输入嵌套的多部分是递归完成的。首先为自己保存一封电子邮件，而不是多部分，以使用正常的get_有效负载（decode=True），然后在所有部分上递归调用此函数。



from email import message_from_file
import os

# Path to directory where attachments will be stored:
path = "./msgfiles"

# To have attachments extracted into memory, change behaviour of 2 following functions:

def file_exists (f):
    """Checks whether extracted file was extracted before."""
    return os.path.exists(os.path.join(path, f))

def save_file (fn, cont):
    """Saves cont to a file fn"""
    file = open(os.path.join(path, fn), "wb")
    file.write(cont)
    file.close()

def construct_name (id, fn):
    """Constructs a file name out of messages ID and packed file name"""
    id = id.split(".")
    id = id[0]+id[1]
    return id+"."+fn

def disqo (s):
    """Removes double or single quotations."""
    s = s.strip()
    if s.startswith("'") and s.endswith("'"): return s[1:-1]
    if s.startswith('"') and s.endswith('"'): return s[1:-1]
    return s

def disgra (s):
    """Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
    s = s.strip()
    if s.startswith("<") and s.endswith(">"): return s[1:-1]
    return s

def pullout (m, key):
    """Extracts content from an e-mail message.
    This works for multipart and nested multipart messages too.
    m   -- email.Message() or mailbox.Message()
    key -- Initial message ID (some string)
    Returns tuple(Text, Html, Files, Parts)
    Text  -- All text from all parts.
    Html  -- All HTMLs from all parts
    Files -- Dictionary mapping extracted file to message ID it belongs to.
    Parts -- Number of parts in original message.
    """
    Html = ""
    Text = ""
    Files = {}
    Parts = 0
    if not m.is_multipart():
        if m.get_filename(): # It's an attachment
            fn = m.get_filename()
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, None)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
            return Text, Html, Files, 1
        # Not an attachment!
        # See where this belongs. Text, Html or some other data:
        cp = m.get_content_type()
        if cp=="text/plain": Text += m.get_payload(decode=True)
        elif cp=="text/html": Html += m.get_payload(decode=True)
        else:
            # Something else!
            # Extract a message ID and a file name if there is one:
            # This is some packed file and name is contained in content-type header
            # instead of content-disposition header explicitly
            cp = m.get("content-type")
            try: id = disgra(m.get("content-id"))
            except: id = None
            # Find file name:
            o = cp.find("name=")
            if o==-1: return Text, Html, Files, 1
            ox = cp.find(";", o)
            if ox==-1: ox = None
            o += 5; fn = cp[o:ox]
            fn = disqo(fn)
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, id)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
        return Text, Html, Files, 1
    # This IS a multipart message.
    # So, we iterate over it and call pullout() recursively for each part.
    y = 0
    while 1:
        # If we cannot get the payload, it means we hit the end:
        try:
            pl = m.get_payload(y)
        except: break
        # pl is a new Message object which goes back to pullout
        t, h, f, p = pullout(pl, key)
        Text += t; Html += h; Files.update(f); Parts += p
        y += 1
    return Text, Html, Files, Parts

def extract (msgfile, key):
    """Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
    msgfile -- A file-like readable object
    key     -- Some ID string for that particular Message. Can be a file name or anything.
    Returns dict()
    Keys: from, to, subject, date, text, html, parts[, files]
    Key files will be present only when message contained binary files.
    For more see __doc__ for pullout() and caption() functions.
    """
    m = message_from_file(msgfile)
    From, To, Subject, Date = caption(m)
    Text, Html, Files, Parts = pullout(m, key)
    Text = Text.strip(); Html = Html.strip()
    msg = {"subject": Subject, "from": From, "to": To, "date": Date,
        "text": Text, "html": Html, "parts": Parts}
    if Files: msg["files"] = Files
    return msg

def caption (origin):
    """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
    origin -- Message() object
    Returns tuple(From, To, Subject, Date)
    If message doesn't contain one/more of them, the empty strings will be returned.
    """
    Date = ""
    if origin.has_key("date"): Date = origin["date"].strip()
    From = ""
    if origin.has_key("from"): From = origin["from"].strip()
    To = ""
    if origin.has_key("to"): To = origin["to"].strip()
    Subject = ""
    if origin.has_key("subject"): Subject = origin["subject"].strip()
    return From, To, Subject, Date

Usage:
f = open("message.eml", "rb")
print extract(f, f.name)
f.close()