Python 我可以只提取eml文件的正文文本吗？_Python_Python 2.7_Web Scraping_Beautifulsoup_Email Parsing

Python 我可以只提取eml文件的正文文本吗？

python python-2.7 web-scraping

Python 我可以只提取eml文件的正文文本吗？,python,python-2.7,web-scraping,beautifulsoup,email-parsing,Python,Python 2.7,Web Scraping,Beautifulsoup,Email Parsing,我只需要提取eml文件正文中的文本，但我的代码一直在给我一些代码文本和Outlook中存在的文件夹。我正在使用Python 2.7和BeautifulSoup。我的代码是： import email from email import message_from_file import os import bs4 as bs import re # Path to directory where attachments will be stored: path = "D:\Pytho

我只需要提取eml文件正文中的文本，但我的代码一直在给我一些代码文本和Outlook中存在的文件夹。我正在使用Python 2.7和BeautifulSoup。我的代码是：

import email
from email import message_from_file
import os
import bs4 as bs
import re

# Path to directory where attachments will be stored:
path = "D:\Python"

# To have attachments extracted into memory, change behaviour of 2 following functions:

def file_exists (f):
    """Checks whether extracted file was extracted before."""
    return os.path.exists(os.path.join(path, f))

def save_file (fn, cont):
    """Saves cont to a file fn"""
    file = open(os.path.join(path, fn), "wb")
    file.write(cont)
    file.close()

def construct_name (id, fn):
    """Constructs a file name out of messages ID and packed file name"""
    id = id.split(".")
    id = id[0]+id[1]
    return id+"."+fn

def disqo (s):
    """Removes double or single quotations."""
    s = s.strip()
    if s.startswith("'") and s.endswith("'"): return s[1:-1]
    if s.startswith('"') and s.endswith('"'): return s[1:-1]
    return s

def disgra (s):
    """Removes < and > from HTML-like tag or e-mail address or e-mail ID."""
    s = s.strip()
    if s.startswith("<") and s.endswith(">"): return s[1:-1]
    return s

def pullout (m, key):
    """Extracts content from an e-mail message.
    This works for multipart and nested multipart messages too.
    m   -- email.Message() or mailbox.Message()
    key -- Initial message ID (some string)
    Returns tuple(Text, Html, Files, Parts)
    Text  -- All text from all parts.
    Html  -- All HTMLs from all parts
    Files -- Dictionary mapping extracted file to message ID it belongs to.
    Parts -- Number of parts in original message.
    """
    Html = ""
    Text = ""
    Files = {}
    Parts = 0
    if not m.is_multipart():
        if m.get_filename(): # It's an attachment
            fn = m.get_filename()
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, None)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
            return Text, Html, Files, 1
        # Not an attachment!
        # See where this belongs. Text, Html or some other data:
        cp = m.get_content_type()
        if cp=="text/plain": Text += m.get_payload(decode=True)
        elif cp=="text/html": Html += m.get_payload(decode=True)
        else:
            # Something else!
            # Extract a message ID and a file name if there is one:
            # This is some packed file and name is contained in content-type header
            # instead of content-disposition header explicitly
            cp = m.get("content-type")
            try: id = disgra(m.get("content-id"))
            except: id = None
            # Find file name:
            o = cp.find("name=")
            if o==-1: return Text, Html, Files, 1
            ox = cp.find(";", o)
            if ox==-1: ox = None
            o += 5; fn = cp[o:ox]
            fn = disqo(fn)
            cfn = construct_name(key, fn)
            Files[fn] = (cfn, id)
            if file_exists(cfn): return Text, Html, Files, 1
            save_file(cfn, m.get_payload(decode=True))
        return Text, Html, Files, 1
    # This IS a multipart message.
    # So, we iterate over it and call pullout() recursively for each part.
    y = 0
    while 1:
        # If we cannot get the payload, it means we hit the end:
        try:
            pl = m.get_payload(y)
        except: break
        # pl is a new Message object which goes back to pullout
        t, h, f, p = pullout(pl, key)
        Text += t; Html += h; Files.update(f); Parts += p
        y += 1
    return Text, Html, Files, Parts

def extract (msgfile, key):
    """Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary.
    msgfile -- A file-like readable object
    key     -- Some ID string for that particular Message. Can be a file name or anything.
    Returns dict()
    Keys: from, to, subject, date, text, html, parts[, files]
    Key files will be present only when message contained binary files.
    For more see __doc__ for pullout() and caption() functions.
    """
    m = message_from_file(msgfile)
    From, To, Subject, Date, Body = caption(m)
    Text, Html, Files, Parts = pullout(m, key)
    Text = Text.strip(); Html = Html.strip()
    msg = {"subject": Subject, "from": From, "to": To, "date": Date, "body": Body,
        "text": Text, "html": Html, "parts": Parts}
    if Files: msg["files"] = Files
    return msg

def caption (origin):
    """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message()
    origin -- Message() object
    Returns tuple(From, To, Subject, Date)
    If message doesn't contain one/more of them, the empty strings will be returned.
    """
    Date = ""
    if origin.has_key("date"): Date = origin["date"].strip()
    From = ""
    if origin.has_key("from"): From = origin["from"].strip()
    To = ""
    if origin.has_key("to"): To = origin["to"].strip()
    Subject = ""
    if origin.has_key("subject"): Subject = origin["subject"].strip()
    if origin.has_key("body"): To = origin["body"].strip()
    Body = ""
    return From, To, Subject, Date, Body

导入电子邮件
从电子邮件从文件导入消息
导入操作系统
将bs4作为bs导入
进口稀土
#存储附件的目录的路径：
path=“D:\Python”
#要将附件提取到内存中，请更改以下两个函数的行为：
def文件_存在（f）：
“”“检查提取的文件是否在之前提取。”“”
返回os.path.exists（os.path.join（path，f））
def保存_文件（fn，cont）：
“”“将cont保存到文件fn”“”
file=open（os.path.join（path，fn），“wb”）
文件写入（续）
file.close（）文件
def构造名称（id，fn）：
“”“用消息ID和压缩文件名构造文件名”“”
id=id.split（“.”）
id=id[0]+id[1]
返回id+“+fn”
def disqo（s）：
“”“删除双引号或单引号。”“”
s=s.条带（）
如果s.startswith（“”）和s.endswith（“”）：返回s[1:-1]
如果s.startswith（“”）和s.endswith（““”）：返回s[1:-1]
返回s
def disgra（s）：
“”“从类似HTML的标记、电子邮件地址或电子邮件ID中删除<和>。”“”
s=s.条带（）
如果s.startswith（“”）：返回s[1:-1]
返回s
def拉出（m，钥匙）：
“”“从电子邮件中提取内容。
这也适用于多部分和嵌套的多部分消息。
m—email.Message（）或mailbox.Message（）
key——初始消息ID（某些字符串）
返回元组（文本、Html、文件、部分）
文本--来自所有部分的所有文本。
Html——来自所有部分的所有Html
文件——将提取的文件映射到它所属的消息ID的字典。
Parts--原始消息中的部分数。
"""
Html=“”
Text=“”
文件={}
零件=0
如果不是，则m.为多部分（）
如果m.get_filename（）：#它是一个附件
fn=m.get_filename（）
cfn=构造名称（键，fn）
文件[fn]=（cfn，无）
如果文件_存在（cfn）：返回文本、Html、文件、1
保存_文件（cfn，m.get_有效载荷（decode=True））
返回文本、Html、文件、1
#不是附件！
#看看这个属于哪里。文本、Html或其他一些数据：
cp=m.获取内容类型（）
如果cp==“text/plain”：text+=m.get\u有效载荷（decode=True）
elif cp==“text/html”：html+=m.get\u有效载荷（decode=True）
其他：
#还有别的！
#提取消息ID和文件名（如果有）：
#这是一些压缩文件，其名称包含在内容类型头中
#而不是显式地显示内容处置头
cp=m.get（“内容类型”）
try:id=disgra（m.get（“内容id”））
除外：id=None
#查找文件名：
o=cp.find（“name=”）
如果o==-1：返回文本、Html、文件、1
ox=cp.find（“；”，o）
如果ox==-1:ox=None
o+=5；fn=cp[o:ox]
fn=disqo（fn）
cfn=构造名称（键，fn）
文件[fn]=（cfn，id）
如果文件_存在（cfn）：返回文本、Html、文件、1
保存_文件（cfn，m.get_有效载荷（decode=True））
返回文本、Html、文件、1
#这是一条多部分消息。
#因此，我们对它进行迭代，并为每个部分递归调用pullout（）。
y=0
而1：
#如果我们无法获得有效载荷，就意味着我们到达了终点：
尝试：
pl=m.get_有效载荷（y）
除了：休息
#pl是一个新的消息对象，返回到pullout
t、 h，f，p=拔出（pl，键）
Text+=t；Html+=h；文件更新（f）；零件+=p
y+=1
返回文本、Html、文件、部件
def提取（msgfile，密钥）：
“”“从电子邮件中提取所有数据，包括发件人、收件人等，并将其作为字典返回。
msgfile——类似文件的可读对象
键——特定消息的某个ID字符串。可以是文件名或任何内容。
返回dict（）
关键字：从，到，主题，日期，文本，html，部分[，文件]
只有当消息包含二进制文件时，密钥文件才会出现。
有关更多信息，请参见_udoc _;了解pullout（）和caption（）函数。
"""
m=来自_文件（msgfile）的消息_
从，到，主题，日期，正文=标题（m）
文本、Html、文件、部件=拉出（m、键）
Text=Text.strip（）；Html=Html.strip（）
msg={“subject”：subject，“from”：from，“to”：to，“date”：date，“body”：body，
“text”：text，“html”：html，“parts”：parts}
如果文件：msg[“文件”]=文件
返回消息
def标题（来源）：
“”“从email.Message（）或mailbox.Message（）提取：收件人、发件人、主题和日期”
origin--Message（）对象
返回元组（从、到、主题、日期）
如果消息不包含一个或多个字符串，则返回空字符串。
"""
Date=“”
如果origin.has_key（“date”）：date=origin[“date”].strip（）
From=“”
如果origin.has_键（“from”）：from=origin[“from”].strip（）
To=“”
如果origin.has_key（“to”）：to=origin[“to”].strip（）
Subject=“”
如果origin.has_key（“subject”）：subject=origin[“subject”].strip（）
如果origin.has_key（“body”）：To=origin[“body”].strip（）
Body=“”
从，到，主题，日期，正文

以下是我对bs的尝试：


"""
# Usage:
f = open("e.eml", "rb")
soup = bs.BeautifulSoup(f,'lxml')
for url in soup.find_all('body'):
    print url.get('href')
f.close()
"""

f = open("e.eml", "rb")
soup = bs.BeautifulSoup(f,'lxml')
for body in soup.find_all('p'):
    text = soup.get_text()
    print text
f.close()


"""def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext
    
f = open("e.eml", "rb")
soup = bs.BeautifulSoup(f,'lxml')
for goal in soup.find_all():
    body = soup.find('body')
    the_contents_of_body_without_body_tags = body.findChildren()
    soup.get_text()
    print the_contents_of_body_without_body_tags
f.close()
"""

"""f = open("e.eml", "rb")
message = email.message_from_file(open('e.eml'))
text = Text.text(message)
#print extract(f,f.name)
f.close()
"""


"""
#用法：
f=开放（“e.eml”、“rb”）
汤=bs.BeautifulSoup（f，'lxml'）
查找汤中的url。查找所有（'body'）：
打印url.get（'href'）
f、 关闭（）
"""
f=开放（“e.eml”、“rb”）
汤=bs.BeautifulSoup（f，'lxml'）
用于汤中的身体。查找所有（'p'）：
text=soup.get_text（）
打印文本
f、 关闭（）
“”“def cleanhtml（原始html）：
cleanr=re.compile（“”）
cleantext=re.sub（cleanr'，原始html）
返回干净文本
f=开放（“e.eml”、“rb”）
汤=bs.BeautifulSoup（f，'lxml'）
为汤中的目标。查找所有（）
body=soup.find（'body'）
不带\u body\u标记的\u body\u的\u contents\u=body.findChildren（）
汤，吃吧