Python 3.x 为所选内容刮取gmail并获取附件
我有一个刮gmail的问题。 候选人应该从Gmail中提取或提取与金融交易相关的信息。这些信息可能是发票、订阅通知、账单等。我们希望您与Gmail帐户连接,并收集或提取发票、订阅、未来账单的数据。你可以在邮件中搜寻诸如“即将到来的发票”、“订阅”或“发票”等字样,然后点击 金额、日期、附件(如有)以及所有这些详细信息 我必须收集信息并存储所有附件。有什么具体的简单方法吗 我的代码Python 3.x 为所选内容刮取gmail并获取附件,python-3.x,smtp,gmail,imap,Python 3.x,Smtp,Gmail,Imap,我有一个刮gmail的问题。 候选人应该从Gmail中提取或提取与金融交易相关的信息。这些信息可能是发票、订阅通知、账单等。我们希望您与Gmail帐户连接,并收集或提取发票、订阅、未来账单的数据。你可以在邮件中搜寻诸如“即将到来的发票”、“订阅”或“发票”等字样,然后点击 金额、日期、附件(如有)以及所有这些详细信息 我必须收集信息并存储所有附件。有什么具体的简单方法吗 我的代码 import imaplib import os import email, getpass import sys
import imaplib
import os
import email, getpass
import sys
import json
class GmailFinin():
def helloWorld(self):
print("\nHello I'm here to help you")
def initializeVariables(self):
self.usr = ""
self.pwd = ""
self.mail = object
self.mailbox = ""
self.mailCount = 0
self.destFolder = ""
self.data = []
self.ids = []
self.idsList = []
def getLogin(self):
print("\nPlease enter your Gmail login details below.")
self.usr = input("Email: ")
# self.pwd = input("Password: ")
self.pwd = getpass.getpass("Enter your password --> ")
def attemptLogin(self):
self.mail = imaplib.IMAP4_SSL("imap.gmail.com", 993)
if self.mail.login(self.usr, self.pwd):
print("\nLogon SUCCESSFUL")
self.destFolder = input("\nPlease choose a destination folder in the form of /Users/username/dest/ (do not forget trailing slash!): ")
if not self.destFolder.endswith("/"): self.destFolder+="/"
return True
else:
print("\nLogon FAILED")
return False
def checkIfUsersWantsToContinue(self):
print("\nWe have found "+str(self.mailCount)+" emails in the mailbox "+self.mailbox+".")
return True if input("Do you wish to continue extracting all the emails into "+self.destFolder+"? (y/N) ").lower().strip()[:1] == "y" else False
def selectMailbox(self):
# self.mailbox = input("\nPlease type the name of the mailbox you want to extract, e.g. Inbox: ")
self.mailbox = "Inbox"
bin_count = self.mail.select(self.mailbox)[1]
self.mailCount = int(bin_count[0].decode("utf-8"))
return True if self.mailCount > 0 else False
def searchThroughMailbox(self):
type, self.data = self.mail.search(None, "ALL")
self.ids = self.data[0]
self.idsList = self.ids.split()
def parseEmails(self):
jsonOutput = {}
for anEmail in self.data[0].split():
type, self.data = self.mail.fetch(anEmail, '(UID RFC822)')
raw = self.data[0][1]
try:
raw_str = raw.decode("utf-8")
except UnicodeDecodeError:
try:
raw_str = raw.decode("ISO-8859-1") # ANSI support
except UnicodeDecodeError:
try:
raw_str = raw.decode("ascii") # ASCII ?
except UnicodeDecodeError:
pass
msg = email.message_from_string(raw_str)
jsonOutput['subject'] = msg['subject']
jsonOutput['from'] = msg['from']
jsonOutput['date'] = msg['date']
raw = self.data[0][0]
raw_str = raw.decode("utf-8")
uid = raw_str.split()[2]
# Body #
if msg.is_multipart():
for part in msg.walk():
partType = part.get_content_type()
## Get Body ##
if partType == "text/plain" and "attachment" not in part:
jsonOutput['body'] = part.get_payload()
## Get Attachments ##
if part.get('Content-Disposition') is not None:
attchName = part.get_filename()
print(attchName)
if bool(attchName):
attchFilePath = str(self.destFolder)+str(uid)+str("/")+str(attchName)
print(attchFilePath)
os.makedirs(os.path.dirname(attchFilePath), exist_ok=True)
with open(attchFilePath, "wb") as f:
f.write(part.get_payload(decode=True))
else:
# jsonOutput['body'] = msg.get_payload(decode=True).decode("utf-8") # Non-multipart email, perhaps no attachments or just text.
jsonOutput['body'] = msg.get_payload()
outputDump = json.dumps(jsonOutput)
emailInfoFilePath = str(self.destFolder)+str(uid)+str("/")+str(uid)+str(".json")
os.makedirs(os.path.dirname(emailInfoFilePath), exist_ok=True)
print(emailInfoFilePath)
with open(emailInfoFilePath, "w") as f:
f.write(outputDump)
def __init__(self):
self.initializeVariables()
self.helloWorld()
self.getLogin()
if self.attemptLogin():
not self.selectMailbox() and sys.exit()
else:
sys.exit()
not self.checkIfUsersWantsToContinue() and sys.exit()
self.searchThroughMailbox()
self.parseEmails()
if __name__ == "__main__":
run = GmailFinin()
我已经尝试使用下面的搜索,但我不认为这是最优的,因为它只在主题中搜索,以及如何为关键字列表添加多个或多个条件
type, self.data = self.mail.search(None, '(OR TEXT "bill" SUBJECT "bill")')
应更新问题,以包括所需行为、特定问题或错误,以及重现问题所需的最短代码。您当前的解决方案存在什么问题?问题是我如何获得仅包含与财务相关的特定词语(如发票、账单、,金额等。目前我可以阅读所有邮件和附件,但无法仅搜索与财务相关的关键字电子邮件。您似乎正在通过SMTP服务器进行搜索。我认为邮件服务器无法搜索。在扫描了他们所有的电子邮件之后,你可能不得不在本地解析。你有没有试过和一个在gmail帐户上启用了2fa的人一起测试这个问题?我是新来的,我找到了好的文档来处理废弃邮件,所以我研究了它并创建了我的代码。但现在我只能得到那些关键字列表中的电子邮件,比如
[“发票”,“账单”,“订阅提醒]
,并从中获取总金额。如果你有更好的解决办法,我很乐意向你学习。