我开始浪费金钱,让垃圾邮件以他们的名义发送……非常感谢你的回答。我现在使用多处理而不是线程。它大大提高了性能。我读到应该避免Python的线程化。我将尝试Phil H在第5点中描述的技巧。 import sqlite3 as sql import urll

我开始浪费金钱,让垃圾邮件以他们的名义发送……非常感谢你的回答。我现在使用多处理而不是线程。它大大提高了性能。我读到应该避免Python的线程化。我将尝试Phil H在第5点中描述的技巧。 import sqlite3 as sql import urll,python,performance,email,python-multithreading,Python,Performance,Email,Python Multithreading,我开始浪费金钱,让垃圾邮件以他们的名义发送……非常感谢你的回答。我现在使用多处理而不是线程。它大大提高了性能。我读到应该避免Python的线程化。我将尝试Phil H在第5点中描述的技巧。 import sqlite3 as sql import urllib2 import re import lxml.html as lxml import time import threading def getUrls(start): urls = [] try:


我开始浪费金钱,让垃圾邮件以他们的名义发送……非常感谢你的回答。我现在使用多处理而不是线程。它大大提高了性能。我读到应该避免Python的线程化。我将尝试Phil H在第5点中描述的技巧。
import sqlite3 as sql
import urllib2
import re
import lxml.html as lxml
import time
import threading


def getUrls(start):

    urls = []
    try:
        dom = lxml.parse(start).getroot()
        dom.make_links_absolute()

        for url in dom.iterlinks():
            if not '.jpg' in url[2]:
                if not '.JPG' in url[2]:
                    if not '.ico' in url[2]:
                        if not '.png' in url[2]:
                            if not '.jpeg' in url[2]:
                                if not '.gif' in url[2]:
                                    if not 'youtube.com' in url[2]:
                                        urls.append(url[2])
    except:
        pass

    return urls

def getURLContent(urlAdresse):

    try:
      url = urllib2.urlopen(urlAdresse)
      text = url.read()
      url.close()
      return text
    except:
        return '<html></html>'

def harvestEmail(url):
    text = getURLContent(url)

    emails = re.findall('[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', text)

    if emails:
        if saveEmail(emails[0]) == 1:
            print emails[0]

def saveUrl(url):

    connection = sql.connect('url.db')

    url = (url, )

    with connection:
        cursor = connection.cursor()
        cursor.execute('SELECT COUNT(*) FROM urladressen WHERE adresse = ?', url)
        data = cursor.fetchone()
        if(data[0] == 0):
            cursor.execute('INSERT INTO urladressen VALUES(NULL, ?)', url)
            return 1
        return 0

def saveEmail(email):
    connection = sql.connect('emails.db')
    email = (email, )

    with connection:
        cursor = connection.cursor()
        cursor.execute('SELECT COUNT(*) FROM addresse WHERE email = ?', email)
        data = cursor.fetchone()
        if(data[0] == 0):
            cursor.execute('INSERT INTO addresse VALUES(NULL, ?)', email)
            return 1
    return 0

def searchrun(urls):
    for url in urls:
        if saveUrl(url) == 1:
            #time.sleep(0.6)
            harvestEmail(url)
            print url
            urls.remove(url)
            urls = urls + getUrls(url)

urls1 = getUrls('http://www.google.de/#hl=de&tbo=d&output=search&sclient=psy-ab&q=DVD')
urls2 = getUrls('http://www.google.de/#hl=de&tbo=d&output=search&sclient=psy-ab&q=Jolie')
urls3 = getUrls('http://www.finanzen.net')
urls4 = getUrls('http://www.google.de/#hl=de&tbo=d&output=search&sclient=psy-ab&q=Party')
urls5 = getUrls('http://www.google.de/#hl=de&tbo=d&output=search&sclient=psy-ab&q=Games')
urls6 = getUrls('http://www.spiegel.de')
urls7 = getUrls('http://www.kicker.de/')
urls8 = getUrls('http://www.chessbase.com')
urls9 = getUrls('http://www.nba.com')
urls10 = getUrls('http://www.nfl.com')


try:
    threads = []
    urls = (urls1, urls2, urls3, urls4, urls5, urls6, urls7, urls8, urls9, urls10)

    for urlList in urls:
        thread = threading.Thread(target=searchrun, args=(urlList, )).start()
        threads.append(thread)
    print threading.activeCount()
    for thread in threads:
        thread.join()
except RuntimeError:
    print RuntimeError