Python多线程HTTP爬虫-关闭连接并挂起程序_Python_Multithreading_Urllib2

Python多线程HTTP爬虫-关闭连接并挂起程序

python multithreading

Python多线程HTTP爬虫-关闭连接并挂起程序,python,multithreading,urllib2,Python,Multithreading,Urllib2,这个爬虫是用Python编写的，它根据域的输入列表将几个参数转储到JSON输出文件中我有一个问题：我需要关闭每个线程中的HTTP连接吗？输入数据约为500万项。它开始时的处理速度约为每秒50次迭代，但过了一段时间后，它会下降到每秒1-2次和/或挂起（没有内核消息，stdout上也没有错误）？这是代码还是与网络限制有关？我怀疑软件，因为当我重新启动它时，它会以很高的速度再次启动（大约每秒50次迭代）任何关于如何改进下面代码的提示都是受欢迎的，特别是提高速度和爬网吞吐量问题代码： impor

这个爬虫是用Python编写的，它根据域的输入列表将几个参数转储到JSON输出文件中

我有一个问题：

我需要关闭每个线程中的HTTP连接吗？输入数据约为500万项。它开始时的处理速度约为每秒50次迭代，但过了一段时间后，它会下降到每秒1-2次和/或挂起（没有内核消息，stdout上也没有错误）？这是代码还是与网络限制有关？我怀疑软件，因为当我重新启动它时，它会以很高的速度再次启动（大约每秒50次迭代）

任何关于如何改进下面代码的提示都是受欢迎的，特别是提高速度和爬网吞吐量

问题代码：

import urllib2
import pprint
from tqdm import tqdm

import lxml.html

from Queue import Queue

from geoip import geolite2
import pycountry

from tld import get_tld



resfile = open("out.txt",'a')



concurrent = 200

def doWork():
    while True:
        url = q.get()
        status = getStatus(url)
        doSomethingWithResult(status)
        q.task_done()

def getStatus(ourl):
     try:
       response = urllib2.urlopen("http://"+ourl)
       peer = response.fp._sock.fp._sock.getpeername()
       ip = peer[0]
       header = response.info()
       html = response.read()
       html_element = lxml.html.fromstring(html)
       generator = html_element.xpath("//meta[@name='generator']/@content")
       try:
         match = geolite2.lookup(ip)
         if match is not None:
           country= match.country
           try:

             c=pycountry.countries.lookup(country)
             country=c.name
           except:
             country=""

       except:
         country=""
       try:
         res=get_tld("http://www"+ourl, as_object=True)
         tld=res.suffix
       except:
         tld=""

       try:
         match = re.search(r'[\w\.-]+@[\w\.-]+', html)
         email=match.group(0)
       except:
         email=""

       try:
           item= generator[0]
           val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
       except:
           val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"

       return val
     except Exception as e:
       #print "error"+str(e)
       pass

def doSomethingWithResult(status):
    if status:
      resfile.write(str(status)+"\n")

q = Queue(concurrent * 2)
for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()

try:
    for url in tqdm(open('list.txt')):
        q.put(url.strip())
        status = open("status.txt",'w')
        status.write(str(url.strip()))   
    q.join()
except KeyboardInterrupt:
    sys.exit(1)

更新1：

关闭Socket和FileDescriptor会使它工作得更好，一段时间后似乎不再挂起。在家用笔记本电脑上的性能为50请求/秒，在VPS上的性能为ca 100请求/秒

from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm

import lxml.html

from Queue import Queue

from geoip import geolite2
import pycountry

from tld import get_tld
import json



resfile = open("out.txt",'a')



concurrent = 200

def doWork():
    while True:
        url = q.get()
        status = getStatus(url)
        doSomethingWithResult(status)
        q.task_done()

def getStatus(ourl):
     try:
       response = urllib2.urlopen("http://"+ourl)
       realsock = response.fp._sock.fp._sock
       peer = response.fp._sock.fp._sock.getpeername()
       ip = peer[0]
       header = response.info()
       html = response.read()
       realsock.close()
       response.close()

       html_element = lxml.html.fromstring(html)
       generator = html_element.xpath("//meta[@name='generator']/@content")
       try:
         match = geolite2.lookup(ip)
         if match is not None:
           country= match.country
           try:

             c=pycountry.countries.lookup(country)
             country=c.name
           except:
             country=""

       except:
         country=""
       try:
         res=get_tld("http://www"+ourl, as_object=True)
         tld=res.suffix
       except:
         tld=""

       try:
         match = re.search(r'[\w\.-]+@[\w\.-]+', html)
         email=match.group(0)
       except:
         email=""

       try:
           item= generator[0]
           val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
       except:
           val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"

       return val
     except Exception as e:
       print "error"+str(e)
       pass

def doSomethingWithResult(status):
    if status:
      resfile.write(str(status)+"\n")

q = Queue(concurrent * 2)
for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()

try:
    for url in tqdm(open('list.txt')):
        q.put(url.strip())
        status = open("status.txt",'w')
        status.write(str(url.strip()))   
    q.join()
except KeyboardInterrupt:
    sys.exit(1)

句柄将被自动垃圾收集，但是，您最好自己关闭句柄，尤其是在紧密循环中这样做时

您还要求提供改进建议。一个大问题是停止使用

urllib2

而开始使用

请求

。

有很多可能的选择，为什么爬网率会下降

1.）注意不要爬网到来自同一域的大量数据。有些web服务器配置为只允许每个IP地址并行一个连接

2.）尝试发送类似随机浏览器的http头（用户代理、引用等），以防止web服务器刮取保护（如果已设置）

3.）使用成熟的http（并行）库，如pycurl（has）或requests（）。它们的表现肯定更快。

谢谢！！！！我关闭了套接字和描述符，现在它工作得更好了。将调查请求…谢谢！！！我选择了另一个答案，因为它更直接地针对我的问题。我希望我能选择两个答案，因为你的回答也很好。