批评此python代码(带线程池的爬虫程序)
这段python代码有多好?(需要批评) 此代码中有一个错误,有时脚本会打印“所有等待-可以完成!” 和冻结(没有更多的行动发生…),但我找不到发生这种情况的原因 具有线程池的站点爬虫程序:批评此python代码(带线程池的爬虫程序),python,multithreading,web-crawler,pool,Python,Multithreading,Web Crawler,Pool,这段python代码有多好?(需要批评) 此代码中有一个错误,有时脚本会打印“所有等待-可以完成!” 和冻结(没有更多的行动发生…),但我找不到发生这种情况的原因 具有线程池的站点爬虫程序: import sys from urllib import urlopen from BeautifulSoup import BeautifulSoup, SoupStrainer import re from Queue import Queue, Empty from threading import
import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread
W_WAIT = 1
W_WORK = 0
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, pool, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
self.pool = pool
self.state = None
def is_wait(self):
return self.state == W_WAIT
def run(self):
while True:
#if all workers wait - time to exsit
print "CHECK WAIT: !!! ",self.pool.is_all_wait()
if self.pool.is_all_wait():
print "ALL WAIT - CAN FINISH!"
return
try:
func, args, kargs = self.tasks.get(timeout=3)
except Empty:
print "task wait timeout"
continue
self.state = W_WORK
print "START !!! in thread %s" % str(self)
#print args
try: func(*args, **kargs)
except Exception, e: print e
print "!!! STOP in thread %s", str(self)
self.tasks.task_done()
self.state = W_WAIT
#threads can fast empty it!
#if self.tasks.qsize() == 0:
# print "QUIT!!!!!!"
# break
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
#self.tasks = Queue(num_threads)
self.tasks = Queue()
self.workers = []
for _ in range(num_threads):
self.workers.append(Worker(self,self.tasks))
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def is_all_wait(self):
for w in self.workers:
if not w.is_wait():
return False
return True
visited = set()
queue = Queue()
external_links_set = set()
internal_links_set = set()
external_links = 0
def process(pool,host,url):
try:
content = urlopen(url).read()
except UnicodeDecodeError:
return
for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
try:
href = link['href']
except KeyError:
continue
if not href.startswith('http://'):
href = 'http://%s%s' % (host, href)
if not href.startswith('http://%s%s' % (host, '/')):
continue
internal_links_set.add(href)
if href not in visited:
visited.add(href)
pool.add_task(process,pool,host,href)
else:
pass
def start(host,charset):
pool = ThreadPool(20)
pool.add_task(process,pool,host,'http://%s/' % (host))
pool.wait_completion()
start('evgenm.com','utf8')
谢谢你的帮助!我提出了新的实施方案:
关于这个代码#2,你能说些什么?
=========================================================================试试看=======================================
import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread
W_STOP = 1
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, pool, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.pool = pool
self.state = None
self.start()
def stop(self):
self.state = W_STOP
def run(self):
while True:
if self.state == W_STOP:
print "\ncalled stop"
break
try:
func, args, kargs = self.tasks.get(timeout=3)
except Empty:
continue
print "\n***START*** %s" % str(self)
try:
func(*args, **kargs)
except Exception, e:
print e
print "\n***STOP*** %s", str(self)
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
#self.tasks = Queue(num_threads)
self.tasks = Queue()
self.workers = []
for _ in range(num_threads):
self.workers.append(Worker(self,self.tasks))
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def stop_threads(self):
for w in self.workers:
w.stop()
def wait_stop(self):
self.wait_completion()
self.stop_threads()
visited = set()
queue = Queue()
external_links_set = set()
internal_links_set = set()
external_links = 0
def process(pool,host,url):
try:
content = urlopen(url).read()
except UnicodeDecodeError:
return
for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
try:
href = link['href']
except KeyError:
continue
if not href.startswith('http://'):
href = 'http://%s%s' % (host, href)
if not href.startswith('http://%s%s' % (host, '/')):
continue
internal_links_set.add(href)
if href not in visited:
visited.add(href)
pool.add_task(process,pool,host,href)
else:
pass
def start(host,charset):
pool = ThreadPool(20)
pool.add_task(process,pool,host,'http://%s/' % (host))
pool.wait_stop()
start('evgenm.com','utf8')
我有基本的python知识,但是python中的线程不是没有用的吗?我已经看到了大量批评全局锁解释器的文章。您正在线程之间共享状态(即,
中的is\u all\u wait
),而没有同步。此外,所有线程都在“等待”这一事实并不是队列为空的可靠指标(例如,它们可能都在获取任务的过程中)。我怀疑,有时线程在队列真正为空之前退出。如果这种情况发生得足够频繁,那么队列中将只剩下任务,而没有线程来运行它们。因此,queue.join()
将永远等待
我的建议是:
就是等待
——这不是一个可靠的指标状态
——这并不是真的必要队列。加入让您知道什么时候一切都已处理完毕
如果需要终止线程(例如,这是一个更大的、长期运行的程序的一部分),请在
队列.join()之后执行此操作。。。天哪,多棒的垒球啊。我想我会克制住幽默的冲动,但是…当然你是对的)从你的角度来看)就像你说的,“python中的线程不是无用的”;-)。GIL是针对CPython的。例如,Jython使用Java同步使解释器线程安全。此外,GIL对IO绑定任务的性能影响小于CPU绑定任务。最后,C扩展中的代码可以在不需要访问Python数据结构时释放GIL。因此,对于大多数用途来说,这真的没什么大不了的。谢谢你的帮助,我来执行#2,你怎么看?是的,这看起来很合理。总的来说,代码看起来非常干净,易于理解。