Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/293.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
批评此python代码(带线程池的爬虫程序)_Python_Multithreading_Web Crawler_Pool - Fatal编程技术网

批评此python代码(带线程池的爬虫程序)

批评此python代码(带线程池的爬虫程序),python,multithreading,web-crawler,pool,Python,Multithreading,Web Crawler,Pool,这段python代码有多好?(需要批评) 此代码中有一个错误,有时脚本会打印“所有等待-可以完成!” 和冻结(没有更多的行动发生…),但我找不到发生这种情况的原因 具有线程池的站点爬虫程序: import sys from urllib import urlopen from BeautifulSoup import BeautifulSoup, SoupStrainer import re from Queue import Queue, Empty from threading import

这段python代码有多好?(需要批评) 此代码中有一个错误,有时脚本会打印“所有等待-可以完成!” 和冻结(没有更多的行动发生…),但我找不到发生这种情况的原因

具有线程池的站点爬虫程序:

import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread

W_WAIT = 1
W_WORK = 0

class Worker(Thread):
    """Thread executing tasks from a given tasks queue"""
    def __init__(self, pool, tasks):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon = True
        self.start()
        self.pool = pool
        self.state = None

    def is_wait(self):
        return self.state == W_WAIT


    def run(self):
        while True:
            #if all workers wait - time to exsit
            print "CHECK WAIT: !!! ",self.pool.is_all_wait()
            if self.pool.is_all_wait():
                print "ALL WAIT - CAN FINISH!"
                return
            try:
                func, args, kargs = self.tasks.get(timeout=3)
            except Empty:
                print "task wait timeout"
                continue

            self.state = W_WORK
            print "START !!! in thread %s" % str(self)
            #print args

            try: func(*args, **kargs)
            except Exception, e: print e
            print "!!! STOP in thread %s", str(self)
            self.tasks.task_done()
            self.state = W_WAIT
            #threads can fast empty it!
            #if self.tasks.qsize() == 0:
            #    print "QUIT!!!!!!"
            #    break

class ThreadPool:
    """Pool of threads consuming tasks from a queue"""
    def __init__(self, num_threads):
        #self.tasks = Queue(num_threads)
        self.tasks = Queue()
        self.workers = []
        for _ in range(num_threads): 
            self.workers.append(Worker(self,self.tasks))


    def add_task(self, func, *args, **kargs):
        """Add a task to the queue"""
        self.tasks.put((func, args, kargs))

    def wait_completion(self):
        """Wait for completion of all the tasks in the queue"""
        self.tasks.join()

    def is_all_wait(self):
        for w in self.workers:
            if not w.is_wait():
                return False
        return True

visited = set()
queue = Queue()
external_links_set = set()
internal_links_set = set()
external_links = 0

def process(pool,host,url):

    try:

        content = urlopen(url).read()
    except UnicodeDecodeError:
        return


    for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
        try:
            href = link['href']
        except KeyError:
            continue


        if not href.startswith('http://'):
            href = 'http://%s%s' % (host, href)
        if not href.startswith('http://%s%s' % (host, '/')):
            continue

        internal_links_set.add(href)


        if href not in visited:
            visited.add(href)
            pool.add_task(process,pool,host,href)

        else:
            pass

def start(host,charset):
    pool = ThreadPool(20)
    pool.add_task(process,pool,host,'http://%s/' % (host))
    pool.wait_completion()

start('evgenm.com','utf8') 
谢谢你的帮助!我提出了新的实施方案: 关于这个代码#2,你能说些什么? =========================================================================试试看=======================================

    import sys
    from urllib import urlopen
    from BeautifulSoup import BeautifulSoup, SoupStrainer
    import re
    from Queue import Queue, Empty
    from threading import Thread


    W_STOP = 1

class Worker(Thread):
    """Thread executing tasks from a given tasks queue"""
    def __init__(self, pool, tasks):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon = True
        self.pool = pool
        self.state = None
        self.start()



    def stop(self):
        self.state = W_STOP

    def run(self):
        while True:
            if self.state == W_STOP:
                print "\ncalled stop"
                break
            try:
                func, args, kargs = self.tasks.get(timeout=3)
            except Empty:
                continue
            print "\n***START*** %s" % str(self)
            try: 
                func(*args, **kargs)
            except Exception, e: 
                print e
            print "\n***STOP*** %s", str(self)
            self.tasks.task_done()



class ThreadPool:
    """Pool of threads consuming tasks from a queue"""
    def __init__(self, num_threads):
        #self.tasks = Queue(num_threads)
        self.tasks = Queue()
        self.workers = []
        for _ in range(num_threads): 
            self.workers.append(Worker(self,self.tasks))


    def add_task(self, func, *args, **kargs):
        """Add a task to the queue"""
        self.tasks.put((func, args, kargs))

    def wait_completion(self):
        """Wait for completion of all the tasks in the queue"""
        self.tasks.join()

    def stop_threads(self):
        for w in self.workers:
            w.stop()

    def wait_stop(self):
        self.wait_completion()
        self.stop_threads()



    visited = set()
    queue = Queue()
    external_links_set = set()
    internal_links_set = set()
    external_links = 0

    def process(pool,host,url):

        try:

            content = urlopen(url).read()
        except UnicodeDecodeError:
            return


        for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
            try:
                href = link['href']
            except KeyError:
                continue


            if not href.startswith('http://'):
                href = 'http://%s%s' % (host, href)
            if not href.startswith('http://%s%s' % (host, '/')):
                continue

            internal_links_set.add(href)


            if href not in visited:
                visited.add(href)
                pool.add_task(process,pool,host,href)

            else:
                pass

    def start(host,charset):
        pool = ThreadPool(20)
        pool.add_task(process,pool,host,'http://%s/' % (host))
        pool.wait_stop()

    start('evgenm.com','utf8') 

我有基本的python知识,但是python中的线程不是没有用的吗?我已经看到了大量批评全局锁解释器的文章。

您正在线程之间共享状态(即,
中的is\u all\u wait
),而没有同步。此外,所有线程都在“等待”这一事实并不是队列为空的可靠指标(例如,它们可能都在获取任务的过程中)。我怀疑,有时线程在队列真正为空之前退出。如果这种情况发生得足够频繁,那么队列中将只剩下任务,而没有线程来运行它们。因此,
queue.join()
将永远等待

我的建议是:

  • 摆脱
    就是等待
    ——这不是一个可靠的指标
  • 摆脱任务
    状态
    ——这并不是真的必要
  • 依靠
    队列。加入
    让您知道什么时候一切都已处理完毕

  • 如果需要终止线程(例如,这是一个更大的、长期运行的程序的一部分),请在
    队列.join()之后执行此操作。。。天哪,多棒的垒球啊。我想我会克制住幽默的冲动,但是…当然你是对的)从你的角度来看)就像你说的,“python中的线程不是无用的”;-)。GIL是针对CPython的。例如,Jython使用Java同步使解释器线程安全。此外,GIL对IO绑定任务的性能影响小于CPU绑定任务。最后,C扩展中的代码可以在不需要访问Python数据结构时释放GIL。因此,对于大多数用途来说,这真的没什么大不了的。谢谢你的帮助,我来执行#2,你怎么看?是的,这看起来很合理。总的来说,代码看起来非常干净,易于理解。