Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/multithreading/4.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
PHP pthreads:从一个;螺纹;去游泳池_Php_Multithreading_Pthreads_Php 7.1 - Fatal编程技术网

PHP pthreads:从一个;螺纹;去游泳池

PHP pthreads:从一个;螺纹;去游泳池,php,multithreading,pthreads,php-7.1,Php,Multithreading,Pthreads,Php 7.1,我正在使用pthreads3.1.6-dev和php7.1进行一些操作。我的目标是创建一个小型webcrawler 计划的工作流程是:将一个URL放入池中(可能是主页),然后一个爬虫(扩展线程)从该URL中搜索所有链接。小规模过滤后,爬虫程序应将所有新链接添加到池中(不应将外部链接添加到池中)。或者,爬虫程序将新的URL提供给“其他人”,从而将其添加到池中 该过程应继续,直到找不到新的URL为止 我的问题是,我没有找到有效的解决办法。我当前的绘图如下所示:爬虫提取URL并将其放入池中。为此,将为

我正在使用pthreads3.1.6-dev和php7.1进行一些操作。我的目标是创建一个小型webcrawler

计划的工作流程是:将一个URL放入池中(可能是主页),然后一个爬虫(扩展线程)从该URL中搜索所有链接。小规模过滤后,爬虫程序应将所有新链接添加到池中(不应将外部链接添加到池中)。或者,爬虫程序将新的URL提供给“其他人”,从而将其添加到池中

该过程应继续,直到找不到新的URL为止

我的问题是,我没有找到有效的解决办法。我当前的绘图如下所示:爬虫提取URL并将其放入池中。为此,将为每个Worker保存对池的引用,以便爬虫可以通过Worker访问池对象

此解决方案的问题是:如果“延迟”线程将新线程添加到池中,则不会执行此新任务

一些演示代码:

一个示例结果:

Create a new worker
Worker 139878744053504: WebWorker::run
Webwork from Worker 139878744053504
Create a new worker
Worker 139878731872000: WebWorker::run
Webwork from Worker 139878731872000
Webwork from Worker 139878731872000
Webwork 139878731872000 add new Webwork
Webwork from Worker 139878744053504
Create a new worker
Worker 139878719289088: WebWorker::run
Webwork from Worker 139878719289088
Webwork 139878719289088 add new Webwork

有人能告诉我解决这个问题的最佳实践吗?

问题是您依赖垃圾收集器来阻止主线程,而实际上您应该使用自己的条件来阻止。在pthreads v3中重写默认的垃圾收集器并不是真正必要的,除非在特殊情况下,您不希望在任务完成执行后立即收集该任务

问题的一个可能解决方案是使用一个link计数器变量,该变量为找到的每个新链接(需要爬网)递增,为爬网的每个链接递减。当该变量达到0时,您可以假设网站已完全爬网,因此可以安全关闭线程池

以下是代码中的此解决方案:

<?php

class WebsiteCrawler extends Worker
{
    public $pool; // expose the pool to our LinkCrawler tasks

    public function __construct(Pool $pool)
    {
        $this->pool = $pool;
    }
}

class LinkCrawler extends Threaded
{
    private $link;
    public static $domain;

    public function __construct(string $link)
    {
        $this->link = $link;
        WebCrawlerPool::$links[] = $link;
        ++WebCrawlerPool::$linksRemaining->count;
        var_dump($link); // for debugging, just to show that it is collecting links
    }

    public function run()
    {
        $content = file_get_contents($this->link);
        $domain = preg_quote(self::$domain);

        preg_match_all("~href=\"(.+?{$domain}.+?)\"~", $content, $matches); // naive regex to fetch links

        $links = $matches[1];

        foreach ($links as $link) {
            if (count(WebCrawlerPool::$links) > 9) { // stop at 10 links (for example purposes...)
                break;
            }
            if (!in_array($link, get_object_vars(WebCrawlerPool::$links), true)) {
                $this->worker->pool->submit(new LinkCrawler($link));
            }
        }

        --WebCrawlerPool::$linksRemaining->count;
    }
}

class WebCrawlerPool extends Pool
{
    public static $linksRemaining;
    public static $links;

    public function __construct(int $size, string $class, array $ctor = [])
    {
        parent::__construct($size, $class, [$this]);
        self::$links = new Threaded();
        self::$linksRemaining = new Threaded();
        self::$linksRemaining->count = 0;
    }
}

LinkCrawler::$domain = 'php.net';

$pool = new WebCrawlerPool(8, 'WebsiteCrawler');
$pool->submit(new LinkCrawler('http://php.net/', $pool)); // kick things off

while (WebCrawlerPool::$linksRemaining->count !== 0);

$pool->shutdown();

print_r(WebCrawlerPool::$links);

<?php

class WebsiteCrawler extends Worker
{
    public $pool; // expose the pool to our LinkCrawler tasks

    public function __construct(Pool $pool)
    {
        $this->pool = $pool;
    }
}

class LinkCrawler extends Threaded
{
    private $link;
    public static $domain;

    public function __construct(string $link)
    {
        $this->link = $link;
        WebCrawlerPool::$links[] = $link;
        ++WebCrawlerPool::$linksRemaining->count;
        var_dump($link); // for debugging, just to show that it is collecting links
    }

    public function run()
    {
        $content = file_get_contents($this->link);
        $domain = preg_quote(self::$domain);

        preg_match_all("~href=\"(.+?{$domain}.+?)\"~", $content, $matches); // naive regex to fetch links

        $links = $matches[1];

        foreach ($links as $link) {
            if (count(WebCrawlerPool::$links) > 9) { // stop at 10 links (for example purposes...)
                break;
            }
            if (!in_array($link, get_object_vars(WebCrawlerPool::$links), true)) {
                $this->worker->pool->submit(new LinkCrawler($link));
            }
        }

        --WebCrawlerPool::$linksRemaining->count;
    }
}

class WebCrawlerPool extends Pool
{
    public static $linksRemaining;
    public static $links;

    public function __construct(int $size, string $class, array $ctor = [])
    {
        parent::__construct($size, $class, [$this]);
        self::$links = new Threaded();
        self::$linksRemaining = new Threaded();
        self::$linksRemaining->count = 0;
    }
}

LinkCrawler::$domain = 'php.net';

$pool = new WebCrawlerPool(8, 'WebsiteCrawler');
$pool->submit(new LinkCrawler('http://php.net/', $pool)); // kick things off

while (WebCrawlerPool::$linksRemaining->count !== 0);

$pool->shutdown();

print_r(WebCrawlerPool::$links);