Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/295.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 使用QWebPage刮取多个URL_Python_Web Scraping_Pyqt_Pyqt4_Qwebpage - Fatal编程技术网

Python 使用QWebPage刮取多个URL

Python 使用QWebPage刮取多个URL,python,web-scraping,pyqt,pyqt4,qwebpage,Python,Web Scraping,Pyqt,Pyqt4,Qwebpage,我使用Qt的QWebPage来呈现一个页面,该页面使用javascript动态更新其内容,因此只下载页面静态版本(如urllib2)的库将无法工作 我的问题是,当我呈现第二个页面时,大约99%的时间程序只是崩溃。在其他时候,它会工作三次,然后崩溃。我也得到了一些断层,但都是随机的 我猜我用来渲染的对象没有被正确删除,所以尝试重用它可能会给我自己带来一些问题。我看了所有的地方,似乎没有人真的有同样的问题 这是我正在使用的代码。该程序从steam的社区市场下载网页,这样我就可以创建所有项目的数据库。

我使用Qt的QWebPage来呈现一个页面,该页面使用javascript动态更新其内容,因此只下载页面静态版本(如urllib2)的库将无法工作

我的问题是,当我呈现第二个页面时,大约99%的时间程序只是崩溃。在其他时候,它会工作三次,然后崩溃。我也得到了一些断层,但都是随机的

我猜我用来渲染的对象没有被正确删除,所以尝试重用它可能会给我自己带来一些问题。我看了所有的地方,似乎没有人真的有同样的问题

这是我正在使用的代码。该程序从steam的社区市场下载网页,这样我就可以创建所有项目的数据库。我需要多次调用
getItemsFromPage
函数来获取所有项目,因为它们被分成多个页面(显示X数量中的1-10个结果)

一次调用
getItemsFromPage
就可以了。后来的电话给了我麻烦。程序的输出通常是

Updating market items to dota2.csv ...
Page 1
Page 2

然后它崩溃了。它应该有700多页

您的程序的问题是,您正试图用获取的每个url创建一个新的QApplication

相反,只应创建一个QApplication和一个网页。网页可以使用其
loadFinished
信号创建一个内部循环,在处理完每个url后获取一个新的url。自定义html处理可以通过将用户定义的插槽连接到一个信号来添加,该信号在html文本和url可用时发出。下面的脚本(用于PyQt5和PyQt4)展示了如何实现这一点

下面是一些演示如何使用WebPage类的示例:

用法

def my_html_processor(html, url):
    print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

class WebPage(QWebEnginePage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super().__init__()
        self._verbose = verbose
        self.loadFinished.connect(self.handleLoadFinished)

    def process(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QUrl(url))
        return True

    def processCurrentPage(self, html):
        self.htmlReady.emit(html, self.url().toString())
        if not self.fetchNext():
            QApplication.instance().quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super().javaScriptConsoleMessage(*args, **kwargs)
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

class WebPage(QWebPage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super(WebPage, self).__init__()
        self._verbose = verbose
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.mainFrame().load(QUrl(url))
        return True

    def processCurrentPage(self):
        self.htmlReady.emit(
            self.mainFrame().toHtml(), self.mainFrame().url().toString())
        print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

    def handleLoadFinished(self):
        self.processCurrentPage()
        if not self.fetchNext():
            QApplication.instance().quit()

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
PyQt5网页

def my_html_processor(html, url):
    print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

class WebPage(QWebEnginePage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super().__init__()
        self._verbose = verbose
        self.loadFinished.connect(self.handleLoadFinished)

    def process(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QUrl(url))
        return True

    def processCurrentPage(self, html):
        self.htmlReady.emit(html, self.url().toString())
        if not self.fetchNext():
            QApplication.instance().quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super().javaScriptConsoleMessage(*args, **kwargs)
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

class WebPage(QWebPage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super(WebPage, self).__init__()
        self._verbose = verbose
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.mainFrame().load(QUrl(url))
        return True

    def processCurrentPage(self):
        self.htmlReady.emit(
            self.mainFrame().toHtml(), self.mainFrame().url().toString())
        print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

    def handleLoadFinished(self):
        self.processCurrentPage()
        if not self.fetchNext():
            QApplication.instance().quit()

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
PyQt4网页

def my_html_processor(html, url):
    print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

class WebPage(QWebEnginePage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super().__init__()
        self._verbose = verbose
        self.loadFinished.connect(self.handleLoadFinished)

    def process(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QUrl(url))
        return True

    def processCurrentPage(self, html):
        self.htmlReady.emit(html, self.url().toString())
        if not self.fetchNext():
            QApplication.instance().quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super().javaScriptConsoleMessage(*args, **kwargs)
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

class WebPage(QWebPage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super(WebPage, self).__init__()
        self._verbose = verbose
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.mainFrame().load(QUrl(url))
        return True

    def processCurrentPage(self):
        self.htmlReady.emit(
            self.mainFrame().toHtml(), self.mainFrame().url().toString())
        print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

    def handleLoadFinished(self):
        self.processCurrentPage()
        if not self.fetchNext():
            QApplication.instance().quit()

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
这一定是此错误(现已修复):