Python 使用Pyqt4下载数千个PDF';s来自URL

Python 使用Pyqt4下载数千个PDF';s来自URL,python,pdf,pyqt4,Python,Pdf,Pyqt4,我正试图从一个给定的网站下载数千个PDF文件。然而,由于某些原因,它甚至不会下载100个PDF。我不知道为什么。代码如下: #!/usr/bin/env python import time from pyPdf import PdfFileWriter, PdfFileReader import StringIO from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter from xhtml2p

我正试图从一个给定的网站下载数千个PDF文件。然而,由于某些原因,它甚至不会下载100个PDF。我不知道为什么。代码如下:

#!/usr/bin/env python
import time
from pyPdf import PdfFileWriter, PdfFileReader
import StringIO
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from xhtml2pdf import pisa
import sys 
from PyQt4.QtCore import *
from PyQt4.QtGui import*
from PyQt4.QtWebKit import *

class Foo(QWidget):
    def __init__(self, parent=None):
        super(Foo, self).__init__(parent)    

        self.count  = -1
        text_file = open("input.txt", "r")
        self.params = text_file.read().split('\n')
        self.url    = 'http://www.asdfasdfasdf.com/Property.aspx?mode=details&pin={0}'

        self.gridLayout = QGridLayout(self)
        #self.tabWidget = QTabWidget(self)
        #self.gridLayout.addWidget(self.tabWidget, 0, 0, 1, 1)

        self.mapper = QSignalMapper(self)
        self.mapper.mapped.connect(self.on_mapper_mapped)

        for i in range(100):
            grabber = QWebView()
            grabber.loadFinished.connect(self.mapper.map)

            self.mapper.setMapping(grabber, i)
            #self.tabWidget.addTab(grabber, "opener {0}".format(str(i)))

            grabber.loadFinished.emit(True)

    @pyqtSlot(int)
    def on_mapper_mapped(self, gNumber):
        self.count += 1
        if self.count < len(self.params):
            #gParam = self.params[self.count]
            gParam = self.params[self.count]

            opener = self.mapper.mapping(gNumber)
            opener.load(QUrl(self.url.format(gParam)))
            printer = QPrinter()
            #setting format
            printer.setPageSize(QPrinter.A4)
            printer.setOutputFormat(QPrinter.PdfFormat)
            #export file as c:\tem_pdf.pdf
            PIDString = gParam[:2] + '-' + gParam[2:4] + '-' + gParam[4:7] + '-' + gParam[7:10] + '-' + gParam[10:14]
            printer.setOutputFileName(PIDString + '.pdf')
            def convertIt():
                opener.print_(printer)

            QObject.connect(opener, SIGNAL("loadFinished(bool)"), convertIt)
            print str(self.count) + ' of ' + str(len(self.params))



if __name__ == "__main__":
    import  sys

    app = QApplication(sys.argv)
    main = Foo()
    #main.show()
    app.exec_()
    sys.exit
#/usr/bin/env python
导入时间
从pyPdf导入PdfileWriter、PdfileReader
导入StringIO
从reportlab.pdfgen导入画布
从reportlab.lib.pagesizes导入信函
从xhtml2pdf导入pisa
导入系统
从PyQt4.QtCore导入*
从PyQt4.QtGui导入*
从PyQt4.QtWebKit导入*
Foo类(QWidget):
def uuu init uuu(self,parent=None):
超级(Foo,self)。\uuuuu初始化\uuuuuuu(父级)
self.count=-1
text_file=open(“input.txt”、“r”)
self.params=text_file.read().split('\n')
self.urlhttp://www.asdfasdfasdf.com/Property.aspx?mode=details&pin={0}'
self.gridLayout=QGridLayout(self)
#self.tabWidget=QTabWidget(self)
#self.gridLayout.addWidget(self.tabWidget,0,0,1,1)
self.mapper=QSignalMapper(self)
self.mapper.mapped.connect(self.on\u mapper\u mapped)
对于范围(100)内的i:
grabber=QWebView()
grabber.loadFinished.connect(self.mapper.map)
self.mapper.setMapping(grabber,i)
#self.tabWidget.addTab(抓取器,“opener{0}.”格式(str(i)))
grabber.loadFinished.emit(真)
@pyqtSlot(int)
映射器上的定义映射(self,gNumber):
self.count+=1
如果self.count
理想情况下,我也想添加一个页脚,但如果我尝试这样做,它就会出错。Input.txt有100个数字(仅用于测试,我需要它在85000上工作)。它适用于非常小的数字,如5或10,但不适用于100。QwebView实例是否有限制?为了让它工作,我应该管理它吗

当我运行100 PDF的代码时,它会打印出20 PDF。如果我将范围更改为小于100的值,它将有重复的PDF,但肯定会超过20

此外,我还发现了以下错误:

QPaint::begin:返回false


我认为,既然您正在尝试创建一个“PyQt应用程序”,那么您应该首先利用它提供的功能

试着对它进行一些调查,以控制您的所有请求和所有其他

它甚至会照顾你的线程和你的并行性所需的。它不会在下载时冻结你的应用程序

仔细看看我的代码。它并不能完全满足您的需要,但对于您需要设置的内容来说,它是一个非常好的过滤示例

# Subclass QNetworkAccessManager Here
from PyQt5.QtCore import QByteArray
from PyQt5.QtCore import QFile, pyqtSlot
from PyQt5.QtCore import QIODevice
from PyQt5.QtCore import QUrl
from PyQt5.QtCore import pyqtSignal
from PyQt5.QtNetwork import QNetworkAccessManager
from PyQt5.QtNetwork import QNetworkRequest


class NetworkAccessManager(QNetworkAccessManager):

    signal_add_image = pyqtSignal()
    dialog = None
    download_finished = False
    message_buffer = None
    reply = None

    def __init__(self):
        QNetworkAccessManager.__init__(self)
        self.reply = self.get(QNetworkRequest(QUrl("")))

    # Save image data in QByteArray buffer to the disk (google_image_logo.png
    # in the same directory)
    @pyqtSlot()
    def slot_finished(self):
        image_file = QFile("resources/browser_images/image_required_browser")
        if image_file.open(QIODevice.ReadWrite):
            image_file.write(self.message_buffer)
            image_file.close()
            self.signal_add_image.emit()
            # QMessageBox.information(None, "Hello!", "File has been saved!")
        else:
            pass
            # QMessageBox.critical(None, "Hello!", "Error saving file!")
        self.download_finished = True
        self.dialog.close()

    # Append current data to the buffer every time readyRead() signal is
    # emitted
    @pyqtSlot()
    def slot_read_data(self):
        self.message_buffer += self.reply.readAll()

    def request_image(self, url, progress_bar, dialog):
        self.reply.deleteLater()
        self.download_finished = False
        self.dialog = dialog
        self.message_buffer = QByteArray()
        url = QUrl(url)
        req = QNetworkRequest(url)
        req.setRawHeader(b'User-Agent',
                                    b'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36')
        self.reply = self.get(req)
        self.reply.readyRead.connect(self.slot_read_data)
        self.reply.finished.connect(self.slot_finished)
        self.reply.downloadProgress.connect(progress_bar)

    def get_reply(self):
        return self.reply

    def done(self):
        return self.download_finished

    def set_reply(self, reply):
        self.reply = reply

    def del_reply(self):
        self.reply.deleteLater()

NETWORK_ACCESS_MANAGER = NetworkAccessManager()

希望它能给您带来一些启示”)

当它“不适用于大数据”时会发生什么?消息?症状?还有,可能会遇到服务器端节流/反垃圾邮件。您检查过了吗?