在python中并行地从url下载文件_Python

在python中并行地从url下载文件

python

在python中并行地从url下载文件,python,Python,我在数据库中有一些链接，我想并行下载。我试着连续做，但花了太多时间。我有大约1877个链接我尝试了并行运行下载的代码，但它抛出了一个错误：失败：“tuple”对象没有属性“read” #!/usr/bin/env python import urllib from stream import ThreadPool URLs = [ 'http://www.cnn.com/', 'http://www.bbc.co.uk/', 'http://www.economist.com/

我在数据库中有一些链接，我想并行下载。我试着连续做，但花了太多时间。我有大约1877个链接

我尝试了并行运行下载的代码，但它抛出了一个错误：失败：“tuple”对象没有属性“read”

#!/usr/bin/env python

import urllib
from stream import ThreadPool

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
  'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]

def retrieve(urls):
    for url in urls:
    print url,' '
    res = urllib.urlretrieve(url).read()
    yield url, res

if __name__ == '__main__':
    retrieved = URLs >> ThreadPool(retrieve, poolsize=7)
    for url, content in retrieved:
        print '%r is %d bytes' % (url, len(content))
    for url, exception in retrieved.failure:
        print '%r failed: %s' % (url, exception)

我也试过这个：

import urllib
import tldextract
from multiprocessing.pool import ThreadPool

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
   'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]


def dwld(url):
  print url
  res = urllib.urlopen(url).read() 
  filename = tldextract.extract(url)
  with open(filename.domain, 'wb') as fh:
     fh.write(res)
  return url 

pool = ThreadPool(processes = 4)
pool.map(dwld, URLs)

给我回溯（最近一次呼叫最后一次）：文件“dwld_thread.py”，第26行，在 map（dwld、URL）文件“/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py”，地图第148行返回self.map\u async（func，iterable，chunksize）.get（） get中的文件“/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/multiprocessing/pool.py”，第422行提升自我价值 IOError:[Errno socket error][Errno 8]提供了节点名或服务名，或者线程导入中未知

*
from threading import *
from time import sleep
# if Python2:
import urllib
# if Python3:
# import urllib.request

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
  'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]

class worker(Thread):
    def __init__(self, link):
        Thread.__init__(self)
        self.link = link
        self.start()
    def run(self):
        # if Python2:
        res = urllib.urlopen(url).read() # as mentioned by @DhruvPathak
        # if Python3:
        # res = urllib.request.urlopen(url).read()
        with open(url, 'rb') as fh:
            fh.write(res) # store fetched data in a file called <link>

for url in urls:
    while len(enumerate()) > 500:
        sleep(0.25)
    worker(url)

while len(enumerate()) > 1:
    sleep(0.25) # wait for all threads to finish

从时间上导入睡眠
#如果Python2：
导入URL库
#如果Python3：
#导入urllib.request
URL=[
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
类工作者（线程）：
定义初始化（自我，链接）：
线程。\uuuu初始化\uuuuu（自）
self.link=link
self.start（）
def运行（自）：
#如果Python2：
res=urllib.urlopen（url.read（）#如@DhruvPathak所述
#如果Python3：
#res=urllib.request.urlopen（url.read（））
打开（url，'rb'）作为fh：
write（res）#将获取的数据存储在名为
对于url中的url：
而len（enumerate（））>500：
睡眠（0.25）
工作者（url）
而len（enumerate（））>1：
睡眠（0.25）#等待所有线程完成

从线程导入*
从时间上导入睡眠
#如果Python2：
导入URL库
#如果Python3：
#导入urllib.request
URL=[
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
类工作者（线程）：
定义初始化（自我，链接）：
线程。\uuuu初始化\uuuuu（自）
self.link=link
self.start（）
def运行（自）：
#如果Python2：
res=urllib.urlopen（url.read（）#如@DhruvPathak所述
#如果Python3：
#res=urllib.request.urlopen（url.read（））
打开（url，'rb'）作为fh：
write（res）#将获取的数据存储在名为
对于url中的url：
而len（enumerate（））>500：
睡眠（0.25）
工作者（url）
而len（enumerate（））>1：
睡眠（0.25）#等待所有线程完成

从线程导入*
从时间上导入睡眠
#如果Python2：
导入URL库
#如果Python3：
#导入urllib.request
URL=[
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
类工作者（线程）：
定义初始化（自我，链接）：
线程。\uuuu初始化\uuuuu（自）
self.link=link
self.start（）
def运行（自）：
#如果Python2：
res=urllib.urlopen（url.read（）#如@DhruvPathak所述
#如果Python3：
#res=urllib.request.urlopen（url.read（））
打开（url，'rb'）作为fh：
write（res）#将获取的数据存储在名为
对于url中的url：
而len（enumerate（））>500：
睡眠（0.25）
工作者（url）
而len（enumerate（））>1：
睡眠（0.25）#等待所有线程完成

从线程导入*
从时间上导入睡眠
#如果Python2：
导入URL库
#如果Python3：
#导入urllib.request
URL=[
'http://www.cnn.com/',
'http://www.bbc.co.uk/',
'http://www.economist.com/',
'http://nonexistant.website.at.baddomain/',
'http://slashdot.org/',
'http://reddit.com/',
'http://news.ycombinator.com/'
]
类工作者（线程）：
定义初始化（自我，链接）：
线程。\uuuu初始化\uuuuu（自）
self.link=link
self.start（）
def运行（自）：
#如果Python2：
res=urllib.urlopen（url.read（）#如@DhruvPathak所述
#如果Python3：
#res=urllib.request.urlopen（url.read（））
打开（url，'rb'）作为fh：
write（res）#将获取的数据存储在名为
对于url中的url：
而len（enumerate（））>500：
睡眠（0.25）
工作者（url）
而len（enumerate（））>1：
睡眠（0.25）#等待所有线程完成

urllib.urlretrieve（url）.read（）

应该是

urllib.urlopen（url）.read（）

urllib.urlretrieve（url）.read（）

应该是

urllib.urlopen（url）.read（）

应该是

urllib.urlopen（url）.read（）

应该是

urllib.urlopen（url）.read（）

应该是

urllib.urlopen（url）.read（）

我不知道您正在使用的

流是什么。ThreadPool

或者它的API是什么…但问题很明显：

res = urllib.urlretrieve(url).read()

如果您查看文档以了解：

返回一个元组（filename，headers），其中filename是可以在其中找到对象的本地文件名

你显然不能打电话给

read

。如果要使用此旧版API下载到本地文件，然后读取该文件，可以：

但是为什么呢？只需使用，它“返回一个带有两个附加方法的类似文件的对象”，因此您只需对其调用

read

，就不会创建临时文件，也不会使用一个多年来没有人维护过的设计不太正确的旧函数

但是Python在标准库中有一个很好的内置函数。如果你看他们给你看的第一个例子，这正是你想要做的

不幸的是，您使用的是Python2.x，它没有

震荡
filename, headers = urllib.urlretrieve(url)
with open(filename) as f:
    res = f.read()

import futures
import urllib2

URLs = [
  'http://www.cnn.com/',
  'http://www.bbc.co.uk/',
  'http://www.economist.com/',
  'http://nonexistant.website.at.baddomain/',
  'http://slashdot.org/',
  'http://reddit.com/',
  'http://news.ycombinator.com/'
 ]

def load_url(url):
    return urllib2.urlopen(url).read()

if __name__ == '__main__':
    with futures.ThreadPoolExecutor(max_workers=7) as executor:
        fmap = dict((executor.submit(load_url, url), url) for url in URLs)
        for f in futures.as_completed(fmap):
            url = fmap[f]
            try:
                content = f.result()
            except Exception as exception:
                print '%r failed: %s' % (url, exception)
            else:
                print '%r is %d bytes' % (url, len(content))

#! /usr/bin/env python

# -*- coding: utf-8 -*-


import sys
import urllib
from multiprocessing import Pool

import os

POOL = 8
PDFS_DOWNLOAD_DIR = 'pdfs'
PDF_LINKS = sys.argv[1]


class DownloadFiles(object):
    def __init__(self):
        self.pdf_links = self.read_links_from_file()
        self.create_download_dir()

    def create_download_dir(self):
        try:
            if not os.path.exists(PDFS_DOWNLOAD_DIR):
                os.makedirs(PDFS_DOWNLOAD_DIR)
        except IOError as e:
            exit()

    def read_links_from_file(self):
        try:
            with open(PDF_LINKS, 'r') as f:
                return list(set([x.strip() for x in f]))
        except (IndexError, IOError) as e:
            exit()

    def get_file(self, link):

        filename = link.split('/')[-2]

        print('Downloading file --> "{filename}"'.format(
            filename=filename
        ))

        urllib.urlretrieve(link, filename='{pdfs_data}/{filename}'.format(
            pdfs_data=PDFS_DOWNLOAD_DIR,
            filename=filename
        ))

    def download(self):

        pool = Pool(POOL)
        pool.map(self.get_file, self.pdf_links)

        pool.close()
        pool.join()

        print('\nSuccessfully downloaded files from given source!\n')


d = DownloadFiles()
d.download()