Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/344.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 从scrapy下载的图像小于预期(JPEG)或不可读(tifs)_Python_Image_Image Processing_Scrapy - Fatal编程技术网

Python 从scrapy下载的图像小于预期(JPEG)或不可读(tifs)

Python 从scrapy下载的图像小于预期(JPEG)或不可读(tifs),python,image,image-processing,scrapy,Python,Image,Image Processing,Scrapy,我不知道如何最好地提出这个问题。我对python和scrapy都是新手 本质上,我使用我的scrapy脚本下载的文件与我手动下载的文件不匹配。所有文件(即使是最小的jpeg图像)的大小都减小了。当我在Photoshop中打开图像时,“tif”文件的格式无法识别。JPEG打开得很好。此外,我下载的文件作为灰度文件下载,我的scrapy脚本提取的文件是RGB 据我所知,上的文档基本上就是使用scrapy处理图像的全部内容,但它确实提到它使用枕头库进行处理 我的想法是,它在默认情况下会做一些事情来调整

我不知道如何最好地提出这个问题。我对python和scrapy都是新手

本质上,我使用我的scrapy脚本下载的文件与我手动下载的文件不匹配。所有文件(即使是最小的jpeg图像)的大小都减小了。当我在Photoshop中打开图像时,“tif”文件的格式无法识别。JPEG打开得很好。此外,我下载的文件作为灰度文件下载,我的scrapy脚本提取的文件是RGB

据我所知,上的文档基本上就是使用scrapy处理图像的全部内容,但它确实提到它使用枕头库进行处理

我的想法是,它在默认情况下会做一些事情来调整图像&|限制下载的大小。但我不知道那可能是什么,也不知道如何禁用它。我想按原样下载图像,即尽可能少(读取:无)处理

如果有帮助,以下是相关文件。为了简洁起见,我省略了我的spider的一些代码,省略的部分只涉及到抓取元数据,如标题和参考号

items.py

import scrapy

class FsaImageData(scrapy.Item):
    title = scrapy.Field()
    digital_id = scrapy.Field()
    source_url = scrapy.Field()
    project = scrapy.Field()
    call_nums = scrapy.Field()
    next_url = scrapy.Field()
    image_sizes = scrapy.Field()
    image_names = scrapy.Field()

    # fields also used to download.
    image_urls = scrapy.Field()
    image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class GetFsaImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for i,url in enumerate(item['image_urls']):
            image_name = item['image_names'][i]
            yield scrapy.Request(url, meta={'image_name': image_name})

    def file_path(self, request, response=None, info=None):
        return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'

SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'

# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'

# Probably just for testing for now:
IMAGES_EXPIRES = 0

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin


class FSA_1935_Spider(scrapy.Spider):

    name = "fsa1935"
    start_urls = [ 'http://www.loc.gov/pictures/' ]

    custom_settings = {
            'FEED_FORMAT':'csv',
            # TODO: include below in FINAL version of spider!
             #'LOG_FILE':'.fsa1935.log',
             #'LOG_STDOUT':'True',
            }


    def parse(self, response):
        # navigate to search results page 1
        results = BeautifulSoup(response.text, 'lxml').find('div',
              class_='results_item ')
        return scrapy.Request(url=urljoin(response.url, results.a['href']),
                callback=self.parseFirst )


    def parseFirst(self, response):
        # navigate to first image returned by FSA searched
        detail = BeautifulSoup(response.text, 'lxml').find('a',
                text='View Larger').parent
        return scrapy.Request(url=urljoin(response.url, detail.a['href']),
                callback=self.parsePage )


    def parsePage(self, response):
        # pull metadata and image_urls for each page entry in the search,
        # pass url to next entry in search to next iteration
        data = FsaImageData()
        ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
                     '\n\tException Type: {}:{}')

        soup = BeautifulSoup(response.text, "lxml")
        # get digital_id, project, & source_url
        description = soup.find('div', {'id':'description'} )
        if description != None:
            # get image_urls, _sizes, and _names:
            img_urls = []
            img_sizes = []
            img_names = []
            for img in description.find_all(
                    'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
                img_urls.append(urljoin( response.url, img['href']))
                img_sizes.append(img.get_text())
                img_names.append(img['href'].split('/')[-1])

            data['image_urls'] = img_urls
            data['image_sizes'] = img_sizes
            data['image_names'] = img_names
        else:
            print( 'WARNING: Item description does not exist!' )
        # scape image_data:
        yield data
管道。py

import scrapy

class FsaImageData(scrapy.Item):
    title = scrapy.Field()
    digital_id = scrapy.Field()
    source_url = scrapy.Field()
    project = scrapy.Field()
    call_nums = scrapy.Field()
    next_url = scrapy.Field()
    image_sizes = scrapy.Field()
    image_names = scrapy.Field()

    # fields also used to download.
    image_urls = scrapy.Field()
    image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class GetFsaImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for i,url in enumerate(item['image_urls']):
            image_name = item['image_names'][i]
            yield scrapy.Request(url, meta={'image_name': image_name})

    def file_path(self, request, response=None, info=None):
        return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'

SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'

# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'

# Probably just for testing for now:
IMAGES_EXPIRES = 0

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin


class FSA_1935_Spider(scrapy.Spider):

    name = "fsa1935"
    start_urls = [ 'http://www.loc.gov/pictures/' ]

    custom_settings = {
            'FEED_FORMAT':'csv',
            # TODO: include below in FINAL version of spider!
             #'LOG_FILE':'.fsa1935.log',
             #'LOG_STDOUT':'True',
            }


    def parse(self, response):
        # navigate to search results page 1
        results = BeautifulSoup(response.text, 'lxml').find('div',
              class_='results_item ')
        return scrapy.Request(url=urljoin(response.url, results.a['href']),
                callback=self.parseFirst )


    def parseFirst(self, response):
        # navigate to first image returned by FSA searched
        detail = BeautifulSoup(response.text, 'lxml').find('a',
                text='View Larger').parent
        return scrapy.Request(url=urljoin(response.url, detail.a['href']),
                callback=self.parsePage )


    def parsePage(self, response):
        # pull metadata and image_urls for each page entry in the search,
        # pass url to next entry in search to next iteration
        data = FsaImageData()
        ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
                     '\n\tException Type: {}:{}')

        soup = BeautifulSoup(response.text, "lxml")
        # get digital_id, project, & source_url
        description = soup.find('div', {'id':'description'} )
        if description != None:
            # get image_urls, _sizes, and _names:
            img_urls = []
            img_sizes = []
            img_names = []
            for img in description.find_all(
                    'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
                img_urls.append(urljoin( response.url, img['href']))
                img_sizes.append(img.get_text())
                img_names.append(img['href'].split('/')[-1])

            data['image_urls'] = img_urls
            data['image_sizes'] = img_sizes
            data['image_names'] = img_names
        else:
            print( 'WARNING: Item description does not exist!' )
        # scape image_data:
        yield data
设置.py

import scrapy

class FsaImageData(scrapy.Item):
    title = scrapy.Field()
    digital_id = scrapy.Field()
    source_url = scrapy.Field()
    project = scrapy.Field()
    call_nums = scrapy.Field()
    next_url = scrapy.Field()
    image_sizes = scrapy.Field()
    image_names = scrapy.Field()

    # fields also used to download.
    image_urls = scrapy.Field()
    image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class GetFsaImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for i,url in enumerate(item['image_urls']):
            image_name = item['image_names'][i]
            yield scrapy.Request(url, meta={'image_name': image_name})

    def file_path(self, request, response=None, info=None):
        return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'

SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'

# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'

# Probably just for testing for now:
IMAGES_EXPIRES = 0

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin


class FSA_1935_Spider(scrapy.Spider):

    name = "fsa1935"
    start_urls = [ 'http://www.loc.gov/pictures/' ]

    custom_settings = {
            'FEED_FORMAT':'csv',
            # TODO: include below in FINAL version of spider!
             #'LOG_FILE':'.fsa1935.log',
             #'LOG_STDOUT':'True',
            }


    def parse(self, response):
        # navigate to search results page 1
        results = BeautifulSoup(response.text, 'lxml').find('div',
              class_='results_item ')
        return scrapy.Request(url=urljoin(response.url, results.a['href']),
                callback=self.parseFirst )


    def parseFirst(self, response):
        # navigate to first image returned by FSA searched
        detail = BeautifulSoup(response.text, 'lxml').find('a',
                text='View Larger').parent
        return scrapy.Request(url=urljoin(response.url, detail.a['href']),
                callback=self.parsePage )


    def parsePage(self, response):
        # pull metadata and image_urls for each page entry in the search,
        # pass url to next entry in search to next iteration
        data = FsaImageData()
        ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
                     '\n\tException Type: {}:{}')

        soup = BeautifulSoup(response.text, "lxml")
        # get digital_id, project, & source_url
        description = soup.find('div', {'id':'description'} )
        if description != None:
            # get image_urls, _sizes, and _names:
            img_urls = []
            img_sizes = []
            img_names = []
            for img in description.find_all(
                    'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
                img_urls.append(urljoin( response.url, img['href']))
                img_sizes.append(img.get_text())
                img_names.append(img['href'].split('/')[-1])

            data['image_urls'] = img_urls
            data['image_sizes'] = img_sizes
            data['image_names'] = img_names
        else:
            print( 'WARNING: Item description does not exist!' )
        # scape image_data:
        yield data
spider.py

import scrapy

class FsaImageData(scrapy.Item):
    title = scrapy.Field()
    digital_id = scrapy.Field()
    source_url = scrapy.Field()
    project = scrapy.Field()
    call_nums = scrapy.Field()
    next_url = scrapy.Field()
    image_sizes = scrapy.Field()
    image_names = scrapy.Field()

    # fields also used to download.
    image_urls = scrapy.Field()
    image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class GetFsaImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for i,url in enumerate(item['image_urls']):
            image_name = item['image_names'][i]
            yield scrapy.Request(url, meta={'image_name': image_name})

    def file_path(self, request, response=None, info=None):
        return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'

SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'

# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'

# Probably just for testing for now:
IMAGES_EXPIRES = 0

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin


class FSA_1935_Spider(scrapy.Spider):

    name = "fsa1935"
    start_urls = [ 'http://www.loc.gov/pictures/' ]

    custom_settings = {
            'FEED_FORMAT':'csv',
            # TODO: include below in FINAL version of spider!
             #'LOG_FILE':'.fsa1935.log',
             #'LOG_STDOUT':'True',
            }


    def parse(self, response):
        # navigate to search results page 1
        results = BeautifulSoup(response.text, 'lxml').find('div',
              class_='results_item ')
        return scrapy.Request(url=urljoin(response.url, results.a['href']),
                callback=self.parseFirst )


    def parseFirst(self, response):
        # navigate to first image returned by FSA searched
        detail = BeautifulSoup(response.text, 'lxml').find('a',
                text='View Larger').parent
        return scrapy.Request(url=urljoin(response.url, detail.a['href']),
                callback=self.parsePage )


    def parsePage(self, response):
        # pull metadata and image_urls for each page entry in the search,
        # pass url to next entry in search to next iteration
        data = FsaImageData()
        ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
                     '\n\tException Type: {}:{}')

        soup = BeautifulSoup(response.text, "lxml")
        # get digital_id, project, & source_url
        description = soup.find('div', {'id':'description'} )
        if description != None:
            # get image_urls, _sizes, and _names:
            img_urls = []
            img_sizes = []
            img_names = []
            for img in description.find_all(
                    'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
                img_urls.append(urljoin( response.url, img['href']))
                img_sizes.append(img.get_text())
                img_names.append(img['href'].split('/')[-1])

            data['image_urls'] = img_urls
            data['image_sizes'] = img_sizes
            data['image_names'] = img_names
        else:
            print( 'WARNING: Item description does not exist!' )
        # scape image_data:
        yield data

看来我解决了我自己的问题!我翻遍了ImagesPipeline的源代码,发现在默认情况下,scrapy在调用
get\u images
时使用了一种方法
convert\u images
convert_images
是一个问题,因为它可以转换非jpeg和bmp图像的文件类型和颜色空间

我重新编写了
get_images
,以处理我感兴趣的tiff和jpeg格式:

def get_images(self, response, request, info):
    path = self.file_path(request, response=response, info=info)
    image = Image.open(BytesIO(response.body))
    buf = BytesIO()

    ext = response.url.split('.')[-1]
    if ext == 'tif':
        exif = image.tag_v2
        image.save(buf, 'TIFF', tiffinfo=exif)
    else:
        image.save(buf, 'JPEG')

    yield path, image, buf
希望以后能帮助别人