Python 从scrapy下载的图像小于预期(JPEG)或不可读(tifs)
我不知道如何最好地提出这个问题。我对python和scrapy都是新手 本质上,我使用我的scrapy脚本下载的文件与我手动下载的文件不匹配。所有文件(即使是最小的jpeg图像)的大小都减小了。当我在Photoshop中打开图像时,“tif”文件的格式无法识别。JPEG打开得很好。此外,我下载的文件作为灰度文件下载,我的scrapy脚本提取的文件是RGB 据我所知,上的文档基本上就是使用scrapy处理图像的全部内容,但它确实提到它使用枕头库进行处理 我的想法是,它在默认情况下会做一些事情来调整图像&|限制下载的大小。但我不知道那可能是什么,也不知道如何禁用它。我想按原样下载图像,即尽可能少(读取:无)处理 如果有帮助,以下是相关文件。为了简洁起见,我省略了我的spider的一些代码,省略的部分只涉及到抓取元数据,如标题和参考号 items.pyPython 从scrapy下载的图像小于预期(JPEG)或不可读(tifs),python,image,image-processing,scrapy,Python,Image,Image Processing,Scrapy,我不知道如何最好地提出这个问题。我对python和scrapy都是新手 本质上,我使用我的scrapy脚本下载的文件与我手动下载的文件不匹配。所有文件(即使是最小的jpeg图像)的大小都减小了。当我在Photoshop中打开图像时,“tif”文件的格式无法识别。JPEG打开得很好。此外,我下载的文件作为灰度文件下载,我的scrapy脚本提取的文件是RGB 据我所知,上的文档基本上就是使用scrapy处理图像的全部内容,但它确实提到它使用枕头库进行处理 我的想法是,它在默认情况下会做一些事情来调整
import scrapy
class FsaImageData(scrapy.Item):
title = scrapy.Field()
digital_id = scrapy.Field()
source_url = scrapy.Field()
project = scrapy.Field()
call_nums = scrapy.Field()
next_url = scrapy.Field()
image_sizes = scrapy.Field()
image_names = scrapy.Field()
# fields also used to download.
image_urls = scrapy.Field()
image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class GetFsaImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for i,url in enumerate(item['image_urls']):
image_name = item['image_names'][i]
yield scrapy.Request(url, meta={'image_name': image_name})
def file_path(self, request, response=None, info=None):
return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'
SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'
# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'
# Probably just for testing for now:
IMAGES_EXPIRES = 0
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
class FSA_1935_Spider(scrapy.Spider):
name = "fsa1935"
start_urls = [ 'http://www.loc.gov/pictures/' ]
custom_settings = {
'FEED_FORMAT':'csv',
# TODO: include below in FINAL version of spider!
#'LOG_FILE':'.fsa1935.log',
#'LOG_STDOUT':'True',
}
def parse(self, response):
# navigate to search results page 1
results = BeautifulSoup(response.text, 'lxml').find('div',
class_='results_item ')
return scrapy.Request(url=urljoin(response.url, results.a['href']),
callback=self.parseFirst )
def parseFirst(self, response):
# navigate to first image returned by FSA searched
detail = BeautifulSoup(response.text, 'lxml').find('a',
text='View Larger').parent
return scrapy.Request(url=urljoin(response.url, detail.a['href']),
callback=self.parsePage )
def parsePage(self, response):
# pull metadata and image_urls for each page entry in the search,
# pass url to next entry in search to next iteration
data = FsaImageData()
ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
'\n\tException Type: {}:{}')
soup = BeautifulSoup(response.text, "lxml")
# get digital_id, project, & source_url
description = soup.find('div', {'id':'description'} )
if description != None:
# get image_urls, _sizes, and _names:
img_urls = []
img_sizes = []
img_names = []
for img in description.find_all(
'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
img_urls.append(urljoin( response.url, img['href']))
img_sizes.append(img.get_text())
img_names.append(img['href'].split('/')[-1])
data['image_urls'] = img_urls
data['image_sizes'] = img_sizes
data['image_names'] = img_names
else:
print( 'WARNING: Item description does not exist!' )
# scape image_data:
yield data
管道。py
import scrapy
class FsaImageData(scrapy.Item):
title = scrapy.Field()
digital_id = scrapy.Field()
source_url = scrapy.Field()
project = scrapy.Field()
call_nums = scrapy.Field()
next_url = scrapy.Field()
image_sizes = scrapy.Field()
image_names = scrapy.Field()
# fields also used to download.
image_urls = scrapy.Field()
image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class GetFsaImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for i,url in enumerate(item['image_urls']):
image_name = item['image_names'][i]
yield scrapy.Request(url, meta={'image_name': image_name})
def file_path(self, request, response=None, info=None):
return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'
SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'
# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'
# Probably just for testing for now:
IMAGES_EXPIRES = 0
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
class FSA_1935_Spider(scrapy.Spider):
name = "fsa1935"
start_urls = [ 'http://www.loc.gov/pictures/' ]
custom_settings = {
'FEED_FORMAT':'csv',
# TODO: include below in FINAL version of spider!
#'LOG_FILE':'.fsa1935.log',
#'LOG_STDOUT':'True',
}
def parse(self, response):
# navigate to search results page 1
results = BeautifulSoup(response.text, 'lxml').find('div',
class_='results_item ')
return scrapy.Request(url=urljoin(response.url, results.a['href']),
callback=self.parseFirst )
def parseFirst(self, response):
# navigate to first image returned by FSA searched
detail = BeautifulSoup(response.text, 'lxml').find('a',
text='View Larger').parent
return scrapy.Request(url=urljoin(response.url, detail.a['href']),
callback=self.parsePage )
def parsePage(self, response):
# pull metadata and image_urls for each page entry in the search,
# pass url to next entry in search to next iteration
data = FsaImageData()
ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
'\n\tException Type: {}:{}')
soup = BeautifulSoup(response.text, "lxml")
# get digital_id, project, & source_url
description = soup.find('div', {'id':'description'} )
if description != None:
# get image_urls, _sizes, and _names:
img_urls = []
img_sizes = []
img_names = []
for img in description.find_all(
'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
img_urls.append(urljoin( response.url, img['href']))
img_sizes.append(img.get_text())
img_names.append(img['href'].split('/')[-1])
data['image_urls'] = img_urls
data['image_sizes'] = img_sizes
data['image_names'] = img_names
else:
print( 'WARNING: Item description does not exist!' )
# scape image_data:
yield data
设置.py
import scrapy
class FsaImageData(scrapy.Item):
title = scrapy.Field()
digital_id = scrapy.Field()
source_url = scrapy.Field()
project = scrapy.Field()
call_nums = scrapy.Field()
next_url = scrapy.Field()
image_sizes = scrapy.Field()
image_names = scrapy.Field()
# fields also used to download.
image_urls = scrapy.Field()
image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class GetFsaImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for i,url in enumerate(item['image_urls']):
image_name = item['image_names'][i]
yield scrapy.Request(url, meta={'image_name': image_name})
def file_path(self, request, response=None, info=None):
return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'
SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'
# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'
# Probably just for testing for now:
IMAGES_EXPIRES = 0
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
class FSA_1935_Spider(scrapy.Spider):
name = "fsa1935"
start_urls = [ 'http://www.loc.gov/pictures/' ]
custom_settings = {
'FEED_FORMAT':'csv',
# TODO: include below in FINAL version of spider!
#'LOG_FILE':'.fsa1935.log',
#'LOG_STDOUT':'True',
}
def parse(self, response):
# navigate to search results page 1
results = BeautifulSoup(response.text, 'lxml').find('div',
class_='results_item ')
return scrapy.Request(url=urljoin(response.url, results.a['href']),
callback=self.parseFirst )
def parseFirst(self, response):
# navigate to first image returned by FSA searched
detail = BeautifulSoup(response.text, 'lxml').find('a',
text='View Larger').parent
return scrapy.Request(url=urljoin(response.url, detail.a['href']),
callback=self.parsePage )
def parsePage(self, response):
# pull metadata and image_urls for each page entry in the search,
# pass url to next entry in search to next iteration
data = FsaImageData()
ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
'\n\tException Type: {}:{}')
soup = BeautifulSoup(response.text, "lxml")
# get digital_id, project, & source_url
description = soup.find('div', {'id':'description'} )
if description != None:
# get image_urls, _sizes, and _names:
img_urls = []
img_sizes = []
img_names = []
for img in description.find_all(
'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
img_urls.append(urljoin( response.url, img['href']))
img_sizes.append(img.get_text())
img_names.append(img['href'].split('/')[-1])
data['image_urls'] = img_urls
data['image_sizes'] = img_sizes
data['image_names'] = img_names
else:
print( 'WARNING: Item description does not exist!' )
# scape image_data:
yield data
spider.py
import scrapy
class FsaImageData(scrapy.Item):
title = scrapy.Field()
digital_id = scrapy.Field()
source_url = scrapy.Field()
project = scrapy.Field()
call_nums = scrapy.Field()
next_url = scrapy.Field()
image_sizes = scrapy.Field()
image_names = scrapy.Field()
# fields also used to download.
image_urls = scrapy.Field()
image = scrapy.Field()
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class GetFsaImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for i,url in enumerate(item['image_urls']):
image_name = item['image_names'][i]
yield scrapy.Request(url, meta={'image_name': image_name})
def file_path(self, request, response=None, info=None):
return request.meta['image_name']
BOT_NAME = 'LOC_FSA_1935'
SPIDER_MODULES = ['LOC_FSA_1935.spiders']
NEWSPIDER_MODULE = 'LOC_FSA_1935.spiders'
# Files Pipeline:
ITEM_PIPELINES = {'LOC_FSA_1935.pipelines.GetFsaImagesPipeline':1}
IMAGES_STORE = '/Volumes/FSA_IMAGES/1935/'
# Probably just for testing for now:
IMAGES_EXPIRES = 0
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# AUTOTHROTTLE (BE NICE!!)
AUTOTHROTTLE_ENABLED = True
import scrapy
from LOC_FSA_1935.items import FsaImageData
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
class FSA_1935_Spider(scrapy.Spider):
name = "fsa1935"
start_urls = [ 'http://www.loc.gov/pictures/' ]
custom_settings = {
'FEED_FORMAT':'csv',
# TODO: include below in FINAL version of spider!
#'LOG_FILE':'.fsa1935.log',
#'LOG_STDOUT':'True',
}
def parse(self, response):
# navigate to search results page 1
results = BeautifulSoup(response.text, 'lxml').find('div',
class_='results_item ')
return scrapy.Request(url=urljoin(response.url, results.a['href']),
callback=self.parseFirst )
def parseFirst(self, response):
# navigate to first image returned by FSA searched
detail = BeautifulSoup(response.text, 'lxml').find('a',
text='View Larger').parent
return scrapy.Request(url=urljoin(response.url, detail.a['href']),
callback=self.parsePage )
def parsePage(self, response):
# pull metadata and image_urls for each page entry in the search,
# pass url to next entry in search to next iteration
data = FsaImageData()
ex_msg = ('EXCEPTION: Unable to gather {} for {}.'
'\n\tException Type: {}:{}')
soup = BeautifulSoup(response.text, "lxml")
# get digital_id, project, & source_url
description = soup.find('div', {'id':'description'} )
if description != None:
# get image_urls, _sizes, and _names:
img_urls = []
img_sizes = []
img_names = []
for img in description.find_all(
'a', text=re.compile('JPEG|TIFF \([0-9.a-zA-Z]*\)')):
img_urls.append(urljoin( response.url, img['href']))
img_sizes.append(img.get_text())
img_names.append(img['href'].split('/')[-1])
data['image_urls'] = img_urls
data['image_sizes'] = img_sizes
data['image_names'] = img_names
else:
print( 'WARNING: Item description does not exist!' )
# scape image_data:
yield data
看来我解决了我自己的问题!我翻遍了ImagesPipeline的源代码,发现在默认情况下,scrapy在调用
get\u images
时使用了一种方法convert\u images
convert_images
是一个问题,因为它可以转换非jpeg和bmp图像的文件类型和颜色空间
我重新编写了get_images
,以处理我感兴趣的tiff和jpeg格式:
def get_images(self, response, request, info):
path = self.file_path(request, response=response, info=info)
image = Image.open(BytesIO(response.body))
buf = BytesIO()
ext = response.url.split('.')[-1]
if ext == 'tif':
exif = image.tag_v2
image.save(buf, 'TIFF', tiffinfo=exif)
else:
image.save(buf, 'JPEG')
yield path, image, buf
希望以后能帮助别人