Python 如何通过Scrapy收集jpeg
我想收集Scrapy的偶像照片 一个收集网页是 我写了蜘蛛 (save_gradol.py) 我还写了管道 (pipelines.py) 我还写了塞廷 (settings.py) 然后,我试着爬行[sudo scrapy crawl save_gradol], 但不要爬行,也不要收集照片Python 如何通过Scrapy收集jpeg,python,scrapy,python-3.6,Python,Scrapy,Python 3.6,我想收集Scrapy的偶像照片 一个收集网页是 我写了蜘蛛 (save_gradol.py) 我还写了管道 (pipelines.py) 我还写了塞廷 (settings.py) 然后,我试着爬行[sudo scrapy crawl save_gradol], 但不要爬行,也不要收集照片 请帮我解决这个问题。你可以用最简单的方法: import requests from tqdm import tqdm number_of_photos = 26 for i in tqdm(range(1
请帮我解决这个问题。你可以用最简单的方法:
import requests
from tqdm import tqdm
number_of_photos = 26
for i in tqdm(range(1, number_of_photos + 1)):
image_url = 'https://news.mynavi.jp/article/20191229-947707/images/{:03}l.jpg'.format(i)
try:
response = requests.get(image_url)
except:
pass
else:
if response.status_code == 200:
with open('{:02}.jpg'.format(i), 'wb') as f:
f.write(response.content)
享受。您可以用最简单的方法完成:
import requests
from tqdm import tqdm
number_of_photos = 26
for i in tqdm(range(1, number_of_photos + 1)):
image_url = 'https://news.mynavi.jp/article/20191229-947707/images/{:03}l.jpg'.format(i)
try:
response = requests.get(image_url)
except:
pass
else:
if response.status_code == 200:
with open('{:02}.jpg'.format(i), 'wb') as f:
f.write(response.content)
享受。简单的解决方案
import os,io,sys,re
from simplified_scrapy.core.utils import md5
from simplified_scrapy.spider import Spider, SimplifiedDoc
class ImageSpider(Spider):
name = 'mynavi.jp'
allowed_domains = ['news.mynavi.jp/']
start_urls = ['https://news.mynavi.jp/article/20191229-947707/']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def __init__(self):
Spider.__init__(self,self.name) #necessary
if(not os.path.exists('images/')):
os.mkdir('images/')
def afterResponse(self, response, url, error=None, extra=None):
try:
if sys.version_info.major == 2: maintype = response.headers.maintype
else: maintype =response.info().get('Content-Type')
# save image
if(response.code==200 and maintype and maintype.find('image')>=0):
name = 'images/'+md5(url)+'.jpg'
file = io.open(name, "wb")
file.write(response.read())
file.close()
return None
else: # If it's not a image, leave it to the frame
return Spider.afterResponse(self, response, url, error)
except Exception as err:
print (err)
def extract(self,url,html,models,modelNames):
doc = SimplifiedDoc(html)
urls = re.compile(u'"url":"[^"]*"').findall(doc.html)
if(urls):
urls = [{'url':doc.absoluteUrl(url['url'],u[len('"url":"[^"]*"'):-1])} for u in urls]
self.saveUrl(urls)
urls = doc.listA(url=url['url'])
if(urls):
self.saveUrl(urls)
return True
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(ImageSpider())
你可以在这里找到simplified_scrapy的例子[1]
[1]: https://github.com/yiyedata/simplified-scrapy-demo使用简化的\u-scrapy解决方案
import os,io,sys,re
from simplified_scrapy.core.utils import md5
from simplified_scrapy.spider import Spider, SimplifiedDoc
class ImageSpider(Spider):
name = 'mynavi.jp'
allowed_domains = ['news.mynavi.jp/']
start_urls = ['https://news.mynavi.jp/article/20191229-947707/']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def __init__(self):
Spider.__init__(self,self.name) #necessary
if(not os.path.exists('images/')):
os.mkdir('images/')
def afterResponse(self, response, url, error=None, extra=None):
try:
if sys.version_info.major == 2: maintype = response.headers.maintype
else: maintype =response.info().get('Content-Type')
# save image
if(response.code==200 and maintype and maintype.find('image')>=0):
name = 'images/'+md5(url)+'.jpg'
file = io.open(name, "wb")
file.write(response.read())
file.close()
return None
else: # If it's not a image, leave it to the frame
return Spider.afterResponse(self, response, url, error)
except Exception as err:
print (err)
def extract(self,url,html,models,modelNames):
doc = SimplifiedDoc(html)
urls = re.compile(u'"url":"[^"]*"').findall(doc.html)
if(urls):
urls = [{'url':doc.absoluteUrl(url['url'],u[len('"url":"[^"]*"'):-1])} for u in urls]
self.saveUrl(urls)
urls = doc.listA(url=url['url'])
if(urls):
self.saveUrl(urls)
return True
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(ImageSpider())
你可以在这里找到simplified_scrapy的例子[1]
[1]: https://github.com/yiyedata/simplified-scrapy-demo感谢您的解决方案。我不认识你。我会尽快尝试理解你的代码,我想写像你一样好的代码。谢谢我很高兴能帮助你。你可以在这里得到简化的例子:谢谢你的解决方案。我不认识你。我会尽快尝试理解你的代码,我想写像你一样好的代码。谢谢我很高兴能帮助你。您可以在此处获得简化的_scrapy示例:
import requests
from tqdm import tqdm
number_of_photos = 26
for i in tqdm(range(1, number_of_photos + 1)):
image_url = 'https://news.mynavi.jp/article/20191229-947707/images/{:03}l.jpg'.format(i)
try:
response = requests.get(image_url)
except:
pass
else:
if response.status_code == 200:
with open('{:02}.jpg'.format(i), 'wb') as f:
f.write(response.content)
import os,io,sys,re
from simplified_scrapy.core.utils import md5
from simplified_scrapy.spider import Spider, SimplifiedDoc
class ImageSpider(Spider):
name = 'mynavi.jp'
allowed_domains = ['news.mynavi.jp/']
start_urls = ['https://news.mynavi.jp/article/20191229-947707/']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def __init__(self):
Spider.__init__(self,self.name) #necessary
if(not os.path.exists('images/')):
os.mkdir('images/')
def afterResponse(self, response, url, error=None, extra=None):
try:
if sys.version_info.major == 2: maintype = response.headers.maintype
else: maintype =response.info().get('Content-Type')
# save image
if(response.code==200 and maintype and maintype.find('image')>=0):
name = 'images/'+md5(url)+'.jpg'
file = io.open(name, "wb")
file.write(response.read())
file.close()
return None
else: # If it's not a image, leave it to the frame
return Spider.afterResponse(self, response, url, error)
except Exception as err:
print (err)
def extract(self,url,html,models,modelNames):
doc = SimplifiedDoc(html)
urls = re.compile(u'"url":"[^"]*"').findall(doc.html)
if(urls):
urls = [{'url':doc.absoluteUrl(url['url'],u[len('"url":"[^"]*"'):-1])} for u in urls]
self.saveUrl(urls)
urls = doc.listA(url=url['url'])
if(urls):
self.saveUrl(urls)
return True
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(ImageSpider())