Python 如何通过Scrapy收集jpeg

Python 如何通过Scrapy收集jpeg,python,scrapy,python-3.6,Python,Scrapy,Python 3.6,我想收集Scrapy的偶像照片 一个收集网页是 我写了蜘蛛 (save_gradol.py) 我还写了管道 (pipelines.py) 我还写了塞廷 (settings.py) 然后,我试着爬行[sudo scrapy crawl save_gradol], 但不要爬行,也不要收集照片 请帮我解决这个问题。你可以用最简单的方法: import requests from tqdm import tqdm number_of_photos = 26 for i in tqdm(range(1

我想收集Scrapy的偶像照片

一个收集网页是

我写了蜘蛛

(save_gradol.py)

我还写了管道

(pipelines.py)

我还写了塞廷

(settings.py)

然后,我试着爬行[sudo scrapy crawl save_gradol], 但不要爬行,也不要收集照片


请帮我解决这个问题。

你可以用最简单的方法:

import requests
from tqdm import tqdm

number_of_photos = 26

for i in tqdm(range(1, number_of_photos + 1)):
    image_url = 'https://news.mynavi.jp/article/20191229-947707/images/{:03}l.jpg'.format(i)
    try:
        response = requests.get(image_url)  
    except:
        pass    
    else:
        if response.status_code == 200:
            with open('{:02}.jpg'.format(i), 'wb') as f:
                f.write(response.content)

享受。

您可以用最简单的方法完成:

import requests
from tqdm import tqdm

number_of_photos = 26

for i in tqdm(range(1, number_of_photos + 1)):
    image_url = 'https://news.mynavi.jp/article/20191229-947707/images/{:03}l.jpg'.format(i)
    try:
        response = requests.get(image_url)  
    except:
        pass    
    else:
        if response.status_code == 200:
            with open('{:02}.jpg'.format(i), 'wb') as f:
                f.write(response.content)

享受。

简单的解决方案

import os,io,sys,re
from simplified_scrapy.core.utils import md5
from simplified_scrapy.spider import Spider, SimplifiedDoc
class ImageSpider(Spider):
  name = 'mynavi.jp'
  allowed_domains = ['news.mynavi.jp/']
  start_urls = ['https://news.mynavi.jp/article/20191229-947707/']
  # refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.

  def __init__(self):
    Spider.__init__(self,self.name) #necessary
    if(not os.path.exists('images/')):
      os.mkdir('images/')

  def afterResponse(self, response, url, error=None, extra=None):
    try:
      if sys.version_info.major == 2: maintype = response.headers.maintype
      else: maintype =response.info().get('Content-Type')
      # save image
      if(response.code==200 and maintype and maintype.find('image')>=0):
        name = 'images/'+md5(url)+'.jpg'
        file = io.open(name, "wb")
        file.write(response.read())
        file.close()
        return None
      else: # If it's not a image, leave it to the frame
        return Spider.afterResponse(self, response, url, error)
    except Exception as err:
      print (err)

  def extract(self,url,html,models,modelNames):
    doc = SimplifiedDoc(html)
    urls = re.compile(u'"url":"[^"]*"').findall(doc.html)
    if(urls):
      urls = [{'url':doc.absoluteUrl(url['url'],u[len('"url":"[^"]*"'):-1])} for u in urls]
      self.saveUrl(urls)

    urls = doc.listA(url=url['url'])
    if(urls):
      self.saveUrl(urls)
    
    return True

from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(ImageSpider())
你可以在这里找到simplified_scrapy的例子[1]
[1]: https://github.com/yiyedata/simplified-scrapy-demo

使用简化的\u-scrapy解决方案

import os,io,sys,re
from simplified_scrapy.core.utils import md5
from simplified_scrapy.spider import Spider, SimplifiedDoc
class ImageSpider(Spider):
  name = 'mynavi.jp'
  allowed_domains = ['news.mynavi.jp/']
  start_urls = ['https://news.mynavi.jp/article/20191229-947707/']
  # refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.

  def __init__(self):
    Spider.__init__(self,self.name) #necessary
    if(not os.path.exists('images/')):
      os.mkdir('images/')

  def afterResponse(self, response, url, error=None, extra=None):
    try:
      if sys.version_info.major == 2: maintype = response.headers.maintype
      else: maintype =response.info().get('Content-Type')
      # save image
      if(response.code==200 and maintype and maintype.find('image')>=0):
        name = 'images/'+md5(url)+'.jpg'
        file = io.open(name, "wb")
        file.write(response.read())
        file.close()
        return None
      else: # If it's not a image, leave it to the frame
        return Spider.afterResponse(self, response, url, error)
    except Exception as err:
      print (err)

  def extract(self,url,html,models,modelNames):
    doc = SimplifiedDoc(html)
    urls = re.compile(u'"url":"[^"]*"').findall(doc.html)
    if(urls):
      urls = [{'url':doc.absoluteUrl(url['url'],u[len('"url":"[^"]*"'):-1])} for u in urls]
      self.saveUrl(urls)

    urls = doc.listA(url=url['url'])
    if(urls):
      self.saveUrl(urls)
    
    return True

from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(ImageSpider())
你可以在这里找到simplified_scrapy的例子[1]
[1]: https://github.com/yiyedata/simplified-scrapy-demo

感谢您的解决方案。我不认识你。我会尽快尝试理解你的代码,我想写像你一样好的代码。谢谢我很高兴能帮助你。你可以在这里得到简化的例子:谢谢你的解决方案。我不认识你。我会尽快尝试理解你的代码,我想写像你一样好的代码。谢谢我很高兴能帮助你。您可以在此处获得简化的_scrapy示例:
import requests
from tqdm import tqdm

number_of_photos = 26

for i in tqdm(range(1, number_of_photos + 1)):
    image_url = 'https://news.mynavi.jp/article/20191229-947707/images/{:03}l.jpg'.format(i)
    try:
        response = requests.get(image_url)  
    except:
        pass    
    else:
        if response.status_code == 200:
            with open('{:02}.jpg'.format(i), 'wb') as f:
                f.write(response.content)
import os,io,sys,re
from simplified_scrapy.core.utils import md5
from simplified_scrapy.spider import Spider, SimplifiedDoc
class ImageSpider(Spider):
  name = 'mynavi.jp'
  allowed_domains = ['news.mynavi.jp/']
  start_urls = ['https://news.mynavi.jp/article/20191229-947707/']
  # refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.

  def __init__(self):
    Spider.__init__(self,self.name) #necessary
    if(not os.path.exists('images/')):
      os.mkdir('images/')

  def afterResponse(self, response, url, error=None, extra=None):
    try:
      if sys.version_info.major == 2: maintype = response.headers.maintype
      else: maintype =response.info().get('Content-Type')
      # save image
      if(response.code==200 and maintype and maintype.find('image')>=0):
        name = 'images/'+md5(url)+'.jpg'
        file = io.open(name, "wb")
        file.write(response.read())
        file.close()
        return None
      else: # If it's not a image, leave it to the frame
        return Spider.afterResponse(self, response, url, error)
    except Exception as err:
      print (err)

  def extract(self,url,html,models,modelNames):
    doc = SimplifiedDoc(html)
    urls = re.compile(u'"url":"[^"]*"').findall(doc.html)
    if(urls):
      urls = [{'url':doc.absoluteUrl(url['url'],u[len('"url":"[^"]*"'):-1])} for u in urls]
      self.saveUrl(urls)

    urls = doc.listA(url=url['url'])
    if(urls):
      self.saveUrl(urls)
    
    return True

from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(ImageSpider())