Python 来自ICML的刮擦、爬行文件
我想用Python 来自ICML的刮擦、爬行文件,python,scrapy,Python,Scrapy,我想用Scrapy从ICML会议记录中抓取论文,我的代码是 items.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.item import Item, Field class
Scrapy
从ICML会议记录中抓取论文,我的代码是
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item, Field
class PapercrawlerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
pdf = Field()
sup = Field()
from scrapy import Spider
from scrapy import Spider
from scrapy.selector import Selector
from PaperCrawler.items import PapercrawlerItem
class PaperCrawler(Spider):
name = "PaperCrawler"
allowed_domains = ["proceedings.mlr.press"]
start_urls = ["http://proceedings.mlr.press/v97/", ]
def parse(self, response):
papers = Selector(response).xpath('//*[@id="content"]/div/div[2]')
titles = Selector(response).xpath('//*[@id="content"]/div/div[2]/p[1]')
pdfs = Selector(response).xpath('//*[@id="content"]/div/div[2]/p[3]/a[2]')
sups = Selector(response).xpath('//*[@id="content"]/div/div[2]/p[3]/a[3]')
for title, pdf, sup in zip(titles, pdfs, sups):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = pdf.xpath('@href').extract()[0]
item['sup'] = sup.xpath('@href').extract()[0]
yield item
spider.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item, Field
class PapercrawlerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
pdf = Field()
sup = Field()
from scrapy import Spider
from scrapy import Spider
from scrapy.selector import Selector
from PaperCrawler.items import PapercrawlerItem
class PaperCrawler(Spider):
name = "PaperCrawler"
allowed_domains = ["proceedings.mlr.press"]
start_urls = ["http://proceedings.mlr.press/v97/", ]
def parse(self, response):
papers = Selector(response).xpath('//*[@id="content"]/div/div[2]')
titles = Selector(response).xpath('//*[@id="content"]/div/div[2]/p[1]')
pdfs = Selector(response).xpath('//*[@id="content"]/div/div[2]/p[3]/a[2]')
sups = Selector(response).xpath('//*[@id="content"]/div/div[2]/p[3]/a[3]')
for title, pdf, sup in zip(titles, pdfs, sups):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = pdf.xpath('@href').extract()[0]
item['sup'] = sup.xpath('@href').extract()[0]
yield item
然而,它只返回第一篇论文的内容。我想抓取链接中的所有文件。我怎样才能修好它
[
{"title": "AReS and MaRS Adversarial and MMD-Minimizing Regression for SDEs", "pdf": "http://proceedings.mlr.press/v97/abbati19a/abbati19a.pdf", "sup": "http://proceedings.mlr.press/v97/abbati19a/abbati19a-supp.pdf"}
]
问题在div/div[2]中。爬虫程序不会迭代,因为您指定了特定的div编号。相反,您可以为div指定一个选择器,例如在本例中为div[@class=“paper”],这样代码就可以正常工作
以下是更正后的代码:
class PaperCrawler(Spider):
name = "PaperCrawler"
allowed_domains = ["proceedings.mlr.press"]
start_urls = ["http://proceedings.mlr.press/v97/", ]
def parse(self, response):
papers = Selector(response).xpath('//*[@id="content"]/div/div[@class="paper"]')
titles = Selector(response).xpath('//*[@id="content"]/div/div[@class="paper"]/p[1]')
pdfs = Selector(response).xpath('//*[@id="content"]/div/div[@class="paper"]/p[3]/a[2]')
sups = Selector(response).xpath('//*[@id="content"]/div/div[@class="paper"]/p[3]/a[3]')
for title, pdf, sup in zip(titles, pdfs, sups):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = pdf.xpath('@href').extract()[0]
item['sup'] = sup.xpath('@href').extract()[0]
yield item
通过迭代论文并检查sup的长度
可以修复它
class PaperCrawler(Spider):
name = "PaperCrawler"
allowed_domains = ["proceedings.mlr.press"]
start_urls = ["http://proceedings.mlr.press/v97/", ]
def parse(self, response):
papers = Selector(response).xpath('//*[@id="content"]/div/div[@class="paper"]')
for paper in papers:
item = PapercrawlerItem()
item['title'] = paper.xpath('p[1]/text()').extract()[0]
item['pdf'] = paper.xpath('p[3]/a[2]/@href').extract()[0]
_sup_data = paper.xpath('p[3]/a[3]/@href').extract()
item['sup'] = '' if len(_sup_data) == 0 else (_sup_data[0] if 'github' not in _sup_data[0] else '')
yield item
谢谢,但我发现它只返回非空的
sups
(带有补充PDF的论文)数据。如何返回页面上的所有数据?