Python Scrapy FormRequest参数不工作,但显示所有结果
我正在使用Scrapy FormRequest刮取此网页。我的代码如下。参数Python Scrapy FormRequest参数不工作,但显示所有结果,python,web-scraping,scrapy,Python,Web Scraping,Scrapy,我正在使用Scrapy FormRequest刮取此网页。我的代码如下。参数\u pp\u hinamewithab和\u pp\u pinamewithpua在response.text中应仅返回1个结果,而在HTML代码中返回所有结果。参数显然不起作用,但我看不出有任何问题 def start_requests(self): params = { 'keyword': '', 'source': 'sharepoint', 'type'
\u pp\u hiname
withab
和\u pp\u piname
withpua
在response.text
中应仅返回1个结果,而在HTML代码中返回所有结果。参数显然不起作用,但我看不出有任何问题
def start_requests(self):
params = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
yield scrapy.FormRequest('https://researchgrant.gov.sg/eservices/mvcgrid',callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})
def parse_item(self,response):
print(response.text)
应仅为1个条目:
但很明显,它显示了所有条目:
最新更新:
class ToScrapeCSSSpiderSG(scrapy.Spider):
name = "toscrapesg-css"
# start_urls = [
# 'https://researchgrant.gov.sg/eservices/mvcgrid',
# ]
params = {
'name':'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': '',
'_pp_source': '',
'_pp_details': '',
'name':'advancesearchawardedprojectsp'
}
def start_requests(self):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url,callback=self.parse_item,method='POST',formdata=self.params,headers = {'X-Requested-With':'XMLHttpRequest'})
def parse_item(self,response):
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
link=row.xpath('td[1]/a/@href').extract_first()
yield scrapy.FormRequest(link,callback = self.parse_product,method='GET')
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
if onclick:
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers = {'X-Requested-With': 'XMLHttpRequest'})
def parse_product(self,response):
text = response.xpath('//span[contains(@id,"ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle")]/text()').extract()
# text = info.xpath('./text()').extract()
print(text)
控制台消息:
它只发送
POST
正文Name=advancesearchawardedprojectsp
。其他参数应在url中作为查询
所以url
应该是
您可以为此使用urllib.parse.urlencode(args)
这给了我一个结果
import urllib.parse
def start_requests(self):
params = {
'name':'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': '1',
'_pp_projectstatus': '',
'_pp_hiname': 'ab',
'_pp_piname': 'pua',
'_pp_source': '',
'_pp_details': '',
}
args = urllib.parse.urlencode(args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item,method='POST',formdata=params,headers = {'X-Requested-With':'XMLHttpRequest'})
编辑:加载下一页并选中按钮
next Page
停止的示例
编辑:现在它可以保存在csv
文件中
import scrapy
import urllib.parse
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = []
params = {
'name': 'advancesearchawardedprojectsp'
}
args = {
'keyword': '',
'source': 'sharepoint',
'type': 'project',
'status': 'open',
'page': 1,
'_pp_projectstatus': '',
#'_pp_hiname': 'tan',
#'_pp_piname': '',
'_pp_hiname': 'ab',
'_pp_piname': '', #'pua',
'_pp_source': '',
'_pp_details': '',
}
def start_requests(self):
# create request for first page
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
def parse_item(self,response):
#print('parse_item] url:', response.url)
#print('parse_item] text:', response.text)
#for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
# for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
# link = row.xpath('td[1]/a/@href').extract_first()
# yield scrapy.Request(link, callback=self.parse_product)
for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
link = row.xpath('.//a/@href').get()
#title = row.xpath('.//a/text()').get()
yield scrapy.Request(link, callback=self.parse_product)
# create request for next page
onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
if onclick:
# next page
self.args['page'] += 1
args = urllib.parse.urlencode(self.args)
url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
def parse_product(self, response):
#print('parse_product] url:', response.url)
# .extract_first() or .get() instead of .extract()
project_id = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjIdExt"]/text()').get()
title = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblProjectTitle"]/text()').get()
pi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblLeadPIName"]/text()').get()
hi = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_lblHostInstName"]/text()').get()
date = response.xpath('//span[@id="ctl00_ctl47_g_b43c0a74_fae0_498f_b75e_c103772db011_ctl00_dtPickerStartDate"]/text()').get()
# etc.
item = {
'id': project_id,
'title': title,
'pi': pi,
'hi': hi,
'date': date,
}
yield item
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
通常情况下,页面上的表单会重新加载整个页面-您会得到这个。只有当页面使用Scrapy提供的JavaScript无法运行JavaScript时,页面才能重新加载部分代码。在浏览器中,我看到它仅作为POST
Name=AdvancesSearchwardedProjectsp
发送。Rest是作为URL中的参数发送的——你们在帖子正文中发送所有的参数——这会有所不同;这段对话已经结束了。谢谢你一直以来的帮助!终于有点把整件事搞对了!