Python 抓取/使用软泥中的饼干
我正试图在这个网站上抓取数据 : 但数据存储在 此url: 每次访问时,令牌都会更改 我不知道如何从第二个url中提取数据Python 抓取/使用软泥中的饼干,python,python-3.x,web-scraping,scrapy,Python,Python 3.x,Web Scraping,Scrapy,我正试图在这个网站上抓取数据 : 但数据存储在 此url: 每次访问时,令牌都会更改 我不知道如何从第二个url中提取数据 如果我不理解错误,我希望它能帮助你。试着帮忙 import scrapy import json from ..items import JsonscrapyItem class PatelcoSpider(scrapy.Spider): name = 'patelco' allowed_domains = ['patelco.org'] sta
如果我不理解错误,我希望它能帮助你。试着帮忙
import scrapy
import json
from ..items import JsonscrapyItem
class PatelcoSpider(scrapy.Spider):
name = 'patelco'
allowed_domains = ['patelco.org']
start_urls = ['https://scrapingclub.com/exercise/detail_cookie/']
def start_requests(self):
headers = {
'authority': 'scrapingclub.com',
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://scrapingclub.com/exercise/detail_cookie/',
'accept-language': 'en-US,en;q=0.9',
'cookie': '__cfduid=dadbc0660498959aca2ad988813bede171600011839; _ga=GA1.2.1127524216.1600011846; _gid=GA1.2.685369613.1600011846; _gat_gtag_UA_39890589_8=1; token=14Z022XFYS',
}
params = (
('token', '14Z022XFYS'),
)
yield scrapy.Request(url=self.start_urls[0], meta={'data': params}, headers=headers, callback=self.parse)
def parse(self, response):
hreflink = response.xpath('/html/body/div/div/div[2]/div/div[2]/p[1]/a/@href').extract()
data = response.xpath('/html/body/div/div/div[2]/div/div[2]/p[2]/text()').extract()
print('\n',hreflink, '\n')
print('\n',data, '\n')
谢谢你的回答,这对我很有帮助,但我仍然很困惑:
import scrapy
import json
import re
class GoldSpider(scrapy.Spider):
name = 'gold'
def parse(self, response):
pattern=re.compile('token=(.*?);')
token=pattern.findall( response.headers.get("set-cookie").decode("utf-8"))[0]
header = {
'authority': 'scrapingclub.com',
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://scrapingclub.com/exercise/detail_cookie/',
'accept-language': 'en-US,en;q=0.9',
}
cookie = {
'__cfduid': 'd95301af9f316c3263fffa2e373424e8f1600024418',
'_ga': 'idk how to find it',
'_gid': 'idk how to find it',
'token': token
}
yield scrapy.Request(url='https://scrapingclub.com/exercise/ajaxdetail_cookie/?token='+str(token),cookies=cookie,headers=header,callback=self.parse_json)
data=response.json
print(data) ```
The data is stored in https://scrapingclub.com/exercise/ajaxdetail_cookie/?token=(the token)
import scrapy
import json
import re
class GoldSpider(scrapy.Spider):
name = 'gold'
def parse(self, response):
pattern=re.compile('token=(.*?);')
token=pattern.findall( response.headers.get("set-cookie").decode("utf-8"))[0]
header = {
'authority': 'scrapingclub.com',
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://scrapingclub.com/exercise/detail_cookie/',
'accept-language': 'en-US,en;q=0.9',
}
cookie = {
'__cfduid': 'd95301af9f316c3263fffa2e373424e8f1600024418',
'_ga': 'idk how to find it',
'_gid': 'idk how to find it',
'token': token
}
yield scrapy.Request(url='https://scrapingclub.com/exercise/ajaxdetail_cookie/?token='+str(token),cookies=cookie,headers=header,callback=self.parse_json)
data=response.json
print(data) ```
The data is stored in https://scrapingclub.com/exercise/ajaxdetail_cookie/?token=(the token)