Python 如何从URL中提取特定字符串
我试图从下面提到的URL中提取某些字符串: 示例URL:Python 如何从URL中提取特定字符串,python,scrapy,Python,Scrapy,我试图从下面提到的URL中提取某些字符串: 示例URL: http://www.ladyblush.com/buy-sarees-online.html?p=1 http://www.ladyblush.com/buy-ladies-suits-online.html?p=1 http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1 我想摘录: productCategory = "sarees" productSubCat
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
我想摘录:
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
等等。实际上我正在编写一个spider,我需要从上面提到的URL中提取productCategory和productSubCategory。所以我试图从response.URL中提取parse方法中的这些字段。有人能帮帮我吗
我的代码:
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/@src').extract()
item['productTitle'] = site.select('./a/div/img/@title').extract()
item['productURL'] = [site.select('./a/@href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="old-price"]//span[@class="price"]/text()').extract()
productPrice = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//p[@class="special-price"]//span[@class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[@class="salePrice"]//div[@class="price-box"]//span[@class="regular-price"]//span[@class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[@class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[@class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[@class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[@class="more-views"]/ul/li/a/img/@src').extract() + hxs.select('//div[@class="more-views"]/ul/li/a/@href').extract()
return item
#------------------------------------------------------------------------------
您可以从中获取url 解析方法中的
response.url
。然后,您可以解析它来获取url路径
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
不过,这是一个相当脆弱的解决方案。您确定数据没有存储在您正在解析的页面html的某个位置,而不是查看url吗?使用urllib2和beautifulsoup进行解析。scrapy也在那里。在问题中添加了我的代码。我想在parse方法中提取上面提到的细节。我已经检查过,据我所知,这是这个站点的最佳方式。