Python 新手：如何只使用一个起始URL刮取多个网页？_Python_Web Scraping_Scrapy

Python 新手：如何只使用一个起始URL刮取多个网页？

python web-scraping scrapy

Python 新手：如何只使用一个起始URL刮取多个网页？,python,web-scraping,scrapy,Python,Web Scraping,Scrapy,首先，我试图从以下方面刮取基金代码，例如MGB_、JAS_： “” 然后，从以下数据中提取每个基金的价格： “+”MGB_” “+”JAS_” 我的代码有： raisenotimplementederror 但我仍然不知道如何解决它 from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from fundPrice.items import FundPriceItem class P

首先，我试图从以下方面刮取基金代码，例如MGB_、JAS_： “”

然后，从以下数据中提取每个基金的价格：

“+”MGB_”

“+”JAS_”

我的代码有：

raisenotimplementederror

但我仍然不知道如何解决它

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from fundPrice.items import FundPriceItem

class PruSpider(BaseSpider):
    name = "prufunds"
    allowed_domains = ["prudential.com.hk"]
    start_urls = ["http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        funds_U = hxs.select('//table//table//table//table//select[@class="fundDropdown"]//option//@value').extract()
        funds_U = [x for x in funds_U if x != (u"#" and u"MMFU_U")]

        items = []

        for fund_U in funds_U:
            url = "http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=" + fund_U
            item = FundPriceItem()
            item['fund'] = fund_U
            item['data'] =  hxs.select('//table//table//table//table//td[@class="fundPriceCell1" or @class="fundPriceCell2"]//text()').extract()
            items.append(item)
            return items

对于循环中的每个

基金

，您应该使用scrapy's：

from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from fundPrice.items import FundPriceItem


class PruSpider(BaseSpider):
    name = "prufunds"
    allowed_domains = ["prudential.com.hk"]
    start_urls = ["http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        funds_U = hxs.select('//table//table//table//table//select[@class="fundDropdown"]//option//@value').extract()
        funds_U = [x for x in funds_U if x != (u"#" and u"MMFU_U")]

        for fund_U in funds_U:
            yield Request(
                url="http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=" + fund_U,
                callback=self.parse_fund,
                meta={'fund': fund_U})

    def parse_fund(self, response):
        hxs = HtmlXPathSelector(response)
        item = FundPriceItem()
        item['fund'] = response.meta['fund']
        item['data'] = hxs.select(
            '//table//table//table//table//td[@class="fundPriceCell1" or @class="fundPriceCell2"]//text()').extract()
        return item

希望能有所帮助。

您应该为循环中的每个

基金使用scrapy's：
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from fundPrice.items import FundPriceItem


class PruSpider(BaseSpider):
    name = "prufunds"
    allowed_domains = ["prudential.com.hk"]
    start_urls = ["http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=MMFU_U"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        funds_U = hxs.select('//table//table//table//table//select[@class="fundDropdown"]//option//@value').extract()
        funds_U = [x for x in funds_U if x != (u"#" and u"MMFU_U")]

        for fund_U in funds_U:
            yield Request(
                url="http://www.prudential.com.hk/PruServlet?module=fund&purpose=searchHistFund&fundCd=" + fund_U,
                callback=self.parse_fund,
                meta={'fund': fund_U})

    def parse_fund(self, response):
        hxs = HtmlXPathSelector(response)
        item = FundPriceItem()
        item['fund'] = response.meta['fund']
        item['data'] = hxs.select(
            '//table//table//table//table//td[@class="fundPriceCell1" or @class="fundPriceCell2"]//text()').extract()
        return item

希望能有所帮助。
从item['fund']=response.meta['fund']，基金字符串将是例如MGB_，JAS_。如何删除该_，并将其保留为MGB，JAS等？就像response.meta['fund'].split（'U'）[0]
一样。如果有帮助的话，请考虑接受答案。从项目[基金' ] =响应。元[基金]，基金字符串将是例如MGBSU，Jasuu。我怎样才能去掉该u，并将其作为MGB、JAS等？就像Stth-<代码>响应。Meta [基金]。拆分（'`'）[0 ] < /代码>。如果有帮助的话，请考虑接受这个答案。