Python 如何组合刮削输出

Python 如何组合刮削输出,python,scrapy,scrapy-spider,Python,Scrapy,Scrapy Spider,我是个新手,这是我迄今为止最复杂的蜘蛛 import scrapy from scrapy.selector import HtmlXPathSelector class CocabotSpider(scrapy.Spider): name = 'cocabot' start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m

我是个新手,这是我迄今为止最复杂的蜘蛛

import scrapy
from scrapy.selector import HtmlXPathSelector


class CocabotSpider(scrapy.Spider):
    name = 'cocabot'
    start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m-d-Y&term=400&event_location&save_lst_list&view']
    custom_settings = {
        'FEED_URI' : 'output/cocaoutput.json'
    }

    def parse(self, response):
        # follow links to concert pages
        for href in response.css("div.search-img a::attr(href)"):
            yield response.follow(href, self.parse_concert)

        # follow links to venue pages
        for href in response.css("span.venue-event a::attr(href)"):
            yield response.follow(href, self.parse_venue)

        # follow links to pagination pages
        for href in response.css("li a.next.page-numbers::attr(href)"):
            yield response.follow(href, self.parse)

    def parse_concert(self, response):
        def extract_with_css(query):
            return response.css(query).extract_first()

        yield {
            'headliner' : extract_with_css("h1.p-ttl::text"),
            'venue' : extract_with_css("div.locatn div.a-block-ct div b::text"),
            'venue_address' : extract_with_css("div.locatn div.a-block-ct div p::text"),
            'venue_coca_url' : extract_with_css("span.venue-event a::attr(href)"),
            'event_url' : HtmlXPathSelector(response).select(
                "//div[@class='a-block-ct']/p/a[contains(text(), 'Official Website')]/@href")\
                .extract_first(),
            'event_coca_url' : response.request.url,
            'date_time' : extract_with_css("ul.ind-time li::text"),
            'price' : extract_with_css("div.a-block-ct div.apl-internal-content p::text"),
        }

    def parse_venue(self, response):
        yield {
            'venue_website' : HtmlXPathSelector(response).select(
                    "//div[@class='art-social-item']/a[contains(text(), 'Website')]/@href")\
                    .extract_first(),
        }
这得到了我想要的所有数据,但问题是,场馆大学网站的数据在它自己的字典中。示例:

{"date_time": "Jun 18, 2018 at 08:00 am - 05:00 pm  (Mon)", "event_url": "http://www.music.fsu.edu/Quicklinks/Summer-Music-Camps/EXPLORE-OUR-14-CAMPS/Jazz-Ensemble-Camp-for-Middle-School", "venue_coca_url": null, "venue_address": "122 N. Copeland St., Tallahassee, FL 32304", "price": "Registration for camp is now open. You can register online or by mailing in a registration form. Day Camper Price: $311.00 (Includes tuition only. No housing or meals.) Night Camper Price: $501.00 \u2013 (Includes tuition and housing with three meals per day). A $100.00 non-refundable deposit is due at registration. Balance of camp fees are due by June 4.", "venue": "FSU College of Music", "headliner": "Jazz Ensemble Camp for Middle School", "event_coca_url": "https://www.tallahasseearts.org/event/jazz-ensemble-camp-for-middle-school-3/"},
{"venue_website": "http://www.makinglightproductions.org/"},
{"venue_website": "http://www.mfbooks.us/"},
{"venue_website": null},

我如何将场馆网站数据输入我的主要parse_concert字典?我尝试过在parse_concert函数中使用follow语句,并让parse_场地数据返回而不是返回,但我只是把它组合得不太正确

有两种方法可以生成需要在scrapy中有多个页面的项目:

  • 请求链接
  • 由于您需要多个请求才能生成一个项目,因此您需要将它们链接起来,以便按顺序排列并随身携带您的数据:

    def parse_concert(self, response):
        concert = {'name': 'red hot chilly hotdogs'}
        venue_url = 'http://someplace.com'
        yield Request(venue_url, meta={'item': concert})
    
    def parse_venue(self, response):
        item = response.meta['item']
        item['venue_name'] = 'someplace'
        yield item 
        # {'name': 'red hot chilly hotdogs', 'venue_name': 'someplace'}
    
  • 后处理合并
  • 另一种解决方案是异步生成两种类型的项,然后通过共享id将它们组合起来:

    def parse_concert(self, response):
        concert = {'name': 'red hot chilly hotdogs', 'id': 1}   
        yield concert
        yield Request(venue_url)
    
    def parse_venue(self, response):
        item = {'venue_name': 'someplace', 'id': 1}
        yield item 
    
    然后与备选脚本结合使用:

    import json
    with open('output.json') as f:
        data = json.loads(f.read())
    
    combined = {}
    for item in data:
        if item['id'] in combined:
            combined[item['id']].update(item)
        else:
            combined[item['id']] = item
    
    with open('output_combined.json', 'w') as f:
            f.write(json.dumps(combined.values()))
    

    有两种方法可以生成需要在scrapy中有多个页面的项目:

  • 请求链接
  • 由于您需要多个请求才能生成一个项目,因此您需要将它们链接起来,以便按顺序排列并随身携带您的数据:

    def parse_concert(self, response):
        concert = {'name': 'red hot chilly hotdogs'}
        venue_url = 'http://someplace.com'
        yield Request(venue_url, meta={'item': concert})
    
    def parse_venue(self, response):
        item = response.meta['item']
        item['venue_name'] = 'someplace'
        yield item 
        # {'name': 'red hot chilly hotdogs', 'venue_name': 'someplace'}
    
  • 后处理合并
  • 另一种解决方案是异步生成两种类型的项,然后通过共享id将它们组合起来:

    def parse_concert(self, response):
        concert = {'name': 'red hot chilly hotdogs', 'id': 1}   
        yield concert
        yield Request(venue_url)
    
    def parse_venue(self, response):
        item = {'venue_name': 'someplace', 'id': 1}
        yield item 
    
    然后与备选脚本结合使用:

    import json
    with open('output.json') as f:
        data = json.loads(f.read())
    
    combined = {}
    for item in data:
        if item['id'] in combined:
            combined[item['id']].update(item)
        else:
            combined[item['id']] = item
    
    with open('output_combined.json', 'w') as f:
            f.write(json.dumps(combined.values()))
    

    下面是我在上面答案的帮助下得到的结果:

    import scrapy
    from scrapy.selector import HtmlXPathSelector
    
    
    class CocabotSpider(scrapy.Spider):
        name = 'cocabot'
        start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m-d-Y&term=400&event_location&save_lst_list&view']
        custom_settings = {
            'FEED_URI' : 'output/cocaoutput.json'
        }
    
        def parse(self, response):
            # follow links to concert pages
            for href in response.css("div.search-img a::attr(href)"):
                yield response.follow(href, self.parse_concert)
    
            # follow links to pagination pages
            for href in response.css("li a.next.page-numbers::attr(href)"):
                yield response.follow(href, self.parse)
    
        def parse_concert(self, response):
            def extract_with_css(query):
                return response.css(query).extract_first()
    
            concert = {
                'headliner' : extract_with_css("h1.p-ttl::text"),
                'venue' : extract_with_css("div.locatn div.a-block-ct div b::text"),
                'venue_address' : extract_with_css("div.locatn div.a-block-ct div p::text"),
                'venue_coca_url' : extract_with_css("span.venue-event a::attr(href)"),
                'event_url' : HtmlXPathSelector(response).select(
                    "//div[@class='a-block-ct']/p/a[contains(text(), 'Official Website')]/@href")\
                    .extract_first(),
                'event_coca_url' : response.request.url,
                'date_time' : extract_with_css("ul.ind-time li::text"),
                'price' : extract_with_css("div.a-block-ct div.apl-internal-content p::text"),
            }
    
            venue_coca_url = concert['venue_coca_url']
            if venue_coca_url:
                yield scrapy.Request(venue_coca_url, meta={'item': concert}, callback=self.parse_venue)
            else:
                yield concert
    
        def parse_venue(self, response):
            item = response.meta['item']
            item['venue_website'] = HtmlXPathSelector(response).select(
                        "//div[@class='art-social-item']/a[contains(text(), 'Website')]/@href")\
                        .extract_first()
            yield item
    

    下面是我在上面答案的帮助下得到的结果:

    import scrapy
    from scrapy.selector import HtmlXPathSelector
    
    
    class CocabotSpider(scrapy.Spider):
        name = 'cocabot'
        start_urls = ['https://www.tallahasseearts.org/event/?keyword&start_date&end_date&date_format=m-d-Y&term=400&event_location&save_lst_list&view']
        custom_settings = {
            'FEED_URI' : 'output/cocaoutput.json'
        }
    
        def parse(self, response):
            # follow links to concert pages
            for href in response.css("div.search-img a::attr(href)"):
                yield response.follow(href, self.parse_concert)
    
            # follow links to pagination pages
            for href in response.css("li a.next.page-numbers::attr(href)"):
                yield response.follow(href, self.parse)
    
        def parse_concert(self, response):
            def extract_with_css(query):
                return response.css(query).extract_first()
    
            concert = {
                'headliner' : extract_with_css("h1.p-ttl::text"),
                'venue' : extract_with_css("div.locatn div.a-block-ct div b::text"),
                'venue_address' : extract_with_css("div.locatn div.a-block-ct div p::text"),
                'venue_coca_url' : extract_with_css("span.venue-event a::attr(href)"),
                'event_url' : HtmlXPathSelector(response).select(
                    "//div[@class='a-block-ct']/p/a[contains(text(), 'Official Website')]/@href")\
                    .extract_first(),
                'event_coca_url' : response.request.url,
                'date_time' : extract_with_css("ul.ind-time li::text"),
                'price' : extract_with_css("div.a-block-ct div.apl-internal-content p::text"),
            }
    
            venue_coca_url = concert['venue_coca_url']
            if venue_coca_url:
                yield scrapy.Request(venue_coca_url, meta={'item': concert}, callback=self.parse_venue)
            else:
                yield concert
    
        def parse_venue(self, response):
            item = response.meta['item']
            item['venue_website'] = HtmlXPathSelector(response).select(
                        "//div[@class='art-social-item']/a[contains(text(), 'Website')]/@href")\
                        .extract_first()
            yield item
    

    回答得好!我确实需要在scrapy.Request调用中使用callback=self.parse\u,但我想我现在已经按照我想要的方式工作了。非常感谢!在另一个答案中发布我修改后的蜘蛛,以获取价值。答案很好!我确实需要在scrapy.Request调用中使用callback=self.parse\u,但我想我现在已经按照我想要的方式工作了。非常感谢!在另一个答案中发布修改后的蜘蛛,以获取价值。