Scrapy 未写入的粗糙结果

Scrapy 未写入的粗糙结果,scrapy,scrapy-spider,Scrapy,Scrapy Spider,我正在抓取以下站点: 希望为每个人搜集所有数据。这意味着遵循每个地区的链接,然后是该地区内的每个职务类别,最后是每个员工。我想问题可能在于我的URL正则表达式,但我不确定。在每个员工的页面上,我想我已经正确识别了XPath: import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class Spider2(CrawlSpider):

我正在抓取以下站点:

希望为每个人搜集所有数据。这意味着遵循每个地区的链接,然后是该地区内的每个职务类别,最后是每个员工。我想问题可能在于我的URL正则表达式,但我不确定。在每个员工的页面上,我想我已经正确识别了XPath:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class Spider2(CrawlSpider):
    #name of the spider
    name = 'stltoday'

    #list of allowed domains
    allowed_domains = ['graphics.stltoday.com']

    #starting url for scraping
    start_urls = ['https://graphics.stltoday.com/apps/payrolls/salaries/teachers']

    rules = [
    Rule(LinkExtractor(
        allow=['/[0-9]+/$']),
        follow=True),
    Rule(LinkExtractor(
        allow=['/[0-9]+/position/[0-9]+/$']),
        follow=True),
    Rule(LinkExtractor(
        allow=['/detail/[0-9]+/$']),
        callback='parse_item',
        follow=True),
    ]

    #setting the location of the output csv file
    custom_settings = {
        'FEED_FORMAT' : "csv",
        'FEED_URI' : 'tmp/stltoday1.csv'
    }

    def parse_item(self, response):
        #Remove XML namespaces
        response.selector.remove_namespaces()
        url = response.url
        #Extract article information

        fullname = response.xpath('//p[@class="table__title"]./text()').extract_first()

        for row in response.xpath('//th[@scope="row"]'):
            yield {
            "url": url,
        "fullname": fullname,
            "district": row.xpath('./text()').extract_first(),
            "school": row.xpath('./following-sibling::*[1]/text()').extract_first(),
            "degree": row.xpath('./following-sibling::*[2]/text()').extract_first(),
            "salary": row.xpath('./following-sibling::*[3]/text()').extract_first(),
        "extcontractpay": row.xpath('./following-sibling::*[4]/text()').extract_first(),
        "extraduty": row.xpath('./following-sibling::*[5]/text()').extract_first(),
        "totalpay": row.xpath('./following-sibling::*[6]/text()').extract_first(),
        "yearsindistrict": row.xpath('./following-sibling::*[7]/text()').extract_first(),
        "yearsinmoschools": row.xpath('./following-sibling::*[8]/text()').extract_first(),
            }


        for item in zip(url,fullname,district,school,degree,salary,extcontractpay,extraduty,totalpay,yearsindistrict,yearsinmoschools):
            yield {
                'url' : url,
        'fullname' : fullname,
                'district' : district,
                'school' : school,
                'degree' : degree,
                'salary' : salary,
        'extcontractpay' : extcontractpay,
                'extraduty' : extraduty,
                'totalpay' : totalpay,
                'yearsindistrict' : yearsindistrict,
                'yearsinmoschools' : yearsinmoschools
            }

爬行器会运行(在我暂停它之前会运行几分钟),但不会向.csv文件写入任何内容

于是我下了一个兔子洞,把蜘蛛重建成一个基本的蜘蛛,而不是爬行。我不明白为什么LinkedAct规则集中没有回调解析器

无论如何,我创建了一个cvs_导出器函数来更好地管理输出。将它和它的参数添加到设置中,瞧

爬行器通过与“爬行”爬行器相同的逻辑遍历站点, 虽然目标指定给URL,但它是一个广泛的爬网。从…起 “parse_district”>“parse_positions”>最后转到“parse_person”,其中 您希望刮取的项目存在

逐项列出。。。项目lol

创建了一个“csv_导出器”模块,您可以在其中调用该模块进行 调整文件的输出方式,包括设置 要输出的项目的定界符和顺序

将导出器包括到settings.py文件中,在这里可以包括 args在“csv\u exporter”中设置您希望使用的分隔符,并且 货物(物品)出口订单


我们可以假设您正在使用“-t csv”标志运行爬网吗?是的,我在运行爬网时尝试了使用和不使用
-t csv
,您看到终端/cmd中生成的项目了吗?刚刚重新创建了您的项目,我看到一些xpath问题,稍后将在此处发布答案好的,修复了一些xpath问题,但这不是项目未被记录的原因。我应该添加,您可能必须通过删除它来清理包含“”的项目,因为导出到csv将看到它并认为它是分隔符。或者简单地在每个项目上加一个“\t”,并将分隔符设置为tab。。。或者你真的选择了什么。
#stlSpider.py
import scrapy
from stltoday.items import StltodayItem

class StlspiderSpider(scrapy.Spider):
    name = 'stlSpider'
    allowed_domains = ['graphics.stltoday.com']
    start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/teachers/']

    def parse(self, response):
        for href in response.xpath("//th/a/@href").re(".*/teachers/[0-9]+/"):
            yield scrapy.Request(response.urljoin(href),
                                 callback=self.parse_district)

    def parse_district(self, response):
        for href in response.xpath("//th/a/@href").re(".*position.*"):
            yield scrapy.Request(response.urljoin(href),
                                 callback=self.parse_position)

    def parse_position(self, response):
        for href in response.xpath("//td/a/@href").extract():
            yield scrapy.Request(response.urljoin(href),
                                 callback=self.parse_person)

    def parse_person(self, response):
        item = StltodayItem()
        name = response.xpath('//p[@class="table__title"]/text()').extract_first()
        row = response.xpath('//th[@scope="row"]')
        item["url"] = response.url
        item["fullname"] = name
        item["district"] = row.xpath('//th[contains(., "District")]/following-sibling::td/text()').extract_first()
        item["school"] = row.xpath('//th[contains(., "School")]/following-sibling::td/text()').extract_first()
        item["degree"] = row.xpath('//th[contains(., "Degree")]/following-sibling::td/text()').extract_first()
        item["salary"] = row.xpath('//th[contains(., "Salary")]/following-sibling::td/text()').extract_first()
        item["extcontractpay"] = row.xpath('//th[contains(., "Extended")]/following-sibling::td/text()').extract_first()
        item["extraduty"] = row.xpath('//th[contains(., "Extra")]/following-sibling::td/text()').extract_first()
        item["totalpay"] = row.xpath('//th[contains(., "Total")]/following-sibling::td/text()').extract_first()
        item["yearsindistrict"] = row.xpath('//th[contains(., "Years in district")]/following-sibling::td/text()').extract_first()
        item["yearsinmoschools"] = row.xpath('//th[contains(., "Years in MO")]/following-sibling::td/text()').extract_first()
        yield item
#items.py
import scrapy


class StltodayItem(scrapy.Item):
    url = scrapy.Field()
    fullname = scrapy.Field()
    district = scrapy.Field()
    school = scrapy.Field()
    degree = scrapy.Field()
    salary = scrapy.Field()
    extcontractpay = scrapy.Field()
    extraduty = scrapy.Field()
    totalpay = scrapy.Field()
    yearsindistrict = scrapy.Field()
    yearsinmoschools = scrapy.Field()
#csv_exporter.py
_author_ = 'Erick'
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter

class MyProjectCsvItemExporter(CsvItemExporter):

    def __init__(self, *args, **kwargs):
        delimiter = settings.get('CSV_DELIMITER', ',')
        kwargs['delimiter'] = delimiter

        fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
        if fields_to_export :
            kwargs['fields_to_export'] = fields_to_export

        super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
#settings.py
OT_NAME = 'stltoday'

SPIDER_MODULES = ['stltoday.spiders']
NEWSPIDER_MODULE = 'stltoday.spiders'
FEED_FORMAT = 'csv'
FEED_URI = 'tmp/stltoday1.csv'
FIELDS_TO_EXPORT = ["url", "fullname", "district", "school", "degree", "salary", "extcontractpay", "extraduty", "totalpay", "yearsindistrict", "yearsinmoschools"]
FEED_EXPORTERS = {
    'csv': 'stltoday.csv_exporter.MyProjectCsvItemExporter',
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stltoday (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
...