Python 使用runner=CrawlerRunner()时,Scrapy脚本无法上载到s3:仅使用process=CrawlerProcess()工作

Python 使用runner=CrawlerRunner()时,Scrapy脚本无法上载到s3:仅使用process=CrawlerProcess()工作,python,json,amazon-s3,scrapy,Python,Json,Amazon S3,Scrapy,我有一个奇怪的问题,我的脚本使用process=CrawlerProcess()(如下示例)将数据上传到s3时没有任何问题,但当我尝试运行包含各种类(总共12个)的脚本时,它在使用runner=CrawlerRunner()(如下所示)时不会依次发送到s3 此外,有时当我运行process=CrawlerProcess()时,它只会发送到11/12文件夹,但几乎不会发送到顶级(第一级)文件夹 我有一个运行12个类的脚本(下面是一个示例部分),所有正在运行的类都将导出到s3上的特定类。有没有想过是

我有一个奇怪的问题,我的脚本使用
process=CrawlerProcess()
(如下示例)将数据上传到s3时没有任何问题,但当我尝试运行包含各种类(总共12个)的脚本时,它在使用
runner=CrawlerRunner()
(如下所示)时不会依次发送到s3

此外,有时当我运行
process=CrawlerProcess()
时,它只会发送到11/12文件夹,但几乎不会发送到顶级(第一级)文件夹

我有一个运行12个类的脚本(下面是一个示例部分),所有正在运行的类都将导出到s3上的特定类。有没有想过是什么导致了这个问题


import scrapy
import boto3
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging


class JobsSpider1(scrapy.Spider):
    name = "indeed"
    allowed_domains = ["indeed.com"]
    start_urls = ["https://www.indeed.com/jobs?q=%22owner+operator%22&l=atlanta"]
#
    custom_settings = {
        'FEED_FORMAT': 'jsonlines',
        'URI SCHEME':'s3',
        'FEED_URI':'s3://<key_data>@<bucket_name>/folder1/folder2/data-file-indeed-%(time)s.json',
        'FEED_EXPORT_ENCODING':'UTF',
        'FEED_TEMPDIR':'/home/user/Desktop/projects/test_folder/',
        'FEED_STORE_EMPTY' : 'True'
        }
#
    def parse(self, response):
        #jobs = response.xpath("//div[contains(@class,'jobsearch-SerpJobCard unifiedRow row result clickcard')]").getall()
        jobs = response.xpath('//div[@class="jobsearch-SerpJobCard unifiedRow row result"]')
#
        for job in jobs:
            #post_id = job.xpath(".//*[@class='title']/div/@id").getall()
            title = job.xpath(".//*[@class='title']/a/@title").get(default='not-found')
            location = job.xpath(".//*[@class='sjcl']/span/text()").get(default='not-found')
            location2 = job.xpath(".//div[@class='location accessible-contrast-color-location']/text()").get(default='not-found')
            #company = job.xpath("//span[@class='company']/text()").getall()
            posting_link = job.xpath('*//@href').get(default='not-found')
            posting_url = "https://indeed.com" + posting_link
            company = job.xpath(".//span[@class='company']//a/text()").get(default='not-found')
            company2 = job.xpath(".//span[@class='company']/text()").get(default='not-found')
#
#
#
            yield Request(posting_url, callback=self.parse_page, meta={'posting_url':posting_url, 'title':title, 'company':company, 'company2':company2, 'location':location, 'location2':location2})
#
        relative_next_url = response.xpath('//link[@rel="next"]/@href').get()
        absolute_next_url = "https://indeed.com" + relative_next_url
#
        yield Request(absolute_next_url, callback=self.parse)
#
    def parse_page(self, response):
        posting_url = response.meta.get('posting_url')
        company2 = response.meta.get('company2')
        title = response.meta.get('title')
        location = response.meta.get('location')
        company = response.meta.get('company')
        location2 = response.meta.get('location2')
        post_id = response.meta.get('post_id')
#
#
#
        job_descriptions_2=response.xpath('//*[@id="jobDescriptionText"]/*').get(default='not-found')
        job_descriptions=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/*').get(default='not-found')
        job_descriptions_3=response.xpath('//*[@class="jobsearch-JobComponent-description  icl-u-xs-mt--md  "]/*').get(default='not-found')
        #job_descriptions="".join(line for line in response.xpath('//*[@class="jobsearch-jobDescriptionText"]/text()').extract()).strip()
        posted_on_date= response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/text()').get(default='not-found')
#
#
        yield{
        'posting_url':posting_url,
        'posted_on_date':posted_on_date,
        'title':title,
        'location':location,
        'location2':location2,
        'company':company,
        'company2':company2,
        'job_descriptions':job_descriptions,
        'job_descriptions_2':job_descriptions_2,
        'job_descriptions_3':job_descriptions_3
        }

################################################################
class JobsSpider2(scrapy.Spider):
    name = "indeed"
    allowed_domains = ["indeed.com"]
    start_urls = ["https://www.indeed.com/jobs?q=%22owner+operator%22&l=augusta"]
#
    custom_settings = {
        'FEED_FORMAT': 'jsonlines',
        'URI SCHEME':'s3',
        'FEED_URI':'s3://<key_data>@<bucket_name>/folder1/folder2/data-file-indeed-%(time)s.json',
        #'FEED_URI':'augusta-indeed-%(time)s.json',
        #'FEED_EXPORT_ENCODING':'utf-8'
        'FEED_EXPORT_ENCODING':'UTF',
        'FEED_TEMPDIR':''/home/user/Desktop/projects/test_folder/',
        'FEED_STORE_EMPTY' : 'True'
        }
#
    def parse(self, response):
        #jobs = response.xpath("//div[contains(@class,'jobsearch-SerpJobCard unifiedRow row result clickcard')]").getall()
        jobs = response.xpath('//div[@class="jobsearch-SerpJobCard unifiedRow row result"]')
#
        for job in jobs:
            #post_id = job.xpath(".//*[@class='title']/div/@id").getall()
            title = job.xpath(".//*[@class='title']/a/@title").get(default='not-found')
            location = job.xpath(".//*[@class='sjcl']/span/text()").get(default='not-found')
            location2 = job.xpath(".//div[@class='location accessible-contrast-color-location']/text()").get(default='not-found')
            #company = job.xpath("//span[@class='company']/text()").getall()
            posting_link = job.xpath('*//@href').get(default='not-found')
            posting_url = "https://indeed.com" + posting_link
            company = job.xpath(".//span[@class='company']//a/text()").get(default='not-found')
            company2 = job.xpath(".//span[@class='company']/text()").get(default='not-found')
#
#
#
            yield Request(posting_url, callback=self.parse_page, meta={'posting_url':posting_url, 'title':title, 'company':company, 'company2':company2, 'location':location, 'location2':location2})
#
        relative_next_url = response.xpath('//link[@rel="next"]/@href').get()
        absolute_next_url = "https://indeed.com" + relative_next_url
#
        yield Request(absolute_next_url, callback=self.parse)
#
    def parse_page(self, response):
        posting_url = response.meta.get('posting_url')
        company2 = response.meta.get('company2')
        title = response.meta.get('title')
        location = response.meta.get('location')
        company = response.meta.get('company')
        location2 = response.meta.get('location2')
        post_id = response.meta.get('post_id')
#
#
#
        job_descriptions_2=response.xpath('//*[@id="jobDescriptionText"]/*').get(default='not-found')
        job_descriptions=response.xpath('//*[@class="jobsearch-jobDescriptionText"]/*').get(default='not-found')
        job_descriptions_3=response.xpath('//*[@class="jobsearch-JobComponent-description  icl-u-xs-mt--md  "]/*').get(default='not-found')
        #job_descriptions="".join(line for line in response.xpath('//*[@class="jobsearch-jobDescriptionText"]/text()').extract()).strip()
        posted_on_date= response.xpath('//*[@class="jobsearch-JobMetadataFooter"]/text()').get(default='not-found')
#
#
        yield{
        'posting_url':posting_url,
        'posted_on_date':posted_on_date,
        'title':title,
        'location':location,
        'location2':location2,
        'company':company,
        'company2':company2,
        'job_descriptions':job_descriptions,
        'job_descriptions_2':job_descriptions_2,
        'job_descriptions_3':job_descriptions_3
        }

#################################################################

This works below:

process = CrawlerProcess()
#
process.crawl(JobsSpider1)
process.crawl(JobsSpider2)
#process.crawl(JobsSpider3)
#process.crawl(JobsSpider4)
#process.crawl(JobsSpider5)
#process.crawl(JobsSpider6)
#process.crawl(JobsSpider7)
#process.crawl(JobsSpider8)
#process.crawl(JobsSpider9)
#process.crawl(JobsSpider10)
#process.crawl(JobsSpider11)
#process.crawl(JobsSpider12)

#################################################
This does not work below 

configure_logging()
runner = CrawlerRunner()
#
@defer.inlineCallbacks
def crawl():
    yield runner.crawl(JobsSpider1)
    yield runner.crawl(JobsSpider2)
   #yield runner.crawl(JobsSpider3)
   #yield runner.crawl(JobsSpider4)
   #yield runner.crawl(JobsSpider5)
   #yield runner.crawl(JobsSpider6)
   #yield runner.crawl(JobsSpider7)
   #yield runner.crawl(JobsSpider8)
   #yield runner.crawl(JobsSpider9)
   #yield runner.crawl(JobsSpider10)
   #yield runner.crawl(JobsSpider11)
   #yield runner.crawl(JobsSpider12)
#
   reactor.stop()
#
crawl()
reactor.run() the script will block here until the last crawl call is finished




进口羊瘙痒
进口boto3
从刮擦进口请求
从scrapy.crawler导入crawler进程
从scrapy.LinkExtractor导入LinkExtractor
从scrapy.loader.processors导入TakeFirst、MapCompose和Join
从twisted.internet导入反应器,延迟
从scrapy.crawler导入CrawlerRunner
从scrapy.utils.log导入配置日志
类别作业Spider1(刮毛蜘蛛):
name=“确实”
允许的_域=[“的确如此”]
起始URL=[”https://www.indeed.com/jobs?q=%22owner+操作员%22&l=atlanta“]
#
自定义设置={
“提要格式”:“jsonlines”,
“URI方案”:“s3”,
'FEED_URI':'s3://@/folder1/folder2/data file indirect-%(time)s.json',
“提要导出编码”:“UTF”,
“FEED_TEMPDIR”:“/home/user/Desktop/projects/test_folder/”,
“FEED\u STORE\u EMPTY”:“True”
}
#
def解析(自我,响应):
#jobs=response.xpath(//div[contains(@class,'jobsearch-SerpJobCard univiedrow行结果clickcard')))”)。getall()
jobs=response.xpath('//div[@class=“jobsearch-SerpJobCard-unifiedRow-row-result”]”)
#
工作中的工作:
#post_id=job.xpath(“./*[@class='title']/div/@id”).getall()
title=job.xpath(“./*[@class='title']/a/@title”).get(默认值='not-found')
location=job.xpath(“./*[@class='sjcl']/span/text()”).get(默认值='not-found')
location2=job.xpath(“.//div[@class='location-accessible-contrast-color-location']]/text()”).get(默认值='not-found')
#company=job.xpath(“//span[@class='company']/text()”).getall()
posting_link=job.xpath('*/@href').get(默认值='not-found')
发布url=”https://indeed.com“+发布链接
company=job.xpath(“.//span[@class='company']//a/text()”).get(默认值='not-found')
company2=job.xpath(“.//span[@class='company']/text()”).get(默认值='not-found')
#
#
#
生成请求(posting_url,callback=self.parse_page,meta={'posting_url',title',company',company',company2',location',location2',location2})
#
relative_next_url=response.xpath('//link[@rel=“next”]/@href').get()
绝对值\u下一个\u url=”https://indeed.com“+相对\u下一个\u url
#
屈服请求(绝对下一个url,回调=self.parse)
#
def解析页面(自我,响应):
posting\u url=response.meta.get('posting\u url')
company2=response.meta.get('company2')
title=response.meta.get('title')
location=response.meta.get('location')
company=response.meta.get('company')
location2=response.meta.get('location2')
post\u id=response.meta.get('post\u id')
#
#
#
job_descriptions_2=response.xpath('/*[@id=“jobDescriptionText”]/*').get(默认值='not-found')
job_descriptions=response.xpath('/*[@class=“jobsearch jobsdescriptiontext”]/*').get(默认值为='not-found')
job_descriptions_3=response.xpath('//*[@class=“jobsearch JobComponent description icl-u-xs-mt--md”]/*')。get(默认值='not-found')
#job_descriptions=“”.join(响应中的行对行.xpath('/*[@class=“jobsearch jobsdescriptiontext”]/text()).extract()).strip()
发布日期=response.xpath('/*[@class=“jobsearch JobMetadataFooter”]/text()).get(默认值为='not-found')
#
#
屈服{
“发布url”:发布url,
“在日期发布”:在日期发布,
“标题”:标题,
“位置”:位置,
“位置2”:位置2,
“公司”:公司,
“公司2”:公司2,
“工作描述”:工作描述,
“职务描述”:职务描述,
“工作描述”3:工作描述3
}
################################################################
类别作业Spider2(刮毛蜘蛛):
name=“确实”
允许的_域=[“的确如此”]
起始URL=[”https://www.indeed.com/jobs?q=%22owner+操作员%22&l=augusta“]
#
自定义设置={
“提要格式”:“jsonlines”,
“URI方案”:“s3”,
'FEED_URI':'s3://@/folder1/folder2/data file indirect-%(time)s.json',
#‘FEED_URI’:‘augusta-react-%(time)s.json’,
#“提要导出编码”:“utf-8”
“提要导出编码”:“UTF”,
'FEED_TEMPDIR':''/home/user/Desktop/projects/test_folder/',
“FEED\u STORE\u EMPTY”:“True”
}
#
def解析(自我,响应):
#jobs=response.xpath(//div[contains(@class,'jobsearch-SerpJobCard univiedrow行结果clickcard')))”)。getall()
jobs=response.xpath('//div[@class=“jobsearch-SerpJobCard-unifiedRow-row-result”]”)
#
工作中的工作:
#post_id=job.xpath(“./*[@class='title']/div/@id”).getall()
title=job.xpath(“./*[@class='title']/a/@title”).get(默认值='not-found')
location=job.xpath(“./*[@class='sjcl']/span/text()”).get(默认值='not-found')
location2=job.xpath(“.//div[@class='location-accessible-contrast-color-location']]/text()”).get(默认值='not-found')
#company=job.xpath(“//span[@class='company']/text()”).getall()
posting_link=job.xpath('*/@href').get(默认值='not-found')
发布url=”https://indeed.com“+张贴_