Python scrapy多爬虫
我对scrapy运行多个爬虫有问题。我想要达到的目标: 我有一个引擎在后台运行,从mysql数据库获取任务/作业。每15秒查询一次mysql数据库。如果有新工作,scrapy应该处理它 到目前为止,我的设置工作正常-最后一个问题是,我的爬行器(刮擦爬行器)在爬行器中“堆叠” 开始:Python scrapy多爬虫,python,mysql,scrapy,scrapy-spider,Python,Mysql,Scrapy,Scrapy Spider,我对scrapy运行多个爬虫有问题。我想要达到的目标: 我有一个引擎在后台运行,从mysql数据库获取任务/作业。每15秒查询一次mysql数据库。如果有新工作,scrapy应该处理它 到目前为止,我的设置工作正常-最后一个问题是,我的爬行器(刮擦爬行器)在爬行器中“堆叠” 开始: def schedule(): jobs = GetJob.Job() jobs.getJobs() if __name__ == "__main__": t = task.Loop
def schedule():
jobs = GetJob.Job()
jobs.getJobs()
if __name__ == "__main__":
t = task.LoopingCall(schedule)
t.start(15)
reactor.run()
GetJob.Job()之后,将在此处处理作业:
class ProcessJob():
def processJob(self, job):
#update job
mysql = MysqlConnector.Mysql()
db = mysql.getConnection()
cur = db.cursor()
job.status = 1
update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
cur.execute(update)
db.commit()
db.close()
#Start new crawler
webspider = MySpider.MySpider(job)
#Some settings
ajaxSettings = CrawlerSettings.ajax_settings
normalSettings = CrawlerSettings.normal_settings
configure_logging()
if job.max_pages != 0:
ajaxSettings["CLOSESPIDER_PAGECOUNT"] = 0
ajaxSettings["CLOSESPIDER_ITEMCOUNT"] = job.max_pages
normalSettings["CLOSESPIDER_PAGECOUNT"] = 0
normalSettings["CLOSESPIDER_ITEMCOUNT"] = job.max_pages
#max connections
concurrent_requests = int(job.max_pages / 20)
if concurrent_requests < 1:
concurrent_requests = 10
if concurrent_requests > 500:
concurrent_requests = 500
ajaxSettings["CONCURRENT_REQUESTS"] = concurrent_requests
normalSettings["CONCURRENT_REQUESTS"] = concurrent_requests
#Ajax true or false
if job.ajax == 1:
runner = CrawlerRunner(ajaxSettings)
else:
runner = CrawlerRunner(normalSettings)
d = runner.crawl(webspider, job=job)
因此,在运行了两三个作业之后,我的spider_被多次调用,而不是(预期)一次
那么这里出了什么问题
class MySpider(CrawlSpider):
def __init__(self, job):
#Get the hosts
self.job = job
dispatcher.connect(self.spider_closed, signals.spider_closed)
allowedDomainsPre = job.url.split(",")
allowedDomains = []
for domains in allowedDomainsPre:
parsed_uri = urlparse(domains)
domain = '{uri.netloc}'.format(uri=parsed_uri)
print domain
allowedDomains.append(domain)
self.allowed_domains = allowedDomains
self.start_urls = allowedDomainsPre
#Get job patterns
jobPatterns = job.processing_patterns.split(",")
allowedPatterns = []
deniedPatterns = []
for pattern in jobPatterns:
if '-' in pattern:
deniedPatterns.append(pattern.replace("-", ""))
else:
allowedPatterns.append(pattern)
self._rules = [
Rule(LinkExtractor(allow=(allowedPatterns), deny=(deniedPatterns)), callback=self.parse_items, follow=True)
]
self.name = job.id
self.settings = CrawlerSettings.normal_settings
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
itemCount = 0
try:
itemCount = stats["item_scraped_count"]
except:
print "Item count = zero"
DoneJob.DoneJob().jobDone(self.job, itemCount)
def parse_items(self, response):
item = Item()
#if the user wants a minimum description
if self.job.min_description > 0:
item['html'] = response.body
item['url'] = response.url
item['job_id'] = self.job.id
soup = BeautifulSoup(response.body, 'html.parser')
article = Document(soup.prettify()).summary()
article_soup = BeautifulSoup(article)
text = re.sub(' +', ' ', article_soup.get_text().rstrip())
text_length = len(text.split(' '))
if text_length > self.job.min_description:
return item
else:
item['html'] = response.body
item['url'] = response.url
item['job'] = {}
#Job
item['job']['id'] = self.job.id
item['job']['user_id'] = self.job.user_id
item['job']['name'] = self.job.name
item['job']['url'] = self.job.url
item['job']['api'] = self.job.api
item['job']['max_pages'] = self.job.max_pages
item['job']['crawl_depth'] = self.job.crawl_depth
item['job']['processing_patterns'] = self.job.processing_patterns
item['job']['days'] = self.job.days
item['job']['ajax'] = self.job.ajax
item['job']['min_description'] = self.job.min_description
return item