scrapy和mysql
我试图让scrapy将已爬网的数据插入mysql,我的代码爬网良好,并在缓冲区中收集数据,没有错误,但数据库从未更新。 “没有运气”,“没有错误” pipeline.pyscrapy和mysql,mysql,scrapy,Mysql,Scrapy,我试图让scrapy将已爬网的数据插入mysql,我的代码爬网良好,并在缓冲区中收集数据,没有错误,但数据库从未更新。 “没有运气”,“没有错误” pipeline.py from twisted.enterprise import adbapi import datetime import MySQLdb.cursors class SQLStorePipeline(object): def __init__(self): self.dbpool = adbapi.
from twisted.enterprise import adbapi
import datetime
import MySQLdb.cursors
class SQLStorePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb', db='craigs',
user='bra', passwd='boobs', cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8', use_unicode=True)
def process_item(self, items, spider):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, items)
query.addErrback(self.handle_error)
return items
def _conditional_insert(self, tx, items):
# create record if doesn't exist.
# all this block run on it's own thread
tx.execute("select * from scraped where link = %s", (items['link'][0], ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % items, level=log.DEBUG)
else:
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
log.msg("Item stored in db: %s" % items, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
爬网代码
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigs.items import CraigsItem
class MySpider(CrawlSpider):
name = "craigs"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = [Rule(SgmlLinkExtractor(restrict_xpaths=('/html/body/blockquote[3]/p/a',)), follow=True, callback='parse_profile')]
def parse_profile(self, response):
items = []
img = CraigsItem()
hxs = HtmlXPathSelector(response)
img['title'] = hxs.select('//h2[contains(@class, "postingtitle")]/text()').extract()
img['posting_id'] = hxs.select('//html/body/article/section/section[2]/div/p/text()').extract()
items.append(img)
return items[0]
return img[0]
设置.py
BOT_NAME = 'craigs'
BOT_VERSION = '1.0'
SPIDER_MODULES = ['craigs.spiders']
NEWSPIDER_MODULE = 'craigs.spiders'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
管道代码根本没有被调用的原因是它还没有被激活。此激活通过在settings.py中添加一个新节来完成,如所示。e、 g
此外,您的
parse_profile
函数应该只返回img
。如果单个响应页面将导致多个项目,则只需添加要返回的项目列表。在设置中激活管道,并使用收益率而不是返回。您应该提交当前事务,从而使更改永久化
那以后呢
tx.execute(\
"insert into scraped (posting_id, email, location, text, title) "
"values (%s, %s, %s, %s, %s)",
(items['posting_id'][0],
items['email'][1],
items['location'][2],
items['text'][3],
items['title'][4],
)
)
你必须
db.commit()
db
这里是
db = MySQLdb.connect(host="localhost",user = "root", passwd = "1234", db="database_name")
请试一试。试着在进程项目
和\u条件插入
函数中放入一条打印语句,看看它们是否被调用。另外,您的settings.py文件是什么样子的?settings.pyBOT_NAME='craigs'BOT_VERSION='1.0'SPIDER_MODULES=['craigs.SPIDER']NEWSPIDER_MODULE='craigs.SPIDER'USER_AGENT='%s/%s'(BOT_NAME,BOT_VERSION)pipelines.py中的misc print语句从不反映正在执行的代码。此答案是否有助于解决此问题?根据Akhter Wahab建议的编辑,编辑了答案。干杯,伙计!
db = MySQLdb.connect(host="localhost",user = "root", passwd = "1234", db="database_name")