Python 在scrapy中使用mysql插入数据
这是我的演示,我认为没有错误,但这段代码不能插入数据到mysql setting.py 这是设置代码:Python 在scrapy中使用mysql插入数据,python,twisted,scrapy,Python,Twisted,Scrapy,这是我的演示,我认为没有错误,但这段代码不能插入数据到mysql setting.py 这是设置代码: BOT_NAME = 'Scan' SPIDER_MODULES = ['scan.spiders'] #NEWSPIDER_MODULE = 'scan.spiders' ITEM_PIPELINES = ['scan.pipelines.MySQLStorePipeline'] 这是管道代码,我认为它不包含错误: 管道.py from scrapy import log from twi
BOT_NAME = 'Scan'
SPIDER_MODULES = ['scan.spiders']
#NEWSPIDER_MODULE = 'scan.spiders'
ITEM_PIPELINES = ['scan.pipelines.MySQLStorePipeline']
这是管道代码,我认为它不包含错误:
管道.py
from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.contrib.pipeline.images import ImagesPipeline
import datetime
import MySQLdb
import MySQLdb.cursors
class MySQLStorePipeline(object):
def __init__(self):
self.db = adbapi.ConnectionPool('MySQLdb',
db = 'spider',
host='localhost',
user = 'root',
passwd = '123456',
cursorclass = MySQLdb.cursors.DictCursor,
charset = 'utf8',
use_unicode = True
)
def process_item(self, item, spider):
query = self.db.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
if item.get('url'):
tx.execute(\
"insert into spider (url) "
"values (%s)",(item['link'])
)
#log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
这是蜘蛛模块,我想它没有包含错误
spider.py
# coding=utf-8
from urlparse import urljoin
import simplejson
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scan.items import ScanItem
class ScanSpider(CrawlSpider):
name = 'Scan'
allowed_domains = ["a.com"]
start_urls = [
"http://www.a.com",
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'http://(.*?)'),deny_domains=(r'qq.com'))
),
Rule(SgmlLinkExtractor(allow=(r'http://www.a.com')), callback="parse_item"),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = ScanItem()
items = []
#item['title'] = hxs.select('//title/text()').extract()
item['url'] = hxs.select('//a[@href]').re('(\"http://(.*?)\")').extract()
items.append(item)
return items
SPIDER = ScanSpider()
目前,没有进一步澄清 我认为,由于预计刮片的返回是一个
项目
(其子类,或者甚至只是一个目录
——我承认我没有彻底检查代码,看看这应该/确实是如何工作的),那么:
看起来有点奇怪
不过,你需要多做一些分析,才能得到一个有意义的/正确的答案。你真的需要读出来,你缺少了很多基础知识
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = ScanItem()
items = []
#item['title'] = hxs.select('//title/text()').extract()
item['url'] = hxs.select('//a[@href]').re('(\"http://(.*?)\")').extract()
items.append(item)
return items
在您的parse_items
中,不需要在列表的items中添加item,您只需返回item
即可
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = ScanItem()
#item['title'] = hxs.select('//title/text()').extract()
item['url'] = hxs.select('//a[@href]').re('(\"http://(.*?)\")').extract()
return item
请记住,item['url']
有一个url的列表
在您的MySQLStorePipeline
def _conditional_insert(self, tx, item):
if item.get('url'):
tx.execute(\
"insert into spider (url) "
"values (%s)",(item['link'])
)
您试图在数据库中插入
item['link']
,而从未填充item['link']
,但只有item['url']
,您能提供一个更简洁的代码示例吗?当您运行此操作时,调试输出是什么?(我还建议缩小你的帖子范围……社区并不需要大部分代码)
def _conditional_insert(self, tx, item):
if item.get('url'):
tx.execute(\
"insert into spider (url) "
"values (%s)",(item['link'])
)