Warning: file_get_contents(/data/phpspider/zhask/data//catemap/7/sqlite/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python Scrapy-SQLalchemy外键未在SQLite中创建_Python_Sqlite_Sqlalchemy_Scrapy - Fatal编程技术网

Python Scrapy-SQLalchemy外键未在SQLite中创建

Python Scrapy-SQLalchemy外键未在SQLite中创建,python,sqlite,sqlalchemy,scrapy,Python,Sqlite,Sqlalchemy,Scrapy,我尝试使用itemLoader运行Scrapy来收集所有数据并将它们放入SQLite3。我成功地收集了我想要的所有信息,但是我无法使用back\u使用外键填充在我的ThreadInfo和PostInfo表中生成外键。我确实尝试过使用back\u ref,但它也不起作用。 我的剪贴画完成后,所有其他信息都被插入SQLite数据库 我的目标是让boardInfo、threadInfo、postInfo和authorInfo这四个表相互链接 boardInfo将与threadInfo建立一对多关系

我尝试使用itemLoader运行Scrapy来收集所有数据并将它们放入SQLite3。我成功地收集了我想要的所有信息,但是我无法使用
back\u使用外键填充
在我的ThreadInfo和PostInfo表中生成外键。我确实尝试过使用
back\u ref
,但它也不起作用。 我的剪贴画完成后,所有其他信息都被插入SQLite数据库

我的目标是让boardInfo、threadInfo、postInfo和authorInfo这四个表相互链接

  • boardInfo将与threadInfo建立一对多关系
  • threadInfo将与postInfo建立一对多关系
  • authorInfo将与threadInfo和
    建立一对多关系 postInfo
我使用DB Browser for SQLite,发现外键的值是
Null
。 我尝试查询该值(threadInfo.boardInfos_id),结果显示
None
。我花了很多天试图解决这个问题,并通读了文档,但无法解决这个问题

如何在threadInfo和postInfo表中生成外键

感谢您的指导和评论

这是我的模特

from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData
from sqlalchemy import Integer, String, Date, DateTime, Float, Boolean, Text
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from scrapy.utils.project import get_project_settings

Base = declarative_base()

def db_connect():
    '''
    Performs database connection using database settings from settings.py.
    Returns sqlalchemy engine instance
    '''
    return create_engine(get_project_settings().get('CONNECTION_STRING'))

def create_table(engine):
    Base.metadata.create_all(engine)

class BoardInfo(Base): 
    __tablename__ = 'boardInfos'
    id = Column(Integer, primary_key=True)
    boardName = Column('boardName', String(100)) 
    threadInfosLink = relationship('ThreadInfo', back_populates='boardInfosLink') # One-to-Many with threadInfo

class ThreadInfo(Base):
    __tablename__ = 'threadInfos'
    id = Column(Integer, primary_key=True)
    threadTitle = Column('threadTitle', String())
    threadLink = Column('threadLink', String())
    threadAuthor = Column('threadAuthor', String())
    threadPost = Column('threadPost', Text())
    replyCount = Column('replyCount', Integer)
    readCount = Column('readCount', Integer)

    boardInfos_id = Column(Integer, ForeignKey('boardInfos.id')) # Many-to-One with boardInfo
    boardInfosLink = relationship('BoardInfo', back_populates='threadInfosLink') # Many-to-One with boardInfo

    postInfosLink = relationship('PostInfo', back_populates='threadInfosLink') # One-to-Many with postInfo
    
    authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo
    authorInfosLink = relationship('AuthorInfo', back_populates='threadInfosLink') # Many-to-One with authorInfo

class PostInfo(Base):
    __tablename__ = 'postInfos'
    id = Column(Integer, primary_key=True)
    postOrder = Column('postOrder', Integer, nullable=True)
    postAuthor = Column('postAuthor', Text(), nullable=True)
    postContent = Column('postContent', Text(), nullable=True)
    postTimestamp = Column('postTimestamp', Text(), nullable=True)

    threadInfos_id = Column(Integer, ForeignKey('threadInfos.id')) # Many-to-One with threadInfo 
    threadInfosLink = relationship('ThreadInfo', back_populates='postInfosLink') # Many-to-One with threadInfo 
    
    authorInfos_id = Column(Integer, ForeignKey('authorInfos.id')) # Many-to-One with authorInfo
    authorInfosLink = relationship('AuthorInfo', back_populates='postInfosLink') # Many-to-One with authorInfo

class AuthorInfo(Base):
    __tablename__ = 'authorInfos'
    id = Column(Integer, primary_key=True)
    threadAuthor = Column('threadAuthor', String())

    postInfosLink = relationship('PostInfo', back_populates='authorInfosLink') # One-to-Many with postInfo
    threadInfosLink = relationship('ThreadInfo', back_populates='authorInfosLink') # One-to-Many with threadInfo
这是我的管道

from sqlalchemy import exists, event
from sqlalchemy.orm import sessionmaker
from scrapy.exceptions import DropItem
from .models import db_connect, create_table, BoardInfo, ThreadInfo, PostInfo, AuthorInfo
from sqlalchemy.engine import Engine
from sqlite3 import Connection as SQLite3Connection
import logging

@event.listens_for(Engine, "connect")
def _set_sqlite_pragma(dbapi_connection, connection_record):
    if isinstance(dbapi_connection, SQLite3Connection):
        cursor = dbapi_connection.cursor()
        cursor.execute("PRAGMA foreign_keys=ON;")
        # print("@@@@@@@ PRAGMA prog is running!! @@@@@@")
        cursor.close()

class DuplicatesPipeline(object):

    def __init__(self):
        '''
        Initializes database connection and sessionmaker.
        Creates tables.
        '''
        engine = db_connect()
        create_table(engine)
        self.Session = sessionmaker(bind=engine)
        logging.info('****DuplicatesPipeline: database connected****')

    def process_item(self, item, spider):

        session = self.Session()
        
        exist_threadLink = session.query(exists().where(ThreadInfo.threadLink == item['threadLink'])).scalar()
        exist_thread_replyCount = session.query(ThreadInfo.replyCount).filter_by(threadLink = item['threadLink']).scalar()
        if exist_threadLink is True: # threadLink is in DB
            if exist_thread_replyCount < item['replyCount']: # check if replyCount is more?
                return item
                session.close()
            else:
                raise DropItem('Duplicated item found and replyCount is not changed')
                session.close()
        else: # New threadLink to be added to BoardPipeline
            return item
            session.close()

class BoardPipeline(object):
    def __init__(self):
        '''
        Initializes database connection and sessionmaker
        Creates tables
        '''
        engine = db_connect()
        create_table(engine)
        self.Session = sessionmaker(bind=engine)

    def process_item(self, item, spider):
        '''
        Save scraped info in the database
        This method is called for every item pipeline component
        '''

        session = self.Session()

        # Input info to boardInfos
        boardInfo = BoardInfo()
        boardInfo.boardName = item['boardName']
        
        # Input info to threadInfos
        threadInfo = ThreadInfo()
        threadInfo.threadTitle = item['threadTitle']
        threadInfo.threadLink = item['threadLink']
        threadInfo.threadAuthor = item['threadAuthor']
        threadInfo.threadPost = item['threadPost']
        threadInfo.replyCount = item['replyCount']
        threadInfo.readCount = item['readCount']

        # Input info to postInfos
        # Due to info is in list, so we have to loop and add it.
        for num in range(len(item['postOrder'])):
            postInfoNum = 'postInfo' + str(num)
            postInfoNum = PostInfo()
            postInfoNum.postOrder = item['postOrder'][num]
            postInfoNum.postAuthor = item['postAuthor'][num]
            postInfoNum.postContent = item['postContent'][num]
            postInfoNum.postTimestamp = item['postTimestamp'][num]
            session.add(postInfoNum)
        
        # Input info to authorInfo
        authorInfo = AuthorInfo()
        authorInfo.threadAuthor = item['threadAuthor'] 

        # check whether the boardName exists
        exist_boardName = session.query(exists().where(BoardInfo.boardName == item['boardName'])).scalar()
        if exist_boardName is False:  # the current boardName does not exists
            session.add(boardInfo)

        # check whether the threadAuthor exists
        exist_threadAuthor = session.query(exists().where(AuthorInfo.threadAuthor == item['threadAuthor'])).scalar()
        if exist_threadAuthor is False:  # the current threadAuthor does not exists
            session.add(authorInfo)

        try:
            session.add(threadInfo)
            session.commit()

        except:
            session.rollback()
            raise

        finally:
            session.close()

        return item
存在来自sqlalchemy导入的
,事件
从sqlalchemy.orm导入sessionmaker
从scrapy.exceptions导入DropItem
从.models导入数据库连接、创建表格、BoardInfo、ThreadInfo、PostInfo、AuthorInfo
从sqlalchemy.engine导入引擎
从sqlite3将连接导入为SQLite3Connection
导入日志记录
@事件。侦听(引擎,“连接”)
定义集sqlite pragma(dbapi连接、连接记录):
如果isinstance(dbapi_连接、SQLITE3连接):
cursor=dbapi_connection.cursor()
cursor.execute(“PRAGMA foreign_keys=ON;”)
#打印(“PRAGMA prog正在运行!!@@@@@@@@”)
cursor.close()
类重复SPIPELINE(对象):
定义初始化(自):
'''
初始化数据库连接和会话生成器。
创建表。
'''
引擎=数据库连接()
创建表格(引擎)
self.Session=sessionmaker(bind=engine)
logging.info('**duplicateSipeline:数据库已连接*****'))
def过程_项目(自身、项目、蜘蛛):
session=self.session()
exist_threadLink=session.query(exists().where(ThreadInfo.threadLink==item['threadLink'])).scalar()
exist\u thread\u replyCount=session.query(ThreadInfo.replyCount).filter\u by(threadLink=item['threadLink'])。scalar()
如果exist_threadLink为True:#threadLink以DB为单位
如果存在_线程_replyCount
从我看到的代码中,我觉得您并没有在任何地方设置
ThreadInfo.authorInfosLink
ThreadInfo.authorInfos\u id
(您所有的FK/关系都是如此)

对于要附加到ThreadInfo实例的相关对象,您需要创建它们,然后附加它们,如下所示:

#将信息输入到authorInfo
authorInfo=authorInfo()
authorInfo.threadAuthor=项['threadAuthor']
threadInfo.authorInfosLink=aut