Python 使用SQLAlchemy大容量插入数据帧_Python_Pandas_Sqlalchemy

Python 使用SQLAlchemy大容量插入数据帧

python pandas sqlalchemy

Python 使用SQLAlchemy大容量插入数据帧,python,pandas,sqlalchemy,Python,Pandas,Sqlalchemy,我有一些相当大的数据帧，我想使用新的批量SQL映射通过SQL Alchemy将它们上载到Microsoft SQL Server。pandas.to_sql方法虽然很好，但速度很慢我写代码有困难我希望能够向此函数传递一个我正在调用的数据帧表，一个我正在调用的模式名模式，以及一个我正在调用的表名名称。理想情况下，函数将1.）删除表（如果它已经存在）。2.）创建新表3.）创建映射器，4.）使用映射器和数据进行批量插入。我被困在第三部分了这是我的（公认的粗略）代码。我正在努力让mapper函数与

我有一些相当大的数据帧，我想使用新的批量SQL映射通过SQL Alchemy将它们上载到Microsoft SQL Server。pandas.to_sql方法虽然很好，但速度很慢

我写代码有困难

我希望能够向此函数传递一个我正在调用的数据帧

表

，一个我正在调用的模式名

模式

，以及一个我正在调用的表名

名称

。理想情况下，函数将1.）删除表（如果它已经存在）。2.）创建新表3.）创建映射器，4.）使用映射器和数据进行批量插入。我被困在第三部分了

这是我的（公认的粗略）代码。我正在努力让mapper函数与我的主键一起工作。我真的不需要主键，但mapper函数需要它

谢谢你的见解

from sqlalchemy import create_engine Table, Column, MetaData
from sqlalchemy.orm import mapper, create_session
from sqlalchemy.ext.declarative import declarative_base
from pandas.io.sql import SQLTable, SQLDatabase

def bulk_upload(table, schema, name):
    e = create_engine('mssql+pyodbc://MYDB')
    s = create_session(bind=e)
    m = MetaData(bind=e,reflect=True,schema=schema)
    Base = declarative_base(bind=e,metadata=m)
    t = Table(name,m)
    m.remove(t)
    t.drop(checkfirst=True)
    sqld = SQLDatabase(e, schema=schema,meta=m)
    sqlt = SQLTable(name, sqld, table).table
    sqlt.metadata = m
    m.create_all(bind=e,tables=[sqlt])    
    class MyClass(Base):
        return
    mapper(MyClass, sqlt)    

    s.bulk_insert_mappings(MyClass, table.to_dict(orient='records'))
    return

我遇到了一个类似的问题，pd.to_sql花了数小时上传数据。下面的代码在几秒钟内插入了相同的数据

from sqlalchemy import create_engine
import psycopg2 as pg
#load python script that batch loads pandas df to sql
import cStringIO

address = 'postgresql://<username>:<pswd>@<host>:<port>/<database>'
engine = create_engine(address)
connection = engine.raw_connection()
cursor = connection.cursor()

#df is the dataframe containing an index and the columns "Event" and "Day"
#create Index column to use as primary key
df.reset_index(inplace=True)
df.rename(columns={'index':'Index'}, inplace =True)

#create the table but first drop if it already exists
command = '''DROP TABLE IF EXISTS localytics_app2;
CREATE TABLE localytics_app2
(
"Index" serial primary key,
"Event" text,
"Day" timestamp without time zone,
);'''
cursor.execute(command)
connection.commit()

#stream the data using 'to_csv' and StringIO(); then use sql's 'copy_from' function
output = cStringIO.StringIO()
#ignore the index
df.to_csv(output, sep='\t', header=False, index=False)
#jump to start of stream
output.seek(0)
contents = output.getvalue()
cur = connection.cursor()
#null values become ''
cur.copy_from(output, 'localytics_app2', null="")    
connection.commit()
cur.close()

从sqlalchemy导入创建引擎
将psycopg2导入为pg
#加载批加载到sql的python脚本
导入cStringIO
地址='postgresql://:@://'
引擎=创建引擎（地址）
连接=引擎。原始连接（）
cursor=connection.cursor（）
#df是包含索引和列“Event”和“Day”的数据帧
#创建用作主键的索引列
df.reset_索引（原地=真）
rename（列={'index'：'index'}，inplace=True）
#创建表，但如果表已存在，则首先删除它
命令=''如果存在localytics_app2，则删除表；
创建表localytics\u app2
(
“索引”串行主键，
“事件”文本，
不带时区的“日”时间戳，
);'''
cursor.execute（命令）
commit（）连接
#使用“to_csv”和StringIO（）流式传输数据；然后使用sql的“copy_from”函数
输出=cStringIO.StringIO（）
#忽略索引
df.to_csv（输出，sep='\t'，header=False，index=False）
#跳转到流的起点
输出搜索（0）
contents=output.getvalue（）
cur=connection.cursor（）
#空值变为“”
cur.copy_from（输出'localytics_app2'，null=”“）
commit（）连接
当前关闭（）

当时可能已经回答了这个问题，但我通过在这个网站上整理不同的答案并与SQLAlchemy的文档保持一致，找到了解决方案

表需要已经存在于db1中；索引设置为启用自动增量

当前类需要与CSV中导入的数据帧和db1中的表对齐

希望这能帮助那些来这里，想快速混合熊猫和SQLAlchemy的人

from urllib import quote_plus as urlquote
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Numeric
from sqlalchemy.orm import sessionmaker
import pandas as pd


# Set up of the engine to connect to the database
# the urlquote is used for passing the password which might contain special characters such as "/"
engine = create_engine('mysql://root:%s@localhost/db1' % urlquote('weirdPassword*withsp€cialcharacters'), echo=False)
conn = engine.connect()
Base = declarative_base()

#Declaration of the class in order to write into the database. This structure is standard and should align with SQLAlchemy's doc.
class Current(Base):
    __tablename__ = 'tableName'

    id = Column(Integer, primary_key=True)
    Date = Column(String(500))
    Type = Column(String(500))
    Value = Column(Numeric())

    def __repr__(self):
        return "(id='%s', Date='%s', Type='%s', Value='%s')" % (self.id, self.Date, self.Type, self.Value)

# Set up of the table in db and the file to import
fileToRead = 'file.csv'
tableToWriteTo = 'tableName'

# Panda to create a lovely dataframe
df_to_be_written = pd.read_csv(fileToRead)
# The orient='records' is the key of this, it allows to align with the format mentioned in the doc to insert in bulks.
listToWrite = df_to_be_written.to_dict(orient='records')

metadata = sqlalchemy.schema.MetaData(bind=engine,reflect=True)
table = sqlalchemy.Table(tableToWriteTo, metadata, autoload=True)

# Open the session
Session = sessionmaker(bind=engine)
session = Session()

# Inser the dataframe into the database in one bulk
conn.execute(table.insert(), listToWrite)

# Commit the changes
session.commit()

# Close the session
session.close()

由于这是一个I/O繁重的工作负载，您还可以通过使用python线程模块。这加快了我的速度：

import math
from multiprocessing.dummy import Pool as ThreadPool

...

def insert_df(df, *args, **kwargs):
    nworkers = 4

    chunksize = math.floor(df.shape[0] / nworkers)
    chunks = [(chunksize * i, (chunksize * i) + chunksize) for i in range(nworkers)]
    chunks.append((chunksize * nworkers, df.shape[0]))
    pool = ThreadPool(nworkers)

    def worker(chunk):
        i, j = chunk
        df.iloc[i:j, :].to_sql(*args, **kwargs)

    pool.map(worker, chunks)
    pool.close()
    pool.join()


....

insert_df(df, "foo_bar", engine, if_exists='append')

根据@ansonw的回答：

def to_sql(engine, df, table, if_exists='fail', sep='\t', encoding='utf8'):
    # Create Table
    df[:0].to_sql(table, engine, if_exists=if_exists)

    # Prepare data
    output = cStringIO.StringIO()
    df.to_csv(output, sep=sep, header=False, encoding=encoding)
    output.seek(0)

    # Insert data
    connection = engine.raw_connection()
    cursor = connection.cursor()
    cursor.copy_from(output, table, sep=sep, null='')
    connection.commit()
    cursor.close()

我在5秒内插入200000行，而不是4分钟

下面我的postgres特定解决方案使用pandas数据帧自动创建数据库表，并使用postgres

从…复制我的表执行快速批量插入。

import io

import pandas as pd
from sqlalchemy import create_engine

def write_to_table(df, db_engine, schema, table_name, if_exists='fail'):
    string_data_io = io.StringIO()
    df.to_csv(string_data_io, sep='|', index=False)
    pd_sql_engine = pd.io.sql.pandasSQL_builder(db_engine, schema=schema)
    table = pd.io.sql.SQLTable(table_name, pd_sql_engine, frame=df,
                               index=False, if_exists=if_exists, schema=schema)
    table.create()
    string_data_io.seek(0)
    string_data_io.readline()  # remove header
    with db_engine.connect() as connection:
        with connection.connection.cursor() as cursor:
            copy_cmd = "COPY %s.%s FROM STDIN HEADER DELIMITER '|' CSV" % (schema, table_name)
            cursor.copy_expert(copy_cmd, string_data_io)
        connection.connection.commit()

对于任何面临此问题且目标数据库为红移的人，请注意红移不会实现完整的Postgres命令集，因此使用Postgres的

COPY FROM

或

COPY_FROM（）

的一些答案将不起作用。

加速红移插入的解决方案是使用文件摄取或Odo

参考资料：
关于奥多
带红移的Odo

红移复制（来自S3文件）

对于我来说，使用cx\u Oracle和SQLALchemy连接到Oracle数据库非常有效

import sqlalchemy
import cx_Oracle
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String
from sqlalchemy.orm import sessionmaker
import pandas as pd

# credentials
username = "username"
password = "password"
connectStr = "connection:/string"
tableName = "tablename"

t0 = time.time()

# connection
dsn = cx_Oracle.makedsn('host','port',service_name='servicename')

Base = declarative_base()

class LANDMANMINERAL(Base):
    __tablename__ = 'tablename'

    DOCUMENTNUM = Column(String(500), primary_key=True)
    DOCUMENTTYPE = Column(String(500))
    FILENUM = Column(String(500))
    LEASEPAYOR = Column(String(500))
    LEASESTATUS = Column(String(500))
    PROSPECT = Column(String(500))
    SPLIT = Column(String(500))
    SPLITSTATUS = Column(String(500))

engine = create_engine('oracle+cx_oracle://%s:%s@%s' % (username, password, dsn))
conn = engine.connect()  

Base.metadata.bind = engine

# Creating the session

DBSession = sessionmaker(bind=engine)

session = DBSession()

# Bulk insertion
data = pd.read_csv('data.csv')
lists = data.to_dict(orient='records')


table = sqlalchemy.Table('landmanmineral', Base.metadata, autoreload=True)
conn.execute(table.insert(), lists)

session.commit()

session.close() 

print("time taken %8.8f seconds" % (time.time() - t0) )

对于像我这样试图实施上述解决方案的人：

熊猫0.24.0现在可以使用chunksize和method='multi'选项对sql进行批量插入…

下面是一个简单的方法

下载SQL数据库连接的驱动程序对于Linux和Mac OS：

对于Windows：

创建连接数据插入如果有很多记录

# limit based on sp_prepexec parameter count
tsql_chunksize = 2097 // len(bd_pred_score_100.columns)
# cap at 1000 (limit for number of rows inserted by table-value constructor)
tsql_chunksize = 1000 if tsql_chunksize > 1000 else tsql_chunksize
print(tsql_chunksize)


df.to_sql('table_name', con = engine, if_exists = 'append', index= False, chunksize=tsql_chunksize)

PS：您可以根据需要更改参数。熊猫0.25.1有一个参数可以执行多次插入，因此不再需要使用SQLAlchemy解决此问题

调用

pandas.DataFrame.to_sql

时设置

method='multi'

在这个例子中，它是

df.to_sql（表，schema=schema，con=e，index=False，如果_exists='replace'，method='multi'）

答案来源于文档

值得注意的是，我只用红移测试过这个。请让我知道它在其他数据库上的运行情况，以便我可以更新此答案。

您似乎正在自己重新创建

以实现sql

功能，我怀疑这是否会更快。将数据写入SQL的瓶颈主要存在于python驱动程序中（

pyobdc

）。此外，

to_sql

不使用ORM，这被认为比核心sqlalchemy慢，即使在进一步使用大容量插入（）时，如果

to_sql

太慢，并且您无法改进它（例如通过调整连接参数、使用的驱动程序（例如pymssql）、internet速度、删除表上的约束等），另一种更快的方法是将数据写入csv，并将其加载到SQL表中。@joris谢谢。这里列出的“批量操作”似乎有点用词不当。我真正需要做的是将pandas数据文件输出到一个文本文件，然后像这样编写批量插入操作。。。是的，但这是为了提高sqlalchemy ORM的速度，它比核心sqlalchemy有更多的功能。但是pandas

to_sql

根本不使用ORM，正如我之前所说，实际上已经在进行大容量插入。@joris嗯，我之所以这样做是因为我可以在sql Server上运行\\fileserver\folder\doc.txt中的“大容量插入dbo.MyTable”，而且性能非常好。我的想法是，当大容量INSERT语句使用“VALUES”而不是“FROM”时，t

df.to_sql('Table_Name', con=engine, if_exists='append', index=False)


"""
if_exists: {'fail', 'replace', 'append'}, default 'fail'
     fail: If table exists, do nothing.
     replace: If table exists, drop it, recreate it, and insert data.
     append: If table exists, insert data. Create if does not exist.
"""

# limit based on sp_prepexec parameter count
tsql_chunksize = 2097 // len(bd_pred_score_100.columns)
# cap at 1000 (limit for number of rows inserted by table-value constructor)
tsql_chunksize = 1000 if tsql_chunksize > 1000 else tsql_chunksize
print(tsql_chunksize)


df.to_sql('table_name', con = engine, if_exists = 'append', index= False, chunksize=tsql_chunksize)