PostgreSQL中的批存储过程

PostgreSQL中的批存储过程,postgresql,scala,stored-procedures,apache-spark,batch-processing,Postgresql,Scala,Stored Procedures,Apache Spark,Batch Processing,我需要将许多实体保存到数据库中。保存实体涉及将行添加到不同的表中,键是通过在一个表中插入一行自动生成的,用于将某些行插入另一个表中。这种逻辑使我创建并使用了一个存储过程。单独为每个实体调用此存储过程(即通过statement.execute(…)调用)可以很好地工作,除非需要保存数十亿个实体。所以我试着分批做这件事。但是,在批处理的情况下,批处理执行会导致抛出org.postgresql.util.PSQLException,并显示一条消息“当预期没有结果时返回了结果” 我的存储过程是这样的:

我需要将许多实体保存到数据库中。保存实体涉及将行添加到不同的表中,键是通过在一个表中插入一行自动生成的,用于将某些行插入另一个表中。这种逻辑使我创建并使用了一个存储过程。单独为每个实体调用此存储过程(即通过statement.execute(…)调用)可以很好地工作,除非需要保存数十亿个实体。所以我试着分批做这件事。但是,在批处理的情况下,批处理执行会导致抛出org.postgresql.util.PSQLException,并显示一条消息“当预期没有结果时返回了结果”

我的存储过程是这样的:

CREATE OR REPLACE FUNCTION insertSentence(warcinfoID varchar, recordID varchar, sentence varchar,
    sent_timestamp bigint, sect_ids smallint[]) RETURNS void AS $$
DECLARE
    warcinfoIdId integer := 0;
    recordIdId integer := 0;
    sentId integer := 0;
    id integer := 0;
BEGIN
    SELECT warcinfo_id_id INTO warcinfoIdId FROM warcinfo_id WHERE warcinfo_id_value = warcinfoID;
    IF NOT FOUND THEN
        INSERT INTO warcinfo_id (warcinfo_id_value) VALUES (warcinfoID)
            RETURNING warcinfo_id_id INTO STRICT warcinfoIdId;
    END IF;
    SELECT record_id_id INTO recordIdId FROM record_id WHERE record_id_value = recordID;
    IF NOT FOUND THEN
        INSERT INTO record_id (record_id_value) VALUES (recordID)
            RETURNING record_id_id INTO STRICT recordIdId;
    END IF;
    LOOP
        SELECT sent_id INTO sentId FROM sentence_text
            WHERE md5(sent_text) = md5(sentence) AND sent_text = sentence;
        EXIT WHEN FOUND;
        BEGIN
            INSERT INTO sentence_text (sent_text) VALUES (sentence) RETURNING sent_id INTO STRICT sentId;
        EXCEPTION WHEN unique_violation THEN
            sentId := 0;
        END;
    END LOOP;
    INSERT INTO sentence_occurrence (warcinfo_id, record_id, sent_id, timestamp, sect_ids)
        VALUES (warcinfoIdId, recordIdId, sentId, TO_TIMESTAMP(sent_timestamp), sect_ids)
        RETURNING entry_id INTO STRICT id;
END;
$$ LANGUAGE plpgsql;
def partition2DB(iterator: Iterator[(String, String, String, Long, Array[Int])]): Unit = {
  Class.forName(driver)
  val conn = DriverManager.getConnection(connectionString)

  try {
    val statement = conn.createStatement()
    var i = 0
    iterator.foreach(r => {
      i += 1
      statement.addBatch(
        "select insertSentence('%s', '%s', '%s', %d, '{%s}');".format(
          r._1, r._2, r._3.replaceAll("'", "''"), r._4, r._5.mkString(","))
      )
      if (i % 1000 == 0) statement.executeBatch()
    })
    if (i % 1000 != 0) statement.executeBatch()
  } catch {
    case e: SQLException => println("exception caught: " + e.getNextException());
  } finally {
    conn.close
  }
}
Scala代码是这样的:

CREATE OR REPLACE FUNCTION insertSentence(warcinfoID varchar, recordID varchar, sentence varchar,
    sent_timestamp bigint, sect_ids smallint[]) RETURNS void AS $$
DECLARE
    warcinfoIdId integer := 0;
    recordIdId integer := 0;
    sentId integer := 0;
    id integer := 0;
BEGIN
    SELECT warcinfo_id_id INTO warcinfoIdId FROM warcinfo_id WHERE warcinfo_id_value = warcinfoID;
    IF NOT FOUND THEN
        INSERT INTO warcinfo_id (warcinfo_id_value) VALUES (warcinfoID)
            RETURNING warcinfo_id_id INTO STRICT warcinfoIdId;
    END IF;
    SELECT record_id_id INTO recordIdId FROM record_id WHERE record_id_value = recordID;
    IF NOT FOUND THEN
        INSERT INTO record_id (record_id_value) VALUES (recordID)
            RETURNING record_id_id INTO STRICT recordIdId;
    END IF;
    LOOP
        SELECT sent_id INTO sentId FROM sentence_text
            WHERE md5(sent_text) = md5(sentence) AND sent_text = sentence;
        EXIT WHEN FOUND;
        BEGIN
            INSERT INTO sentence_text (sent_text) VALUES (sentence) RETURNING sent_id INTO STRICT sentId;
        EXCEPTION WHEN unique_violation THEN
            sentId := 0;
        END;
    END LOOP;
    INSERT INTO sentence_occurrence (warcinfo_id, record_id, sent_id, timestamp, sect_ids)
        VALUES (warcinfoIdId, recordIdId, sentId, TO_TIMESTAMP(sent_timestamp), sect_ids)
        RETURNING entry_id INTO STRICT id;
END;
$$ LANGUAGE plpgsql;
def partition2DB(iterator: Iterator[(String, String, String, Long, Array[Int])]): Unit = {
  Class.forName(driver)
  val conn = DriverManager.getConnection(connectionString)

  try {
    val statement = conn.createStatement()
    var i = 0
    iterator.foreach(r => {
      i += 1
      statement.addBatch(
        "select insertSentence('%s', '%s', '%s', %d, '{%s}');".format(
          r._1, r._2, r._3.replaceAll("'", "''"), r._4, r._5.mkString(","))
      )
      if (i % 1000 == 0) statement.executeBatch()
    })
    if (i % 1000 != 0) statement.executeBatch()
  } catch {
    case e: SQLException => println("exception caught: " + e.getNextException());
  } finally {
    conn.close
  }
}
奇怪的是,尽管statement.executeBatch()抛出了一个异常,但它在这之前保存了实体。因此,这种变通方法可以让事情顺利进行:

def partition2DB(iterator: Iterator[(String, String, String, Long, Array[Int])]): Unit = {
  Class.forName(driver)
  val conn = DriverManager.getConnection(connectionString)

  try {
    var statement = conn.createStatement()
    var i = 0
    iterator.foreach(r => {
      i += 1
      statement.addBatch(
        "select insertSentence('%s', '%s', '%s', %d, '{%s}');".format(
          r._1, r._2, r._3.replaceAll("'", "''"), r._4, r._5.mkString(","))
      )
      if (i % 1000 == 0) {
        i = 0
        try {
          statement.executeBatch()
        } catch {
          case e: SQLException => statement = conn.createStatement()
        }
      }
    })
    if (i % 1000 != 0) {
      try {
        statement.executeBatch()
      } catch {
        case e: SQLException => statement = conn.createStatement()
      }
    }
  } catch {
    case e: SQLException => println("exception caught: " + e.getNextException());
  } finally {
    conn.close
  }
}
然而,我不想依赖我目前使用的PostgreSQL的一个未记录的特性。 我看到其他人也遇到了这个问题:

有人能提出解决办法吗

奇怪的是,尽管statement.executeBatch()抛出了一个异常,但它在这之前保存了实体

这是因为您没有在事务中包装批处理。JDBC规范没有明确说明,如果批处理尚未进行,是否应该隐式地包装在事务中,或者作为单个语句激发。也不知道在发生错误后是否应继续批处理的实现

要获得定义良好的行为(和更好的性能),请将批处理打包到事务中

不!远离键盘!拜托,你不是PHP程序员:p

您知道最好不要在SQL中插入字符串。不要那样做。使用
PreparedStatement
。除了更安全、更安全之外,它还将更快,因为PgJDBC只需发送一条语句进行解析,然后重新使用它
PreparedStatement
非常适合在JDBC批处理中使用

现在,退一步

保存实体涉及将行添加到不同的表中,键是通过在一个表中插入一行自动生成的,用于将某些行插入另一个表中。这种逻辑使我创建并使用了一个存储过程

这是编写它的简单方法,但它的性能不会很好。你在不同的表上做了很多独立的操作,很多零碎的索引更新,等等。还有过程调用开销,每个查询的开销,等等。每个
开始。。。pl/pgsql中的异常…
块也有不寻常的开销

使用这种方法,您将遇到数十万或数百万行的问题,更不用说数十亿行了

关系数据库在集合中思考得最好。如果您真的要查看数十亿行,那么基于proc的方法将不起作用。您需要获取原始输入的批次,将它们插入临时表,然后使用一系列对临时数据的查询将它们插入目标表

您需要熟悉
插入到。。。选择…
更新。。。来自…
等。如果您使用的是PostgreSQL 9.5,您将受益于使用
插入。。。关于冲突…
用于类似upsert的操作

这样想一段时间会很痛苦,但这是非常值得的,你不会相信当你在集合而不是单个项目中工作时的表现


我无法为您编写全部内容-您没有显示任何原始数据、模式,也没有对细节进行解释。这很好,因为那不是你的问题。不管怎么说,它太长了,所以不是一个为我编写代码的站点。

好的,我去掉了存储过程以防止批处理失败,因此在批处理失败时依赖于未记录的行为。 批处理现在封装在事务中,语句被PreparedStatement替换(事实上,它并没有在这个脚本中带来更好的速度性能)。 我利用插入到。。。选择。。。并插入。。。关于冲突。。。因此,许多逻辑从存储过程转移到SQL命令

现在看起来是这样的:

def partition2DB(iterator: Iterator[(String, String, String, Long, Array[Short])]): Unit = {
  val batchSize = 1000
  val nRetries = 10

  def updStatements(item: (String, String, String, Long, Array[Short]), c: Connection, statement1: PreparedStatement,
                    statement2: PreparedStatement, statement3: PreparedStatement, statement4: PreparedStatement) = {
    val sentence = if (item._3.length > 2712) item._3.substring(0, 2712) else item._3
    statement1.setString(1, item._1)
    statement2.setString(1, item._2)
    statement3.setString(1, sentence)
    statement4.setString(1, item._1)
    statement4.setString(2, item._2)
    statement4.setString(3, sentence)
    statement4.setString(4, sentence)
    statement4.setLong(5, item._4)
    statement4.setArray(6, c.createArrayOf("int4", item._5.map(new Integer(_)).asInstanceOf[Array[Object]]))
    statement1.addBatch()
    statement2.addBatch()
    statement3.addBatch()
    statement4.addBatch()
  }
  def executeStatements(statement1: PreparedStatement, statement2: PreparedStatement,
                        statement3: PreparedStatement, statement4: PreparedStatement) = {
    statement1.executeBatch()
    statement2.executeBatch()
    statement3.executeBatch()
    statement4.executeBatch()
  }

  Class.forName(driver)
  var conn: Connection = null

  try {
    conn = DriverManager.getConnection(connectionString)
    conn.setAutoCommit(false)
    val statement1 = conn.prepareStatement("INSERT INTO warcinfo_id (warcinfo_id_value) VALUES (?) ON CONFLICT (warcinfo_id_value) DO NOTHING;")
    val statement2 = conn.prepareStatement("INSERT INTO record_id (record_id_value) VALUES (?) ON CONFLICT (record_id_value) DO NOTHING;")
    val statement3 = conn.prepareStatement("INSERT INTO sentence_text (sent_text) VALUES (?) ON CONFLICT (sent_text) DO NOTHING;")
    val statement4 = conn.prepareStatement(
      """
        |INSERT INTO sentence_occurrence (warcinfo_id, record_id, sent_id, timestamp, sect_ids) VALUES (
        |    (SELECT warcinfo_id_id FROM warcinfo_id WHERE warcinfo_id_value = ?),
        |    (SELECT record_id_id FROM record_id WHERE record_id_value = ?),
        |    (SELECT sent_id FROM sentence_text WHERE md5(sent_text) = md5(?) AND sent_text = ?),
        |    TO_TIMESTAMP(?),
        |    ?
        |)
      """.stripMargin)
    var i = 0
    val batch = ListBuffer[(String, String, String, Long, Array[Short])]()
    conn.setAutoCommit(false)

    def executeBatch() = {
      var attempts = 0
      while (attempts < nRetries) {
        try {
          for (item <- batch) updStatements(item, conn, statement1, statement2, statement3, statement4)
          executeStatements(statement1, statement2, statement3, statement4)
          conn.commit()
          batch.clear()
          attempts += nRetries
        } catch {
          case e: SQLException => {
            attempts += 1
            println("exception caught: " + e.getNextException)
            conn.rollback()
          }
        }
      }
    }

    iterator.foreach(r => {
      i += 1
      batch += r
      if (i % batchSize == 0) {
        executeBatch()
      }
    })
    if (i % batchSize != 0) {
      executeBatch()
    }
  } catch {
    case e: SQLException => println("exception caught: " + e)
  } finally {
    conn.close()
  }
}
在克雷格的评论后添加:

谢谢,克雷格。什么是对输入集的操作?你能发一个链接到一些例子吗


此外,我还有以下问题。如果两个批同时尝试在某个表中插入相同的记录,我会得到一个java.sql.BatchUpdateException,并显示如下消息:“错误:检测到死锁。详细信息:流程31959等待事务24298876上的ShareLock;被流程31955阻止。流程31955等待事务24298877上的ShareLock;被流程31959阻止。”对于这种情况,什么是正确的解决方案?我可以考虑重试失败的尝试,直到它成功或达到重试次数的限制,存储重复的数据,然后使用SELECT DISTICT…,生成最终结果表,玩弄隔离级别(例如尝试“读取未提交”)。然而,所有这些似乎都是危险的解决办法(重试次数达到了极限,磁盘空间用完,数据库中出现了一些错误数据)。

做得好。如果insert操作输入集,而不是一个接一个地调用,您将获得更大的改进,但这应该已经是一种改进。理想情况下,您应该使用PgJDBC的CopyManager加载临时表,然后处理临时表。