Regex Oracle-当只对不同的匹配感兴趣时,优化CLOB列上所有正则表达式匹配的循环
我正在创建一个在一个表(有数千行)上循环的存储过程,对于每一行,都有一个CLOB列,我希望从中获取正则表达式(sa“FNR”)上的所有匹配项。此后,我想在新表中插入每个不同的匹配项。单个CLOB列可能包含数千个匹配项,但通常在CLOB中重复的是同一个“FNR”-即,不同的正则表达式匹配项要少得多,而这些是我唯一感兴趣的。 然而,我做的这个过程花费了很长的时间,我怀疑在所有的比赛中循环是最耗时的部分 我的程序如下所示:Regex Oracle-当只对不同的匹配感兴趣时,优化CLOB列上所有正则表达式匹配的循环,regex,oracle,performance,loops,distinct,Regex,Oracle,Performance,Loops,Distinct,我正在创建一个在一个表(有数千行)上循环的存储过程,对于每一行,都有一个CLOB列,我希望从中获取正则表达式(sa“FNR”)上的所有匹配项。此后,我想在新表中插入每个不同的匹配项。单个CLOB列可能包含数千个匹配项,但通常在CLOB中重复的是同一个“FNR”-即,不同的正则表达式匹配项要少得多,而这些是我唯一感兴趣的。 然而,我做的这个过程花费了很长的时间,我怀疑在所有的比赛中循环是最耗时的部分 我的程序如下所示: CREATE OR REPLACE PROCEDURE SP_MTV_FINN
CREATE OR REPLACE PROCEDURE SP_MTV_FINN_FNR AS
BEGIN
DECLARE
v_n NUMBER;
v_cnt NUMBER;
v_mtrid NUMBER;
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_doc CLOB;
v_fnr VARCHAR2(11);
BEGIN
-- Get all rows from table --
SELECT COUNT(*) INTO v_n FROM TABLE;
IF v_n > 0 THEN
-- Loop over all rows --
FOR i IN 1..v_n LOOP
SELECT doc, mtrid
INTO v_doc, v_mtrid
FROM (SELECT DOC doc, ID mtrid, ROWNUM rnum
FROM TABLE
WHERE ROWNUM <=i)
WHERE rnum >= i;
IF v_doc IS NOT NULL THEN
SELECT REGEXP_COUNT(v_doc, v_regex_fnr) INTO v_cnt FROM DUAL;
IF v_cnt >= 1 THEN
-- For each regex match - time consuming, right? --
FOR j IN 1..v_cnt LOOP
SELECT REGEXP_SUBSTR(v_doc, v_regex_fnr, 1, j, 'm') INTO v_fnr FROM DUAL;
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
INSERT INTO TABLE2(MTR_ID, FNR)
SELECT v_mtrid, v_fnr FROM DUAL;
END IF;
END LOOP;
END IF;
END IF;
COMMIT;
END LOOP;
END IF;
END LOOP;
EXCEPTION WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
使用静态调用删除双引用。这样就节省了PL/SQL和SQL引擎之间不必要的
上下文切换
我还添加了隐式游标来逐个记录地处理记录
下一个改进级别可以放在另一个表中。虽然我不是在这里做的
CREATE OR REPLACE PROCEDURE SP_MTV_FINN_FNR AS
BEGIN
DECLARE
v_n NUMBER;
v_cnt NUMBER;
v_mtrid NUMBER;
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_doc CLOB;
v_fnr VARCHAR2(11);
BEGIN
-- Get all rows from table --
/* Lets go with a Implicit cursor */
FOR MYREC IN (SELECT DOC doc, ID mtrid
FROM TABLE)
LOOP
IF MYREC.DOC IS NOT NULL THEN
v_cnt := REGEXP_COUNT(MYREC.DOC, v_regex_fnr);
IF v_cnt >= 1 THEN
-- For each regex match - time consuming, right? --
FOR j IN 1..v_cnt LOOP
v_fnr := REGEXP_SUBSTR(MYREC.DOC, v_regex_fnr, 1, j, 'm');
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
INSERT INTO TABLE2(MTR_ID, FNR)
VALUES (MYREC.MTRID,v_fnr);
END IF;
END LOOP;
END IF;
COMMIT;
END IF;
END LOOP;
EXCEPTION WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
迭代优化PL/SQL块
迭代0:修复语法错误
CREATE OR REPLACE PROCEDURE SP_MTV_FINN_FNR AS
v_n NUMBER;
v_cnt NUMBER;
v_mtrid NUMBER;
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_doc CLOB;
v_fnr VARCHAR2(11);
BEGIN
-- Get all rows from table --
SELECT COUNT(*) INTO v_n FROM TABLE;
IF v_n > 0 THEN
-- Loop over all rows --
FOR i IN 1..v_n LOOP
SELECT doc, mtrid
INTO v_doc, v_mtrid
FROM (SELECT DOC doc, ID mtrid, ROWNUM rnum
FROM TABLE
WHERE ROWNUM <=i)
WHERE rnum >= i;
IF v_doc IS NOT NULL THEN
SELECT REGEXP_COUNT(v_doc, v_regex_fnr) INTO v_cnt FROM DUAL;
IF v_cnt >= 1 THEN
-- For each regex match - time consuming, right? --
FOR j IN 1..v_cnt LOOP
SELECT REGEXP_SUBSTR(v_doc, v_regex_fnr, 1, j, 'm') INTO v_fnr FROM DUAL;
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
INSERT INTO TABLE2(MTR_ID, FNR)
SELECT v_mtrid, v_fnr FROM DUAL;
END IF;
END LOOP;
END IF;
END IF;
COMMIT;
END LOOP;
END IF;
EXCEPTION WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
迭代2:减少外部循环的数量
DECLARE
v_cnt NUMBER;
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_fnr VARCHAR2(11);
BEGIN
FOR rec IN (
select doc, id as mtrid
from table
where doc is not null
) LOOP
v_cnt := REGEXP_COUNT(rec.doc, v_regex_fnr);
IF v_cnt >= 1 THEN
-- For each regex match - time consuming, right? --
FOR j IN 1..v_cnt LOOP
v_fnr := REGEXP_SUBSTR(rec.doc, v_regex_fnr, 1, j, 'm');
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
INSERT INTO TABLE2(MTR_ID, FNR) values (rec.mtrid, v_fnr);
END IF;
END LOOP;
END IF;
END LOOP;
COMMIT;
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
/
迭代3:缩短迭代2的代码
DECLARE
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_fnr VARCHAR2(11);
BEGIN
FOR rec IN (
select doc, id as mtrid, REGEXP_COUNT(rec.doc, v_regex_fnr) as regexp_cnt
from table
where doc is not null
and regexp_like(doc, v_regex_fnt)
) LOOP
FOR j IN 1..rec.regexp_cnt LOOP
v_fnr := REGEXP_SUBSTR(rec.doc, v_regex_fnr, 1, j, 'm');
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
INSERT INTO TABLE2(MTR_ID, FNR) values (rec.mtrid, v_fnr);
END IF;
END LOOP;
END LOOP;
COMMIT;
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
/
迭代4:删除不必要的regexp\u count()
计数
DECLARE
v_cnt NUMBER;
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_fnr VARCHAR2(11);
BEGIN
FOR rec IN (
select doc, id as mtrid
from table
) LOOP
IF rec.doc IS NOT NULL THEN
v_cnt := REGEXP_COUNT(rec.doc, v_regex_fnr);
IF v_cnt >= 1 THEN
-- For each regex match - time consuming, right? --
FOR j IN 1..v_cnt LOOP
v_fnr := REGEXP_SUBSTR(rec.doc, v_regex_fnr, 1, j, 'm');
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
INSERT INTO TABLE2(MTR_ID, FNR) values (rec.mtrid, v_fnr);
END IF;
END LOOP;
END IF;
END IF;
END LOOP;
COMMIT;
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
/
DECLARE
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_fnr VARCHAR2(11);
j integer;
BEGIN
FOR rec IN (
select doc, id as mtrid
from table
where doc is not null
) LOOP
j := 1;
loop
v_fnr := REGEXP_SUBSTR(rec.doc, v_regex_fnr, 1, j, 'm');
exit when v_fnt is null;
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
INSERT INTO TABLE2(MTR_ID, FNR) values (rec.mtrid, v_fnr);
END IF;
j := j + 1;
END LOOP;
END LOOP;
COMMIT;
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
/
迭代5:将结果保存到内存中,并立即刷新到DB(使用集合绑定),再加上处理不同的需求
create or replace type obj_table2
as
object (
mtr_id integer,
fnr varchar2(4000)
);
/
create or replace type arr_table2
as
table of obj_table2;
/
DECLARE
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_fnr VARCHAR2(11);
j integer;
table2_bulk arr_table2 := arr_table2();
BEGIN
FOR rec IN (
select doc, id as mtrid
from table
where doc is not null
) LOOP
j := 1;
loop
v_fnr := REGEXP_SUBSTR(rec.doc, v_regex_fnr, 1, j, 'm');
exit when v_fnt is null;
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
table2_bulk.extend();
table2_bulk(table2_bulk.last) := new obj_table2(
mtr_id => rec.mtrid,
fnr => v_fnr
);
END IF;
j := j + 1;
END LOOP;
END LOOP;
insert into table2(mtr_id, fnr)
select mtr_id, fnr
from table(table2_bulk) X
minus
select mtr_id, fnr
from table2;
COMMIT;
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
/
第6次迭代:在决定
请注意这些代码片段甚至可能不起作用。由于您没有向我们提供任何测试数据设置,因此我们只能以假设的方式调整您的代码
请注意最慢的部分仍然是regexp\u substr()
在CLOB
值上。您可能需要考虑使用regexp\u substr()
的position
参数而不是occurrence
参数来获取后续的regexp匹配
享受。从删除双表调用开始。例如:v\u cnt:=REGEXP\u COUNT(v\u doc,v\u regex\u fnr)
不要怀疑循环是缓慢的部分,检查一下。尽管嵌套循环通常不是个好主意。你也在做不必要的连接切换。您实际插入的行可能有多少行?检查正在做什么?您可以将值放入集合中以消除重复项,然后在一个批量操作中插入所有内容,或者使用connect by循环提取匹配项,等等。;但是你需要测试瓶颈在哪里,多亏了你们两个。插入行的数量取决于正则表达式匹配的数量,但可能总共有几千行。Check_fnr是对“fnr”进行校验和检查,因为正则表达式不足以识别匹配是否为国家标识号。检查fnr程序通常需要0.001-0.003s,并且使用递归CTE;不只是我,那我就拿那些来炫耀了!不过,解释一下它在做什么可能会有帮助,或者至少有一个文档链接谢谢,伙计们。:-)添加了一些链接,也修复了我第一次忘记的区分要求。Alex,当然你不是唯一一个:-),这个递归的和是一个值得宣扬的奇迹(尽管偶尔会有马车)。哇,谢谢你的详细解释和很好的回答!我一定会尝试这些迭代(并阅读文档以了解迭代6中发生的事情),我应该告诉check_fnr函数在做什么,但它与唯一性无关。它是“fnr”的校验和检查。顺便说一句,在迭代5中,执行不同检查的最佳方法是什么?在调用check\u fnr-function之前,可能需要检查一下table2\u bulk中是否存在?是的,几分钟前刚刚阅读了它,并相应地修改了我的答案。请不要把这些代码片段当作alpha和omega;相反,要从中找到灵感——用迭代的方法解决性能问题可以让你的生活更轻松。如果我的回答能让您的代码运行得更快,那么我很高兴能提供帮助。:-)对于迭代5中的区分检查。。。棘手的问题。您可以使用架构级对象+集合类型,并通过将集合(作为数据源,在from
子句中)绑定到insert select而不是使用批量DML(即forall…
)来实现。感谢您的贡献。然而,在本例中,上下文切换不是瓶颈,但是所有的regexp\u substr()
调用都是瓶颈。如果你感兴趣的话,我已经用我想出的解决方法更新了这个问题!
create or replace type obj_table2
as
object (
mtr_id integer,
fnr varchar2(4000)
);
/
create or replace type arr_table2
as
table of obj_table2;
/
DECLARE
v_regex_fnr VARCHAR2(54) := '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))';
v_fnr VARCHAR2(11);
j integer;
table2_bulk arr_table2 := arr_table2();
BEGIN
FOR rec IN (
select doc, id as mtrid
from table
where doc is not null
) LOOP
j := 1;
loop
v_fnr := REGEXP_SUBSTR(rec.doc, v_regex_fnr, 1, j, 'm');
exit when v_fnt is null;
IF CHECK_FNR(v_fnr) = 'TRUE' THEN
table2_bulk.extend();
table2_bulk(table2_bulk.last) := new obj_table2(
mtr_id => rec.mtrid,
fnr => v_fnr
);
END IF;
j := j + 1;
END LOOP;
END LOOP;
insert into table2(mtr_id, fnr)
select mtr_id, fnr
from table(table2_bulk) X
minus
select mtr_id, fnr
from table2;
COMMIT;
EXCEPTION
WHEN OTHERS THEN
DBMS_OUTPUT.PUT_LINE('Error - rollback');
DBMS_OUTPUT.PUT_LINE('The error code is ' || SQLCODE || '- ' || SQLERRM);
ROLLBACK;
END;
/
insert into table2 (mtr_id, fnr)
with xyz (doc, mtrid, fnr, j) as (
select doc, id as mtrid, cast(null as varchar2(4000)) as fnr, 0 as j
from table A
where doc is not null
--
union all
--
select doc, mtrid,
regexp_substr(doc, '(((0[1-9]|[12]\d|3[01])(0[1-9]|1[012])(\d{2}))(\d{5}))', 1, j+1, 'm') as fnr,
j+1
from xyz X
where j = 0
or j > 0 and X.fnr is not null
)
select distinct mtrid, fnr
from xyz
where j > 0
and fnr is not null
and CHECK_FNR(fnr) = 'TRUE'
;
commit;