Regex 查找正则表达式断言以匹配嵌套的HTML表
我希望在一个表中匹配一个特定的表。以下是示例html和我迄今为止失败尝试的摘要:Regex 查找正则表达式断言以匹配嵌套的HTML表,regex,regex-lookarounds,Regex,Regex Lookarounds,我希望在一个表中匹配一个特定的表。以下是示例html和我迄今为止失败尝试的摘要: <table id="parent"> <table class="possible_target"> <tr><td>We're tageting this table</td></tr> </table> </table> <table class="possible_t
<table id="parent">
<table class="possible_target">
<tr><td>We're tageting this table</td></tr>
</table>
</table>
<table class="possible_target">
<tr><td>We're not targeteing this table</td></tr>
</table>
我们在这张桌子上贴标签
我们不是针对这张桌子
这是我最初的尝试。但即使有效,它也可能与第二个未列出的表相匹配:
~(?=<table.*?)<table class="possible_target".*?</table>~si
~(?=我发现这很有趣,因为使用正则表达式和嵌套html标记很有挑战性
我的尝试可以(应该)做到以下几点:
1.)使用回调函数按深度枚举表。最低深度=1
// html stuff to process
$source = "your input";
// specify tag to match
$rx_tag = "table";
// match all $rx_tag in depth (lowest = 1)
$rx_depth = 2;
// ----------------------------
// set markers using callback function
$source = preg_replace_callback('~<(/)?'.$rx_tag.'~i','set_tag_depth',$source);
function set_tag_depth($out)
{
global $tag_depth;
if($out[1]=="/") {
$tag_depth--; return $out[0].($tag_depth+1);
}
$tag_depth++; return $out[0].$tag_depth;
}
#echo nl2br(htmlspecialchars($source));
//要处理的html内容
$source=“您的输入”;
//指定要匹配的标记
$rx_tag=“表格”;
//在深度上匹配所有$rx_标记(最低=1)
$rx_深度=2;
// ----------------------------
//使用回调函数设置标记
$source=preg\u replace\u回调(“~它可能会搜索具有相同名称的父/同级标记的平衡文本,
只有属性不同
也就是说,您可以一次收集所有潜在的家长候选人。
然后在另一个关卡中搜索候选人的“可能目标”
需要2个几乎相同的正则表达式(仅属性不同)。
有关详细信息,请参见扩展的正则表达式(底部)
PHP示例代码
// PHP sample code
$html =
'
<table id="parent">
<table class="possible_target">
C table data
</table>
<table class="possible_target">
D table data
</table>
</table>
<table id="parent">
<table>
<table class="possible_target">
<tr><td>We\'re targeting this table</td></tr>
</table>
</table>
</table>
<table class="possible_target">
<tr><td>We\'re not targeteing this table</td></tr>
</table>
'
;
// Regexes -
$rx_Parent = '~(?s)<table\s+id="parent">((?<Table_Core>(?:(?>(?:(?!</table\s*>|<table[\s>][^>]*(?<!/)>).)*)|(?<New_Table><table(?!\s+id="parent">)[\s>][^>]*(?<!/)>(?&Table_Core)</table\s*>))*))</table\s*>~';
$rx_Target = '~(?s)<table\s+class="possible_target">((?<Table_Core>(?:(?>(?:(?!</table\s*>|<table[\s>][^>]*(?<!/)>).)*)|(?<New_Table><table(?!\s+class="possible_target">)[\s>][^>]*(?<!/)>(?&Table_Core)</table\s*>))*))</table\s*>~';
// Match all possible parent candidates -
if ( preg_match_all ( $rx_Parent, $html, $ParentMatches, PREG_PATTERN_ORDER ) )
{
print "\n============================\n";
print_r( $ParentMatches[0] );
print "\n\n";
foreach( $ParentMatches[0] as $parent )
{
// Match each individual parent candidate possible targets -
if ( preg_match_all ( $rx_Target, $parent, $TargetMatches, PREG_SET_ORDER ) )
{
print "\n-----------------\n>Found Valid Parent\n";
foreach( $TargetMatches as $target )
{
print "Target:\n'" . $target[0] . "'\n"; // group 0
print "Core = \n'" . $target[1] . "'\n"; // group 1
}
}
}
}
else
{
print "No parents\n";
}
//PHP示例代码
$html=
'
表数据
D表数据
我们的目标是这张桌子
我们不是针对这张桌子
'
;
//正则表达式-
$rx|U Parent='~(s)((?(?:(?>(?:(!!|][^>]*(?)*))(?)[\s>][^>]*((?(?(?)&表|核心)))*)~;
$rx|u Target='~(s)((?(?:(?>)(?:(?!|][^>]*(?)*)(?)[\s>][^>]*(?(?(?)&表|核心)))*)~;
//匹配所有可能的家长候选人-
if(preg_match_all($rx_Parent,$html,$ParentMatches,preg_PATTERN_ORDER))
{
打印“\n==============================================\n”;
打印($ParentMatches[0]);
打印“\n\n”;
foreach($ParentMatches[0]作为$parent)
{
//匹配每个父母候选人的可能目标-
if(预匹配所有($rx\u目标,$parent,$TargetMatches,预设置顺序))
{
打印“\n---------\n>找到有效的父项\n”;
foreach($TargetMatches作为$target)
{
打印“Target:\n'.$Target[0]。“\n”;//组0
打印“Core=\n'。$target[1]。“\n”;//组1
}
}
}
}
其他的
{
打印“无父项\n”;
}
输出>>
============================
Array
(
[0] => <table id="parent">
<table class="possible_target">
C table data
</table>
<table class="possible_target">
D table data
</table>
</table>
[1] => <table id="parent">
<table>
<table class="possible_target">
<tr><td>We're targeting this table</td></tr>
</table>
</table>
)
-----------------
>Found Valid Parent
Target:
'<table class="possible_target">
C table data
</table>'
Core =
'
C table data
'
Target:
'<table class="possible_target">
D table data
</table>'
Core =
'
D table data
'
-----------------
>Found Valid Parent
Target:
'<table class="possible_target">
<tr><td>We're targeting this table</td></tr>
</table>'
Core =
'
<tr><td>We're targeting this table</td></tr>
'
============================
排列
(
[0] =>
表数据
D表数据
[1] =>
我们的目标是这张桌子
)
-----------------
>找到有效的父项
目标:
'
表数据
'
核心=
'
表数据
'
目标:
'
D表数据
'
核心=
'
D表数据
'
-----------------
>找到有效的父项
目标:
'
我们的目标是这张桌子
'
核心=
'
我们的目标是这张桌子
'
扩展正则表达式
# BalancedText_PHP_Html.rxf
# Processed by RegexFormat4 (http://www.regexformat.com)
# '~(?s)<table\s+id="parent">((?<Table_Core>(?:(?>(?:(?!</table\s*>|<table[\s>][^>]*(?<!/)>).)*)|(?<New_Table><table(?!\s+id="parent">)[\s>][^>]*(?<!/)>(?&Table_Core)</table\s*>))*))</table\s*>~'
(?s) # Dot-All
# Parent Table
# ==================
<table \s+ id="parent"> # Parent table start
( # (1 start), Core Start
(?<Table_Core> # (2 start), Table Core
(?:
(?>
(?:
(?! # Not start/end of another table
</table \s* >
|
<table [\s>] [^>]*
(?<! / )
>
)
.
)*
)
|
(?<New_Table> # (3 start), New Table
<table # Table start
(?! \s+ id="parent"> ) # but, not a parent table type
[\s>] [^>]*
(?<! / )
>
(?&Table_Core) # Recurse Table Core
</table \s* > # Table end
) # (3 end)
)*
) # (2 end)
) # (1 end), Core End
</table \s* > # Parent table end
# ==========================================================================
(?s) # Dot-All
# Target Table
# ==================
<table \s+ class="possible_target"> # Target table start
( # (1 start), Core Start
(?<Table_Core> # (2 start), Table Core
(?:
(?>
(?:
(?! # Not start/end of another table
</table \s* >
|
<table [\s>] [^>]*
(?<! / )
>
)
.
)*
)
|
(?<New_Table> # (3 start), New Table
<table # Table start
(?! \s+ class="possible_target"> ) # but, not a target table type
[\s>] [^>]*
(?<! / )
>
(?&Table_Core) # Recurse Table Core
</table \s* > # Table end
) # (3 end)
)*
) # (2 end)
) # (1 end), Core End
</table \s* > # Target table end
#BalancedText_PHP_Html.rxf
#由RegexFormat4处理(http://www.regexformat.com)
#“(?(?(?:(?>(?:(?!|][^>]*(?)*))(?)[\s>][^>]*((?(?!|表核心)))*)”
(s)点全部
#父表
# ==================
#父表开始
(#(1开始),核心开始
(?#(2开始),桌芯
(?:
(?>
(?:
(?!#不是另一张桌子的开始/结束
|
] [^>]*
(?
)
.
)*
)
|
(?#(3开始),新表
)#但是,不是父表类型
[\s>][^>]*
(?
(?&Table_Core)#递归Table Core
#桌尾
)#(三完)
)*
)#(二完)
)#(1端),芯端
#父表结束
# ==========================================================================
(s)点全部
#目标表
# ==================
#目标表开始
(#(1开始),核心开始
(?#(2开始),桌芯
(?:
(?>
(?:
(?!#不是另一张桌子的开始/结束
|
] [^>]*
(?
)
.
)*
)
|
(?#(3开始),新表
)#但是,不是目标表类型
[\s>][^>]*
(?
(?&Table_Core)#递归Table Core
#桌尾
)#(三完)
)*
)#(二完)
)#(1端),芯端
#目标表端
你用的是什么语言?如果你的语言支持递归,我可以用一个正则表达式。看起来
============================
Array
(
[0] => <table id="parent">
<table class="possible_target">
C table data
</table>
<table class="possible_target">
D table data
</table>
</table>
[1] => <table id="parent">
<table>
<table class="possible_target">
<tr><td>We're targeting this table</td></tr>
</table>
</table>
)
-----------------
>Found Valid Parent
Target:
'<table class="possible_target">
C table data
</table>'
Core =
'
C table data
'
Target:
'<table class="possible_target">
D table data
</table>'
Core =
'
D table data
'
-----------------
>Found Valid Parent
Target:
'<table class="possible_target">
<tr><td>We're targeting this table</td></tr>
</table>'
Core =
'
<tr><td>We're targeting this table</td></tr>
'
# BalancedText_PHP_Html.rxf
# Processed by RegexFormat4 (http://www.regexformat.com)
# '~(?s)<table\s+id="parent">((?<Table_Core>(?:(?>(?:(?!</table\s*>|<table[\s>][^>]*(?<!/)>).)*)|(?<New_Table><table(?!\s+id="parent">)[\s>][^>]*(?<!/)>(?&Table_Core)</table\s*>))*))</table\s*>~'
(?s) # Dot-All
# Parent Table
# ==================
<table \s+ id="parent"> # Parent table start
( # (1 start), Core Start
(?<Table_Core> # (2 start), Table Core
(?:
(?>
(?:
(?! # Not start/end of another table
</table \s* >
|
<table [\s>] [^>]*
(?<! / )
>
)
.
)*
)
|
(?<New_Table> # (3 start), New Table
<table # Table start
(?! \s+ id="parent"> ) # but, not a parent table type
[\s>] [^>]*
(?<! / )
>
(?&Table_Core) # Recurse Table Core
</table \s* > # Table end
) # (3 end)
)*
) # (2 end)
) # (1 end), Core End
</table \s* > # Parent table end
# ==========================================================================
(?s) # Dot-All
# Target Table
# ==================
<table \s+ class="possible_target"> # Target table start
( # (1 start), Core Start
(?<Table_Core> # (2 start), Table Core
(?:
(?>
(?:
(?! # Not start/end of another table
</table \s* >
|
<table [\s>] [^>]*
(?<! / )
>
)
.
)*
)
|
(?<New_Table> # (3 start), New Table
<table # Table start
(?! \s+ class="possible_target"> ) # but, not a target table type
[\s>] [^>]*
(?<! / )
>
(?&Table_Core) # Recurse Table Core
</table \s* > # Table end
) # (3 end)
)*
) # (2 end)
) # (1 end), Core End
</table \s* > # Target table end