Python：如何将嵌套括号与正则表达式匹配？_Python_Regex_Nested

Python：如何将嵌套括号与正则表达式匹配？

python regex

Python：如何将嵌套括号与正则表达式匹配？,python,regex,nested,Python,Regex,Nested,我试图匹配一个数学表达式，比如字符串，它有嵌套的括号 import re p = re.compile('\(.+\)') str = '(((1+0)+1)+1)' print p.findall(s) def get_string_inside_outermost_parentheses(text): content_p = re.compile(r"(?<=\().*(?=\))") r = content_p.search(text) return r.

我试图匹配一个数学表达式，比如字符串，它有嵌套的括号

import re

p = re.compile('\(.+\)')
str = '(((1+0)+1)+1)'
print p.findall(s)

def get_string_inside_outermost_parentheses(text):
    content_p = re.compile(r"(?<=\().*(?=\))")
    r = content_p.search(text)
    return r.group() 

def get_string_inside_innermost_parentheses(text):
    while '(' in text:
        text = get_string_inside_outermost_parentheses(text)
    return text

['（（1+0）+1+1）]

我希望它匹配所有包含的表达式，例如（1+0），（（1+0）+1）…
我甚至不在乎它是否与不需要的匹配，比如（（1+0），我可以处理这些

为什么它还没有这样做，我该怎么做？

正则表达式语言的功能不足以匹配任意嵌套的结构。为此，您需要一个下推自动机（即解析器）。有几种这样的工具可用，例如

Python还为自己的语法提供了一个简单的解释，这可能会满足您的需要。但是，输出非常详细，需要一段时间才能理解。如果您对这个角度感兴趣，下面的讨论将尝试尽可能简单地解释问题

>>> import parser, pprint
>>> pprint.pprint(parser.st2list(parser.expr('(((1+0)+1)+1)')))
[258,
 [327,
  [304,
   [305,
    [306,
     [307,
      [308,
       [310,
        [311,
         [312,
          [313,
           [314,
            [315,
             [316,
              [317,
               [318,
                [7, '('],
                [320,
                 [304,
                  [305,
                   [306,
                    [307,
                     [308,
                      [310,
                       [311,
                        [312,
                         [313,
                          [314,
                           [315,
                            [316,
                             [317,
                              [318,
                               [7, '('],
                               [320,
                                [304,
                                 [305,
                                  [306,
                                   [307,
                                    [308,
                                     [310,
                                      [311,
                                       [312,
                                        [313,
                                         [314,
                                          [315,
                                           [316,
                                            [317,
                                             [318,
                                              [7,
                                               '('],
                                              [320,
                                               [304,
                                                [305,
                                                 [306,
                                                  [307,
                                                   [308,
                                                    [310,
                                                     [311,
                                                      [312,
                                                       [313,
                                                        [314,
                                                         [315,
                                                          [316,
                                                           [317,
                                                            [318,
                                                             [2,
                                                              '1']]]]],
                                                         [14,
                                                          '+'],
                                                         [315,
                                                          [316,
                                                           [317,
                                                            [318,
                                                             [2,
                                                              '0']]]]]]]]]]]]]]]],
                                              [8,
                                               ')']]]]],
                                          [14,
                                           '+'],
                                          [315,
                                           [316,
                                            [317,
                                             [318,
                                              [2,
                                               '1']]]]]]]]]]]]]]]],
                               [8, ')']]]]],
                           [14, '+'],
                           [315,
                            [316,
                             [317,
                              [318, [2, '1']]]]]]]]]]]]]]]],
                [8, ')']]]]]]]]]]]]]]]],
 [4, ''],
 [0, '']]

您可以通过以下简短功能缓解疼痛：

def shallow(ast):
    if not isinstance(ast, list): return ast
    if len(ast) == 2: return shallow(ast[1])
    return [ast[0]] + [shallow(a) for a in ast[1:]]

>>> pprint.pprint(shallow(parser.st2list(parser.expr('(((1+0)+1)+1)'))))
[258,
 [318,
  '(',
  [314,
   [318, '(', [314, [318, '(', [314, '1', '+', '0'], ')'], '+', '1'], ')'],
   '+',
   '1'],
  ')'],
 '',
 '']

这些数字来自Python模块

symbol

和

token

，您可以使用它们构建从数字到名称的查找表：

map = dict(token.tok_name.items() + symbol.sym_name.items())

您甚至可以将此映射折叠到

shallow（）

函数中，以便使用字符串而不是数字：

def shallow(ast):
    if not isinstance(ast, list): return ast
    if len(ast) == 2: return shallow(ast[1])
    return [map[ast[0]]] + [shallow(a) for a in ast[1:]]

>>> pprint.pprint(shallow(parser.st2list(parser.expr('(((1+0)+1)+1)'))))
['eval_input',
 ['atom',
  '(',
  ['arith_expr',
   ['atom',
    '(',
    ['arith_expr',
     ['atom', '(', ['arith_expr', '1', '+', '0'], ')'],
     '+',
     '1'],
    ')'],
   '+',
   '1'],
  ')'],
 '',
 '']

您应该编写适当的解析器来解析此类表达式（例如，使用pyparsing）。

正则表达式不是编写合适的解析器的合适工具。

正则表达式尝试匹配尽可能多的文本，从而消耗所有字符串。它不会在该字符串的部分上查找正则表达式的其他匹配项。这就是为什么您只得到一个答案

解决方案是不使用正则表达式。如果您确实在尝试解析数学表达式，请使用真正的解析解决方案。如果您确实只想捕获括号中的片段，只需在看到（和）时循环计数字符即可递增一个减量一个计数器。

我相信这个函数可能适合您的需要，我很快就把它组合起来了，所以请随意清理一下。在进行嵌套时，很容易向后思考并从那里开始工作=]

def fn(string,endparens=False):
    exp = []
    idx = -1
    for char in string:
        if char == "(":
            idx += 1
            exp.append("")
        elif char == ")":
            idx -= 1
            if idx != -1:
                exp[idx] = "(" + exp[idx+1] + ")"
        else:
            exp[idx] += char
    if endparens:
        exp = ["("+val+")" for val in exp]
    return exp

正如其他人提到的，正则表达式不是嵌套构造的方法。我将使用以下方法给出一个基本示例：

下面是一个用法示例：

>>> parens.parseString("((a + b) + c)")

输出：

(                          # all of str
 [
  (                        # ((a + b) + c)
   [
    (                      #  (a + b)
     ['a', '+', 'b'], {}   
    ),                     #  (a + b)      [closed]
    '+',
    'c'
   ], {}
  )                        # ((a + b) + c) [closed]
 ], {}  
)                          # all of str    [closed]

[[['12', '+', '2'], '+', '3']]

（手动进行换行/缩进/注释）

编辑：根据Paul McGuire的建议进行修改，以消除不必要的

转发

要以嵌套列表格式获取输出，请执行以下操作：

res = parens.parseString("((12 + 2) + 3)")
res.asList()

输出：

(                          # all of str
 [
  (                        # ((a + b) + c)
   [
    (                      #  (a + b)
     ['a', '+', 'b'], {}   
    ),                     #  (a + b)      [closed]
    '+',
    'c'
   ], {}
  )                        # ((a + b) + c) [closed]
 ], {}  
)                          # all of str    [closed]

[[['12', '+', '2'], '+', '3']]

平衡对（例如括号）是正则表达式无法识别的语言的一个例子

下面是一个简单的数学解释为什么会这样

正则表达式是定义有限状态自动机（简称FSM）的一种方法。这样的设备有有限数量的可能状态来存储信息。如何使用该状态没有特别的限制，但它确实意味着它可以识别的不同位置的绝对最大数量

例如，状态可以用于计数，比如说，不匹配的左括号。但是由于这种计数的状态量必须是完全有界的，那么给定的FSM可以计数到最大值n-1，其中n是FSM可以处于的状态数。如果n是，比如说，10，那么不匹配的左括号的最大值是FSM可以匹配10，直到它中断。由于完全可能有一个以上的左括号，因此不可能有FSM能够正确识别匹配括号的完整语言

那又怎样？假设你只选择一个非常大的n？问题是，作为描述FSM的一种方式，正则表达式基本上描述了从一个状态到另一个状态的所有转换。因为对于任何n，FSM都需要两个状态转换（一个用于匹配左括号，另一个用于匹配右括号），正则表达式本身必须至少增长n的常数倍

相比之下，下一类更好的语言（上下文无关语法）可以完全紧凑地解决这个问题


表达式：：=`（`expression`）`expression

|什么都没有

您可以使用regexp，但您需要自己进行递归。下面的方法可以实现这一点（如果您只需要找到括号中的所有表达式，如问题所述）：

但是，此代码与“正确”括号不匹配。如果需要这样做，最好使用专门的解析器。

正在准备替换Python中现有的解析器。它引入了许多新功能，包括递归调用

import regex

s = 'aaa(((1+0)+1)+1)bbb'

result = regex.search(r'''
(?<rec> #capturing group rec
 \( #open parenthesis
 (?: #non-capturing group
  [^()]++ #anyting but parenthesis one or more times without backtracking
  | #or
   (?&rec) #recursive substitute of group rec
 )*
 \) #close parenthesis
)
''',s,flags=regex.VERBOSE)


print(result.captures('rec'))

regex中的相关bug

：

许多帖子建议，对于嵌套大括号，正则表达式不是这样做的。只需计算大括号：例如，请参见：

下面是一个完整的python示例，用于迭代字符串并计算大括号：

# decided for nested braces to not use regex but brace-counting
import re, string

texta = r'''
nonexistent.\note{Richard Dawkins, \textit{Unweaving the Rainbow: Science, Delusion
and the Appetite for Wonder} (Boston: Houghton Mifflin Co., 1998), pp. 302, 304,
306-309.} more text and more.

 This is a statistical fact, not a
guess.\note{Zheng Wu, \textit{Cohabitation: An Alternative Form
of Family Living} (Ontario, Canada: Oxford University Press,
2000), p. 149; \hbox{Judith} Treas and Deirdre Giesen, ``Title
and another title,''
\textit{Journal of Marriage and the Family}, February 2000,
p.\,51}

more and more text.capitalize
'''
pos = 0
foundpos = 0
openBr = 0 # count open braces
while foundpos <> -1:
    openBr = 0
    foundpos = string.find(texta, r'\note',pos)
    # print 'foundpos',foundpos
    pos = foundpos + 5
    # print texta[pos]
    result = ""
    while foundpos > -1 and openBr >= 0:
        pos = pos + 1
        if texta[pos] == "{":
            openBr = openBr + 1
        if texta[pos] == "}":
            openBr = openBr - 1
        result = result + texta[pos]
    result = result[:-1] # drop the last } found.
    result = string.replace(result,'\n', ' ') # replace new line with space
    print result

#决定嵌套大括号不使用正则表达式，而是使用大括号计数
输入re，字符串
texta=r''
不存在。\n注意{Richard Dawkins，\text它{解开彩虹：科学，错觉
《对奇迹的渴望》（波士顿：霍顿·米夫林公司，1998年），第302、304页，
306-309.}更多文本和更多内容。
这是一个统计事实，而不是事实
猜一猜。{郑武，\text它{同居：另一种形式
家庭生活}（加拿大安大略省：牛津大学出版社，
2000年），第149页；\hbox{Judith}Treas和Deirdre Giesen，`标题
还有另一个标题“我爱你”
\text{婚姻与家庭杂志}，2000年2月，
p、 \，51}
越来越多的文本。大写
'''
pos=0
foundpos=0
openBr=0#计算开括号
而foundpos-1：
openBr=0
foundpos=string.find（texta，r'\note'，pos）
#打印“foundpos”，foundpos
pos=foundpos+5
#打印文本A[pos]
result=“”
而foundpos>-1和openBr
def paren_matcher (n):
    # poor man's matched paren scanning, gives up
    # after n+1 levels.  Matches any string with balanced
    # parens inside; add the outer parens yourself if needed.
    # Nongreedy.
    return r"[^()]*?(?:\("*n+r"[^()]*?"+r"\)[^()]*?)*?"*n

import re
def matches(line, opendelim='(', closedelim=')'):
    stack = []

    for m in re.finditer(r'[{}{}]'.format(opendelim, closedelim), line):
        pos = m.start()

        if line[pos-1] == '\\':
            # skip escape sequence
            continue

        c = line[pos]

        if c == opendelim:
            stack.append(pos+1)

        elif c == closedelim:
            if len(stack) > 0:
                prevpos = stack.pop()
                # print("matched", prevpos, pos, line[prevpos:pos])
                yield (prevpos, pos, len(stack))
            else:
                # error
                print("encountered extraneous closing quote at pos {}: '{}'".format(pos, line[pos:] ))
                pass

    if len(stack) > 0:
        for pos in stack:
            print("expecting closing quote to match open quote starting at: '{}'"
                  .format(line[pos-1:]))

line = '(((1+0)+1)+1)'
for openpos, closepos, level in matches(line):
    print(line[openpos:closepos], level)

1+0 2
(1+0)+1 1
((1+0)+1)+1 0

import re s = '(((1+0)+1)+1)'

def getContectWithinBraces( x , *args , **kwargs):
    ptn = r'[%(left)s]([^%(left)s%(right)s]*)[%(right)s]' %kwargs
    Res = []
    res = re.findall(ptn , x)
    while res != []:
        Res = Res + res
        xx = x.replace('(%s)' %Res[-1] , '%s')
        res = re.findall(ptn, xx)
        print(res)
        if res != []:
            res[0] = res[0] %('(%s)' %Res[-1])
    return Res

getContectWithinBraces(s , left='\(\[\{' , right = '\)\]\}')

def get_string_inside_outermost_parentheses(text):
    content_p = re.compile(r"(?<=\().*(?=\))")
    r = content_p.search(text)
    return r.group() 

def get_string_inside_innermost_parentheses(text):
    while '(' in text:
        text = get_string_inside_outermost_parentheses(text)
    return text