在python中,用什么方法可以快速地进行括号的嵌套拆分?
我有一个以下格式的文件:在python中,用什么方法可以快速地进行括号的嵌套拆分?,python,Python,我有一个以下格式的文件: ID1 { some text } ID2 { some text } 它们不必逐行排列,因此我们可以: ID1 { some [crlf] text [crlf] } ID2 [crlf] { some t [crlf] ex [crlf] t} 依此类推,意味着一些文本可以不止一行,并且在ID之后可能会有一个CRLF。主要不变量是所有ID都由{}包围。 问题是某些文本本身可能包含{和} 在考虑嵌套括号的情况下,将这样一个文件分割成一个字符串列表(每个字符串都是
ID1 { some text }
ID2 { some text }
它们不必逐行排列,因此我们可以:
ID1 { some [crlf]
text [crlf]
}
ID2 [crlf] { some t [crlf]
ex [crlf]
t}
依此类推,意味着一些文本可以不止一行,并且在ID
之后可能会有一个CRLF
。主要不变量是所有ID都由{
}
包围。
问题是某些文本本身可能包含{
和}
在考虑嵌套括号的情况下,将这样一个文件分割成一个字符串列表(每个字符串都是ID{text}
)的快捷方法是什么
考虑到一些错误分析,如果括号不平衡,那就太好了。是不可能的。你看过吗
[编辑]
OTOH这可能有用:
from functools import wraps
def transition(method):
@wraps(method)
def trans(state, *args, **kwargs):
command = method(state, *args, **kwargs)
state.__class__ = command(state)
return trans
class State(object):
def __new__(cls):
state = object.__new__(cls)
state._identities = []
return state
def unchanged(state):
return state.__class__
def shifting(identity):
def command(state):
return identity
return command
def pushing(identity, afterwards=None):
def command(state):
state._identities.append(afterwards or state.__class__)
return identity
return command
def popped(state):
return state._identities.pop()
##############################################################################
import re
tokenize = re.compile(flags=re.VERBOSE | re.MULTILINE, pattern=r"""
(?P<word> \w+ ) |
(?P<braceleft> { ) |
(?P<braceright> } ) |
(?P<eoi> $ ) |
(?P<error> \S ) # catch all (except white space)
""").finditer
def parse(parser, source, builder):
for each in tokenize(source):
dispatch = getattr(parser, each.lastgroup)
dispatch(each.group(), builder)
class ParsingState(State):
def eoi(self, token, *args):
raise ValueError('premature end of input in parsing state %s' %
self.__class__.__name__
)
def error(self, token, *args):
raise ValueError('parsing state %s does not understand token %s' % (
self.__class__.__name__, token
))
def __getattr__(self, name):
def raiser(token, *args):
raise ValueError(
'parsing state %s does not understand token "%s" of type %s' %
(self.__class__.__name__, token, name)
)
return raiser
class Id(ParsingState):
@transition
def word(self, token, builder):
builder.add_id(token)
return shifting(BeginContent)
@transition
def eoi(self, token, builder):
return shifting(DoneParsing)
class BeginContent(ParsingState):
@transition
def braceleft(self, token, builder):
return shifting(Content)
class Content(ParsingState):
@transition
def word(self, token, builder):
builder.add_text(token)
return unchanged
@transition
def braceleft(self, token, builder):
builder.add_text(token)
return pushing(PushedContent)
@transition
def braceright(self, token, builder):
return shifting(Id)
class PushedContent(Content):
@transition
def braceright(self, token, builder):
builder.add_text(token)
return popped
class DoneParsing(ParsingState):
pass
##############################################################################
class Entry(object):
def __init__(self, idname):
self.idname = idname
self.text = []
def __str__(self):
return '%s { %s }' % (self.idname, ' '.join(self.text))
class Builder(object):
def __init__(self):
self.entries = []
def add_id(self, id_token):
self.entries.append(Entry(id_token))
def add_text(self, text_token):
self.entries[-1].text.append(text_token)
##############################################################################
if __name__ == '__main__':
file_content = """
id1 { some text } id2 {
some { text }
}
"""
builder = Builder()
parse(Id(), file_content, builder)
for entry in builder.entries:
print entry
从functools导入包装
def转换(方法):
@包装(方法)
def传输(状态,*args,**kwargs):
命令=方法(状态,*args,**kwargs)
状态。\类\命令(状态)
回程传输
类状态(对象):
定义新的(cls):
状态=对象。\uuuu新建\uuuu(cls)
状态。_标识=[]
返回状态
def未更改(状态):
返回状态__
def转换(标识):
def命令(状态):
返回标识
返回命令
def推送(标识,之后=无):
def命令(状态):
state.\u identifies.append(之后或state.\u类)
返回标识
返回命令
def弹出(状态):
返回状态。_identifies.pop()
##############################################################################
进口稀土
tokenize=re.compile(flags=re.VERBOSE | re.MULTILINE,pattern=r”“”
(?P\w+)|
(?P{)|
(?P})|
(?P$)|
(?P\S)#包罗万象(空白除外)
“”“)。查找程序
def解析(解析器、源代码、生成器):
对于tokenize(源)中的每一项:
dispatch=getattr(解析器,each.lastgroup)
分派(each.group(),生成器)
类别ParsingState(州):
def eoi(自身、令牌、*参数):
raise VALUERROR('分析状态为%s的输入过早结束'%
self.\u类\u.\u名称__
)
def错误(自身、令牌、*args):
raise VALUERROR('分析状态%s不理解令牌%s'%(
self.\uuuuu类\uuuuuuu.\uuuuuuuu名称\uuuuuuuuuu,令牌
))
def _ugetattr _;(self,name):
def提升器(令牌,*args):
升值误差(
'分析状态%s不理解类型为%s的令牌“%s”%
(self.\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
)
回程提升机
类Id(ParsingState):
@过渡
def word(自身、令牌、生成器):
builder.add_id(令牌)
返回移位(起始内容)
@过渡
def eoi(自身、令牌、生成器):
返回换档(完成换档)
类起始内容(ParsingState):
@过渡
定义左括号(self、token、builder):
返回移位(内容)
课程内容(ParsingState):
@过渡
def word(自身、令牌、生成器):
生成器。添加_文本(令牌)
返回不变
@过渡
定义左括号(self、token、builder):
生成器。添加_文本(令牌)
返回推送(推送内容)
@过渡
定义权限(自身、令牌、生成器):
返回移位(Id)
类PushedContent(内容):
@过渡
定义权限(自身、令牌、生成器):
生成器。添加_文本(令牌)
返回弹出
完成类授权(ParsingState):
通过
##############################################################################
类条目(对象):
定义初始化(self,idname):
self.idname=idname
self.text=[]
定义(自我):
返回'%s{%s}'(self.idname',.join(self.text))
类生成器(对象):
定义初始化(自):
self.entries=[]
def add_id(自身,id_令牌):
self.entries.append(条目(id_令牌))
def添加_文本(自身、文本标记):
self.entries[-1].text.append(text\u标记)
##############################################################################
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
文件内容=“”
id1{某些文本}id2{
一些{text}
}
"""
builder=builder()
解析(Id(),文件内容,生成器)
对于builder.entries中的条目:
打印条目
这是一个简单的问题,“如何编写与括号匹配的rescursive体面解析器?”
鉴于这种语法:
STMT_LIST := STMT+
STMT := ID '{' DATA '}'
DATA := TEXT | STMT
ID := [a-z0-9]+
TEXT := [^}]*
解析器可能如下所示:
import sys
import re
def parse(data):
"""
STMT
"""
while data:
data, statement_id, clause = parse_statement(data)
print repr((statement_id, clause))
def consume_whitespace(data):
return data.lstrip()
def parse_statement(data):
m = re.match('[a-zA-Z0-9]+', data)
if not m:
raise ValueError, "No ID found"
statement_id = m.group(0)
data = consume_whitespace(data[len(statement_id):])
data, clause = parse_clause(data)
return consume_whitespace(data), statement_id, clause
def parse_clause(data):
clause = []
if not data.startswith('{'):
raise ValueError, "No { found"
data = data[1:]
closebrace = data.index('}')
try:
openbrace = data.index('{')
except ValueError:
openbrace = sys.maxint
while openbrace < closebrace:
clause.append(data[:openbrace])
data, subclause = parse_clause(data[openbrace:])
clause.append(subclause)
closebrace = data.index('}')
try:
openbrace = data.index('{')
except ValueError:
openbrace = sys.maxint
clause.append(data[:closebrace])
data = data[closebrace+1:]
return data, clause
parse("ID { foo { bar } }")
parse("ID { foo { bar } } baz { tee fdsa { fdsa } }")
导入系统
进口稀土
def解析(数据):
"""
STMT
"""
而数据:
数据,语句id,子句=解析语句(数据)
打印报告((声明,条款))
def消耗_空白(数据):
返回data.lstrip()
def parse_语句(数据):
m=重新匹配('[a-zA-Z0-9]+',数据)
如果不是m:
raise VALUE错误,“找不到ID”
语句\u id=m.group(0)
数据=使用空格(数据[len(语句id)])
数据,子句=parse_子句(数据)
返回使用空格(数据)、语句id、子句
def parse_子句(数据):
子句=[]
如果不是data.startswith(“{”):
raise VALUERROR,“未找到{”
数据=数据[1:]
closebrace=data.index(“}”)
尝试:
openbrace=data.index(“{”)
除值错误外:
openbrace=sys.maxint
当openbrace
老实说,这是一个令人讨厌的解析器。如果你把它构造得更好,你最终会得到一个来自lexer和
# parsebrackets.py
def parse_brackets(data):
# step 1: find the 0-nesting-level { and }
lpos = []
rpos = []
nest = 0
for i, c in enumerate(data):
if c == '{':
if nest == 0:
lpos.append(i)
nest += 1
elif c == '}':
nest -= 1
if nest < 0:
raise Exception('too many } at offset %d' % i)
if nest == 0:
rpos.append(i)
if nest > 0:
raise Exception('too many { in data')
prev = -1
# step 2: extract the pieces
for start, end in zip(lpos, rpos):
key = data[prev+1:start].strip()
# insert test for empty key here
text = data[start:end+1]
prev = end
yield key, text
if data[prev+1:].strip():
raise Exception('non-blank text after last }')
>>> from parsebrackets import parse_brackets as pb
>>> for k, t in pb(' foo {bar {zot\n}} guff {qwerty}'):
... print repr(k), repr(t)
...
'foo' '{bar {zot\n}}'
'guff' '{qwerty}'
>>>
data = """ID1 { some text } ID2 { some {with some more text nested in braces} text }"""
from pyparsing import Word, alphas, alphanums, dictOf, nestedExpr, originalTextFor
# identifier starts with any alpha, followed by any alpha, num, or '_'
ident = Word(alphas,alphanums+"_")
# Solution 1
# list of items is a dict of pairs of idents and nested {}'s
# - returns {}'s expressions as nested structures
itemlist = dictOf(ident, nestedExpr("{","}"))
items = itemlist.parseString(data)
print items.dump()
"""
prints:
[['ID1', ['some', 'text']], ['ID2', ['some', ['with', 'some', 'more', ...
- ID1: ['some', 'text']
- ID2: ['some', ['with', 'some', 'more', 'text', 'nested', 'in', 'braces'], 'text']
"""
# Solution 2
# list of items is a dict of pairs of idents and nested {}'s
# - returns {}'s expressions as strings of text extract from the
# original input string
itemlist = dictOf(ident, originalTextFor(nestedExpr("{","}")))
items = itemlist.parseString(data)
print items.dump()
"""
prints:
[['ID1', '{ some text }'], ['ID2', '{ some {with some more text nested in ...
- ID1: { some text }
- ID2: { some {with some more text nested in braces} text }
"""