Python 用简约法解析成分列表 来自节俭语法导入语法 语法=语法( ''' #项目=((配料+)(“配料+)”逗号ws?+/逗号+/ws+/配料)+ 项目=((成分“(“成分”))/成分)+ 成分=((粗体标记词+ws百分比?+逗号ws?) word=~“[A-Z0-9::+”i ws=~“\s*”+ 逗号=“,” bold_tag=“” 百分比=~“\([\d\.%]+\)” ''') 语法=语法( """ 成分项目=单词开括号(子成分)*闭括号逗号 text_前面的_冒号=(text冒号)* text=~“[A-Z 0-9]*”i 空格=~“[\\s]*” 冒号=:“ 单词=文本+ 打开_括号=“(” 闭式_括号=“)” 子成分=文本/b_标签/百分比 百分比=开括号百分比闭括号数 百分比\u num=~“[0-9\.%]*” b_tag=“” 逗号=“,” """) 语法=语法( ''' 项目=(成分+逗号?)+ 成分=单词?/bold\U标签?/ws?/百分比? 逗号=',' word=~“[A-Z0-9::+”i bold_tag=“” ws=~“\s*” 百分比=~“\([\d\.%]+\)” ''' ) 测试字符串=''部分倒置的红糖糖浆(43.3%),盐,酸度调节剂:磷酸三钾,葵花籽油'' 测试字符串=测试字符串。解码('utf-8') tree=grammar.parse(测试字符串)

Python 用简约法解析成分列表 来自节俭语法导入语法 语法=语法( ''' #项目=((配料+)(“配料+)”逗号ws?+/逗号+/ws+/配料)+ 项目=((成分“(“成分”))/成分)+ 成分=((粗体标记词+ws百分比?+逗号ws?) word=~“[A-Z0-9::+”i ws=~“\s*”+ 逗号=“,” bold_tag=“” 百分比=~“\([\d\.%]+\)” ''') 语法=语法( """ 成分项目=单词开括号(子成分)*闭括号逗号 text_前面的_冒号=(text冒号)* text=~“[A-Z 0-9]*”i 空格=~“[\\s]*” 冒号=:“ 单词=文本+ 打开_括号=“(” 闭式_括号=“)” 子成分=文本/b_标签/百分比 百分比=开括号百分比闭括号数 百分比\u num=~“[0-9\.%]*” b_tag=“” 逗号=“,” """) 语法=语法( ''' 项目=(成分+逗号?)+ 成分=单词?/bold\U标签?/ws?/百分比? 逗号=',' word=~“[A-Z0-9::+”i bold_tag=“” ws=~“\s*” 百分比=~“\([\d\.%]+\)” ''' ) 测试字符串=''部分倒置的红糖糖浆(43.3%),盐,酸度调节剂:磷酸三钾,葵花籽油'' 测试字符串=测试字符串。解码('utf-8') tree=grammar.parse(测试字符串),python,parsing,peg,parsimonious,Python,Parsing,Peg,Parsimonious,嗨,我正试图用简约来解析一份配料表,但没有任何运气。以上是我迄今为止试图写的所有语法。我想用逗号将其拆分为各个组成部分,忽略嵌套括号,如下面更复杂的测试字符串中所示: from parsimonious.grammar import Grammar grammar = Grammar( ''' # item = ( ( ingredient+ '(' ingredient+ ')' comma ws?) + / comma+ / ws+ / ingredient )+


from parsimonious.grammar import Grammar

grammar = Grammar(
     # item = ( ( ingredient+ '(' ingredient+ ')' comma ws?) + / comma+ / ws+ / ingredient )+
     item = ((ingredient '(' ingredient ')') / ingredient )+

     ingredient = ( ( bold_tag? word+ ws? percent? )+ comma? ws? )
     word = ~"[A-Z0-9:]+"i
     ws = ~"\s*"+
     comma = ","
     bold_tag = "<b>"
     percent = ~"\([\d\.%]+\)"

grammar = Grammar(
     ingredient_item = words open_bracket (subingredient)* closed_bracket comma
     text_preceding_colon = (text colon)*
     text       = ~"[A-Z 0-9]*"i
     space = ~"[\\s]*"
     colon = ":"
     words = text+
     open_bracket = "("
     closed_bracket = ")"
     subingredient = text / b_tag / percentage
     percentage = open_bracket percentage_num closed_bracket
     percentage_num = ~"[0-9\.%]*"
     b_tag = "<b>"
     comma = ","

grammar = Grammar(
     item = (ingredient+ comma? )+
     ingredient = word? / bold_tag? / ws? / percent?
     comma = ','
     word = ~"[A-Z0-9:]+"i
     bold_tag = "<b>"
     ws = ~"\s*"
     percent = ~"\([\d\.%]+\)"

test_string = '''Partially Inverted <b>Brown <b>Sugar Syrup (43.3%), Salt, Acidity Regulator: Tripotassium Phosphate, Sunflower Oil'''

test_string = test_string.decode('utf-8')

tree = grammar.parse(test_string)


test_string2 = '''Cereal Grains (Whole Grain <b>Oat Flour (28.3%), Whole Grain <b>Wheat (28.3%), Whole Grain <b>Barley Flour (17.1%), Whole Grain Maize Flour (2.0%), Whole Grain Rice Flour (2.0%)), Sugar, <b>Wheat Starch, Partially Inverted Brown Sugar Syrup, Salt, Acidity Regulator: Tripotassium Phosphate, Sunflower Oil, Colours: Carotene, Caramel and Annatto, Antioxidant: Tocopherols, Vitamins and Minerals: Vitamin C, Niacin (B3), Pantothenic Acid, Riboflavin (B2), Vitamin B6, Folic Acid, Vitamin D, Calcium Carbonate, Iron, To produce 100g of this product we have used 77.7g of Whole Grain, We guarantee every Nestlé Cereal with the green banner contains at least 8g of Whole Grain per serving'''