Python NLTK：处理句子的语法_Python_Nlp_Nltk_Grammar

Python NLTK：处理句子的语法

python nlp

Python NLTK：处理句子的语法,python,nlp,nltk,grammar,Python,Nlp,Nltk,Grammar,我想写一个语法，并得到以下类型句子的结果祈使句，如“吃沙拉” 双及物句，如“我给她读了那本书”（但不是“我给她读了那本书”）被动句，如“这本书是给她读的”（但不是我给她读的）我写道： from __future__ import division import sys from collections import defaultdict import nltk from nltk import ConditionalFreqDist, Nont

我想写一个语法，并得到以下类型句子的结果

祈使句，如“吃沙拉”
双及物句，如“我给她读了那本书”（但不是“我给她读了那本书”）
被动句，如“这本书是给她读的”（但不是我给她读的）

我写道：

    from __future__ import division
    import sys
    from collections import defaultdict
    import nltk
    from nltk import ConditionalFreqDist, Nonterminal, FreqDist
    from fixesNLTK3 import *
    from BetterICP import *
    from nltk import InsideChartParser
    from nltk import induce_pcfg
    from nltk import PCFG
    from nltk import ProbabilisticProduction
    from nltk import Tree, DictionaryProbDist
    from nltk.grammar import Production

        toy_grammer = parse_pgrammar("""
            # Grammatical productions.
             S -> NP VP [1.0]
             NP -> Pro [0.1] | Det N [0.3] | N [0.5] | NP PP [0.1]
             VP -> Vi [0.05] | Vt NP [0.9] | VP PP [0.05]
             Det -> Art [1.0]
             PP -> Prep NP [1.0]
             PRP -> Prp_Obj[0.5] | vbd VBD [0.3] | to TO[0.2]
           # Lexical productions.
             Pro -> "i" [0.3] | "we" [0.1] | "you" [0.1] | "he" [0.3] | "she" [0.2]
             Art -> "a" [0.4] | "an" [0.1] | "the" [0.4] | "The" [0.1]
             Prep -> "with" [0.7] | "in" [0.3]
             N -> "salad" [0.3] | "fork" [0.3] | "mushrooms" [0.2] | "book" [0.2]
             Vi -> "sneezed" [0.4] | "ran" [0.4] | "read" [0.2]
             Vt -> "eat" [0.2] | "eats" [0.2] | "ate" [0.2] | "see" [0.2] | "saw" [0.2]
             Prp_Obj -> "her" [0.5] | "I" [0.5]
             vbd -> "was" [1.0]
             to -> "to" [1.0]
            """)
def input_file():
    sppc = BetterICP(toy_grammer)
    with open("input.txt", "r") as ins:
        array = []
        for line in ins:
                array.append(line)
        for a in array:     
            sppc.parse(a.split())

input_file()

Im获得的输出为

0找到的总解析数

我的语法定义正确吗

更新（包含在

input.txt

中）：

贝特丽克，p.py

from __future__ import division
import sys
from pprint import pprint
from collections import defaultdict
import nltk
from nltk.corpus import treebank
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from nltk import InsideChartParser
from nltk.parse.chart import Chart,AbstractChartRule
from nltk.tree import Tree,ProbabilisticTree,_child_names
from nltk.parse.pchart import ProbabilisticFundamentalRule,ProbabilisticBottomUpInitRule,ProbabilisticTreeEdge,ProbabilisticLeafEdge
from nltk.parse.pchart import SingleEdgeProbabilisticFundamentalRule
from math import log

# Renamed between 3.0 and 3.0.4 :-(
if not(hasattr(Chart,'pretty_format_edge')):
    Chart.pretty_format_edge=Chart.pp_edge

# nltk.parse.pchart is fundamentally broken, because it adds edges directly
# into the chart, where the fr can see them whether or not they've come
# out of the agenda or not.

# The least-bad fix from outside I can come up with is implemented here:
#  add a boolean var called 'pending' which is true by default, only set to
#  false when the edge comes off the agenda, and when true causes it
#  to be ignored by fr

# Possible bug?  Even pending edges _are_ checked for when testing for
#  redundancy (i.e. Chart.insert is _not_ changed), but that means any
#  failure of best-first might cause a cheaper edge to be discarded
#  because an earlier, but still pending, identical-but-more expensive
#  edge is in the chart.

nltk.chart.EdgeI.pending=True

def productions_with_left_context(self,lpos=0,leaves=None):
    """
    Generate the productions that correspond to the non-terminal nodes of the tree, with their left-context word (or None), as pairs of word and Production.
    For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
    form P -> C1 C2 ... Cn and the word to the left of C1

        >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
        >>> t.productions()
        [(None, S -> NP VP),
         (None, NP -> D N),
         (None,  D -> 'the')
         ('the', N -> 'dog'),
         ('dog', VP -> V NP),
         ('dog', V -> 'chased'),
         ('dog', NP -> D N),
         ('chased', D -> 'the'),
         ('the', N -> 'cat')]

    :rtype: list(Production)
    """
    if leaves is None:
        leaves=self.leaves()
    #if not isinstance(self._label, string_types):
    #   raise TypeError('Productions can only be generated from trees having node labels that are strings')
    if lpos>0:
        lc=leaves[lpos-1]
    else:
        lc=None
    prods = [(lc,Production(Nonterminal(self._label), _child_names(self)))]
    for child in self:
        if isinstance(child, Tree):
            prods += child.productions_with_left_context(lpos,leaves)
            # could be much smarter
            lpos+=len(child.leaves())
        else:
            lpos+=1
    return prods

Tree.productions_with_left_context=productions_with_left_context

def production_distribution(psents):
    """ Creates a frequency distribution of lexical and non-lexical (grammatical) productions """
    prod_dict = defaultdict(int)
    for psent in psents:
        for production in psent.productions():
            prod_dict[production]+=1
    return prod_dict

def nt_counts(prod_dict):
    '''Create a dictionary of non-terminals and their counts'''
    nt_dict=defaultdict(int)
    for (rule,count) in prod_dict.items():
        nt_dict[rule.lhs()]+=count
    return nt_dict

def cost(prob):
    return 0.0 if prob==1.0 else -log(prob,2)

def production_cost(production,lhs_counts,production_counts):
    pcount=production_counts[production]
    ntcount=lhs_counts[production.lhs()]
    return cost(float(pcount)/float(ntcount))                     

def get_costed_productions(psents):
    """ Creates costed/weighted productions from a given list of parsed sentences."""
    prods_dict = production_distribution(psents)
    prods_nt_counts=nt_counts(prods_dict)
    costed_prods=[CostedProduction(p.lhs(),p.rhs(),production_cost(p, prods_nt_counts, prods_dict))
               for p in prods_dict.keys()]
    return costed_prods

class BetterPBPR(AbstractChartRule):
    NUM_EDGES=1
    def apply(self, chart, grammar, edge):
        if edge.is_incomplete(): return
        for prod in grammar.productions():
            if edge.lhs() == prod.rhs()[0]:
                # check for X -> X
                if prod.lhs()==edge.lhs() and len(prod.rhs())==1:
                    continue
                new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob())
                if chart.insert(new_edge, ()):
                    yield new_edge

class BetterSEPFR(AbstractChartRule):
    NUM_EDGES=1

    _fundamental_rule = ProbabilisticFundamentalRule()

    def apply(self, chart, grammar, edge1):
        fr = self._fundamental_rule
        if edge1.is_incomplete():
            # edge1 = left_edge; edge2 = right_edge
            for edge2 in chart.select(start=edge1.end(), is_complete=True,
                                     lhs=edge1.nextsym()):
                if edge2.pending:
                    continue
                for new_edge in fr.apply(chart, grammar, edge1, edge2):
                    yield new_edge
        else:
            # edge2 = left_edge; edge1 = right_edge
            for edge2 in chart.select(end=edge1.start(), is_complete=False,
                                      nextsym=edge1.lhs()):
                if edge2.pending:
                    continue
                for new_edge in fr.apply(chart, grammar, edge2, edge1):
                    yield new_edge

class BetterICP(InsideChartParser):
    '''Implement a more user-friendly InsideChartParser,
    which will show intermediate results, and quit after
    finding a specified number of parses'''
    def parse(self, tokens, notify=True, max=0):
        '''Run a probabilistic parse of tokens.
        If notify is true, display each complete parse as it is found
        If max>0, quit after finding that many parses'''
        self._grammar.check_coverage(tokens)
        chart = Chart(list(tokens))
        chart._trace=self._trace # Bad form. . .
        grammar = self._grammar
        start = grammar.start()
        prod_probs = {}

        # Chart parser rules.
        bu_init = ProbabilisticBottomUpInitRule()
        bu = BetterPBPR() # avoid infinite numbers of parses :-(
        fr = BetterSEPFR() # don't look at pending edges
        # Our queue
        queue = []

        # Initialize the chart.
        for edge in bu_init.apply(chart, grammar):
            if self._trace > 1:
                print('  %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
                                        cost(edge.prob())))
            queue.append(edge)

        found = 0
        while len(queue) > 0 and (max<1 or found<max):
            # Re-sort the queue.
            self.sort_queue(queue, chart)

            # Prune the queue to the correct size if a beam was defined
            if self.beam_size:
                self._prune(queue, chart)

            # Get the best edge.
            edge = queue.pop()
            edge.pending = False
            if self._trace > 0:
                print('  %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
                                        cost(edge.prob())))
            if (edge.start()==0 and
                edge.end()==chart._num_leaves and
                edge.lhs()==start and
                edge.is_complete()):
                if len(prod_probs)==0:
                    for prod in grammar.productions():
                        prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
                if notify:
                    print "****"
                    for tree in chart.trees(edge, tree_class=ProbabilisticTree,
                                            complete=True):
                        self._setprob(tree, prod_probs)
                        print tree, '%.4g (%.4g)'%(cost(tree.prob()),cost(edge.prob()))
                        #print tree
                    print "****"
                found+=1
            # Apply BU & FR to it.
            queue.extend(fr.apply(chart, grammar, edge))
            queue.extend(bu.apply(chart, grammar, edge))

        # Get a list of complete parses.
        parses = list(chart.parses(grammar.start(), ProbabilisticTree))
        if not notify:
            for parse in parses:
                self._setprob(parse,prod_probs)

        # Sort by probability
        parses.sort(reverse=True, key=lambda tree: tree.prob())
        if notify:
            print "%s total parses found"%found
        return iter(parses)

    def _prune(self, queue, chart):
        """ Discard items in the queue if the queue is longer than the beam."""
        if len(queue) > self.beam_size:
            split = len(queue)-self.beam_size
            if self._trace > 2:
                for edge in queue[:split]:
                    print('  %-50s [%.4g DISCARDED]' % (chart.pretty_format_edge(edge,2),
                                                        cost(edge.prob())))
            del queue[:split]

    def beam(self,width):
        self.beam_size=width

来自未来进口部的


导入系统
从pprint导入pprint
从集合导入defaultdict
导入nltk
从nltk.corpus导入树库
从nltk导入ConditionalFreqDist，非终端，FreqDist
从fixesNLTK3导入*
从nltk导入InsideChartParser
从nltk.parse.chart导入图表，AbstractChartRule
从nltk.tree导入树、概率树、\u子\u名称
从nltk.parse.pchart导入ProbabilisticFundamentalRule、ProbabilisticBottomUpInitRule、ProbabilisticTreeEdge、ProbabilisticLeafEdge
从nltk.parse.pchart导入SingleEdgeProbabilisticFundamentalRule
从数学导入日志
#在3.0和3.0.4之间重命名：-(
如果不是（hasattr（图表，漂亮的格式边））：
Chart.pretty\u format\u edge=Chart.pp\u edge
#nltk.parse.pchart从根本上被破坏了，因为它直接添加了边
#进入图表，fr可以看到他们是否来过
#是否在议程之外。
#我能想到的最不坏的外部修复在这里实现：
#添加一个名为“pending”的布尔变量，默认为true，仅设置为
#当优势从议程上消失时为假，当为真时为假
#被fr忽略
#可能存在的缺陷？在测试时，甚至会检查待处理的边缘
#冗余（即图表插入未更改），但这意味着
#best first的失败可能会导致放弃更便宜的优势
#因为一个更早，但仍然悬而未决，相同但更昂贵的
#edge在图表中。
nltk.chart.EdgeI.pending=True
带有左上下文（self，lpos=0，leaves=None）的def productions\u：
"""
生成对应于树的非终端节点的产品，并将其左上下文单词（或无）作为单词和产品对。
对于形式（P:c1c2…Cn）的每个子树，这将生成
表格P->C1 C2…Cn和C1左边的单词
>>>t=Tree.fromstring（（S（NP（D）（N狗））（VP（V追逐）（NP（D）（N猫））））
>>>t.产品（）
[（无，S->NP VP），
（无，NP->D N），
（无，D->“the”）
（'the'，N->'dog'），
（“狗”，VP->V NP），
（'dog'，V->'chased'），
（'dog'，NP->dn），
（“追逐”，D->“the”），
（'the'，N->'cat'）]
：rtype:list（生产）
"""
如果为“无”：
leaves=self.leaves（）
#如果不存在（自身标签、字符串类型）：
#raise TypeError（'只能从具有字符串节点标签的树生成产品'）
如果lpos>0：
lc=叶[lpos-1]
其他：
lc=无
产品=[（lc，产品（非终端（自我标签），[子项名称（自我）））]
儿童自我：
如果存在（子级、树）：
prods+=具有左上下文（LPO、左上下文）的child.productions\u
#可能会更聪明
lpos+=len（child.leaves（））
其他：
lpos+=1
回击棒
Tree.productions\u with\u left\u context=productions\u with\u left\u context
def生产分配（psents）：
“”“创建词法和非词法（语法）结果的频率分布”“”
prod_dict=defaultdict（int）
对于psent中的psent：
对于psent.productions（）中的生产：
产品目录[生产]+=1
返回产品目录
def nt_计数（生产指令）：
''创建非终端及其计数的字典''
nt_dict=defaultdict（int）
对于产品目录项（）中的（规则、计数）：
nt_dict[rule.lhs（）]+=count
返回新台币
def成本（prob）：
如果prob==1.0，则返回0.0 else-log（prob，2）
def生产成本（生产、lhs_计数、生产_计数）：
pcount=生产计数[生产]
ntcount=lhs\u计数[production.lhs（）]
退货成本（浮动（pcount）/浮动（ntcount））
def获得成本产品（psents）：
“”“从给定的已解析句子列表中创建成本/加权产品。”“”
产品目录=生产分配（psents）
产品数量=产品数量（产品数量）
成本生产=[成本生产（p.lhs（），p.rhs（），生产成本（p，产品数量，产品目录））
对于产品目录键（）中的p
退货成本产品
类BetterBPR（抽象图表规则）：
边数=1
def应用（自我、图表、语法、边缘）：
如果edge.is_不完整（）：返回
对于语法中的prod.productions（）：
如果edge.lhs（）==prod.rhs（）[0]：
#检查X->X
如果prod.lhs（）==edge.lhs（）和len（prod.rhs（））==1：
持续
new_edge=ProbabilisticTreeEdge.从_生产（prod，edge.start（），prod.prob（））
如果插入图表（新边，（））：
产生新的优势
类BetterSEPFR（抽象图表规则）：
边数=1
_基本规则=概率基本规则（）
def应用（自我、图表、语法、edge1）：
fr=自我。\基本规则
如果边1.不完整（）：
#边1=左边缘；边2=右边缘
对于chart.select中的边2（start=edge1.end（），is_complete=True，
lhs=edge1.nextsym（））：
如果edge2.pending：
持续
对于fr.apply中的新边（图表、语法、边1、边2）：
产生新的优势
其他：
#边2=左边缘；边1=右边缘
对于chart.select中的边2（end=edge1.start（），is_complete=False，
from __future__ import division
import sys
from pprint import pprint
from collections import defaultdict
import nltk
from nltk.corpus import treebank
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from nltk import InsideChartParser
from nltk.parse.chart import Chart,AbstractChartRule
from nltk.tree import Tree,ProbabilisticTree,_child_names
from nltk.parse.pchart import ProbabilisticFundamentalRule,ProbabilisticBottomUpInitRule,ProbabilisticTreeEdge,ProbabilisticLeafEdge
from nltk.parse.pchart import SingleEdgeProbabilisticFundamentalRule
from math import log

# Renamed between 3.0 and 3.0.4 :-(
if not(hasattr(Chart,'pretty_format_edge')):
    Chart.pretty_format_edge=Chart.pp_edge

# nltk.parse.pchart is fundamentally broken, because it adds edges directly
# into the chart, where the fr can see them whether or not they've come
# out of the agenda or not.

# The least-bad fix from outside I can come up with is implemented here:
#  add a boolean var called 'pending' which is true by default, only set to
#  false when the edge comes off the agenda, and when true causes it
#  to be ignored by fr

# Possible bug?  Even pending edges _are_ checked for when testing for
#  redundancy (i.e. Chart.insert is _not_ changed), but that means any
#  failure of best-first might cause a cheaper edge to be discarded
#  because an earlier, but still pending, identical-but-more expensive
#  edge is in the chart.

nltk.chart.EdgeI.pending=True

def productions_with_left_context(self,lpos=0,leaves=None):
    """
    Generate the productions that correspond to the non-terminal nodes of the tree, with their left-context word (or None), as pairs of word and Production.
    For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
    form P -> C1 C2 ... Cn and the word to the left of C1

        >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
        >>> t.productions()
        [(None, S -> NP VP),
         (None, NP -> D N),
         (None,  D -> 'the')
         ('the', N -> 'dog'),
         ('dog', VP -> V NP),
         ('dog', V -> 'chased'),
         ('dog', NP -> D N),
         ('chased', D -> 'the'),
         ('the', N -> 'cat')]

    :rtype: list(Production)
    """
    if leaves is None:
        leaves=self.leaves()
    #if not isinstance(self._label, string_types):
    #   raise TypeError('Productions can only be generated from trees having node labels that are strings')
    if lpos>0:
        lc=leaves[lpos-1]
    else:
        lc=None
    prods = [(lc,Production(Nonterminal(self._label), _child_names(self)))]
    for child in self:
        if isinstance(child, Tree):
            prods += child.productions_with_left_context(lpos,leaves)
            # could be much smarter
            lpos+=len(child.leaves())
        else:
            lpos+=1
    return prods

Tree.productions_with_left_context=productions_with_left_context

def production_distribution(psents):
    """ Creates a frequency distribution of lexical and non-lexical (grammatical) productions """
    prod_dict = defaultdict(int)
    for psent in psents:
        for production in psent.productions():
            prod_dict[production]+=1
    return prod_dict

def nt_counts(prod_dict):
    '''Create a dictionary of non-terminals and their counts'''
    nt_dict=defaultdict(int)
    for (rule,count) in prod_dict.items():
        nt_dict[rule.lhs()]+=count
    return nt_dict

def cost(prob):
    return 0.0 if prob==1.0 else -log(prob,2)

def production_cost(production,lhs_counts,production_counts):
    pcount=production_counts[production]
    ntcount=lhs_counts[production.lhs()]
    return cost(float(pcount)/float(ntcount))                     

def get_costed_productions(psents):
    """ Creates costed/weighted productions from a given list of parsed sentences."""
    prods_dict = production_distribution(psents)
    prods_nt_counts=nt_counts(prods_dict)
    costed_prods=[CostedProduction(p.lhs(),p.rhs(),production_cost(p, prods_nt_counts, prods_dict))
               for p in prods_dict.keys()]
    return costed_prods

class BetterPBPR(AbstractChartRule):
    NUM_EDGES=1
    def apply(self, chart, grammar, edge):
        if edge.is_incomplete(): return
        for prod in grammar.productions():
            if edge.lhs() == prod.rhs()[0]:
                # check for X -> X
                if prod.lhs()==edge.lhs() and len(prod.rhs())==1:
                    continue
                new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob())
                if chart.insert(new_edge, ()):
                    yield new_edge

class BetterSEPFR(AbstractChartRule):
    NUM_EDGES=1

    _fundamental_rule = ProbabilisticFundamentalRule()

    def apply(self, chart, grammar, edge1):
        fr = self._fundamental_rule
        if edge1.is_incomplete():
            # edge1 = left_edge; edge2 = right_edge
            for edge2 in chart.select(start=edge1.end(), is_complete=True,
                                     lhs=edge1.nextsym()):
                if edge2.pending:
                    continue
                for new_edge in fr.apply(chart, grammar, edge1, edge2):
                    yield new_edge
        else:
            # edge2 = left_edge; edge1 = right_edge
            for edge2 in chart.select(end=edge1.start(), is_complete=False,
                                      nextsym=edge1.lhs()):
                if edge2.pending:
                    continue
                for new_edge in fr.apply(chart, grammar, edge2, edge1):
                    yield new_edge

class BetterICP(InsideChartParser):
    '''Implement a more user-friendly InsideChartParser,
    which will show intermediate results, and quit after
    finding a specified number of parses'''
    def parse(self, tokens, notify=True, max=0):
        '''Run a probabilistic parse of tokens.
        If notify is true, display each complete parse as it is found
        If max>0, quit after finding that many parses'''
        self._grammar.check_coverage(tokens)
        chart = Chart(list(tokens))
        chart._trace=self._trace # Bad form. . .
        grammar = self._grammar
        start = grammar.start()
        prod_probs = {}

        # Chart parser rules.
        bu_init = ProbabilisticBottomUpInitRule()
        bu = BetterPBPR() # avoid infinite numbers of parses :-(
        fr = BetterSEPFR() # don't look at pending edges
        # Our queue
        queue = []

        # Initialize the chart.
        for edge in bu_init.apply(chart, grammar):
            if self._trace > 1:
                print('  %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
                                        cost(edge.prob())))
            queue.append(edge)

        found = 0
        while len(queue) > 0 and (max<1 or found<max):
            # Re-sort the queue.
            self.sort_queue(queue, chart)

            # Prune the queue to the correct size if a beam was defined
            if self.beam_size:
                self._prune(queue, chart)

            # Get the best edge.
            edge = queue.pop()
            edge.pending = False
            if self._trace > 0:
                print('  %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
                                        cost(edge.prob())))
            if (edge.start()==0 and
                edge.end()==chart._num_leaves and
                edge.lhs()==start and
                edge.is_complete()):
                if len(prod_probs)==0:
                    for prod in grammar.productions():
                        prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
                if notify:
                    print "****"
                    for tree in chart.trees(edge, tree_class=ProbabilisticTree,
                                            complete=True):
                        self._setprob(tree, prod_probs)
                        print tree, '%.4g (%.4g)'%(cost(tree.prob()),cost(edge.prob()))
                        #print tree
                    print "****"
                found+=1
            # Apply BU & FR to it.
            queue.extend(fr.apply(chart, grammar, edge))
            queue.extend(bu.apply(chart, grammar, edge))

        # Get a list of complete parses.
        parses = list(chart.parses(grammar.start(), ProbabilisticTree))
        if not notify:
            for parse in parses:
                self._setprob(parse,prod_probs)

        # Sort by probability
        parses.sort(reverse=True, key=lambda tree: tree.prob())
        if notify:
            print "%s total parses found"%found
        return iter(parses)

    def _prune(self, queue, chart):
        """ Discard items in the queue if the queue is longer than the beam."""
        if len(queue) > self.beam_size:
            split = len(queue)-self.beam_size
            if self._trace > 2:
                for edge in queue[:split]:
                    print('  %-50s [%.4g DISCARDED]' % (chart.pretty_format_edge(edge,2),
                                                        cost(edge.prob())))
            del queue[:split]

    def beam(self,width):
        self.beam_size=width

# fix buggy NLTK 3 :-(
# different fixes for different versions :-((((
import re, sys
import nltk
from nltk.grammar import _ARROW_RE, _PROBABILITY_RE, _DISJUNCTION_RE, Production
from nltk.draw import CFGEditor
from nltk.probability import ImmutableProbabilisticMixIn
ARROW = u'\u2192'
TOKEN = u'([\\w ]|\\\\((x[0-9a-f][0-9a-f])|(u[0-9a-f][0-9a-f][0-9a-f][0-9a-f])))+'
CFGEditor.ARROW = ARROW
CFGEditor._TOKEN_RE=re.compile(u"->|u?'"+TOKEN+u"'|u?\""+TOKEN+u"\"|\\w+|("+ARROW+u")")
CFGEditor._PRODUCTION_RE=re.compile(ur"(^\s*\w+\s*)" +
                  ur"(->|("+ARROW+"))\s*" +
                  ur"((u?'"+TOKEN+"'|u?\""+TOKEN+"\"|''|\"\"|\w+|\|)\s*)*$")
nltk.grammar._TERMINAL_RE = re.compile(ur'( u?"[^"]+" | u?\'[^\']+\' ) \s*', re.VERBOSE)
nltk.grammar._ARROR_RE = re.compile(ur'\s* (->|'+ARROW+') \s*', re.VERBOSE)

from nltk.grammar import _TERMINAL_RE

if sys.version_info[0]>2 or sys.version_info[1]>6:
    from nltk.grammar import PCFG, CFG, ProbabilisticProduction as FixPP
    parse_grammar=CFG.fromstring
    parse_pgrammar=PCFG.fromstring
    from nltk import InsideChartParser
    def nbest_parse(self,tokens,n=None):
        parses=self.parse(tokens)
        if n is None:
            return [parse for parse in parses]
        else:
            return [parses.next() for i in range(n)]
    InsideChartParser.nbest_parse=nbest_parse
else:
    from nltk.grammar import WeightedGrammar as PCFG, WeightedProduction as FixPP
    from nltk import parse_cfg, parse_pcfg
    parse_grammar=parse_cfg
    parse_pgrammar=parse_pcfg

def fix_parse_production(line, nonterm_parser, probabilistic=False):
    """
    Parse a grammar rule, given as a string, and return
    a list of productions.
    """
    pos = 0

    # Parse the left-hand side.
    lhs, pos = nonterm_parser(line, pos)

    # Skip over the arrow.
    m = _ARROW_RE.match(line, pos)
    if not m: raise ValueError('Expected an arrow')
    pos = m.end()

    # Parse the right hand side.
    probabilities = [0.0]
    rhsides = [[]]
    while pos < len(line):
        # Probability.
        m = _PROBABILITY_RE.match(line, pos)
        if probabilistic and m:
            pos = m.end()
            probabilities[-1] = float(m.group(1)[1:-1])
            if probabilities[-1] > 1.0:
                raise ValueError('Production probability %f, '
                                 'should not be greater than 1.0' %
                                 (probabilities[-1],))

        # String -- add terminal.
        elif (line[pos] in "\'\"" or line[pos:pos+2] in ('u"',"u'")):
            m = _TERMINAL_RE.match(line, pos)
            if not m: raise ValueError('Unterminated string')
            rhsides[-1].append(eval(m.group(1)))
            pos = m.end()

        # Vertical bar -- start new rhside.
        elif line[pos] == '|':
            m = _DISJUNCTION_RE.match(line, pos)
            probabilities.append(0.0)
            rhsides.append([])
            pos = m.end()

        # Anything else -- nonterminal.
        else:
            nonterm, pos = nonterm_parser(line, pos)
            rhsides[-1].append(nonterm)

    if probabilistic:
        return [FixPP(lhs, rhs, prob=probability)
                for (rhs, probability) in zip(rhsides, probabilities)]
    else:
        return [Production(lhs, rhs) for rhs in rhsides]

if sys.version_info[0]>2 or sys.version_info[1]>6:
    nltk.grammar._read_production=fix_parse_production
else:
    nltk.grammar.parse_production=fix_parse_production

class CostedProduction(FixPP):
    """
    A probabilistic context free grammar production using costs.
    A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that
    has an associated probability, which represents how likely it is that
    this production will be used.  In particular, the probability of a
    ``ProbabilisticProduction`` records the likelihood that its right-hand side is
    the correct instantiation for any given occurrence of its left-hand side.

    :see: ``Production``
    """
    def __init__(self, lhs, rhs, cost):
        """
        Construct a new ``ProbabilisticProduction``.

        :param lhs: The left-hand side of the new ``ProbabilisticProduction``.
        :type lhs: Nonterminal
        :param rhs: The right-hand side of the new ``ProbabilisticProduction``.
        :type rhs: sequence(Nonterminal and terminal)
        :param prob: Probability parameters of the new ``ProbabilisticProduction``.
        """
        ImmutableProbabilisticMixIn.__init__(self, logprob=-cost)
        Production.__init__(self, lhs, rhs)

    def __str__(self):
        return Production.__unicode__(self) + \
            (' [0.0]' if (self.logprob() == 0.0) else ' [%g]' % -self.logprob())

    def __repr__(self):
        return '%s'%str(self)

    def cost(self):
        return 0.0 if self.logprob() == 0.0 else -self.logprob()