Python NLTK:处理句子的语法
我想写一个语法,并得到以下类型句子的结果Python NLTK:处理句子的语法,python,nlp,nltk,grammar,Python,Nlp,Nltk,Grammar,我想写一个语法,并得到以下类型句子的结果 祈使句,如“吃沙拉” 双及物句,如“我给她读了那本书”(但不是“我给她读了那本书”) 被动句,如“这本书是给她读的”(但不是我给她读的) 我写道: from __future__ import division import sys from collections import defaultdict import nltk from nltk import ConditionalFreqDist, Nont
- 祈使句,如“吃沙拉”
- 双及物句,如“我给她读了那本书”(但不是“我给她读了那本书”)
- 被动句,如“这本书是给她读的”(但不是我给她读的)
from __future__ import division
import sys
from collections import defaultdict
import nltk
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from BetterICP import *
from nltk import InsideChartParser
from nltk import induce_pcfg
from nltk import PCFG
from nltk import ProbabilisticProduction
from nltk import Tree, DictionaryProbDist
from nltk.grammar import Production
toy_grammer = parse_pgrammar("""
# Grammatical productions.
S -> NP VP [1.0]
NP -> Pro [0.1] | Det N [0.3] | N [0.5] | NP PP [0.1]
VP -> Vi [0.05] | Vt NP [0.9] | VP PP [0.05]
Det -> Art [1.0]
PP -> Prep NP [1.0]
PRP -> Prp_Obj[0.5] | vbd VBD [0.3] | to TO[0.2]
# Lexical productions.
Pro -> "i" [0.3] | "we" [0.1] | "you" [0.1] | "he" [0.3] | "she" [0.2]
Art -> "a" [0.4] | "an" [0.1] | "the" [0.4] | "The" [0.1]
Prep -> "with" [0.7] | "in" [0.3]
N -> "salad" [0.3] | "fork" [0.3] | "mushrooms" [0.2] | "book" [0.2]
Vi -> "sneezed" [0.4] | "ran" [0.4] | "read" [0.2]
Vt -> "eat" [0.2] | "eats" [0.2] | "ate" [0.2] | "see" [0.2] | "saw" [0.2]
Prp_Obj -> "her" [0.5] | "I" [0.5]
vbd -> "was" [1.0]
to -> "to" [1.0]
""")
def input_file():
sppc = BetterICP(toy_grammer)
with open("input.txt", "r") as ins:
array = []
for line in ins:
array.append(line)
for a in array:
sppc.parse(a.split())
input_file()
Im获得的输出为0找到的总解析数
我的语法定义正确吗
更新(包含在input.txt
中):
贝特丽克,p.py
from __future__ import division
import sys
from pprint import pprint
from collections import defaultdict
import nltk
from nltk.corpus import treebank
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from nltk import InsideChartParser
from nltk.parse.chart import Chart,AbstractChartRule
from nltk.tree import Tree,ProbabilisticTree,_child_names
from nltk.parse.pchart import ProbabilisticFundamentalRule,ProbabilisticBottomUpInitRule,ProbabilisticTreeEdge,ProbabilisticLeafEdge
from nltk.parse.pchart import SingleEdgeProbabilisticFundamentalRule
from math import log
# Renamed between 3.0 and 3.0.4 :-(
if not(hasattr(Chart,'pretty_format_edge')):
Chart.pretty_format_edge=Chart.pp_edge
# nltk.parse.pchart is fundamentally broken, because it adds edges directly
# into the chart, where the fr can see them whether or not they've come
# out of the agenda or not.
# The least-bad fix from outside I can come up with is implemented here:
# add a boolean var called 'pending' which is true by default, only set to
# false when the edge comes off the agenda, and when true causes it
# to be ignored by fr
# Possible bug? Even pending edges _are_ checked for when testing for
# redundancy (i.e. Chart.insert is _not_ changed), but that means any
# failure of best-first might cause a cheaper edge to be discarded
# because an earlier, but still pending, identical-but-more expensive
# edge is in the chart.
nltk.chart.EdgeI.pending=True
def productions_with_left_context(self,lpos=0,leaves=None):
"""
Generate the productions that correspond to the non-terminal nodes of the tree, with their left-context word (or None), as pairs of word and Production.
For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
form P -> C1 C2 ... Cn and the word to the left of C1
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
>>> t.productions()
[(None, S -> NP VP),
(None, NP -> D N),
(None, D -> 'the')
('the', N -> 'dog'),
('dog', VP -> V NP),
('dog', V -> 'chased'),
('dog', NP -> D N),
('chased', D -> 'the'),
('the', N -> 'cat')]
:rtype: list(Production)
"""
if leaves is None:
leaves=self.leaves()
#if not isinstance(self._label, string_types):
# raise TypeError('Productions can only be generated from trees having node labels that are strings')
if lpos>0:
lc=leaves[lpos-1]
else:
lc=None
prods = [(lc,Production(Nonterminal(self._label), _child_names(self)))]
for child in self:
if isinstance(child, Tree):
prods += child.productions_with_left_context(lpos,leaves)
# could be much smarter
lpos+=len(child.leaves())
else:
lpos+=1
return prods
Tree.productions_with_left_context=productions_with_left_context
def production_distribution(psents):
""" Creates a frequency distribution of lexical and non-lexical (grammatical) productions """
prod_dict = defaultdict(int)
for psent in psents:
for production in psent.productions():
prod_dict[production]+=1
return prod_dict
def nt_counts(prod_dict):
'''Create a dictionary of non-terminals and their counts'''
nt_dict=defaultdict(int)
for (rule,count) in prod_dict.items():
nt_dict[rule.lhs()]+=count
return nt_dict
def cost(prob):
return 0.0 if prob==1.0 else -log(prob,2)
def production_cost(production,lhs_counts,production_counts):
pcount=production_counts[production]
ntcount=lhs_counts[production.lhs()]
return cost(float(pcount)/float(ntcount))
def get_costed_productions(psents):
""" Creates costed/weighted productions from a given list of parsed sentences."""
prods_dict = production_distribution(psents)
prods_nt_counts=nt_counts(prods_dict)
costed_prods=[CostedProduction(p.lhs(),p.rhs(),production_cost(p, prods_nt_counts, prods_dict))
for p in prods_dict.keys()]
return costed_prods
class BetterPBPR(AbstractChartRule):
NUM_EDGES=1
def apply(self, chart, grammar, edge):
if edge.is_incomplete(): return
for prod in grammar.productions():
if edge.lhs() == prod.rhs()[0]:
# check for X -> X
if prod.lhs()==edge.lhs() and len(prod.rhs())==1:
continue
new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob())
if chart.insert(new_edge, ()):
yield new_edge
class BetterSEPFR(AbstractChartRule):
NUM_EDGES=1
_fundamental_rule = ProbabilisticFundamentalRule()
def apply(self, chart, grammar, edge1):
fr = self._fundamental_rule
if edge1.is_incomplete():
# edge1 = left_edge; edge2 = right_edge
for edge2 in chart.select(start=edge1.end(), is_complete=True,
lhs=edge1.nextsym()):
if edge2.pending:
continue
for new_edge in fr.apply(chart, grammar, edge1, edge2):
yield new_edge
else:
# edge2 = left_edge; edge1 = right_edge
for edge2 in chart.select(end=edge1.start(), is_complete=False,
nextsym=edge1.lhs()):
if edge2.pending:
continue
for new_edge in fr.apply(chart, grammar, edge2, edge1):
yield new_edge
class BetterICP(InsideChartParser):
'''Implement a more user-friendly InsideChartParser,
which will show intermediate results, and quit after
finding a specified number of parses'''
def parse(self, tokens, notify=True, max=0):
'''Run a probabilistic parse of tokens.
If notify is true, display each complete parse as it is found
If max>0, quit after finding that many parses'''
self._grammar.check_coverage(tokens)
chart = Chart(list(tokens))
chart._trace=self._trace # Bad form. . .
grammar = self._grammar
start = grammar.start()
prod_probs = {}
# Chart parser rules.
bu_init = ProbabilisticBottomUpInitRule()
bu = BetterPBPR() # avoid infinite numbers of parses :-(
fr = BetterSEPFR() # don't look at pending edges
# Our queue
queue = []
# Initialize the chart.
for edge in bu_init.apply(chart, grammar):
if self._trace > 1:
print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
cost(edge.prob())))
queue.append(edge)
found = 0
while len(queue) > 0 and (max<1 or found<max):
# Re-sort the queue.
self.sort_queue(queue, chart)
# Prune the queue to the correct size if a beam was defined
if self.beam_size:
self._prune(queue, chart)
# Get the best edge.
edge = queue.pop()
edge.pending = False
if self._trace > 0:
print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
cost(edge.prob())))
if (edge.start()==0 and
edge.end()==chart._num_leaves and
edge.lhs()==start and
edge.is_complete()):
if len(prod_probs)==0:
for prod in grammar.productions():
prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
if notify:
print "****"
for tree in chart.trees(edge, tree_class=ProbabilisticTree,
complete=True):
self._setprob(tree, prod_probs)
print tree, '%.4g (%.4g)'%(cost(tree.prob()),cost(edge.prob()))
#print tree
print "****"
found+=1
# Apply BU & FR to it.
queue.extend(fr.apply(chart, grammar, edge))
queue.extend(bu.apply(chart, grammar, edge))
# Get a list of complete parses.
parses = list(chart.parses(grammar.start(), ProbabilisticTree))
if not notify:
for parse in parses:
self._setprob(parse,prod_probs)
# Sort by probability
parses.sort(reverse=True, key=lambda tree: tree.prob())
if notify:
print "%s total parses found"%found
return iter(parses)
def _prune(self, queue, chart):
""" Discard items in the queue if the queue is longer than the beam."""
if len(queue) > self.beam_size:
split = len(queue)-self.beam_size
if self._trace > 2:
for edge in queue[:split]:
print(' %-50s [%.4g DISCARDED]' % (chart.pretty_format_edge(edge,2),
cost(edge.prob())))
del queue[:split]
def beam(self,width):
self.beam_size=width
来自未来进口部的
导入系统
从pprint导入pprint
从集合导入defaultdict
导入nltk
从nltk.corpus导入树库
从nltk导入ConditionalFreqDist,非终端,FreqDist
从fixesNLTK3导入*
从nltk导入InsideChartParser
从nltk.parse.chart导入图表,AbstractChartRule
从nltk.tree导入树、概率树、\u子\u名称
从nltk.parse.pchart导入ProbabilisticFundamentalRule、ProbabilisticBottomUpInitRule、ProbabilisticTreeEdge、ProbabilisticLeafEdge
从nltk.parse.pchart导入SingleEdgeProbabilisticFundamentalRule
从数学导入日志
#在3.0和3.0.4之间重命名:-(
如果不是(hasattr(图表,漂亮的格式边)):
Chart.pretty\u format\u edge=Chart.pp\u edge
#nltk.parse.pchart从根本上被破坏了,因为它直接添加了边
#进入图表,fr可以看到他们是否来过
#是否在议程之外。
#我能想到的最不坏的外部修复在这里实现:
#添加一个名为“pending”的布尔变量,默认为true,仅设置为
#当优势从议程上消失时为假,当为真时为假
#被fr忽略
#可能存在的缺陷?在测试时,甚至会检查待处理的边缘
#冗余(即图表插入未更改),但这意味着
#best first的失败可能会导致放弃更便宜的优势
#因为一个更早,但仍然悬而未决,相同但更昂贵的
#edge在图表中。
nltk.chart.EdgeI.pending=True
带有左上下文(self,lpos=0,leaves=None)的def productions\u:
"""
生成对应于树的非终端节点的产品,并将其左上下文单词(或无)作为单词和产品对。
对于形式(P:c1c2…Cn)的每个子树,这将生成
表格P->C1 C2…Cn和C1左边的单词
>>>t=Tree.fromstring((S(NP(D)(N狗))(VP(V追逐)(NP(D)(N猫))))
>>>t.产品()
[(无,S->NP VP),
(无,NP->D N),
(无,D->“the”)
('the',N->'dog'),
(“狗”,VP->V NP),
('dog',V->'chased'),
('dog',NP->dn),
(“追逐”,D->“the”),
('the',N->'cat')]
:rtype:list(生产)
"""
如果为“无”:
leaves=self.leaves()
#如果不存在(自身标签、字符串类型):
#raise TypeError('只能从具有字符串节点标签的树生成产品')
如果lpos>0:
lc=叶[lpos-1]
其他:
lc=无
产品=[(lc,产品(非终端(自我标签),[子项名称(自我)))]
儿童自我:
如果存在(子级、树):
prods+=具有左上下文(LPO、左上下文)的child.productions\u
#可能会更聪明
lpos+=len(child.leaves())
其他:
lpos+=1
回击棒
Tree.productions\u with\u left\u context=productions\u with\u left\u context
def生产分配(psents):
“”“创建词法和非词法(语法)结果的频率分布”“”
prod_dict=defaultdict(int)
对于psent中的psent:
对于psent.productions()中的生产:
产品目录[生产]+=1
返回产品目录
def nt_计数(生产指令):
''创建非终端及其计数的字典''
nt_dict=defaultdict(int)
对于产品目录项()中的(规则、计数):
nt_dict[rule.lhs()]+=count
返回新台币
def成本(prob):
如果prob==1.0,则返回0.0 else-log(prob,2)
def生产成本(生产、lhs_计数、生产_计数):
pcount=生产计数[生产]
ntcount=lhs\u计数[production.lhs()]
退货成本(浮动(pcount)/浮动(ntcount))
def获得成本产品(psents):
“”“从给定的已解析句子列表中创建成本/加权产品。”“”
产品目录=生产分配(psents)
产品数量=产品数量(产品数量)
成本生产=[成本生产(p.lhs(),p.rhs(),生产成本(p,产品数量,产品目录))
对于产品目录键()中的p
退货成本产品
类BetterBPR(抽象图表规则):
边数=1
def应用(自我、图表、语法、边缘):
如果edge.is_不完整():返回
对于语法中的prod.productions():
如果edge.lhs()==prod.rhs()[0]:
#检查X->X
如果prod.lhs()==edge.lhs()和len(prod.rhs())==1:
持续
new_edge=ProbabilisticTreeEdge.从_生产(prod,edge.start(),prod.prob())
如果插入图表(新边,()):
产生新的优势
类BetterSEPFR(抽象图表规则):
边数=1
_基本规则=概率基本规则()
def应用(自我、图表、语法、edge1):
fr=自我。\基本规则
如果边1.不完整():
#边1=左边缘;边2=右边缘
对于chart.select中的边2(start=edge1.end(),is_complete=True,
lhs=edge1.nextsym()):
如果edge2.pending:
持续
对于fr.apply中的新边(图表、语法、边1、边2):
产生新的优势
其他:
#边2=左边缘;边1=右边缘
对于chart.select中的边2(end=edge1.start(),is_complete=False,
from __future__ import division
import sys
from pprint import pprint
from collections import defaultdict
import nltk
from nltk.corpus import treebank
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from nltk import InsideChartParser
from nltk.parse.chart import Chart,AbstractChartRule
from nltk.tree import Tree,ProbabilisticTree,_child_names
from nltk.parse.pchart import ProbabilisticFundamentalRule,ProbabilisticBottomUpInitRule,ProbabilisticTreeEdge,ProbabilisticLeafEdge
from nltk.parse.pchart import SingleEdgeProbabilisticFundamentalRule
from math import log
# Renamed between 3.0 and 3.0.4 :-(
if not(hasattr(Chart,'pretty_format_edge')):
Chart.pretty_format_edge=Chart.pp_edge
# nltk.parse.pchart is fundamentally broken, because it adds edges directly
# into the chart, where the fr can see them whether or not they've come
# out of the agenda or not.
# The least-bad fix from outside I can come up with is implemented here:
# add a boolean var called 'pending' which is true by default, only set to
# false when the edge comes off the agenda, and when true causes it
# to be ignored by fr
# Possible bug? Even pending edges _are_ checked for when testing for
# redundancy (i.e. Chart.insert is _not_ changed), but that means any
# failure of best-first might cause a cheaper edge to be discarded
# because an earlier, but still pending, identical-but-more expensive
# edge is in the chart.
nltk.chart.EdgeI.pending=True
def productions_with_left_context(self,lpos=0,leaves=None):
"""
Generate the productions that correspond to the non-terminal nodes of the tree, with their left-context word (or None), as pairs of word and Production.
For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
form P -> C1 C2 ... Cn and the word to the left of C1
>>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
>>> t.productions()
[(None, S -> NP VP),
(None, NP -> D N),
(None, D -> 'the')
('the', N -> 'dog'),
('dog', VP -> V NP),
('dog', V -> 'chased'),
('dog', NP -> D N),
('chased', D -> 'the'),
('the', N -> 'cat')]
:rtype: list(Production)
"""
if leaves is None:
leaves=self.leaves()
#if not isinstance(self._label, string_types):
# raise TypeError('Productions can only be generated from trees having node labels that are strings')
if lpos>0:
lc=leaves[lpos-1]
else:
lc=None
prods = [(lc,Production(Nonterminal(self._label), _child_names(self)))]
for child in self:
if isinstance(child, Tree):
prods += child.productions_with_left_context(lpos,leaves)
# could be much smarter
lpos+=len(child.leaves())
else:
lpos+=1
return prods
Tree.productions_with_left_context=productions_with_left_context
def production_distribution(psents):
""" Creates a frequency distribution of lexical and non-lexical (grammatical) productions """
prod_dict = defaultdict(int)
for psent in psents:
for production in psent.productions():
prod_dict[production]+=1
return prod_dict
def nt_counts(prod_dict):
'''Create a dictionary of non-terminals and their counts'''
nt_dict=defaultdict(int)
for (rule,count) in prod_dict.items():
nt_dict[rule.lhs()]+=count
return nt_dict
def cost(prob):
return 0.0 if prob==1.0 else -log(prob,2)
def production_cost(production,lhs_counts,production_counts):
pcount=production_counts[production]
ntcount=lhs_counts[production.lhs()]
return cost(float(pcount)/float(ntcount))
def get_costed_productions(psents):
""" Creates costed/weighted productions from a given list of parsed sentences."""
prods_dict = production_distribution(psents)
prods_nt_counts=nt_counts(prods_dict)
costed_prods=[CostedProduction(p.lhs(),p.rhs(),production_cost(p, prods_nt_counts, prods_dict))
for p in prods_dict.keys()]
return costed_prods
class BetterPBPR(AbstractChartRule):
NUM_EDGES=1
def apply(self, chart, grammar, edge):
if edge.is_incomplete(): return
for prod in grammar.productions():
if edge.lhs() == prod.rhs()[0]:
# check for X -> X
if prod.lhs()==edge.lhs() and len(prod.rhs())==1:
continue
new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob())
if chart.insert(new_edge, ()):
yield new_edge
class BetterSEPFR(AbstractChartRule):
NUM_EDGES=1
_fundamental_rule = ProbabilisticFundamentalRule()
def apply(self, chart, grammar, edge1):
fr = self._fundamental_rule
if edge1.is_incomplete():
# edge1 = left_edge; edge2 = right_edge
for edge2 in chart.select(start=edge1.end(), is_complete=True,
lhs=edge1.nextsym()):
if edge2.pending:
continue
for new_edge in fr.apply(chart, grammar, edge1, edge2):
yield new_edge
else:
# edge2 = left_edge; edge1 = right_edge
for edge2 in chart.select(end=edge1.start(), is_complete=False,
nextsym=edge1.lhs()):
if edge2.pending:
continue
for new_edge in fr.apply(chart, grammar, edge2, edge1):
yield new_edge
class BetterICP(InsideChartParser):
'''Implement a more user-friendly InsideChartParser,
which will show intermediate results, and quit after
finding a specified number of parses'''
def parse(self, tokens, notify=True, max=0):
'''Run a probabilistic parse of tokens.
If notify is true, display each complete parse as it is found
If max>0, quit after finding that many parses'''
self._grammar.check_coverage(tokens)
chart = Chart(list(tokens))
chart._trace=self._trace # Bad form. . .
grammar = self._grammar
start = grammar.start()
prod_probs = {}
# Chart parser rules.
bu_init = ProbabilisticBottomUpInitRule()
bu = BetterPBPR() # avoid infinite numbers of parses :-(
fr = BetterSEPFR() # don't look at pending edges
# Our queue
queue = []
# Initialize the chart.
for edge in bu_init.apply(chart, grammar):
if self._trace > 1:
print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
cost(edge.prob())))
queue.append(edge)
found = 0
while len(queue) > 0 and (max<1 or found<max):
# Re-sort the queue.
self.sort_queue(queue, chart)
# Prune the queue to the correct size if a beam was defined
if self.beam_size:
self._prune(queue, chart)
# Get the best edge.
edge = queue.pop()
edge.pending = False
if self._trace > 0:
print(' %-50s [%.4g]' % (chart.pretty_format_edge(edge,width=2),
cost(edge.prob())))
if (edge.start()==0 and
edge.end()==chart._num_leaves and
edge.lhs()==start and
edge.is_complete()):
if len(prod_probs)==0:
for prod in grammar.productions():
prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
if notify:
print "****"
for tree in chart.trees(edge, tree_class=ProbabilisticTree,
complete=True):
self._setprob(tree, prod_probs)
print tree, '%.4g (%.4g)'%(cost(tree.prob()),cost(edge.prob()))
#print tree
print "****"
found+=1
# Apply BU & FR to it.
queue.extend(fr.apply(chart, grammar, edge))
queue.extend(bu.apply(chart, grammar, edge))
# Get a list of complete parses.
parses = list(chart.parses(grammar.start(), ProbabilisticTree))
if not notify:
for parse in parses:
self._setprob(parse,prod_probs)
# Sort by probability
parses.sort(reverse=True, key=lambda tree: tree.prob())
if notify:
print "%s total parses found"%found
return iter(parses)
def _prune(self, queue, chart):
""" Discard items in the queue if the queue is longer than the beam."""
if len(queue) > self.beam_size:
split = len(queue)-self.beam_size
if self._trace > 2:
for edge in queue[:split]:
print(' %-50s [%.4g DISCARDED]' % (chart.pretty_format_edge(edge,2),
cost(edge.prob())))
del queue[:split]
def beam(self,width):
self.beam_size=width
# fix buggy NLTK 3 :-(
# different fixes for different versions :-((((
import re, sys
import nltk
from nltk.grammar import _ARROW_RE, _PROBABILITY_RE, _DISJUNCTION_RE, Production
from nltk.draw import CFGEditor
from nltk.probability import ImmutableProbabilisticMixIn
ARROW = u'\u2192'
TOKEN = u'([\\w ]|\\\\((x[0-9a-f][0-9a-f])|(u[0-9a-f][0-9a-f][0-9a-f][0-9a-f])))+'
CFGEditor.ARROW = ARROW
CFGEditor._TOKEN_RE=re.compile(u"->|u?'"+TOKEN+u"'|u?\""+TOKEN+u"\"|\\w+|("+ARROW+u")")
CFGEditor._PRODUCTION_RE=re.compile(ur"(^\s*\w+\s*)" +
ur"(->|("+ARROW+"))\s*" +
ur"((u?'"+TOKEN+"'|u?\""+TOKEN+"\"|''|\"\"|\w+|\|)\s*)*$")
nltk.grammar._TERMINAL_RE = re.compile(ur'( u?"[^"]+" | u?\'[^\']+\' ) \s*', re.VERBOSE)
nltk.grammar._ARROR_RE = re.compile(ur'\s* (->|'+ARROW+') \s*', re.VERBOSE)
from nltk.grammar import _TERMINAL_RE
if sys.version_info[0]>2 or sys.version_info[1]>6:
from nltk.grammar import PCFG, CFG, ProbabilisticProduction as FixPP
parse_grammar=CFG.fromstring
parse_pgrammar=PCFG.fromstring
from nltk import InsideChartParser
def nbest_parse(self,tokens,n=None):
parses=self.parse(tokens)
if n is None:
return [parse for parse in parses]
else:
return [parses.next() for i in range(n)]
InsideChartParser.nbest_parse=nbest_parse
else:
from nltk.grammar import WeightedGrammar as PCFG, WeightedProduction as FixPP
from nltk import parse_cfg, parse_pcfg
parse_grammar=parse_cfg
parse_pgrammar=parse_pcfg
def fix_parse_production(line, nonterm_parser, probabilistic=False):
"""
Parse a grammar rule, given as a string, and return
a list of productions.
"""
pos = 0
# Parse the left-hand side.
lhs, pos = nonterm_parser(line, pos)
# Skip over the arrow.
m = _ARROW_RE.match(line, pos)
if not m: raise ValueError('Expected an arrow')
pos = m.end()
# Parse the right hand side.
probabilities = [0.0]
rhsides = [[]]
while pos < len(line):
# Probability.
m = _PROBABILITY_RE.match(line, pos)
if probabilistic and m:
pos = m.end()
probabilities[-1] = float(m.group(1)[1:-1])
if probabilities[-1] > 1.0:
raise ValueError('Production probability %f, '
'should not be greater than 1.0' %
(probabilities[-1],))
# String -- add terminal.
elif (line[pos] in "\'\"" or line[pos:pos+2] in ('u"',"u'")):
m = _TERMINAL_RE.match(line, pos)
if not m: raise ValueError('Unterminated string')
rhsides[-1].append(eval(m.group(1)))
pos = m.end()
# Vertical bar -- start new rhside.
elif line[pos] == '|':
m = _DISJUNCTION_RE.match(line, pos)
probabilities.append(0.0)
rhsides.append([])
pos = m.end()
# Anything else -- nonterminal.
else:
nonterm, pos = nonterm_parser(line, pos)
rhsides[-1].append(nonterm)
if probabilistic:
return [FixPP(lhs, rhs, prob=probability)
for (rhs, probability) in zip(rhsides, probabilities)]
else:
return [Production(lhs, rhs) for rhs in rhsides]
if sys.version_info[0]>2 or sys.version_info[1]>6:
nltk.grammar._read_production=fix_parse_production
else:
nltk.grammar.parse_production=fix_parse_production
class CostedProduction(FixPP):
"""
A probabilistic context free grammar production using costs.
A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that
has an associated probability, which represents how likely it is that
this production will be used. In particular, the probability of a
``ProbabilisticProduction`` records the likelihood that its right-hand side is
the correct instantiation for any given occurrence of its left-hand side.
:see: ``Production``
"""
def __init__(self, lhs, rhs, cost):
"""
Construct a new ``ProbabilisticProduction``.
:param lhs: The left-hand side of the new ``ProbabilisticProduction``.
:type lhs: Nonterminal
:param rhs: The right-hand side of the new ``ProbabilisticProduction``.
:type rhs: sequence(Nonterminal and terminal)
:param prob: Probability parameters of the new ``ProbabilisticProduction``.
"""
ImmutableProbabilisticMixIn.__init__(self, logprob=-cost)
Production.__init__(self, lhs, rhs)
def __str__(self):
return Production.__unicode__(self) + \
(' [0.0]' if (self.logprob() == 0.0) else ' [%g]' % -self.logprob())
def __repr__(self):
return '%s'%str(self)
def cost(self):
return 0.0 if self.logprob() == 0.0 else -self.logprob()