Java 为什么斯坦福CoreNLP服务器将命名实体拆分为单个令牌?
我使用这个命令发布数据(斯坦福网站的一点复制): 响应如下所示:Java 为什么斯坦福CoreNLP服务器将命名实体拆分为单个令牌?,java,nlp,stanford-nlp,Java,Nlp,Stanford Nlp,我使用这个命令发布数据(斯坦福网站的一点复制): 响应如下所示: { "sentences": [{ "index": 0, "tokens": [{ "index": 1, "word": "Barack", "originalText": "Barack", "lemma": "Barack", "characterOffsetBeg
{
"sentences": [{
"index": 0,
"tokens": [{
"index": 1,
"word": "Barack",
"originalText": "Barack",
"lemma": "Barack",
"characterOffsetBegin": 0,
"characterOffsetEnd": 6,
"pos": "NNP",
"ner": "PERSON",
"before": "",
"after": " "
}, {
"index": 2,
"word": "Obama",
"originalText": "Obama",
"lemma": "Obama",
"characterOffsetBegin": 7,
"characterOffsetEnd": 12,
"pos": "NNP",
"ner": "PERSON",
"before": " ",
"after": " "
}, {
"index": 3,
"word": "was",
"originalText": "was",
"lemma": "be",
"characterOffsetBegin": 13,
"characterOffsetEnd": 16,
"pos": "VBD",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 4,
"word": "President",
"originalText": "President",
"lemma": "President",
"characterOffsetBegin": 17,
"characterOffsetEnd": 26,
"pos": "NNP",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 5,
"word": "of",
"originalText": "of",
"lemma": "of",
"characterOffsetBegin": 27,
"characterOffsetEnd": 29,
"pos": "IN",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 6,
"word": "the",
"originalText": "the",
"lemma": "the",
"characterOffsetBegin": 30,
"characterOffsetEnd": 33,
"pos": "DT",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 7,
"word": "United",
"originalText": "United",
"lemma": "United",
"characterOffsetBegin": 34,
"characterOffsetEnd": 40,
"pos": "NNP",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 8,
"word": "States",
"originalText": "States",
"lemma": "States",
"characterOffsetBegin": 41,
"characterOffsetEnd": 47,
"pos": "NNPS",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 9,
"word": "of",
"originalText": "of",
"lemma": "of",
"characterOffsetBegin": 48,
"characterOffsetEnd": 50,
"pos": "IN",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 10,
"word": "America",
"originalText": "America",
"lemma": "America",
"characterOffsetBegin": 51,
"characterOffsetEnd": 58,
"pos": "NNP",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 11,
"word": "in",
"originalText": "in",
"lemma": "in",
"characterOffsetBegin": 59,
"characterOffsetEnd": 61,
"pos": "IN",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 12,
"word": "2016",
"originalText": "2016",
"lemma": "2016",
"characterOffsetBegin": 62,
"characterOffsetEnd": 66,
"pos": "CD",
"ner": "DATE",
"normalizedNER": "2016",
"before": " ",
"after": "",
"timex": {
"tid": "t1",
"type": "DATE",
"value": "2016"
}
}]
}]
}
我做错什么了吗?我有Java客户机代码,至少可以将
Barack Obama
和United of America
识别为完全NER,但使用该服务时,它似乎会将每个令牌分别对待。你知道为什么吗?你应该将实体注释添加到注释者列表中。wget——发布数据“巴拉克·奥巴马在2016年担任美国总统”localhost:9000/?properties={“注释者”:[“实体注释”、“ner],“输出格式”:“json”}-O out.json
{
"sentences": [{
"index": 0,
"tokens": [{
"index": 1,
"word": "Barack",
"originalText": "Barack",
"lemma": "Barack",
"characterOffsetBegin": 0,
"characterOffsetEnd": 6,
"pos": "NNP",
"ner": "PERSON",
"before": "",
"after": " "
}, {
"index": 2,
"word": "Obama",
"originalText": "Obama",
"lemma": "Obama",
"characterOffsetBegin": 7,
"characterOffsetEnd": 12,
"pos": "NNP",
"ner": "PERSON",
"before": " ",
"after": " "
}, {
"index": 3,
"word": "was",
"originalText": "was",
"lemma": "be",
"characterOffsetBegin": 13,
"characterOffsetEnd": 16,
"pos": "VBD",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 4,
"word": "President",
"originalText": "President",
"lemma": "President",
"characterOffsetBegin": 17,
"characterOffsetEnd": 26,
"pos": "NNP",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 5,
"word": "of",
"originalText": "of",
"lemma": "of",
"characterOffsetBegin": 27,
"characterOffsetEnd": 29,
"pos": "IN",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 6,
"word": "the",
"originalText": "the",
"lemma": "the",
"characterOffsetBegin": 30,
"characterOffsetEnd": 33,
"pos": "DT",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 7,
"word": "United",
"originalText": "United",
"lemma": "United",
"characterOffsetBegin": 34,
"characterOffsetEnd": 40,
"pos": "NNP",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 8,
"word": "States",
"originalText": "States",
"lemma": "States",
"characterOffsetBegin": 41,
"characterOffsetEnd": 47,
"pos": "NNPS",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 9,
"word": "of",
"originalText": "of",
"lemma": "of",
"characterOffsetBegin": 48,
"characterOffsetEnd": 50,
"pos": "IN",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 10,
"word": "America",
"originalText": "America",
"lemma": "America",
"characterOffsetBegin": 51,
"characterOffsetEnd": 58,
"pos": "NNP",
"ner": "LOCATION",
"before": " ",
"after": " "
}, {
"index": 11,
"word": "in",
"originalText": "in",
"lemma": "in",
"characterOffsetBegin": 59,
"characterOffsetEnd": 61,
"pos": "IN",
"ner": "O",
"before": " ",
"after": " "
}, {
"index": 12,
"word": "2016",
"originalText": "2016",
"lemma": "2016",
"characterOffsetBegin": 62,
"characterOffsetEnd": 66,
"pos": "CD",
"ner": "DATE",
"normalizedNER": "2016",
"before": " ",
"after": "",
"timex": {
"tid": "t1",
"type": "DATE",
"value": "2016"
}
}]
}]
}