Java 向语言工具建议列表添加单词
我使用LanguageTool在我的应用程序中执行一些拼写检查和拼写更正功能 LanguageTool描述了如何从拼写检查中排除单词(使用正在使用的拼写检查规则的Java 向语言工具建议列表添加单词,java,spell-checking,languagetool,Java,Spell Checking,Languagetool,我使用LanguageTool在我的应用程序中执行一些拼写检查和拼写更正功能 LanguageTool描述了如何从拼写检查中排除单词(使用正在使用的拼写检查规则的addIgnoreTokens(…)方法调用) 如何在拼写检查中添加一些单词(例如,来自特定词典)?也就是说,LanguageTool可以修复拼写错误的单词并从我的特定词典中推荐单词吗?不幸的是,我认为API不支持这一点。在没有API的情况下,您可以将单词添加到spelling.txt,以使它们被接受并用作建议。使用API,您可能需要扩
addIgnoreTokens(…)
方法调用)
如何在拼写检查中添加一些单词(例如,来自特定词典)?也就是说,LanguageTool可以修复拼写错误的单词并从我的特定词典中推荐单词吗?不幸的是,我认为API不支持这一点。在没有API的情况下,您可以将单词添加到
spelling.txt
,以使它们被接受并用作建议。使用API,您可能需要扩展MorfologikSpllerRule并进行更改。(披露:我是LanguageTool的维护者)我有类似的要求,那就是将一些自定义词作为“建议词”加载到词典中,而不仅仅是“忽略词”。最后,我扩展了MorfologikSpellerRule来实现这一点:
- 创建类MorfologikSpillerRulex从MorfologikSpillerRule扩展而来,重写方法“match()”,并编写我自己的“initSpeller()”来创建拼写器
- 然后,对于语言工具,创建此自定义拼写器规则以替换现有规则
public class MorfologikSpellerRuleEx extends MorfologikSpellerRule {
private String spellingFilePath = null;
private boolean ignoreTaggedWords = false;
public MorfologikSpellerRuleEx(ResourceBundle messages, Language language) throws IOException {
super(messages, language);
}
@Override
public String getFileName() {
return "/en/hunspell/en_US.dict";
}
@Override
public String getId() {
return "MORFOLOGIK_SPELLING_RULE_EX";
}
@Override
public void setIgnoreTaggedWords() {
ignoreTaggedWords = true;
}
public String getSpellingFilePath() {
return spellingFilePath;
}
public void setSpellingFilePath(String spellingFilePath) {
this.spellingFilePath = spellingFilePath;
}
private void initSpellerEx(String binaryDict) throws IOException {
String plainTextDict = null;
if (JLanguageTool.getDataBroker().resourceExists(getSpellingFileName())) {
plainTextDict = getSpellingFileName();
}
if (plainTextDict != null) {
BufferedReader br = null;
if (this.spellingFilePath != null) {
try {
br = new BufferedReader(new FileReader(this.spellingFilePath));
}
catch (Exception e) {
br = null;
}
}
if (br != null) {
speller1 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 1);
speller2 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 2);
speller3 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 3);
br.close();
}
else {
speller1 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 1);
speller2 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 2);
speller3 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 3);
}
setConvertsCase(speller1.convertsCase());
} else {
throw new RuntimeException("Could not find ignore spell file in path: " + getSpellingFileName());
}
}
private boolean canBeIgnored(AnalyzedTokenReadings[] tokens, int idx, AnalyzedTokenReadings token)
throws IOException {
return token.isSentenceStart() || token.isImmunized() || token.isIgnoredBySpeller() || isUrl(token.getToken())
|| isEMail(token.getToken()) || (ignoreTaggedWords && token.isTagged()) || ignoreToken(tokens, idx);
}
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
// lazy init
if (speller1 == null) {
String binaryDict = null;
if (JLanguageTool.getDataBroker().resourceExists(getFileName())) {
binaryDict = getFileName();
}
if (binaryDict != null) {
initSpellerEx(binaryDict); //here's the change
} else {
// should not happen, as we only configure this rule (or rather its subclasses)
// when we have the resources:
return toRuleMatchArray(ruleMatches);
}
}
int idx = -1;
for (AnalyzedTokenReadings token : tokens) {
idx++;
if (canBeIgnored(tokens, idx, token)) {
continue;
}
// if we use token.getToken() we'll get ignored characters inside and speller
// will choke
String word = token.getAnalyzedToken(0).getToken();
if (tokenizingPattern() == null) {
ruleMatches.addAll(getRuleMatches(word, token.getStartPos(), sentence));
} else {
int index = 0;
Matcher m = tokenizingPattern().matcher(word);
while (m.find()) {
String match = word.subSequence(index, m.start()).toString();
ruleMatches.addAll(getRuleMatches(match, token.getStartPos() + index, sentence));
index = m.end();
}
if (index == 0) { // tokenizing char not found
ruleMatches.addAll(getRuleMatches(word, token.getStartPos(), sentence));
} else {
ruleMatches.addAll(getRuleMatches(word.subSequence(index, word.length()).toString(),
token.getStartPos() + index, sentence));
}
}
}
return toRuleMatchArray(ruleMatches);
}
公共类MorfologikSpillerRulex扩展了MorfologikSpillerRule{
私有字符串拼写FilePath=null;
私有布尔ignoreTaggedWords=false;
公共morfologikspellerrulex(ResourceBundle消息、语言)引发IOException{
超级(信息、语言);
}
@凌驾
公共字符串getFileName(){
返回“/en/hunspell/en_US.dict”;
}
@凌驾
公共字符串getId(){
返回“MORFOLOGIK\u拼写规则\u EX”;
}
@凌驾
public void setIgnoreTaggedWords(){
ignoreTaggedWords=真;
}
公共字符串getSpellingFilePath(){
返回拼写文件路径;
}
public void setSpellingFilePath(字符串spellingFilePath){
this.spellingFilePath=spellingFilePath;
}
私有void initSpellerEx(字符串binaryDict)引发IOException{
字符串plainTextDict=null;
如果(JLanguageTool.getDataBroker().resourceExists(getSpellingFileName())){
plainTextDict=getSpellingFileName();
}
if(明文dict!=null){
BufferedReader br=null;
if(this.spellingFilePath!=null){
试一试{
br=新的BufferedReader(新文件读取器(this.spellingFilePath));
}
捕获(例外e){
br=null;
}
}
如果(br!=null){
speller1=新的MorfologyMultiSpeller(二进制dict,br,明文dict,1);
speller2=新的MorfologyMultiSpeller(二进制dict,br,明文dict,2);
speller3=新的MorfologyMultiSpeller(二进制dict,br,明文dict,3);
br.close();
}
否则{
speller1=新的MorfologyMultiSpeller(二进制dict,明文dict,1);
speller2=新的MorfologyMultiSpeller(二进制dict,明文dict,2);
speller3=新的MorfologyMultiSpeller(二进制dict,明文dict,3);
}
setConvertsCase(拼写器1.convertsCase());
}否则{
抛出新的RuntimeException(“在路径:+getSpellingFileName()中找不到忽略拼写文件”);
}
}
私有布尔值可以忽略(AnalyzedTokenReadings[]标记,int idx,AnalyzedTokenReadings标记)
抛出IOException{
返回token.isSentenceStart()| | | token.isImmunized()| | | token.isIgnoredBySpeller()| | isUrl(token.getToken())
||isEMail(token.getToken())| |(ignoreTaggedWords&&token.istaged())| | ignoreToken(tokens,idx);
}
@凌驾
公共规则匹配[]匹配(AnalyzedEntence语句)引发IOException{
List ruleMatches=new ArrayList();
AnalyzedTokenRedings[]tokens=getSentenceWithImmunization(句子)。GetTokenWithWithHiteSpace();
//惰性初始化
if(拼写器1==null){
字符串binaryDict=null;
如果(JLanguageTool.getDataBroker().resourceExists(getFileName())){
binaryDict=getFileName();
}
if(二进制dict!=null){
initSpellerEx(binaryDict);//这是更改
}否则{
//不应该发生,因为我们只配置此规则(或其子类)
//当我们有资源时:
返回圆环匹配数组(ruleMatches);
}
}
intidx=-1;
for(AnalyzedTokenReadings令牌:令牌){
idx++;
if(可识别(令牌、idx、令牌)){
继续;
}
//如果我们使用token.getToken()我们将在和拼写器中获得被忽略的字符
//会窒息
String word=token.getAnalyzedToken(0.getToken();
if(tokenizingPattern()==null){
addAll(getRuleMatches(word,token.getStartPos(),句子));
}否则{
int指数=0;
Matcher m=tokenizingPattern().Matcher(word);
while(m.find()){
字符串匹配=word.subSequence(index,m.start()).toString();
addAll(getRuleMatches(match,token.getStartPos()+索引,句子));
索引=m.end();
}
如果(索引==0){//未找到标记化字符
addAll(getRuleMatches(word,token.getStartPos(),句子));
}否则{
ruleMatches.addAll(getRuleMatches(word.subSequence(index,word.length()).toString(),
getStartPos()+索引,句子));
}
}
}
返回圆环匹配数组(ruleMatches);
}
}非常感谢!据我所知,如果我将单词添加到spelling.txt,那么它将导致仅在初始化时将这些单词添加到建议列表?若我在running应用程序中添加单词,我需要扩展MorfologikSpillerRule(并更改代码)?是的,就是这样。我创建了一个类并扩展了MorfologikSpillerRule,但什么时候
public class MorfologikSpellerRuleEx extends MorfologikSpellerRule {
private String spellingFilePath = null;
private boolean ignoreTaggedWords = false;
public MorfologikSpellerRuleEx(ResourceBundle messages, Language language) throws IOException {
super(messages, language);
}
@Override
public String getFileName() {
return "/en/hunspell/en_US.dict";
}
@Override
public String getId() {
return "MORFOLOGIK_SPELLING_RULE_EX";
}
@Override
public void setIgnoreTaggedWords() {
ignoreTaggedWords = true;
}
public String getSpellingFilePath() {
return spellingFilePath;
}
public void setSpellingFilePath(String spellingFilePath) {
this.spellingFilePath = spellingFilePath;
}
private void initSpellerEx(String binaryDict) throws IOException {
String plainTextDict = null;
if (JLanguageTool.getDataBroker().resourceExists(getSpellingFileName())) {
plainTextDict = getSpellingFileName();
}
if (plainTextDict != null) {
BufferedReader br = null;
if (this.spellingFilePath != null) {
try {
br = new BufferedReader(new FileReader(this.spellingFilePath));
}
catch (Exception e) {
br = null;
}
}
if (br != null) {
speller1 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 1);
speller2 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 2);
speller3 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 3);
br.close();
}
else {
speller1 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 1);
speller2 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 2);
speller3 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 3);
}
setConvertsCase(speller1.convertsCase());
} else {
throw new RuntimeException("Could not find ignore spell file in path: " + getSpellingFileName());
}
}
private boolean canBeIgnored(AnalyzedTokenReadings[] tokens, int idx, AnalyzedTokenReadings token)
throws IOException {
return token.isSentenceStart() || token.isImmunized() || token.isIgnoredBySpeller() || isUrl(token.getToken())
|| isEMail(token.getToken()) || (ignoreTaggedWords && token.isTagged()) || ignoreToken(tokens, idx);
}
@Override
public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
List<RuleMatch> ruleMatches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace();
// lazy init
if (speller1 == null) {
String binaryDict = null;
if (JLanguageTool.getDataBroker().resourceExists(getFileName())) {
binaryDict = getFileName();
}
if (binaryDict != null) {
initSpellerEx(binaryDict); //here's the change
} else {
// should not happen, as we only configure this rule (or rather its subclasses)
// when we have the resources:
return toRuleMatchArray(ruleMatches);
}
}
int idx = -1;
for (AnalyzedTokenReadings token : tokens) {
idx++;
if (canBeIgnored(tokens, idx, token)) {
continue;
}
// if we use token.getToken() we'll get ignored characters inside and speller
// will choke
String word = token.getAnalyzedToken(0).getToken();
if (tokenizingPattern() == null) {
ruleMatches.addAll(getRuleMatches(word, token.getStartPos(), sentence));
} else {
int index = 0;
Matcher m = tokenizingPattern().matcher(word);
while (m.find()) {
String match = word.subSequence(index, m.start()).toString();
ruleMatches.addAll(getRuleMatches(match, token.getStartPos() + index, sentence));
index = m.end();
}
if (index == 0) { // tokenizing char not found
ruleMatches.addAll(getRuleMatches(word, token.getStartPos(), sentence));
} else {
ruleMatches.addAll(getRuleMatches(word.subSequence(index, word.length()).toString(),
token.getStartPos() + index, sentence));
}
}
}
return toRuleMatchArray(ruleMatches);
}