用于从输入文本中提取关键字的Java库
我正在寻找一个Java库来从文本块中提取关键字 该过程应如下所示: 停止单词清理->词干分析->根据英语语言学统计信息搜索关键字-这意味着如果一个单词在文本中出现的次数比在英语中出现的次数要多,就概率而言,它是一个候选关键字用于从输入文本中提取关键字的Java库,java,nlp,extract,keyword,stemming,Java,Nlp,Extract,Keyword,Stemming,我正在寻找一个Java库来从文本块中提取关键字 该过程应如下所示: 停止单词清理->词干分析->根据英语语言学统计信息搜索关键字-这意味着如果一个单词在文本中出现的次数比在英语中出现的次数要多,就概率而言,它是一个候选关键字 是否有执行此任务的库?这里有一个使用的可能解决方案。我没有使用上一个版本,而是使用了,因为这是我最了解的版本。除了/lucene-core-x.x.jar,不要忘记将下载的归档文件中的/contrib/analyzers/common/lucene-analyzers-x.
是否有执行此任务的库?这里有一个使用的可能解决方案。我没有使用上一个版本,而是使用了,因为这是我最了解的版本。除了
/lucene-core-x.x.jar
,不要忘记将下载的归档文件中的/contrib/analyzers/common/lucene-analyzers-x.x.jar
添加到您的项目中:它包含特定于语言的分析器(尤其是您案例中的英语分析器)
请注意,这将仅根据输入文本单词各自的词干查找其频率。之后应将这些频率与英语统计数据进行比较(顺便说一句,可能会有所帮助)
数据模型 一个关键词对应一个词干。不同的单词可能有相同的词干,因此
术语集。每次发现一个新术语时,关键字频率都会增加(即使已经找到了-集合会自动删除重复项)
public类关键字实现可比较{
私人最终串干;
私有最终集术语=新HashSet();
专用int频率=0;
公共关键字(字符串干){
this.stem=stem;
}
公共无效添加(字符串术语){
条款。添加(条款);
频率++;
}
@凌驾
公共整数比较(关键字o){
//降序
返回整数.valueOf(o.frequency).compareTo(frequency);
}
@凌驾
公共布尔等于(对象obj){
if(this==obj){
返回true;
}else if(!(obj instanceof关键字)){
返回false;
}否则{
返回stem.equals(((关键字)obj.stem);
}
}
@凌驾
公共int hashCode(){
返回Arrays.hashCode(新对象[]{stem});
}
公共字符串getStem(){
回流杆;
}
公共集getTerms(){
退货条款;
}
公共int getFrequency(){
返回频率;
}
}
公用事业
词干:
公共静态字符串干(字符串术语)引发IOException{
TokenStream-TokenStream=null;
试一试{
//标记化
tokenStream=新的ClassicTokenizer(Version.LUCENE_36,新的StringReader(术语));
//茎
tokenStream=新PorterStemFilter(tokenStream);
//将每个令牌添加到一个集合中,以便删除重复的令牌
Set stems=newhashset();
CharterMattAttribute令牌=tokenStream.getAttribute(charterMattAttribute.class);
tokenStream.reset();
while(tokenStream.incrementToken()){
steps.add(token.toString());
}
//如果未找到阀杆或2+阀杆,则返回null
if(steps.size()!=1){
返回null;
}
String stem=stems.iterator().next();
//如果阀杆具有非字母数字字符,则返回null
如果(!阀杆匹配(“[a-zA-Z0-9-]+”){
返回null;
}
回流杆;
}最后{
if(令牌流!=null){
tokenStream.close();
}
}
}
要搜索集合(将由潜在关键字列表使用),请执行以下操作:
publicstatict-find(集合集合,T示例){
for(T元素:集合){
if(element.equals(示例)){
返回元素;
}
}
集合。添加(示例);
返回示例;
}
核心
以下是主要的输入法:
公共静态列表guessFromString(字符串输入)引发IOException{
TokenStream-TokenStream=null;
试一试{
//黑客保留虚词(例如,“非特定”而不是“非”和“特定”)
input=input.replaceAll(“-+”,“-0”);
//用空格替换除撇号和破折号以外的任何标点字符
input=input.replaceAll(“[\\p{Punct}&[^'-]]+”,”);
//替换最常见的英语缩略语
input=input.replaceAll(“(?:”(?:[tdsm]|[vr]e | ll))+\\b“,”);
//标记化输入
tokenStream=新的ClassicTokenizer(Version.LUCENE_36,新的StringReader(输入));
//小写
tokenStream=新的小写过滤器(Version.LUCENE_36,tokenStream);
//删除首字母缩略词中的点(和“'s”,但上面已经手动完成)
令牌流=新的ClassicFilter(令牌流);
//将任何字符转换为ASCII
令牌流=新的ASCIIFoldingFilter(令牌流);
//删除英语停止词
tokenStream=新的StopFilter(Version.LUCENE_36,tokenStream,EnglishAnalyzer.getDefaultStopSet());
列表关键字=新建LinkedList();
CharterMattAttribute令牌=tokenStream.getAttribute(charterMattAttribute.class);
tokenStream.reset();
while(tokenStream.incrementToken()){
字符串项=token.toString();
//每学期干一次
串杆=杆(术语);
如果(阀杆!=null){
//创建关键字或获取现有关键字(如果有)
关键字=查找(关键字,新关键字(stem.replaceAll(“-0”,“-”));
//添加其相应的初始标记
关键字.add(term.replaceAll(“-0”和“-”);
}
}
//按频率反向排序
集合。排序(关键字);
返回关键字;
}最后{
if(令牌流!=null){
tokenStream.close();
}
}
}
例子
使用上的guessFromString
方法,以下是找到的前10个最常见的关键字(即词干):
javax12[java]
编译x5[已编译,编译器,编译器]
太阳x5[太阳]
开发x4[已开发,开发人员]
语言x3[语言,语言]
实施x3[实施,实施]
应用程序x3[应用程序,应用程序]
运行x3[运行]
原产地x3[原件,原件]
GNUx3[gnu]
迭代输出列表以了解哪些是or
import java.util.HashSet;
import java.util.Set;
/**
* Keyword card with stem form, terms dictionary and frequency rank
*/
class CardKeyword implements Comparable<CardKeyword> {
/**
* Stem form of the keyword
*/
private final String stem;
/**
* Terms dictionary
*/
private final Set<String> terms = new HashSet<>();
/**
* Frequency rank
*/
private int frequency;
/**
* Build keyword card with stem form
*
* @param stem
*/
public CardKeyword(String stem) {
this.stem = stem;
}
/**
* Add term to the dictionary and update its frequency rank
*
* @param term
*/
public void add(String term) {
this.terms.add(term);
this.frequency++;
}
/**
* Compare two keywords by frequency rank
*
* @param keyword
* @return int, which contains comparison results
*/
@Override
public int compareTo(CardKeyword keyword) {
return Integer.valueOf(keyword.frequency).compareTo(this.frequency);
}
/**
* Get stem's hashcode
*
* @return int, which contains stem's hashcode
*/
@Override
public int hashCode() {
return this.getStem().hashCode();
}
/**
* Check if two stems are equal
*
* @param o
* @return boolean, true if two stems are equal
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof CardKeyword)) return false;
CardKeyword that = (CardKeyword) o;
return this.getStem().equals(that.getStem());
}
/**
* Get stem form of keyword
*
* @return String, which contains getStemForm form
*/
public String getStem() {
return this.stem;
}
/**
* Get terms dictionary of the stem
*
* @return Set<String>, which contains set of terms of the getStemForm
*/
public Set<String> getTerms() {
return this.terms;
}
/**
* Get stem frequency rank
*
* @return int, which contains getStemForm frequency
*/
public int getFrequency() {
return this.frequency;
}
}
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
/**
* Keywords extractor functionality handler
*/
class KeywordsExtractor {
/**
* Get list of keywords with stem form, frequency rank, and terms dictionary
*
* @param fullText
* @return List<CardKeyword>, which contains keywords cards
* @throws IOException
*/
static List<CardKeyword> getKeywordsList(String fullText) throws IOException {
TokenStream tokenStream = null;
try {
// treat the dashed words, don't let separate them during the processing
fullText = fullText.replaceAll("-+", "-0");
// replace any punctuation char but apostrophes and dashes with a space
fullText = fullText.replaceAll("[\\p{Punct}&&[^'-]]+", " ");
// replace most common English contractions
fullText = fullText.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
StandardTokenizer stdToken = new StandardTokenizer();
stdToken.setReader(new StringReader(fullText));
tokenStream = new StopFilter(new ASCIIFoldingFilter(new ClassicFilter(new LowerCaseFilter(stdToken))), EnglishAnalyzer.getDefaultStopSet());
tokenStream.reset();
List<CardKeyword> cardKeywords = new LinkedList<>();
CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
String term = token.toString();
String stem = getStemForm(term);
if (stem != null) {
CardKeyword cardKeyword = find(cardKeywords, new CardKeyword(stem.replaceAll("-0", "-")));
// treat the dashed words back, let look them pretty
cardKeyword.add(term.replaceAll("-0", "-"));
}
}
// reverse sort by frequency
Collections.sort(cardKeywords);
return cardKeywords;
} finally {
if (tokenStream != null) {
try {
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* Get stem form of the term
*
* @param term
* @return String, which contains the stemmed form of the term
* @throws IOException
*/
private static String getStemForm(String term) throws IOException {
TokenStream tokenStream = null;
try {
StandardTokenizer stdToken = new StandardTokenizer();
stdToken.setReader(new StringReader(term));
tokenStream = new PorterStemFilter(stdToken);
tokenStream.reset();
// eliminate duplicate tokens by adding them to a set
Set<String> stems = new HashSet<>();
CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
stems.add(token.toString());
}
// if stem form was not found or more than 2 stems have been found, return null
if (stems.size() != 1) {
return null;
}
String stem = stems.iterator().next();
// if the stem form has non-alphanumerical chars, return null
if (!stem.matches("[a-zA-Z0-9-]+")) {
return null;
}
return stem;
} finally {
if (tokenStream != null) {
try {
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* Find sample in collection
*
* @param collection
* @param sample
* @param <T>
* @return <T> T, which contains the found object within collection if exists, otherwise the initially searched object
*/
private static <T> T find(Collection<T> collection, T sample) {
for (T element : collection) {
if (element.equals(sample)) {
return element;
}
}
collection.add(sample);
return sample;
}
}
String text = "…";
List<CardKeyword> keywordsList = KeywordsExtractor.getKeywordsList(text);