Plugins 创建用于中文标记化的自定义插件_Plugins_Solr_Nlp_Stanford Nlp

Plugins 创建用于中文标记化的自定义插件

plugins solr nlp stanford-nlp

Plugins 创建用于中文标记化的自定义插件,plugins,solr,nlp,stanford-nlp,Plugins,Solr,Nlp,Stanford Nlp,我正在努力将斯坦福分节器正确地集成到SOLR中，以实现中文标记化这个插件包括加载其他jar文件和模型文件。我通过硬编码文件的完整路径，使它以一种粗糙的方式工作我正在寻找方法来创建路径不需要硬编码的插件，并使插件符合SOLR插件体系结构。请让我知道是否有任何推荐的网站或教程我在下面添加了我的代码：公共类ChineseTokenizerFactory扩展了TokenizerFactory{ /** Creates a new WhitespaceTokenizerFactory */ pub

我正在努力将斯坦福分节器正确地集成到SOLR中，以实现中文标记化

这个插件包括加载其他jar文件和模型文件。我通过硬编码文件的完整路径，使它以一种粗糙的方式工作

我正在寻找方法来创建路径不需要硬编码的插件，并使插件符合SOLR插件体系结构。请让我知道是否有任何推荐的网站或教程

我在下面添加了我的代码：

公共类ChineseTokenizerFactory扩展了TokenizerFactory{

/** Creates a new WhitespaceTokenizerFactory */
public ChineseTokenizerFactory(Map<String,String> args) {
    super(args);
    assureMatchVersion();
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

@Override
public ChineseTokenizer create(AttributeFactory factory, Reader input) {
    Reader processedStringReader = new ProcessedStringReader(input);
    return new ChineseTokenizer(luceneMatchVersion, factory, processedStringReader);
}

}

您可以通过工厂的args参数传递参数

private static final int BUFFER_SIZE = 1024 * 8;
//private static TextProcess m_textProcess = null;
private static final String basedir = "/home/praveen/PDS_Meetup/solr-4.9.0/custom_plugins/";
static Properties props = null;
static CRFClassifier<CoreLabel> segmenter = null;
private char[] m_inputData = null;
private int m_offset = 0;
private int m_length = 0;

public ProcessedStringReader(Reader input){
    char[] arr = new char[BUFFER_SIZE];
    StringBuffer buf = new StringBuffer();
    int numChars;

    if(segmenter == null)
    {
        segmenter = new CRFClassifier<CoreLabel>(getProperties());
        segmenter.loadClassifierNoExceptions(basedir + "ctb.gz", getProperties());
    }

    try {
        while ((numChars = input.read(arr, 0, arr.length)) > 0) {
            buf.append(arr, 0, numChars);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    m_inputData = processText(buf.toString()).toCharArray();
    m_offset = 0;
    m_length = m_inputData.length;
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
    int charNumber = 0;
    for(int i = m_offset + off;i<m_length && charNumber< len; i++){
        cbuf[charNumber] = m_inputData[i];
        m_offset ++;
        charNumber++;
    }
    if(charNumber == 0){
        return -1;
    }
    return charNumber;
}
@Override
public void close() throws IOException {
    m_inputData = null;
    m_offset = 0;
    m_length = 0;
}
public String processText(String inputText)
{
    List<String> segmented = segmenter.segmentString(inputText);
    String output = "";
    if(segmented.size() > 0)
    {
        output = segmented.get(0);
        for(int i=1;i<segmented.size();i++)
        {
            output = output + " " +segmented.get(i);
        }
    }
    System.out.println(output);
    return output;
}
static Properties getProperties()
{
    if (props == null) {
        props = new Properties();
        props.setProperty("sighanCorporaDict", basedir);
        // props.setProperty("NormalizationTable", "data/norm.simp.utf8");
        // props.setProperty("normTableEncoding", "UTF-8");
        // below is needed because CTBSegDocumentIteratorFactory accesses it
        props.setProperty("serDictionary",basedir+"dict-chris6.ser.gz");
        props.setProperty("inputEncoding", "UTF-8");
        props.setProperty("sighanPostProcessing", "true");
    }
    return props;
}

public ChineseTokenizer(Version matchVersion, Reader in) {
    super(matchVersion, in);
}
public ChineseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
    super(matchVersion, factory, in);
}

/** Collects only characters which do not satisfy
 * {@link Character#isWhitespace(int)}.*/
@Override
protected boolean isTokenChar(int c) {
    return !Character.isWhitespace(c);
}