Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/solr/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Plugins 创建用于中文标记化的自定义插件_Plugins_Solr_Nlp_Stanford Nlp - Fatal编程技术网

Plugins 创建用于中文标记化的自定义插件

Plugins 创建用于中文标记化的自定义插件,plugins,solr,nlp,stanford-nlp,Plugins,Solr,Nlp,Stanford Nlp,我正在努力将斯坦福分节器正确地集成到SOLR中,以实现中文标记化 这个插件包括加载其他jar文件和模型文件。我通过硬编码文件的完整路径,使它以一种粗糙的方式工作 我正在寻找方法来创建路径不需要硬编码的插件,并使插件符合SOLR插件体系结构。请让我知道是否有任何推荐的网站或教程 我在下面添加了我的代码: 公共类ChineseTokenizerFactory扩展了TokenizerFactory{ /** Creates a new WhitespaceTokenizerFactory */ pub

我正在努力将斯坦福分节器正确地集成到SOLR中,以实现中文标记化

这个插件包括加载其他jar文件和模型文件。我通过硬编码文件的完整路径,使它以一种粗糙的方式工作

我正在寻找方法来创建路径不需要硬编码的插件,并使插件符合SOLR插件体系结构。请让我知道是否有任何推荐的网站或教程

我在下面添加了我的代码:

公共类ChineseTokenizerFactory扩展了TokenizerFactory{

/** Creates a new WhitespaceTokenizerFactory */
public ChineseTokenizerFactory(Map<String,String> args) {
    super(args);
    assureMatchVersion();
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

@Override
public ChineseTokenizer create(AttributeFactory factory, Reader input) {
    Reader processedStringReader = new ProcessedStringReader(input);
    return new ChineseTokenizer(luceneMatchVersion, factory, processedStringReader);
}

}

您可以通过工厂的args参数传递参数

private static final int BUFFER_SIZE = 1024 * 8;
//private static TextProcess m_textProcess = null;
private static final String basedir = "/home/praveen/PDS_Meetup/solr-4.9.0/custom_plugins/";
static Properties props = null;
static CRFClassifier<CoreLabel> segmenter = null;
private char[] m_inputData = null;
private int m_offset = 0;
private int m_length = 0;

public ProcessedStringReader(Reader input){
    char[] arr = new char[BUFFER_SIZE];
    StringBuffer buf = new StringBuffer();
    int numChars;

    if(segmenter == null)
    {
        segmenter = new CRFClassifier<CoreLabel>(getProperties());
        segmenter.loadClassifierNoExceptions(basedir + "ctb.gz", getProperties());
    }

    try {
        while ((numChars = input.read(arr, 0, arr.length)) > 0) {
            buf.append(arr, 0, numChars);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    m_inputData = processText(buf.toString()).toCharArray();
    m_offset = 0;
    m_length = m_inputData.length;
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
    int charNumber = 0;
    for(int i = m_offset + off;i<m_length && charNumber< len; i++){
        cbuf[charNumber] = m_inputData[i];
        m_offset ++;
        charNumber++;
    }
    if(charNumber == 0){
        return -1;
    }
    return charNumber;
}
@Override
public void close() throws IOException {
    m_inputData = null;
    m_offset = 0;
    m_length = 0;
}
public String processText(String inputText)
{
    List<String> segmented = segmenter.segmentString(inputText);
    String output = "";
    if(segmented.size() > 0)
    {
        output = segmented.get(0);
        for(int i=1;i<segmented.size();i++)
        {
            output = output + " " +segmented.get(i);
        }
    }
    System.out.println(output);
    return output;
}
static Properties getProperties()
{
    if (props == null) {
        props = new Properties();
        props.setProperty("sighanCorporaDict", basedir);
        // props.setProperty("NormalizationTable", "data/norm.simp.utf8");
        // props.setProperty("normTableEncoding", "UTF-8");
        // below is needed because CTBSegDocumentIteratorFactory accesses it
        props.setProperty("serDictionary",basedir+"dict-chris6.ser.gz");
        props.setProperty("inputEncoding", "UTF-8");
        props.setProperty("sighanPostProcessing", "true");
    }
    return props;
}
public ChineseTokenizer(Version matchVersion, Reader in) {
    super(matchVersion, in);
}
public ChineseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
    super(matchVersion, factory, in);
}

/** Collects only characters which do not satisfy
 * {@link Character#isWhitespace(int)}.*/
@Override
protected boolean isTokenChar(int c) {
    return !Character.isWhitespace(c);
}