Plugins 创建用于中文标记化的自定义插件
我正在努力将斯坦福分节器正确地集成到SOLR中,以实现中文标记化 这个插件包括加载其他jar文件和模型文件。我通过硬编码文件的完整路径,使它以一种粗糙的方式工作 我正在寻找方法来创建路径不需要硬编码的插件,并使插件符合SOLR插件体系结构。请让我知道是否有任何推荐的网站或教程 我在下面添加了我的代码: 公共类ChineseTokenizerFactory扩展了TokenizerFactory{Plugins 创建用于中文标记化的自定义插件,plugins,solr,nlp,stanford-nlp,Plugins,Solr,Nlp,Stanford Nlp,我正在努力将斯坦福分节器正确地集成到SOLR中,以实现中文标记化 这个插件包括加载其他jar文件和模型文件。我通过硬编码文件的完整路径,使它以一种粗糙的方式工作 我正在寻找方法来创建路径不需要硬编码的插件,并使插件符合SOLR插件体系结构。请让我知道是否有任何推荐的网站或教程 我在下面添加了我的代码: 公共类ChineseTokenizerFactory扩展了TokenizerFactory{ /** Creates a new WhitespaceTokenizerFactory */ pub
/** Creates a new WhitespaceTokenizerFactory */
public ChineseTokenizerFactory(Map<String,String> args) {
super(args);
assureMatchVersion();
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public ChineseTokenizer create(AttributeFactory factory, Reader input) {
Reader processedStringReader = new ProcessedStringReader(input);
return new ChineseTokenizer(luceneMatchVersion, factory, processedStringReader);
}
}您可以通过工厂的args参数传递参数
private static final int BUFFER_SIZE = 1024 * 8;
//private static TextProcess m_textProcess = null;
private static final String basedir = "/home/praveen/PDS_Meetup/solr-4.9.0/custom_plugins/";
static Properties props = null;
static CRFClassifier<CoreLabel> segmenter = null;
private char[] m_inputData = null;
private int m_offset = 0;
private int m_length = 0;
public ProcessedStringReader(Reader input){
char[] arr = new char[BUFFER_SIZE];
StringBuffer buf = new StringBuffer();
int numChars;
if(segmenter == null)
{
segmenter = new CRFClassifier<CoreLabel>(getProperties());
segmenter.loadClassifierNoExceptions(basedir + "ctb.gz", getProperties());
}
try {
while ((numChars = input.read(arr, 0, arr.length)) > 0) {
buf.append(arr, 0, numChars);
}
} catch (IOException e) {
e.printStackTrace();
}
m_inputData = processText(buf.toString()).toCharArray();
m_offset = 0;
m_length = m_inputData.length;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int charNumber = 0;
for(int i = m_offset + off;i<m_length && charNumber< len; i++){
cbuf[charNumber] = m_inputData[i];
m_offset ++;
charNumber++;
}
if(charNumber == 0){
return -1;
}
return charNumber;
}
@Override
public void close() throws IOException {
m_inputData = null;
m_offset = 0;
m_length = 0;
}
public String processText(String inputText)
{
List<String> segmented = segmenter.segmentString(inputText);
String output = "";
if(segmented.size() > 0)
{
output = segmented.get(0);
for(int i=1;i<segmented.size();i++)
{
output = output + " " +segmented.get(i);
}
}
System.out.println(output);
return output;
}
static Properties getProperties()
{
if (props == null) {
props = new Properties();
props.setProperty("sighanCorporaDict", basedir);
// props.setProperty("NormalizationTable", "data/norm.simp.utf8");
// props.setProperty("normTableEncoding", "UTF-8");
// below is needed because CTBSegDocumentIteratorFactory accesses it
props.setProperty("serDictionary",basedir+"dict-chris6.ser.gz");
props.setProperty("inputEncoding", "UTF-8");
props.setProperty("sighanPostProcessing", "true");
}
return props;
}
public ChineseTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
public ChineseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(int)}.*/
@Override
protected boolean isTokenChar(int c) {
return !Character.isWhitespace(c);
}