Solr:从单词Dictionary在正文上自动链接

Solr:从单词Dictionary在正文上自动链接,dictionary,solr,Dictionary,Solr,我正在寻找生成自动链接在身体的solr结果。链接上的单词必须在词汇表中 例如: 一份文件: <doc> [...] <str name="title">Il faut, quand on gouverne, voir les hommes tels qu’ils sont, et les choses telles qu’elles devraient être.</str> <str name="path">citat

我正在寻找生成自动链接在身体的solr结果。链接上的单词必须在词汇表中

例如:

一份文件:

<doc>

    [...]

    <str name="title">Il faut, quand on gouverne, voir les hommes tels qu’ils sont, et les choses telles qu’elles devraient être.</str>
    <str name="path">citation/faut-gouverne-voir-hommes-tels-choses-telles-devraient-etre-15.php</str>
    <str name="ss_field_citation_keywords">#faut#gouverne#voir#hommes#tels#choses#telles#devraient#etre#</str>

    [...]
</doc>
来自ss_字段_引文_关键词的链接:

#faut#gouverne#voir#hommes#tels#choses#telles#devraient#etre#
主体必须如下所示:

Il <a href="foo/faut">faut</a>, quand on <a href="foo/gouverne">gouverne</a>, <a href="foo/voir">voir</a> les <a href="foo/hommes">hommes</a> <a href="foo/tels">tels</a> qu’ils sont, et les <a href="foo/choses">choses</a> <a href="foo/telles">telles</a> qu’elles <a href="foo/devraient">devraient</a> <a href="foo/etre">être</a>.
Il,quand on,les qu'ils sont,et les qu'elles。
我,我,我,我的儿子,还有我的孩子


你知道吗?

这里有两个阶段:

  • 识别关键词。为此,您需要正确构建analyzer链。空格标记器、小写过滤器和-这是关键部分-。这将使Solr只保留文本中带有偏移量的关键字
  • 得到那些偏移量。可能有几种方法,但其中之一是重用FieldAnalyzer,您可以在最新(4+)Solr的AdminWebUI中使用它。确保选中详细框。它使用端点,您也可以使用它(使用verbose标志)。结果可能对您来说太冗长了,但已经足够好了,可以开始了。然后,您可以寻找更好的实现或复制/减少当前完成的实现

  • 使用velocity和java类进行内部处理的建议

    public class autoLinkCitationDirective extends Directive{
    
    public String getName() {
        return "autolinkcitation";
    }
    
    public int getType() {
        return LINE;
    }
    
    public boolean render(InternalContextAdapter context, Writer writer, Node node)
            throws IOException, ResourceNotFoundException, ParseErrorException, MethodInvocationException {
    
        String CitationMe   = null;
        String KeyWords     = null;
        String SchemaUrl    = null;
    
        //params
        if (node.jjtGetChild(0) != null) {
            CitationMe = String.valueOf(node.jjtGetChild(0).value(context));
        }
        if (node.jjtGetChild(1) != null) {
            KeyWords = String.valueOf(node.jjtGetChild(1).value(context));
        }
    
        //schema url
        if (node.jjtGetChild(2) != null) {
            SchemaUrl = String.valueOf(node.jjtGetChild(2).value(context));
        }
    
        writer.write(autoLinkCitation(CitationMe, KeyWords, SchemaUrl));
    
        return true;
    }
    
    public String autoLinkCitation(String CitationMe, String KeyWords, String SchemaUrl) {
        if (CitationMe == null) {
            return null;
        }
    
        List<String> tokens = new ArrayList<String>();
        StringTokenizer stkKeyWords = new StringTokenizer(KeyWords, "#");
        while ( stkKeyWords.hasMoreTokens() ) {
            tokens.add(stkKeyWords.nextToken());
        }
    
    
        String patternString = "\\b(" + StringUtils.join(tokens, "|") + ")\\b";
        Pattern pattern = Pattern.compile(patternString);
    
        String strippedHtml = CitationMe.replaceAll("<(.|\n)*?>", "");
        StringTokenizer st = new StringTokenizer(strippedHtml, ".,! ()[]");
    
        while (st.hasMoreTokens())
        {
            String token = st.nextToken().trim();
            if (token.length() > 3)
            {
                Matcher matcher = pattern.matcher(cleanString(token));
                while (matcher.find()) {
                    if(CitationMe.indexOf( SchemaUrl + cleanString(token) + "'") == -1)
                    {
                        String tmpStringreplacement = "<a href='" + SchemaUrl + cleanString(token) + "'>"+token+"</a>";
                        CitationMe = CitationMe.replaceAll("\\b"+token+"\\b(?!/)",tmpStringreplacement);
                    }
                }
            }
        }
    
        return CitationMe;
    }
    
    public String cleanString(String CleanStringMe) {
        if (CleanStringMe == null) {
            return null;
        }
    
        CleanStringMe =  Normalizer.normalize(CleanStringMe, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
        CleanStringMe = CleanStringMe.toLowerCase();
        CleanStringMe = CleanStringMe.replaceAll("[^A-Za-z0-9]", "-");
        return CleanStringMe;
    }
    }
    

    谢谢你的回答
    public class autoLinkCitationDirective extends Directive{
    
    public String getName() {
        return "autolinkcitation";
    }
    
    public int getType() {
        return LINE;
    }
    
    public boolean render(InternalContextAdapter context, Writer writer, Node node)
            throws IOException, ResourceNotFoundException, ParseErrorException, MethodInvocationException {
    
        String CitationMe   = null;
        String KeyWords     = null;
        String SchemaUrl    = null;
    
        //params
        if (node.jjtGetChild(0) != null) {
            CitationMe = String.valueOf(node.jjtGetChild(0).value(context));
        }
        if (node.jjtGetChild(1) != null) {
            KeyWords = String.valueOf(node.jjtGetChild(1).value(context));
        }
    
        //schema url
        if (node.jjtGetChild(2) != null) {
            SchemaUrl = String.valueOf(node.jjtGetChild(2).value(context));
        }
    
        writer.write(autoLinkCitation(CitationMe, KeyWords, SchemaUrl));
    
        return true;
    }
    
    public String autoLinkCitation(String CitationMe, String KeyWords, String SchemaUrl) {
        if (CitationMe == null) {
            return null;
        }
    
        List<String> tokens = new ArrayList<String>();
        StringTokenizer stkKeyWords = new StringTokenizer(KeyWords, "#");
        while ( stkKeyWords.hasMoreTokens() ) {
            tokens.add(stkKeyWords.nextToken());
        }
    
    
        String patternString = "\\b(" + StringUtils.join(tokens, "|") + ")\\b";
        Pattern pattern = Pattern.compile(patternString);
    
        String strippedHtml = CitationMe.replaceAll("<(.|\n)*?>", "");
        StringTokenizer st = new StringTokenizer(strippedHtml, ".,! ()[]");
    
        while (st.hasMoreTokens())
        {
            String token = st.nextToken().trim();
            if (token.length() > 3)
            {
                Matcher matcher = pattern.matcher(cleanString(token));
                while (matcher.find()) {
                    if(CitationMe.indexOf( SchemaUrl + cleanString(token) + "'") == -1)
                    {
                        String tmpStringreplacement = "<a href='" + SchemaUrl + cleanString(token) + "'>"+token+"</a>";
                        CitationMe = CitationMe.replaceAll("\\b"+token+"\\b(?!/)",tmpStringreplacement);
                    }
                }
            }
        }
    
        return CitationMe;
    }
    
    public String cleanString(String CleanStringMe) {
        if (CleanStringMe == null) {
            return null;
        }
    
        CleanStringMe =  Normalizer.normalize(CleanStringMe, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
        CleanStringMe = CleanStringMe.toLowerCase();
        CleanStringMe = CleanStringMe.replaceAll("[^A-Za-z0-9]", "-");
        return CleanStringMe;
    }
    }
    
    #autolinkcitation($doc.getFieldValue('body'),$doc.getFieldValue('ss_field_citation_keywords'), '/citations/mot.php?mot=' )