Java 如何通过考虑上一个单词和下一个单词来精确匹配字符串

Java 如何通过考虑上一个单词和下一个单词来精确匹配字符串,java,regex,xml,string,Java,Regex,Xml,String,我不熟悉javaregex。因此,我有一个xml文件,其中包含不同的节点。文件是- <Node id="855"/>PROFILE<Node id="862"/>:<Node id="863"/> <Node id="864"/>8<Node id="865"/> <Node id="866"/>years<Node id="871"/> <Node id="872"/>IT<Node id="

我不熟悉
javaregex
。因此,我有一个
xml
文件,其中包含不同的节点。文件是-

<Node id="855"/>PROFILE<Node id="862"/>:<Node id="863"/>
<Node id="864"/>8<Node id="865"/> <Node id="866"/>years<Node id="871"/> <Node id="872"/>IT<Node id="874"/> <Node id="875"/>industry<Node id="883"/> <Node id="884"/>experience<Node id="894"/> <Node id="895"/>in<Node id="897"/> <Node id="898"/>web<Node id="901"/> <Node id="902"/>based<Node id="907"/> <Node id="908"/>applications<Node id="920"/> <Node id="921"/>that<Node id="925"/> <Node id="926"/>involved<Node id="934"/> <Node id="935"/>extensive<Node id="944"/> <Node id="945"/>development<Node id="956"/> <Node id="957"/>work<Node id="961"/> <Node id="962"/>in<Node id="964"/> <Node id="965"/>Java<Node id="969"/>/<Node id="970"/>J<Node id="971"/>2<Node id="972"/>EE<Node id="974"/>,<Node id="975"/>Jquery<Node id="981"/>,<Node id="982"/>Jqgrid<Node id="988"/>,<Node id="989"/>Ajax<Node id="993"/>.<Node id="994"/>
<Node id="995"/>Good<Node id="999"/> <Node id="1000"/>experience<Node id="1010"/> <Node id="1011"/>in<Node id="1013"/> <Node id="1014"/>agile<Node id="1019"/> <Node id="1020"/>methodology<Node id="1031"/> <Node id="1032"/>.<Node id="1033"/>
所以

private void parseXml(ArrayList元素、字符串文件路径){
布尔标志=假;
字符串nextId=“0”;
字符串xmlData=getTextWithNodesDataFromXml(文件路径);
用于(字符串s:元素){
系统输出打印项次;
字符串标记;
int-id;
字符串regex=“”;
如果(标志==false){
regex=“(“+s+”);
flag=true;
Pattern pattern1=Pattern.compile(regex);
Matcher matcher1=pattern1.Matcher(xmlData);
if(matcher1.find()){
System.out.println(“找到匹配-->”+s);
}
}

因此,第一个参数是一个数组列表,其中包含要匹配的字符串的标记,第二个参数是文件的路径。
xmlData
与我前面提到的节点匹配,我需要匹配这些节点。因此,如果我发现
PROFILE
匹配了三次,那么我如何检查整个字符串?我必须匹配准确的st使用此节点进行环化?我怎么做?

我建议比较可比较的内容:您显示的XML是句子的标记化。您尝试将其与整个字符串进行比较

如果将XML转换为字符串数组,并通过
String.split(\\s+”)
标记句子配置文件,则必须比较两个字符串数组

也许,你不想要一个完全匹配的吗


在这种情况下,您必须计算相似性的百分比
p
,确定阈值
t
,并仅保留带有
p>t

的配置文件。我对请求的内容做了一些假设,并创建了一个我最能理解的工作解决方案

我逃避阅读文件而作弊

package stacktest;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StringMatch 
{
    private String getTextWithNodesDataFromXml(String filePath)
    {
        // cheating here by passing the xml as a string in.
        return filePath;
    }

    private int []  findMatches(List<String> elements, String filePath) 
    {
        String xmlData = getTextWithNodesDataFromXml(filePath);

        String outerRegex = "(<Node id=\"[0-9]+\"\\/>PROFILE<Node id=\"[0-9]+\"\\/>:<)";
        Pattern outerPattern = Pattern.compile(outerRegex, Pattern.DOTALL);
        Matcher outerMatcher = outerPattern.matcher(xmlData);
        int outerMatches = 0;
        boolean first = true;
        int lastStart=0;
        ArrayList<String> profiles = new ArrayList<String>();
        while (outerMatcher.find())
        {
            String localXML = outerMatcher.group(1);
            int startIndex = outerMatcher.start(1);

            if (!first)
            {
                localXML = xmlData.substring(lastStart, startIndex);
                profiles.add(localXML);
            }
            lastStart = startIndex;
            first = false;
            outerMatches++;
        }
        // Is there a hanging one at the end?
        if (outerMatches > 0)
        {
            String localXML = xmlData.substring(lastStart);
            profiles.add(localXML);
        }

        for (String profile: profiles)
        {
            // System.out.println(localXML);
            String regex = "<Node id=\"([0-9]+)\"\\/>([^<]+)";

            Pattern pattern1 = Pattern.compile(regex);
            Matcher matcher1 = pattern1.matcher(profile);
            ArrayList<String> toMatch = new ArrayList<String>();
            ArrayList<String> idMatch = new ArrayList<String>();

            while (matcher1.find())
            {
                String token = matcher1.group(2);
                toMatch.add(token);
                String id = matcher1.group(1);
                idMatch.add(id);
                outerMatches++;
            }

            if (elements.size() == toMatch.size())
            {
                boolean didFind = true;
                for (int i=0; i< elements.size(); i++) 
                {
                    String element = elements.get(i);
                    String match = toMatch.get(i);
                    if (!element.equals(match))
                    {
                        didFind = false;
                    }
                }

                if (didFind)
                {
                    int[] toReturn = new int[2];
                    toReturn[0] = Integer.parseInt(idMatch.get(0));
                    toReturn[1] = Integer.parseInt(idMatch.get(idMatch.size()-1));
                    return toReturn;
                }
            }

        }

        return null;
     }

    public static void main(String args[])
    {
        String nodes = "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>\n" +
                "<Node id=\"995\"/>Good<Node id=\"999\"/> <Node id=\"1000\"/>experience<Node id=\"1010\"/>";

        String nodes2 = "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>";
                //"<Node id=\"995\"/>Good<Node id=\"999\"/> <Node id=\"1000\"/>experience<Node id=\"1010\"/>";

        String nodes3 = "<Node id=\"1\"/>PROFILE<Node id=\"2\"/>:<Node id=\"3\"/>This<Node id=\"4\"/>is<Node id=\"5\"/>not<Node id=\"6\"/>the<Node id=\"7\"/>Profile<Node id=\"8\"/>\n" +
                "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>" +
                "PROFILE<Node id=\"1021\"/>:<Node id=\"1022\"/>This<Node id=\"1023\"/>is<Node id=\"1024\"/>not<Node id=\"1025\"/>the<Node id=\"1026\"/>Profile<Node id=\"1027\"/>\n";

        String[] el = { "PROFILE", ":", "\n",
                        "8", " ", "years", " ", "IT", " ", "industry", " ", "experience", " ", "in", " ", "web",
                        " ", "based", " ", "applications", " ", "that", " ", "involved", " ", "extensive", " ", 
                        "development", " ", "work", " ", "in", " ", "Java", "/", "J", "2", "EE", ",", "Jquery", 
                        ",", "Jqgrid", ",", "Ajax", "."
                        };

        List<String> elements =  Arrays.asList(el);

        StringMatch sm = new StringMatch();
        printTest(sm.findMatches(elements, nodes)); 
        printTest(sm.findMatches(elements, nodes2)); 
        printTest(sm.findMatches(elements, nodes3)); 
    }

    private static void printTest(int[] vals)
    {
        if (vals != null) 
        {
            System.out.println("found match from id: " + vals[0] + " to " + vals[1]);
        }
        else
        {
            System.out.println("no match");
        }
        System.out.println("--------------------------------");
    }

}

我被标题中的“考虑上一个和下一个单词”弄糊涂了。还使用了“profile”一词。你能添加几个不同的输入并说出你期望的输出吗?如果匹配,你是返回真/假,还是在节点文件中找到句子(元素)所在的位置开始?是的,我想找到句子开始和结束的节点的id。基本上,我想要开始和结束偏移量。@hack_on我的意思是文件可以多次包含配置文件字。但是在配置文件字之后,剩余的字符串应该是相同的。所以,如果我们在第一个配置文件之后找到了三次配置文件,那么下一个不是:哪个是s所以,我想精确匹配整个字符串。我已经更新了答案,以便在一个文件中处理多个配置文件,并提取开始和结束ID。
private void parseXml(ArrayList<String> elements, String filePath) {
    boolean flag = false;
    String nextId = "0";
    String xmlData = getTextWithNodesDataFromXml(filePath);
    for (String s : elements) {
        System.out.println(s);
        String token;
        int id;
        String regex = "";
        if (flag == false) {
            regex = "<Node id=\"([0-9]+)\"\\/>(" + s + ")";
            flag = true;
            Pattern pattern1 = Pattern.compile(regex);
            Matcher matcher1 = pattern1.matcher(xmlData);
            if (matcher1.find()) {
                System.out.println("match found -->" + s);
            }
        }
package stacktest;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StringMatch 
{
    private String getTextWithNodesDataFromXml(String filePath)
    {
        // cheating here by passing the xml as a string in.
        return filePath;
    }

    private int []  findMatches(List<String> elements, String filePath) 
    {
        String xmlData = getTextWithNodesDataFromXml(filePath);

        String outerRegex = "(<Node id=\"[0-9]+\"\\/>PROFILE<Node id=\"[0-9]+\"\\/>:<)";
        Pattern outerPattern = Pattern.compile(outerRegex, Pattern.DOTALL);
        Matcher outerMatcher = outerPattern.matcher(xmlData);
        int outerMatches = 0;
        boolean first = true;
        int lastStart=0;
        ArrayList<String> profiles = new ArrayList<String>();
        while (outerMatcher.find())
        {
            String localXML = outerMatcher.group(1);
            int startIndex = outerMatcher.start(1);

            if (!first)
            {
                localXML = xmlData.substring(lastStart, startIndex);
                profiles.add(localXML);
            }
            lastStart = startIndex;
            first = false;
            outerMatches++;
        }
        // Is there a hanging one at the end?
        if (outerMatches > 0)
        {
            String localXML = xmlData.substring(lastStart);
            profiles.add(localXML);
        }

        for (String profile: profiles)
        {
            // System.out.println(localXML);
            String regex = "<Node id=\"([0-9]+)\"\\/>([^<]+)";

            Pattern pattern1 = Pattern.compile(regex);
            Matcher matcher1 = pattern1.matcher(profile);
            ArrayList<String> toMatch = new ArrayList<String>();
            ArrayList<String> idMatch = new ArrayList<String>();

            while (matcher1.find())
            {
                String token = matcher1.group(2);
                toMatch.add(token);
                String id = matcher1.group(1);
                idMatch.add(id);
                outerMatches++;
            }

            if (elements.size() == toMatch.size())
            {
                boolean didFind = true;
                for (int i=0; i< elements.size(); i++) 
                {
                    String element = elements.get(i);
                    String match = toMatch.get(i);
                    if (!element.equals(match))
                    {
                        didFind = false;
                    }
                }

                if (didFind)
                {
                    int[] toReturn = new int[2];
                    toReturn[0] = Integer.parseInt(idMatch.get(0));
                    toReturn[1] = Integer.parseInt(idMatch.get(idMatch.size()-1));
                    return toReturn;
                }
            }

        }

        return null;
     }

    public static void main(String args[])
    {
        String nodes = "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>\n" +
                "<Node id=\"995\"/>Good<Node id=\"999\"/> <Node id=\"1000\"/>experience<Node id=\"1010\"/>";

        String nodes2 = "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>";
                //"<Node id=\"995\"/>Good<Node id=\"999\"/> <Node id=\"1000\"/>experience<Node id=\"1010\"/>";

        String nodes3 = "<Node id=\"1\"/>PROFILE<Node id=\"2\"/>:<Node id=\"3\"/>This<Node id=\"4\"/>is<Node id=\"5\"/>not<Node id=\"6\"/>the<Node id=\"7\"/>Profile<Node id=\"8\"/>\n" +
                "<Node id=\"855\"/>PROFILE<Node id=\"862\"/>:<Node id=\"863\"/>\n" +
                "<Node id=\"864\"/>8<Node id=\"865\"/> <Node id=\"866\"/>years<Node id=\"871\"/> <Node id=\"872\"/>IT<Node id=\"874\"/> <Node id=\"875\"/>industry<Node id=\"883\"/> <Node id=\"884\"/>experience<Node id=\"894\"/> <Node id=\"895\"/>in<Node id=\"897\"/> <Node id=\"898\"/>web<Node id=\"901\"/> <Node id=\"902\"/>based<Node id=\"907\"/> <Node id=\"908\"/>applications<Node id=\"920\"/> <Node id=\"921\"/>that<Node id=\"925\"/> <Node id=\"926\"/>involved<Node id=\"934\"/> <Node id=\"935\"/>extensive<Node id=\"944\"/> <Node id=\"945\"/>development<Node id=\"956\"/> <Node id=\"957\"/>work<Node id=\"961\"/> <Node id=\"962\"/>in<Node id=\"964\"/> <Node id=\"965\"/>Java<Node id=\"969\"/>/<Node id=\"970\"/>J<Node id=\"971\"/>2<Node id=\"972\"/>EE<Node id=\"974\"/>,<Node id=\"975\"/>Jquery<Node id=\"981\"/>,<Node id=\"982\"/>Jqgrid<Node id=\"988\"/>,<Node id=\"989\"/>Ajax<Node id=\"993\"/>.<Node id=\"994\"/>" +
                "PROFILE<Node id=\"1021\"/>:<Node id=\"1022\"/>This<Node id=\"1023\"/>is<Node id=\"1024\"/>not<Node id=\"1025\"/>the<Node id=\"1026\"/>Profile<Node id=\"1027\"/>\n";

        String[] el = { "PROFILE", ":", "\n",
                        "8", " ", "years", " ", "IT", " ", "industry", " ", "experience", " ", "in", " ", "web",
                        " ", "based", " ", "applications", " ", "that", " ", "involved", " ", "extensive", " ", 
                        "development", " ", "work", " ", "in", " ", "Java", "/", "J", "2", "EE", ",", "Jquery", 
                        ",", "Jqgrid", ",", "Ajax", "."
                        };

        List<String> elements =  Arrays.asList(el);

        StringMatch sm = new StringMatch();
        printTest(sm.findMatches(elements, nodes)); 
        printTest(sm.findMatches(elements, nodes2)); 
        printTest(sm.findMatches(elements, nodes3)); 
    }

    private static void printTest(int[] vals)
    {
        if (vals != null) 
        {
            System.out.println("found match from id: " + vals[0] + " to " + vals[1]);
        }
        else
        {
            System.out.println("no match");
        }
        System.out.println("--------------------------------");
    }

}
no match
--------------------------------
found match from id: 855 to 993
--------------------------------
found match from id: 855 to 993
--------------------------------