Java语法分析器_Java_Regex_Tokenize

Java语法分析器

java regex

Java语法分析器,java,regex,tokenize,Java,Regex,Tokenize,我的代币课 public class Token { public enum TokenType { RELATIONALOPERATOR("==|<>|<=|>=|>|<"), MULTIPLYINGOPERATOR("[*/]"), SIGNADDINGOP("[+-]"), LEFTPAREN( "\\("), RIGHTPAREN("\\)"), COMMA(","), PEROID("\\."), ASSIGNMENTOP("="), SEM

我的代币课

public class Token {
public enum TokenType {
RELATIONALOPERATOR("==|<>|<=|>=|>|<"), MULTIPLYINGOPERATOR("[*/]"), SIGNADDINGOP("[+-]"), LEFTPAREN(
    "\\("), RIGHTPAREN("\\)"), COMMA(","), PEROID("\\."), ASSIGNMENTOP("="), SEMICOLON(";"), WHILE(
        "while"), IF("if"), ELSE("else"), COMMENT("//"), PUBLIC("public"), PRIVATE("private"), PACKAGE(
            "package"), IMPORT("import"), ENUM("enum"), CONSTANT(
                "[0-9]*"), VARIABLE("[a-zA-Z][a-zA-Z0-9]*"), SKIP("[\\s+\\t]*"), INVALID(".*");

public final String pattern;

private TokenType(String pattern) {
    this.pattern = pattern;
}
}

public TokenType type;
public String data;

public Token(TokenType type, String data) {
  this.type = type;
  this.data = data;
}

@Override
public String toString() {
  return String.format("[ %s, %s ]", type.name(), this.data);
}
}

我希望它能打印出像[公共，公共]，[类，类]之类的东西。。。以[TYPENAMEHERE，actualRegexMatchHere]的形式，直到它读取整个文件。多亏了欧文的建议，特殊角色现在被转义了，但问题依然存在

最初我的正则表达式定义为

常量（“[0-9]*”），变量（“[a-zA-Z][a-zA-Z0-9]*”），跳过（[\\s+\\t]*”），无效（“*”）
[0-9]*
将匹配任何字符，因为该字符为星号。将正则表达式中的所有*替换为+修复了该问题。
不是答案，但您的解析器将需要不同的标记来表示=，您应该转义标记中的特殊字符，如lparen，否则它们将被解释为正则表达式特殊字符。
public static ArrayList<Token> lex(String input) {
// The tokens to return
ArrayList<Token> tokens = new ArrayList<Token>();

StringBuffer tokenPatternsBuffer = new StringBuffer();
for (TokenType tokenType : TokenType.values()) {
    // format everything to match |(?<EXAMPLE> [0-9]*)
    tokenPatternsBuffer.append(String.format("|(?<%s>%s)", tokenType.name(), tokenType.pattern));

}
Pattern tokenPatterns = Pattern.compile(tokenPatternsBuffer.substring(1));
//System.out.println(tokenPatternsBuffer.substring(1));

// Object that finds matches of pattern tokenPatterns
Matcher matcher = tokenPatterns.matcher(input);
while (matcher.find()) {
    int i = 0;
    System.out.println(matcher.group());
    for (TokenType tk : TokenType.values()) {
    // don't want to grab spaces
    if (matcher.group(TokenType.SKIP.toString()) != null) {
        continue;
    }
    // grab anything that isn't a space and add TokenType to the
    // matcher group using .named() because the text matching
    // exactly is vital
    else if (matcher.group(tk.name()) != null) {
        tokens.add(new Token(tk, matcher.group(tk.name())));
        i++;
        continue;
    }

    }
}

return tokens;
}

static String readFile(String path) throws IOException {
byte[] encoded = Files.readAllBytes(Paths.get(path));
return new String(encoded);
}

public static void main(String[] args) throws IOException {
String toDebugg = readFile("Test.java");
ArrayList<Token> myTokens = lex(toDebugg);
// System.out.println(toDebugg);
// for (Token tok : myTokens) {
// System.out.println(tok);
// }
}

package myLex;
public class Test {

   public static void main(String[] args) {

   }

}