Java 制作词法分析器_Java_Analyzer_Lexical

Java 制作词法分析器

java

Java 制作词法分析器,java,analyzer,lexical,Java,Analyzer,Lexical,我现在正在使用一个词法分析器程序，我正在使用Java。我一直在研究这个问题的答案，但直到现在我还没有找到任何答案。我的问题是：输入： System.out.println ("Hello World"); 期望输出： Lexeme----------------------Token System [Key_Word] . [Object_Accessor] out [Key_Word] . [Object_Accessor] println [Key_Word]

我现在正在使用一个词法分析器程序，我正在使用Java。我一直在研究这个问题的答案，但直到现在我还没有找到任何答案。我的问题是：

输入：

System.out.println ("Hello World");

期望输出：

Lexeme----------------------Token

System [Key_Word]

.       [Object_Accessor]

out   [Key_Word]

. [Object_Accessor]

println  [Key_Word]

(  [left_Parenthesis]

"Hello World"    [String_Literal]

)   [right_Parenthesis]

;  [statement_separator]

我还是个初学者，所以我希望你们能在这方面帮助我。谢谢。

词法分析本身就是一个主题，通常与编译器设计和分析一起讨论。在尝试编写任何代码之前，您应该仔细阅读它。关于这个主题，我最喜欢的书是这本书，它应该给你一个很好的编译器设计介绍，甚至为所有编译器阶段提供伪代码，你可以很容易地将其翻译成Java并从那里开始

简而言之，主要思想是使用有限状态机解析输入并将其划分为属于特定类（例如，在所需输出中的括号或关键字）的标记。状态机构建过程实际上是这一分析中唯一困难的部分，《龙之书》将为您提供关于这一点的深刻见解。

将使用

Java.g4

参考语法来实现这一点。根据您希望Unicode转义序列的处理遵循语言规范的程度，有两个选项

：此语法仅将Unicode转义序列作为字符串或字符文本中的字符处理
（使用前必须重命名为Java.g4）：此语法要求将
```
ANTLRInputStream
```
包装在一个文件中，该文件在将Unicode转义序列馈送到lexer之前，根据JLS处理Unicode转义序列

编辑：此语法生成的标记的名称与您的表略有不同

您的
```
关键字
```
标记是
```
标识符
```
您的
```
对象\u访问器
```
令牌是
```
点
```
您的
```
左括号
```
标记是
```
LPAREN
```
您的
```
String\u Literal
```
标记是
```
StringLiteral
```
您的
```
右括号
```
标记是
```
RPAREN
```
您的
```
语句\u分隔符
```
标记是
```
SEMI
```

您可以使用C语言中的

Lex&Bison

或Java语言中的

Antlr

等库。词法分析可以通过制作自动机来完成。我给你举个小例子：

假设您需要标记一个字符串，其中关键字（语言）是

{'echo'，'.'，''end'）

echo .
end .

我的lexer应该输出

echo ECHO
 SPACE
. DOT
end END
 SPACE
. DOT

现在要为这样一个标记器构建自动机，我可以从

  ->(SPACE) (Back)
 |   
(S)-------------E->C->H->O->(ECHO) (Back)
 |              |
 .->(DOT)(Back)  ->N->D ->(END) (Back to Start)

上面的图表很糟糕，但其思想是，您有一个由

表示的开始状态，现在您使用

并进入另一个状态，现在您希望

或

分别进入

END

和

ECHO

。您继续使用字符并在这个S中达到不同的状态执行有限状态机。最终，您达到特定的

发射

状态，例如，在消费

，

之后，您达到发射标记的

结束

的发射状态，然后返回到

开始

状态。只要有字符流出现，此循环将永远持续对于无效字符，您可以抛出错误或忽略，具体取决于设计。

手动编写简单的词法分析器既不需要ANTLR，也不需要Dragon book。即使是用于更完整语言（如Java）的词法分析器显然，如果你有一个工业任务，你可能想考虑工业力量工具，如ANTLR或一些Lax变体，但为了学习词法分析如何工作，手写一个手工可能会被证明是一个有用的练习。我假设是这样的，因为你说你。你还是个初学者

这是一个简单的词汇分析器，用Java编写，用于Scheme类语言的一个子集，是我在看到这个问题后编写的。我认为即使您以前从未见过词法分析器，代码也相对容易理解，这仅仅是因为将字符流（在本例中是

字符串

）分解为一个标记流（在本例中，

列表

）并不难。如果您有问题，我可以尝试更深入地解释

import java.util.List;
import java.util.ArrayList;

/*
 * Lexical analyzer for Scheme-like minilanguage:
 * (define (foo x) (bar (baz x)))
 */
public class Lexer {
    public static enum Type {
        // This Scheme-like language has three token types:
        // open parens, close parens, and an "atom" type
        LPAREN, RPAREN, ATOM;
    }
    public static class Token {
        public final Type t;
        public final String c; // contents mainly for atom tokens
        // could have column and line number fields too, for reporting errors later
        public Token(Type t, String c) {
            this.t = t;
            this.c = c;
        }
        public String toString() {
            if(t == Type.ATOM) {
                return "ATOM<" + c + ">";
            }
            return t.toString();
        }
    }

    /*
     * Given a String, and an index, get the atom starting at that index
     */
    public static String getAtom(String s, int i) {
        int j = i;
        for( ; j < s.length(); ) {
            if(Character.isLetter(s.charAt(j))) {
                j++;
            } else {
                return s.substring(i, j);
            }
        }
        return s.substring(i, j);
    }

    public static List<Token> lex(String input) {
        List<Token> result = new ArrayList<Token>();
        for(int i = 0; i < input.length(); ) {
            switch(input.charAt(i)) {
            case '(':
                result.add(new Token(Type.LPAREN, "("));
                i++;
                break;
            case ')':
                result.add(new Token(Type.RPAREN, ")"));
                i++;
                break;
            default:
                if(Character.isWhitespace(input.charAt(i))) {
                    i++;
                } else {
                    String atom = getAtom(input, i);
                    i += atom.length();
                    result.add(new Token(Type.ATOM, atom));
                }
                break;
            }
        }
        return result;
    }

    public static void main(String[] args) {
        if(args.length < 1) {
            System.out.println("Usage: java Lexer \"((some Scheme) (code to) lex)\".");
            return;
        }
        List<Token> tokens = lex(args[0]);
        for(Token t : tokens) {
            System.out.println(t);
        }
    }
}

import java.util.List；
导入java.util.ArrayList；
/*
*用于类似Scheme的小型语言的词法分析器：
*（定义（foo x）（bar（baz x）））
*/
公共类Lexer{
公共静态枚举类型{
//此类似于Scheme的语言有三种令牌类型：
//打开排列、关闭排列和“原子”类型
LPAREN，RPAREN，ATOM；
}
公共静态类令牌{
公共最终类型t；
公共最终字符串c；//主要用于atom标记的内容
//也可以有列和行号字段，以便以后报告错误
公共令牌（类型t，字符串c）{
t=t；
这个.c=c；
}
公共字符串toString（）{
if（t==Type.ATOM）{
返回“原子”；
}
返回t.toString（）；
}
}
/*
*给定一个字符串和一个索引，让原子从该索引开始
*/
公共静态字符串getAtom（字符串s，int i）{
int j=i；
对于（；j~/code/scratch $ java Lexer ""
~/code/scratch $ java Lexer "("
LPAREN
~/code/scratch $ java Lexer "()"
LPAREN
RPAREN
~/code/scratch $ java Lexer "(foo)"
LPAREN
ATOM<foo>
RPAREN
~/code/scratch $ java Lexer "(foo bar)"
LPAREN
ATOM<foo>
ATOM<bar>
RPAREN
~/code/scratch $ java Lexer "(foo (bar))"
LPAREN
ATOM<foo>
LPAREN
ATOM<bar>
RPAREN
RPAREN


#include<stdio.h>
    #include<stdlib.h>
    #include<string.h>

    int main(){
    /* By Ashik Rabbani
    Daffodil International University,CSE43 */
    keyword_check();
    identifier_check();
    math_operator_check();
    logical_operator_check();
    numerical_check();
    others_check();


        return 0;
    }


    void math_operator_check()
    {

        char ch, string_input[15], operators[] = "+-*/%";
        FILE *fp;
        char tr[20];
        int i,j=0;

        fp = fopen("input.txt","r");

        if(fp == NULL){
            printf("error while opening the file\n");
            exit(0);
        }
       printf("\nMath Operators : ");
        while((ch = fgetc(fp)) != EOF){
               for(i = 0; i < 6; ++i){
                   if(ch == operators[i])
                       printf("%c ", ch);

               }
               }
                printf("\n");



        fclose(fp);
    }


    void logical_operator_check()
    {

        char ch, string_input[15], operators[] = "&&||<>";
        FILE *fp;
        char tr[20];
        int i,j=0;

        fp = fopen("input.txt","r");

        if(fp == NULL){
            printf("error while opening the file\n");
            exit(0);
        }
       printf("\nLogical Operators : ");
        while((ch = fgetc(fp)) != EOF){
               for(i = 0; i < 6; ++i){
                   if(ch == operators[i])
                       printf("%c ", ch);

               }
               }
                printf("\n");



        fclose(fp);
    }

    void numerical_check()
    {

        char ch, string_input[15], operators[] ={'0','1','2','3','4','5','6','7','8','9'};
        FILE *fp;

        int i,j=0;

        fp = fopen("input.txt","r");

        if(fp == NULL){
            printf("error while opening the file\n");
            exit(0);
        }
       printf("\nNumerical Values : ");
        while((ch = fgetc(fp)) != EOF){
               for(i = 0; i < 6; ++i){
                   if(ch == operators[i])
                       printf("%c ", ch);

               }
               }
                printf("\n");



        fclose(fp);
    }

    void others_check()
    {
        char ch, string_input[15], symbols[] = "(){}[]";
        FILE *fp;
        char tr[20];
        int i,j=0;

        fp = fopen("input.txt","r");

        if(fp == NULL){
            printf("error while opening the file\n");
            exit(0);
        }
       printf("\nOthers : ");
        while((ch = fgetc(fp)) != EOF){
               for(i = 0; i < 6; ++i){
                   if(ch == symbols[i])
                       printf("%c ", ch);

               }
               }
               printf("\n");



        fclose(fp);
    }

    void identifier_check()
    {
        char ch, string_input[15];
        FILE *fp;
    char    operators[] ={'0','1','2','3','4','5','6','7','8','9'};
        int i,j=0;

        fp = fopen("input.txt","r");

        if(fp == NULL){
            printf("error while opening the file\n");
            exit(0);
        }

        printf("\nIdentifiers : ");
        while((ch = fgetc(fp)) != EOF){

               if(isalnum(ch)){
                   string_input[j++] = ch;
               }
               else if((ch == ' ' || ch == '\n') && (j != 0)){
                       string_input[j] = '\0';
                       j = 0;

                       if(isKeyword(string_input) == 1)
                       {

                       }

                       else
                           printf("%s ", string_input);
               }

               }

                printf("\n");


        fclose(fp);
    }

    int isKeyword(char string_input[]){
        char keywords[32][10] = {"auto","break","case","char","const","continue","default",
                                "do","double","else","enum","extern","float","for","goto",
                                "if","int","long","register","return","short","signed",
                                "sizeof","static","struct","switch","typedef","union",
                                "unsigned","void","volatile","while"};
        int i, flag = 0;

        for(i = 0; i < 32; ++i){
            if(strcmp(keywords[i], string_input) == 0){
                flag = 1;
                break;
            }
        }

        return flag;
    }

    void keyword_check()
    {

        char ch, string_input[15], operators[] = "+-*/%=";
        FILE *fp;
        char tr[20];
        int i,j=0;

        printf(" Token Identification using C \n By Ashik-E-Rabbani \n 161-15-7093\n\n");

        fp = fopen("input.txt","r");

        if(fp == NULL){
            printf("error while opening the file\n");
            exit(0);
        }

        printf("\nKeywords : ");
        while((ch = fgetc(fp)) != EOF){

               if(isalnum(ch)){
                   string_input[j++] = ch;
               }
               else if((ch == ' ' || ch == '\n') && (j != 0)){
                       string_input[j] = '\0';
                       j = 0;

                       if(isKeyword(string_input) == 1)
                           printf("%s ", string_input);

               }

               }

     printf("\n");


        fclose(fp);
    }