Java 文件中单词的n-gram相似性

Java 文件中单词的n-gram相似性,java,similarity,Java,Similarity,在上面的例子中,我必须将答案下的单词与[2]下的每个单词进行比较,如{high,rules}{high,outline}{high,high}{high,source}{high,knowledge},并且我必须存储上述比较的最大值,然后再次使用答案中的第二个单词,然后采用类似的过程。最后,取每个迭代的最大值的平均值。至少尽可能准确地描述预期输出的样子。现在描述事物的方式很难理解问题是什么。@Ashalynd我的输出应该是相似值,例如单词{high,rules}{high,outline}{hi

在上面的例子中,我必须将答案下的单词与[2]下的每个单词进行比较,如{high,rules}{high,outline}{high,high}{high,source}{high,knowledge},并且我必须存储上述比较的最大值,然后再次使用答案中的第二个单词,然后采用类似的过程。最后,取每个迭代的最大值的平均值。

至少尽可能准确地描述预期输出的样子。现在描述事物的方式很难理解问题是什么。@Ashalynd我的输出应该是相似值,例如单词{high,rules}{high,outline}{high,high}{high,source}{high,knowledge},应该作为bigram进行比较,并返回相似值。下一个单词“risk”我必须取最大相似度值,并从每个单词比较所存储的最大值中取整体答案的平均值,作为{risk,rules}{risk,outline}{risk,high}{risk,source}{risk,knowledge}进行比较。
/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package sim;
import java.io.*;
import java.util.Arrays;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import static jdk.nashorn.internal.objects.NativeMath.max;

/**
 *
 * @author admin
 */
public class Sim {
    public String[][] bigramizedWords = new String[500][100];
    public String[] words = new String[500];
    public File file1 = new File("file1.txt");
    public File file2 = new File("file2.txt");
    public int tracker = 0;
    public double matches = 0;
    public double denominator = 0; //This will hold the sum of the bigrams of the 2 words
    public double res;
    public double results;

    public Scanner a;
    public PrintWriter pw1;
    public Sim(){
        intialize();
       // bigramize();
        results = max(res);
        System.out.println("\n\nThe Bigram Similarity value between " + words[0] + " and " + words[1] + " is " + res + ".");
        pw1.close();

    }


    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        Sim si=new Sim();
        // TODO code application logic here
    }
    public void intialize() {
        int j[]=new int[35];
        try {

            File file1=new File("input.txt");
            File file2=new File("out.txt");
            Scanner a = new Scanner(file1);
            PrintWriter pw1= new PrintWriter(file2);
            int i=0,count = 0;

            while (a.hasNext()) {

                java.lang.String gram = a.next();
                if(gram.startsWith("question")|| gram.endsWith("?"))
                {
                    count=0;
                    count-=1;
                }

                if(gram.startsWith("[")||gram.startsWith("answer")||gram.endsWith(" ") )
                {
                    //pw1.println(count);
                    j[i++]=count;

                    count=0;
                    //pw1.println(gram);
                    //System.out.println(count);

                }
                else
                {
                    // System.out.println(count);
                    count+=1;
//System.out.println(count + " " + gram);

                }
       int line=gram.length();
    int sa_length;
                        //int[] j = null;
            int refans_length=j[1];
            //System.out.println(refans_length);
            for(int k=2;k<=35;k++)
               // System.out.println(j[k]);
            //System.out.println(refans_length);
            for(int m=2;m<=33;m++)

            {
                sa_length=j[2];
                //System.out.println(sa_length);

                for(int s=0;s<=refans_length;s++)
                {
                    for(int l=0;l<=sa_length;l++)
                    {
                        for (int x = 0; x <= line - 2; x++) {
                            int tracker = 0;
                    bigramizedWords[tracker][x] = gram.substring(x, x + 2);
                    System.out.println(gram.substring(x, x + 2) + "");
                    //bigramize(); 
                }
              // bigramize(); 

            }
        }

    }
            bigramize();
         words[tracker] = gram;
                tracker++;

            }
            //pw1.close();

    }


    catch (FileNotFoundException ex) {
            Logger.getLogger(Sim.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public void bigramize() {
        //for(int p=0;p<=sa_length;p++)
        denominator = (words[0].length() - 1) + (words[1].length() - 1);
        for (int k = 0; k < bigramizedWords[0].length; k++) {
            if (bigramizedWords[0][k] != null) {
                for (int i = 0; i < bigramizedWords[1].length; i++) {
                    if (bigramizedWords[1][i] != null) {
                        if (bigramizedWords[0][k].equals(bigramizedWords[1][i])) {
                            matches++;
                        }
                    }
                }
            }
        }
        matches *= 2;
        res = matches / denominator;
    }


}
answer:
high
risk
simulate
behaviour
solution
set
rules
[2]
rules
outline
high
source
knowledge
[1]
set
rules
simulate
behaviour