Java 从字符串中计算单词的出现次数

Java 从字符串中计算单词的出现次数,java,string,arraylist,hashmap,Java,String,Arraylist,Hashmap,我希望能够计算出每个单词在给定文件中重复的次数。然而,我做这件事有困难。我试过两种不同的方法。我使用HashMap并将单词作为键,将其频率作为关联值。但是,这似乎不起作用,因为使用HashMap,您无法访问指定索引处的元素。现在我尝试使用两个单独的数组列表,一个用于单词,另一个用于该单词的每次出现。我的想法是:在向wordsCount arrayList添加单词时,如果单词已经在wordsCount中,则在已经看到的单词的索引处增加cnt arrayList中元素的值。但是,我不确定写什么来增加

我希望能够计算出每个单词在给定文件中重复的次数。然而,我做这件事有困难。我试过两种不同的方法。我使用HashMap并将单词作为键,将其频率作为关联值。但是,这似乎不起作用,因为使用HashMap,您无法访问指定索引处的元素。现在我尝试使用两个单独的数组列表,一个用于单词,另一个用于该单词的每次出现。我的想法是:在向wordsCount arrayList添加单词时,如果单词已经在wordsCount中,则在已经看到的单词的索引处增加cnt arrayList中元素的值。但是,我不确定写什么来增加值

import java.io.*;
import java.lang.reflect.Array;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;

public class MP0 {
    Random generator;
    String delimiters = " \t,;.?!-:@[](){}_*/";
    String[] stopWordsArray = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
            "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
            "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
            "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
            "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
            "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
            "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
            "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
            "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
            "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"};
    private static String str;
    private static File file;
    private static Scanner s;   

    public MP0() {
    }

    public void process() throws Exception{
        ArrayList<Integer> cnt = new ArrayList<Integer>();
        boolean isStopWord = false;
        StringTokenizer st = new StringTokenizer(s.nextLine(), delimiters);
        ArrayList<String> wordsCount = new ArrayList<String>();

        while(st.hasMoreTokens()) {
            String s = st.nextToken().toLowerCase();
            if(!wordsCount.contains(s)) {
                for(int i = 0; i < stopWordsArray.length; i++) {
                    isStopWord = false;
                    if(s.equals(stopWordsArray[i])) {
                        isStopWord = true;
                        break;
                    }
                }
                if(isStopWord == false) {
                    wordsCount.add(s);
                    cnt.add(1);
                }
            }
            else { // i tried this but only displayed "1" for all words
                cnt.set(wordsCount.indexOf(s), cnt.get(wordsCount.indexOf(s) + 1));
            }
        }


        for(int i = 0; i < wordsCount.size(); i++) {
            System.out.println(wordsCount.get(i) + " " + cnt.get(i));
        }

    }

    public static void main(String args[]) throws Exception {
            try {
                file = new File("input.txt");
                s = new Scanner(file);
                str = s.nextLine();
                String[] topItems;
                MP0 mp = new MP0();
                while(s.hasNext()) {
                    mp.process();
                    str = s.nextLine();
                }
            }
            catch(FileNotFoundException e) {
                System.out.println("File not found");
            }
    }

}
import java.io.*;
导入java.lang.reflect.Array;
导入java.security.MessageDigest;
导入java.security.NoSuchAlgorithmException;
导入java.util.*;
公共类MP0{
随机发生器;
字符串分隔符=“\t,;.?!-:@[](){}\u*/”;
String[]stopWordsArray={“我”、“我”、“我的”、“我自己”、“我们”、“我们的”、“我们的”、“我们自己”、“你”、“你的”、“你的”,
“你自己”、“你自己”、“他”、“他”、“他”、“他自己”、“她”、“她的”、“她自己”、“它”、“它”,
“自身”、“他们”、“他们”、“他们的”、“他们的”、“他们自己”、“什么”、“哪个”、“谁”、“谁”、“这个”、“那个”,
“这些”、“那些”、“我”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“有”、“有”、“有”、“有”、“有”、“有”,
“do”、“does”、“did”、“doing”、“a”、“an”、“the”、“and”、“but”、“if”、“or”、“because”、“as”、“until”、“while”,
“of”、“at”、“by”、“for”、“with”、“about”、“offer”、“between”、“into”、“through”、“during”、“before”,
“后”、“上”、“下”、“到”、“从”、“上”、“下”、“进”、“出”、“开”、“关”、“过”、“下”、“再”,
“进一步”、“然后”、“一次”、“这里”、“那里”、“何时”、“何地”、“为什么”、“如何”、“全部”、“任何”、“两者”、“各自”,
“少数”、“更多”、“大多数”、“其他”、“一些”、“这样”、“不”、“也”、“不”、“仅”、“拥有”、“相同”、“所以”、“比”,
“太”、“非常”、“s”、“t”、“can”、“will”、“just”、“don”、“should”、“now”};
私有静态字符串str;
私有静态文件;
专用静态扫描仪;
公共MP0(){
}
public void进程()引发异常{
ArrayList cnt=新的ArrayList();
布尔isStopWord=false;
StringTokenizer st=新的StringTokenizer(s.nextLine(),分隔符);
ArrayList wordsCount=新的ArrayList();
而(st.hasMoreTokens()){
字符串s=st.nextToken().toLowerCase();
如果(!wordscont.contains)){
for(int i=0;i
我相信你可以使用hashmap做你想做的事情。大概是这样的:

              HashMap<String, Integer> mymap= new HashMap<>();

                for(String word: stopWordsArray) {
                    if (mymap.containsKey(word))
                        mymap.put(word, mymap.get(word) + 1);
                    else{
                        mymap.put(word, new Integer(1));
                    }
                }

我相信你可以使用hashmap做你想做的事情。大概是这样的:

              HashMap<String, Integer> mymap= new HashMap<>();

                for(String word: stopWordsArray) {
                    if (mymap.containsKey(word))
                        mymap.put(word, mymap.get(word) + 1);
                    else{
                        mymap.put(word, new Integer(1));
                    }
                }

您还可以使用Pattern和matcher

String in = "our goal is our power";
int i = 0;
Pattern p = Pattern.compile("our");
Matcher m = p.matcher( in );
while (m.find()) {
    i++;
}

您还可以使用Pattern和matcher

String in = "our goal is our power";
int i = 0;
Pattern p = Pattern.compile("our");
Matcher m = p.matcher( in );
while (m.find()) {
    i++;
}

我认为地图绝对是表示每个单词计数的方式。在我看来,获取地图的最佳方式(或者至少是一种尚未提及的不同方式)是将单词穿过一个特定的区域。这样,您就可以利用Java标准库中已经编写的大量代码,使您的代码更加简洁,避免了重新设计所有轮子的需要。流可以有一点学习曲线,但一旦你理解了,它们会非常有用。例如,观察您的20+行方法,减少为2行:

import java.util.Map;
import java.util.ArrayList;
import java.util.Arrays;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Stream;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.summingInt;
import static java.util.function.Function.identity;

public class CountWords
{
    private static String delimiters = "[ \t,;.?!\\-:@\\[\\](){}_*/]+";
    private static ArrayList<String> stopWords =    new ArrayList<>(Arrays.asList(new String[] {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
                                                "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
                                                "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
                                                "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
                                                "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
                                                "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
                                                "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
                                                "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
                                                "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
                                                "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}));
    public static void main(String[] args) throws IOException //Your code should likely catch this
    {
        Path fLoc = Paths.get("test.txt"); //Or get from stdio, args[0], etc...
        CountWords cw = new CountWords();
        Map<String, Integer> counts = cw.count(Files.lines(fLoc).flatMap(s -> Arrays.stream(s.split(delimiters))));
        counts.forEach((k, v) -> System.out.format("Key: %s, Val: %d\n", k, v));
    }

    public Map<String, Integer> count(Stream<String> words)
    {
        return words.filter(s -> !stopWords.contains(s))
                    .collect(groupingBy(identity(), summingInt(s -> 1)));
    }
}
import java.util.Map;
导入java.util.ArrayList;
导入java.util.array;
导入java.io.IOException;
导入java.nio.file.Files;
导入java.nio.file.Path;
导入java.nio.file.path;
导入java.util.stream.stream;
导入静态java.util.stream.Collectors.groupingBy;
导入静态java.util.stream.Collectors.summingit;
导入静态java.util.function.function.identity;
公共类CountWords
{
专用静态字符串分隔符=“[\t,;.?!\\-:@\\[\\](){}\u*/]+”;
private static ArrayList stopWords=new ArrayList(Arrays.asList)(新字符串[]{“我”、“我”、“我的”、“我的”、“我的”、“我们的”、“我们的”、“我们的”、“你的”、“你的”,
“你自己”、“你自己”、“他”、“他”、“他”、“他自己”、“她”、“她的”、“她自己”、“它”、“它”,
“自身”、“他们”、“他们”、“他们的”、“他们的”、“他们自己”、“什么”、“哪个”、“谁”、“谁”、“这个”、“那个”,
“这些”、“那些”、“我”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“有”、“有”、“有”、“有”、“有”、“有”,
“做”、“做”、“做”、“做”、“a”、“an”、“the”,