Java 从字符串中计算单词的出现次数
我希望能够计算出每个单词在给定文件中重复的次数。然而,我做这件事有困难。我试过两种不同的方法。我使用HashMap并将单词作为键,将其频率作为关联值。但是,这似乎不起作用,因为使用HashMap,您无法访问指定索引处的元素。现在我尝试使用两个单独的数组列表,一个用于单词,另一个用于该单词的每次出现。我的想法是:在向wordsCount arrayList添加单词时,如果单词已经在wordsCount中,则在已经看到的单词的索引处增加cnt arrayList中元素的值。但是,我不确定写什么来增加值Java 从字符串中计算单词的出现次数,java,string,arraylist,hashmap,Java,String,Arraylist,Hashmap,我希望能够计算出每个单词在给定文件中重复的次数。然而,我做这件事有困难。我试过两种不同的方法。我使用HashMap并将单词作为键,将其频率作为关联值。但是,这似乎不起作用,因为使用HashMap,您无法访问指定索引处的元素。现在我尝试使用两个单独的数组列表,一个用于单词,另一个用于该单词的每次出现。我的想法是:在向wordsCount arrayList添加单词时,如果单词已经在wordsCount中,则在已经看到的单词的索引处增加cnt arrayList中元素的值。但是,我不确定写什么来增加
import java.io.*;
import java.lang.reflect.Array;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
public class MP0 {
Random generator;
String delimiters = " \t,;.?!-:@[](){}_*/";
String[] stopWordsArray = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
"yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
"of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
"further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
"too", "very", "s", "t", "can", "will", "just", "don", "should", "now"};
private static String str;
private static File file;
private static Scanner s;
public MP0() {
}
public void process() throws Exception{
ArrayList<Integer> cnt = new ArrayList<Integer>();
boolean isStopWord = false;
StringTokenizer st = new StringTokenizer(s.nextLine(), delimiters);
ArrayList<String> wordsCount = new ArrayList<String>();
while(st.hasMoreTokens()) {
String s = st.nextToken().toLowerCase();
if(!wordsCount.contains(s)) {
for(int i = 0; i < stopWordsArray.length; i++) {
isStopWord = false;
if(s.equals(stopWordsArray[i])) {
isStopWord = true;
break;
}
}
if(isStopWord == false) {
wordsCount.add(s);
cnt.add(1);
}
}
else { // i tried this but only displayed "1" for all words
cnt.set(wordsCount.indexOf(s), cnt.get(wordsCount.indexOf(s) + 1));
}
}
for(int i = 0; i < wordsCount.size(); i++) {
System.out.println(wordsCount.get(i) + " " + cnt.get(i));
}
}
public static void main(String args[]) throws Exception {
try {
file = new File("input.txt");
s = new Scanner(file);
str = s.nextLine();
String[] topItems;
MP0 mp = new MP0();
while(s.hasNext()) {
mp.process();
str = s.nextLine();
}
}
catch(FileNotFoundException e) {
System.out.println("File not found");
}
}
}
import java.io.*;
导入java.lang.reflect.Array;
导入java.security.MessageDigest;
导入java.security.NoSuchAlgorithmException;
导入java.util.*;
公共类MP0{
随机发生器;
字符串分隔符=“\t,;.?!-:@[](){}\u*/”;
String[]stopWordsArray={“我”、“我”、“我的”、“我自己”、“我们”、“我们的”、“我们的”、“我们自己”、“你”、“你的”、“你的”,
“你自己”、“你自己”、“他”、“他”、“他”、“他自己”、“她”、“她的”、“她自己”、“它”、“它”,
“自身”、“他们”、“他们”、“他们的”、“他们的”、“他们自己”、“什么”、“哪个”、“谁”、“谁”、“这个”、“那个”,
“这些”、“那些”、“我”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“有”、“有”、“有”、“有”、“有”、“有”,
“do”、“does”、“did”、“doing”、“a”、“an”、“the”、“and”、“but”、“if”、“or”、“because”、“as”、“until”、“while”,
“of”、“at”、“by”、“for”、“with”、“about”、“offer”、“between”、“into”、“through”、“during”、“before”,
“后”、“上”、“下”、“到”、“从”、“上”、“下”、“进”、“出”、“开”、“关”、“过”、“下”、“再”,
“进一步”、“然后”、“一次”、“这里”、“那里”、“何时”、“何地”、“为什么”、“如何”、“全部”、“任何”、“两者”、“各自”,
“少数”、“更多”、“大多数”、“其他”、“一些”、“这样”、“不”、“也”、“不”、“仅”、“拥有”、“相同”、“所以”、“比”,
“太”、“非常”、“s”、“t”、“can”、“will”、“just”、“don”、“should”、“now”};
私有静态字符串str;
私有静态文件;
专用静态扫描仪;
公共MP0(){
}
public void进程()引发异常{
ArrayList cnt=新的ArrayList();
布尔isStopWord=false;
StringTokenizer st=新的StringTokenizer(s.nextLine(),分隔符);
ArrayList wordsCount=新的ArrayList();
而(st.hasMoreTokens()){
字符串s=st.nextToken().toLowerCase();
如果(!wordscont.contains)){
for(int i=0;i
我相信你可以使用hashmap做你想做的事情。大概是这样的:
HashMap<String, Integer> mymap= new HashMap<>();
for(String word: stopWordsArray) {
if (mymap.containsKey(word))
mymap.put(word, mymap.get(word) + 1);
else{
mymap.put(word, new Integer(1));
}
}
我相信你可以使用hashmap做你想做的事情。大概是这样的:
HashMap<String, Integer> mymap= new HashMap<>();
for(String word: stopWordsArray) {
if (mymap.containsKey(word))
mymap.put(word, mymap.get(word) + 1);
else{
mymap.put(word, new Integer(1));
}
}
您还可以使用Pattern和matcher
String in = "our goal is our power";
int i = 0;
Pattern p = Pattern.compile("our");
Matcher m = p.matcher( in );
while (m.find()) {
i++;
}
您还可以使用Pattern和matcher
String in = "our goal is our power";
int i = 0;
Pattern p = Pattern.compile("our");
Matcher m = p.matcher( in );
while (m.find()) {
i++;
}
我认为地图绝对是表示每个单词计数的方式。在我看来,获取地图的最佳方式(或者至少是一种尚未提及的不同方式)是将单词穿过一个特定的区域。这样,您就可以利用Java标准库中已经编写的大量代码,使您的代码更加简洁,避免了重新设计所有轮子的需要。流可以有一点学习曲线,但一旦你理解了,它们会非常有用。例如,观察您的20+行方法,减少为2行:
import java.util.Map;
import java.util.ArrayList;
import java.util.Arrays;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.stream.Stream;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.summingInt;
import static java.util.function.Function.identity;
public class CountWords
{
private static String delimiters = "[ \t,;.?!\\-:@\\[\\](){}_*/]+";
private static ArrayList<String> stopWords = new ArrayList<>(Arrays.asList(new String[] {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
"yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
"itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
"of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
"further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
"too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}));
public static void main(String[] args) throws IOException //Your code should likely catch this
{
Path fLoc = Paths.get("test.txt"); //Or get from stdio, args[0], etc...
CountWords cw = new CountWords();
Map<String, Integer> counts = cw.count(Files.lines(fLoc).flatMap(s -> Arrays.stream(s.split(delimiters))));
counts.forEach((k, v) -> System.out.format("Key: %s, Val: %d\n", k, v));
}
public Map<String, Integer> count(Stream<String> words)
{
return words.filter(s -> !stopWords.contains(s))
.collect(groupingBy(identity(), summingInt(s -> 1)));
}
}
import java.util.Map;
导入java.util.ArrayList;
导入java.util.array;
导入java.io.IOException;
导入java.nio.file.Files;
导入java.nio.file.Path;
导入java.nio.file.path;
导入java.util.stream.stream;
导入静态java.util.stream.Collectors.groupingBy;
导入静态java.util.stream.Collectors.summingit;
导入静态java.util.function.function.identity;
公共类CountWords
{
专用静态字符串分隔符=“[\t,;.?!\\-:@\\[\\](){}\u*/]+”;
private static ArrayList stopWords=new ArrayList(Arrays.asList)(新字符串[]{“我”、“我”、“我的”、“我的”、“我的”、“我们的”、“我们的”、“我们的”、“你的”、“你的”,
“你自己”、“你自己”、“他”、“他”、“他”、“他自己”、“她”、“她的”、“她自己”、“它”、“它”,
“自身”、“他们”、“他们”、“他们的”、“他们的”、“他们自己”、“什么”、“哪个”、“谁”、“谁”、“这个”、“那个”,
“这些”、“那些”、“我”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“是”、“有”、“有”、“有”、“有”、“有”、“有”,
“做”、“做”、“做”、“做”、“a”、“an”、“the”,