在java中计算多个文件/文档中的词频
我想计算java中多个文件/文档的词频 e、 g 所以,我想计算每个文件的字频:在java中计算多个文件/文档中的词频,java,words,word-frequency,Java,Words,Word Frequency,我想计算java中多个文件/文档的词频 e、 g 所以,我想计算每个文件的字频: for a1 file {aaa = 3, bbb = 1} for a2 file {aaa = 2, hhh = 1} for a3 file {aaa = 1, hhh = 1, bbb =2} 我有一种方法,可以从文件中读取单词,然后将存储在LinkedHashMap中。尽管如此,这将计算所有文件中特定单词的频率,但我想分别计算每个文件的单词频率 有人有办法吗 然后,我写了这个, Set mapset
for a1 file {aaa = 3, bbb = 1}
for a2 file {aaa = 2, hhh = 1}
for a3 file {aaa = 1, hhh = 1, bbb =2}
我有一种方法,可以从文件
中读取单词,然后将
存储在LinkedHashMap
中。尽管如此,这将计算所有文件中特定单词的频率,但我想分别计算每个文件的单词频率
有人有办法吗
然后,我写了这个,
Set mapset = fileToWordCount.keySet();
for(Object filenameFromMap: mapset){
System.out.println("FILENAME::"+filenameFromMap);
}
但是,它不会打印任何内容。您可以创建另一个
映射
,该映射将文件名映射到具有字数的LinkedHashMap
。所以你会有这样的想法:
Map<String, LinkedHashMap<String, Integer>> fileToWordCount = new HashMap<String, LinkedHashMap<String, Integer>();
导入java.io。;
导入java.util
公共类文件1{
公共静态void main(字符串[]args)引发异常{
HashMap words_fre=新HashMap();
HashSet words=新的HashSet();
试试{
文件夹=新文件(“”);
File[]listOfFiles=folder.listFiles();
BufferedReader BufferedReader=null;
FileInputStream inputfilename=null;
BufferedWriter out=新的BufferedWriter(新的OutputStreamWriter(新的FileOutputStream(“outfilename.txt”,false),“UTF-8”);
对于(文件:listOfFiles){
inputfilename=新文件InputStream(文件);
/*System.out.println(文件);*/
bufferedReader=新的bufferedReader(新的InputStreamReader(inputfilename,“UTF-8”);
字符串s;
而((s=bufferedReader.readLine())!=null){
/*系统输出打印项次(行)*/
s=s.replaceAll(“\\”,“”);
如果(s)包含(“॥") || s、 包含(“:”)| | s.contains(“।")||
s、 包含(“,”)| s。包含(“!”)| s。包含(“?”){
s=s。替换(“॥"," ");
s=s。替换(“:”,“”);
s=s。替换(“।"," ");
s=s。替换(“,”,”);
s=s。替换(“!”,“);
s=s。替换(“?”,”);
}
StringTokenizer st=新的StringTokenizer,“”;
而(st.hasMoreTokens()){
/*out.write(st.nextToken()+“\n”)*/
字符串str=(st.nextToken()).toString();
添加(str);
}
for(字符串str:words){
如果(单词_fre.containsKey(str)){
int a=单词(str);
单词(str,a+1);
}否则{
单词_fre.put(str,1);/*uwords++;//唯一单词计数*/
}
}
单词。清除();
/*输出。写入(“\n”);
out.close()*/
}
Object[]key=words_fre.keySet().toArray();
数组。排序(键);
对于(int i=0;i
}
}为每个文件创建一个新的LinkedHashMap。可能的重复项以及我如何将元素放入此数据结构中?
Map<String, LinkedHashMap<String, Integer>> fileToWordCount = new HashMap<String, LinkedHashMap<String, Integer>();
fileToWordCount.put(file.getPath(), wordCountMap);
public class file1{
public static void main(String[] args) throws Exception{
HashMap<String,Integer> words_fre = new HashMap<String,Integer>();
HashSet<String> words = new HashSet<String>();
try{
File folder = new File("</file path>");
File[] listOfFiles = folder.listFiles();
BufferedReader bufferedReader=null;
FileInputStream inputfilename=null;
BufferedWriter out= new BufferedWriter(new OutputStreamWriter(new FileOutputStream("outfilename.txt",false), "UTF-8"));
for(File file : listOfFiles){
inputfilename= new FileInputStream(file);
/*System.out.println(file); */
bufferedReader= new BufferedReader(new InputStreamReader(inputfilename, "UTF-8"));
String s;
while((s = bufferedReader.readLine()) != null){
/*System.out.println(line);*/
s = s.replaceAll("\\<.*?>"," ");
if(s.contains("॥") || s.contains(":")|| s.contains("।")||
s.contains(",")|| s.contains("!")|| s.contains("?")){
s=s.replace("॥"," ");
s=s.replace(":"," ");
s=s.replace("।"," ");
s=s.replace(","," ");
s=s.replace("!"," ");
s=s.replace("?"," ");
}
StringTokenizer st = new StringTokenizer(s," ");
while (st.hasMoreTokens()) {
/*out.write(st.nextToken()+"\n");*/
String str=(st.nextToken()).toString();
words.add(str);
}
for(String str : words){
if(words_fre.containsKey(str)){
int a = words_fre.get(str);
words_fre.put(str,a+1);
}else{
words_fre.put(str,1);/*uwords++;//unique words count */
}
}
words.clear();
/*out.write("\n");
out.close();*/
}
Object[] key = words_fre.keySet().toArray();
Arrays.sort(key);
for (int i = 0; i < key.length; i++) {
//System.out.println(key[i]+"= "+words_fre.get(key[i]));
out.write(key[i]+" : "+words_fre.get(key[i]) +"\n");
}
}
out.close();
bufferedReader.close();
}catch(FileNotFoundException ex){
System.out.println("Error in reading line");
}catch(IOException ex){
/*System.out.println("Error in reading line"+fileReader );*/
ex.printStackTrace();
}