Java 从文本中查找前10个单词:95%-工作代码
我几个月前开始学习Java。对于我的家庭作业,我写了以下代码——从文本文件中查找前10个单词。对我来说,这是一项非常有趣的任务。而且很难。第三天我无法完成这个软件-我发现很难在代码中导航。我认为,这项任务可以用更少的代码和更高的性能来完成。但我的一般问题是,我无法修复下面的一个代码块 总而言之:我读取文件,剪切一些无意义的单词,将文本中的所有单词添加到数组中(没有重复),同时我还有第二个数组来计算单词的频率。我的问题是找到十个最流行单词的代码块。我所需要的是——从freq[]中找到最大的数字,而不丢失这些数字的索引,因为单词[9]==freq[9]--九位单词在freq[]中的九位有频率数字Java 从文本中查找前10个单词:95%-工作代码,java,arrays,sorting,text,statistics,Java,Arrays,Sorting,Text,Statistics,我几个月前开始学习Java。对于我的家庭作业,我写了以下代码——从文本文件中查找前10个单词。对我来说,这是一项非常有趣的任务。而且很难。第三天我无法完成这个软件-我发现很难在代码中导航。我认为,这项任务可以用更少的代码和更高的性能来完成。但我的一般问题是,我无法修复下面的一个代码块 总而言之:我读取文件,剪切一些无意义的单词,将文本中的所有单词添加到数组中(没有重复),同时我还有第二个数组来计算单词的频率。我的问题是找到十个最流行单词的代码块。我所需要的是——从freq[]中找到最大的数字,而
package CountWords21;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
public class CountWords21 {
public static void main(String[] args) throws IOException {
String textSource = ReadFromFileToString.readFile("/home/vitaly/Desktop/text", StandardCharsets.UTF_8);
int start = 0;
int end = 0;
int w = 0; // counter for array of words - position for add new word in array
StringBuilder textStringBuilder = new StringBuilder(textSource);
for (int i = 0; i < textSource.length(); i++) {
if (textSource.charAt(i) == ','
|| textSource.charAt(i) == '.'
|| textSource.charAt(i) == '!'
|| textSource.charAt(i) == ';'
|| textSource.charAt(i) == '\''
|| textSource.charAt(i) == '-'
|| textSource.charAt(i) == '+'
|| textSource.charAt(i) == '*'
|| textSource.charAt(i) == '/'
|| textSource.charAt(i) == '('
|| textSource.charAt(i) == '>'
|| textSource.charAt(i) == ':'
|| textSource.charAt(i) == '\n' // new line
|| (textSource.charAt(i) == 'a'
&& textSource.charAt(i - 1) == ' '
&& textSource.charAt(i + 1) == ' '))
textStringBuilder.setCharAt(i, ' ');
if (textSource.charAt(i) == '\''
&& textSource.charAt(i + 1) == 's') {
textStringBuilder.setCharAt(i, ' '); // 's
textStringBuilder.setCharAt(i + 1, ' ');
}
if (textSource.charAt(i) == 'a' && textSource.charAt(i + 1) == 's'
&& textSource.charAt(i + 2) == ' '
&& textSource.charAt(i - 1) == ' ' && i > 2) {
textStringBuilder.setCharAt(i, ' '); // as
textStringBuilder.setCharAt(i + 1, ' ');
}
if ((textSource.charAt(i) == 't' || textSource.charAt(i) == 'T')&& textSource.charAt(i + 1) == 'h'
&& textSource.charAt(i + 2) == 'e'
&& textSource.charAt(i - 1) == ' '
&& textSource.charAt(i + 3) == ' ' && i > 3) {
textStringBuilder.setCharAt(i, ' '); // the
textStringBuilder.setCharAt(i + 1, ' ');
textStringBuilder.setCharAt(i + 2, ' ');
}
if ( textSource.charAt(i) == 'a' // and
&& textSource.charAt(i + 1) == 'n'
&& textSource.charAt(i + 2) == 'd'
&& (textSource.charAt(i - 1) == ' ' || textSource.charAt(i - 1) == '\n'
&& textSource.charAt(i + 3) == ' ' && i > 3)) {
textStringBuilder.setCharAt(i, ' ');
textStringBuilder.setCharAt(i + 1, ' ');
textStringBuilder.setCharAt(i + 2, ' ');
}
if ((textSource.charAt(i) == 'f' || i < (textSource.length() - 4)) // for
&& textSource.charAt(i + 1) == 'o'
&& textSource.charAt(i + 2) == 'r'
&& (i == 0 || textSource.charAt(i - 1) == ' ' || textSource.charAt(i - 1) == '\n')
&& textSource.charAt(i + 3) == ' ') {
textStringBuilder.setCharAt(i, ' ');
textStringBuilder.setCharAt(i + 1, ' ');
textStringBuilder.setCharAt(i + 2, ' ');
}
if (textSource.charAt(i) == 'o' && textSource.charAt(i + 1) == 'f'
&& textSource.charAt(i - 1) == ' '
&& textSource.charAt(i + 2) == ' ' && i > 2) {
textStringBuilder.setCharAt(i, ' '); // of
textStringBuilder.setCharAt(i + 1, ' ');
textStringBuilder.setCharAt(i + 2, ' ');
}
if (textSource.charAt(i) == 'i' && textSource.charAt(i + 1) == 's'
&& textSource.charAt(i + 2) == ' '
&& textSource.charAt(i - 1) == ' ' && i > 2) {
textStringBuilder.setCharAt(i, ' '); // is
textStringBuilder.setCharAt(i + 1, ' ');
}
if (textSource.charAt(i) == 't' && textSource.charAt(i + 1) == 'o'
&& textSource.charAt(i + 2) == ' '
&& textSource.charAt(i - 1) == ' ' && i > 2) {
textStringBuilder.setCharAt(i, ' '); // to
textStringBuilder.setCharAt(i + 1, ' ');
}
if (textSource.charAt(i) == 'i' && textSource.charAt(i + 1) == 'n'
&& textSource.charAt(i + 2) == ' '
&& textSource.charAt(i - 1) == ' ' && i > 2) {
textStringBuilder.setCharAt(i, ' '); // in
textStringBuilder.setCharAt(i + 1, ' ');
}
if ((textSource.charAt(i) == 't' || textSource.charAt(i) == 'T')
&& textSource.charAt(i + 1) == 'h'
&& textSource.charAt(i + 2) == 'i'
&& textSource.charAt(i + 3) == 's'
&& textSource.charAt(i + 4) == ' '
&& (i == 0 || textSource.charAt(i - 1) == ' ')) {
textStringBuilder.setCharAt(i, ' '); // this
textStringBuilder.setCharAt(i + 1, ' ');
textStringBuilder.setCharAt(i + 2, ' ');
textStringBuilder.setCharAt(i + 3, ' ');
}
if ((textSource.charAt(i) == 't' || textSource.charAt(i) == 'T')
&& textSource.charAt(i + 1) == 'h'
&& textSource.charAt(i + 2) == 'a'
&& textSource.charAt(i + 3) == 't'
&& textSource.charAt(i + 4) == ' '
&& (i == 0 || textSource.charAt(i - 1) == ' ')) {
textStringBuilder.setCharAt(i, ' '); // that
textStringBuilder.setCharAt(i + 1, ' ');
textStringBuilder.setCharAt(i + 2, ' ');
textStringBuilder.setCharAt(i + 3, ' ');
}
if ((textSource.charAt(i) == 'm' || textSource.charAt(i) == 'M')
&& textSource.charAt(i + 1) == 'o'
&& textSource.charAt(i + 2) == 's'
&& textSource.charAt(i + 3) == 't'
&& textSource.charAt(i + 4) == ' '
&& (i == 0 || textSource.charAt(i - 1) == ' ')) {
textStringBuilder.setCharAt(i, ' '); // this
textStringBuilder.setCharAt(i + 1, ' ');
textStringBuilder.setCharAt(i + 2, ' ');
textStringBuilder.setCharAt(i + 3, ' ');
}
}
textSource = textStringBuilder.toString();
textSource = textSource.trim();
String[] words = new String[textSource.length() / 2];
int[] freq = new int[textSource.length() / 2];
// number of usage - index the same for words[]
while (end < textSource.length()) {
// global search for next word
while (end != -1) { // -1 will be if string finish
end = textSource.indexOf(' ', end);
// search for space, start from end
// (last founded space position)
if (end != -1) // when space
break; // because we already have end position
else { // (end == -1) -- finish of sourceText - without space on end
end = textSource.length();
break;
}
}
if (words[w] != null) w++; // or last word in array will be replaced
if (end != start) {
words[w] = (textSource.substring(start, end)).trim(); // add word to array
freq[w] = 1;
}
if (end == start) w--;
if (w == 0) { // first word
freq[0] = 1;
w = 1; // next word will be added in next position
}
if (words[1] != null) // if it not for the first word
searchDuplicate: for (int i = 0; i <= w - 1; i++) {
// search word in array for duplicate
if (words[w].compareToIgnoreCase(words[i]) == 0) {
// if new word equal to one of word from array
freq[i] += 1;
// increment count of this word or set 1 if first added
words[w] = null;
freq[w] = 0;
// clear duplicated new (last added) word
break searchDuplicate; // exit from for-search-iterator
} // if we can't find duplicate and this is new word
}
if (w != 1 && words[w] != null && words[w].compareToIgnoreCase(words[w - 1]) != 0 && freq[w] == 0) {
freq[w] += 1; // new word achieve first 1.
w += 1; // next time add new word in next position
}
start = end + 1;
// start next word extract from first symbol after space
end += 1;
// without this line end == current space end
// indexOf will find this current space
}
int[] PopularWordsIndexes = new int[10]; // indexes of max used words
for (int index = 0, minFreqIndexTemp = 1; freq[index] != 0; index++) { // max frequency
if (index < PopularWordsIndexes.length)
PopularWordsIndexes[index] = index;
// add first words in empty TOP
else { // TODO error somewhere here
/* if we don't have empty position in PopularWordsIndexes
find the lowest freq index and replace with higher index */
for (int top10IndexIterator = 0; top10IndexIterator < PopularWordsIndexes.length; top10IndexIterator++)
if (freq[PopularWordsIndexes[minFreqIndexTemp]] < freq[PopularWordsIndexes[top10IndexIterator]])
minFreqIndexTemp = top10IndexIterator;
if (freq[PopularWordsIndexes[minFreqIndexTemp]] < freq[index])
PopularWordsIndexes[minFreqIndexTemp] = index;
}
}
// output
for (int i2 = 0; i2 < PopularWordsIndexes.length; i2++) {
System.out.println(words[PopularWordsIndexes[i2]] + " : "
+ freq[PopularWordsIndexes[i2]]);
}
}
}
package countwords 21;
导入java.io.IOException;
导入java.nio.charset.StandardCharset;
公共类countwords 21{
公共静态void main(字符串[]args)引发IOException{
String textSource=ReadFromFileToString.readFile(“/home/vitaly/Desktop/text”,StandardCharsets.UTF_8);
int start=0;
int end=0;
int w=0;//单词数组的计数器-在数组中添加新词的位置
StringBuilder textStringBuilder=新的StringBuilder(textSource);
对于(int i=0;i'
||textSource.charAt(i)=':'
||textSource.charAt(i)='\n'//新行
||(textSource.charAt(i)='a'
&&textSource.charAt(i-1)=''
&&textSource.charAt(i+1)='')
textStringBuilder.setCharAt(i',);
如果(textSource.charAt(i)='\''
&&textSource.charAt(i+1)='s'){
textStringBuilder.setCharAt(i,,);/'s
textStringBuilder.setCharAt(i+1,');
}
如果(textSource.charAt(i)='a'&&textSource.charAt(i+1)='s'
&&textSource.charAt(i+2)=''
&&textSource.charAt(i-1)=''&&i>2){
textStringBuilder.setCharAt(i,,);//作为
textStringBuilder.setCharAt(i+1,');
}
如果((textSource.charAt(i)='t'| | textSource.charAt(i)='t')&&textSource.charAt(i+1)='h'
&&textSource.charAt(i+2)='e'
&&textSource.charAt(i-1)=''
&&textSource.charAt(i+3)=''&&i>3){
textStringBuilder.setCharAt(i,,);//
textStringBuilder.setCharAt(i+1,');
textStringBuilder.setCharAt(i+2',);
}
if(textSource.charAt(i)='a'//和
&&textSource.charAt(i+1)='n'
&&textSource.charAt(i+2)='d'
&&(textSource.charAt(i-1)=''| | textSource.charAt(i-1)='\n'
&&textSource.charAt(i+3)=''&&i>3)){
textStringBuilder.setCharAt(i',);
textStringBuilder.setCharAt(i+1,');
textStringBuilder.setCharAt(i+2',);
}
如果((textSource.charAt(i)='f'| i<(textSource.length()-4))//for
&&textSource.charAt(i+1)='o'
&&textSource.charAt(i+2)='r'
&&(i==0 | | textSource.charAt(i-1)=''| | textSource.charAt(i-1)='\n')
&&textSource.charAt(i+3)=''){
textStringBuilder.setCharAt(i',);
textStringBuilder.setCharAt(i+1,');
textStringBuilder.setCharAt(i+2',);
}
if(textSource.charAt(i)='o'&&textSource.charAt(i+1)='f'
&&textSource.charAt(i-1)=''
&&textSource.charAt(i+2)=''&&i>2){
textStringBuilder.setCharAt(i,,);//的
textStringBuilder.setCharAt(i+1,');
textStringBuilder.setCharAt(i+2',);
}
如果(textSource.charAt(i)='i'&&textSource.charAt(i+1)=='s'
&&textSource.charAt(i+2)=''
&&textSource.charAt(i-1)=''&&i>2){
textStringBuilder.setCharAt(i,,);//是
textStringBuilder.setCharAt(i+1,');
}
if(textSource.charAt(i)='t'&&textSource.charAt(i+1)='o'
&&textSource.charAt(i+2)=''
&&textSource.charAt(i-1)=''&&i>2){
textStringBuilder.setCharAt(i,,);//到
textStringBuilder.setCharAt(i+1,');
}
if(textSource.charAt(i)='i'&&textSource.charAt(i+1)='n'
&&textSource.charAt(i+2)=''
&&textSource.charAt(i-1)=''&&i>2){
textStringBuilder.setCharAt(i,,);//在
textStringBuilder.setCharAt(i+1,');
}
if((textSource.charAt(i)='t'| | textSource.charAt(i)='t')
&&textSource.charAt(i+1)='h'
&&textSource.charAt(i+2)='i'
&&textSource.charAt(i+3)='s'
&&textSource.charAt(i+4)=''
&&(i==0 | | textSource.charAt(i-1)=''){
textStringBuilder.setCharAt(i,,);//此
textStringBuilder.setCharAt(i+1,');
textStringBuilder.setCharAt(i+2',);
textStringBuilder.setCharAt(i+3,'
substring() // get a string of x length rather than so many charAt(i)
toLowerCase() // to ignore capitals
contains() // to see if the string you have contains the string you dont want
equals() // to see if the string you have is the same the string you dont want
split() // split into an array based on a expression, in you case I assume a space