Java-具有大量数据的字符串频率

Java-具有大量数据的字符串频率,java,memory,substring,Java,Memory,Substring,我试图找到一个巨大字符串中最长子字符串的频率 'Huge string' can be up to 2M characters long, only a-z 'Substring' may be between 100k to 2M characters long 'Substring' is always same or smaller size than 'Huge string' 目前,我正在使用我创建的以下方法: public static int[] countSubstri

我试图找到一个巨大字符串中最长子字符串的频率

'Huge string' can be up to 2M characters long, only a-z
'Substring' may be between 100k to 2M characters long
'Substring' is always same or smaller size than 'Huge string'
目前,我正在使用我创建的以下方法:

    public static int[] countSubstringOccurence(String input, int substringLength) {
      // input = from 100 000 to 2 000 000 alphanumeric characters long string;
      // substringLength = from 100 000 to 2 000 000, always smaller than input

      LinkedHashMap < String, Integer > substringOccurence = new LinkedHashMap < > ();
      int l;

      for (int i = 0; i < (input.length() - substringLength) + 1; i++) {
          String substring = input.substring(i, i + substringLength);
          if (substringOccurence.containsKey(substring)) {
              l = substringOccurence.get(substring);
              substringOccurence.put(substring, ++l);
          } else {
              substringOccurence.put(substring, 1);
          }
      }


      List < Integer > substringOccurenceList = new ArrayList < > (substringOccurence.values());
      int numberOfUniqueSubstrings = substringOccurenceList.size();
      int numberOfOccurenciesOfMostCommonSubstrings = 0;
      int numberOfSubstringsOfMostCommonSubstring = 0;

      for (int i: substringOccurenceList) {
          if (i > numberOfOccurenciesOfMostCommonSubstrings) {
              numberOfOccurenciesOfMostCommonSubstrings = i;
              numberOfSubstringsOfMostCommonSubstring = 1;
          } else if (i == numberOfOccurenciesOfMostCommonSubstrings) {
              numberOfSubstringsOfMostCommonSubstring++;
          }
      }

      return new int[] {
          numberOfUniqueSubstrings,
          numberOfOccurenciesOfMostCommonSubstrings,
          numberOfSubstringsOfMostCommonSubstring
      };
  }
我将代码更改为:

public static int[] countSubstringOccurence(String text, int substringLength) {
    int textLength = text.length();

    int numberOfUniqueSubstrings = 0;
    List<Integer> substrIndexes = new ArrayList<>();

    for (int i = 0; i < (textLength - substringLength) + 1; i++) {
        boolean doesNotExists = true;
        for (int j = i + 1; j < (textLength - substringLength) + 1; j++) {
            String actualSubstr = text.substring(i, i + substringLength);
            String indexSubstr = text.substring(j, j + substringLength);
            if (actualSubstr.equals(indexSubstr)) {
                doesNotExists = false;
                substrIndexes.add(j);
            }
        }

        if (doesNotExists) {
            numberOfUniqueSubstrings++;
            substrIndexes.add(i);
        }
    }

    LinkedHashMap<Integer, Integer> substrCountMap = new LinkedHashMap<>();

    for (int i : substrIndexes) {
        String substr = text.substring(i, i + substringLength);

        int lastIndex = 0;
        int count = 0;

        while (lastIndex != -1) {
            lastIndex = text.indexOf(substr, lastIndex);

            if (lastIndex != -1) {
                count++;
                lastIndex += substr.length();
            }
        }

        substrCountMap.put(i, count);
    }

    List<Integer> substrCountList = new ArrayList<>(substrCountMap.values());

    int numberOfOccurenciesOfMostCommonSubstrings = 0;
    int numberOfSubstringsOfMostCommonSubstring = 0;

    for (int count : substrCountList) {
        if (count > numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfOccurenciesOfMostCommonSubstrings = count;
            numberOfSubstringsOfMostCommonSubstring = 1;
        } else if (count == numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfSubstringsOfMostCommonSubstring++;
        }
    }

    return new int[] {
            numberOfUniqueSubstrings,
            numberOfOccurenciesOfMostCommonSubstrings,
            numberOfSubstringsOfMostCommonSubstring
    };
}
public static int[]countsubstringoccurrence(字符串文本,int substringLength){
int textLength=text.length();
int numberofuniquesubstring=0;
List substrindex=new ArrayList();
对于(int i=0;i<(textLength-substringLength)+1;i++){
布尔doesNotExists=true;
对于(int j=i+1;j<(textLength-substringLength)+1;j++){
String actualSubstr=text.substring(i,i+substringLength);
String indexSubstr=text.substring(j,j+substringLength);
if(实际substr.等于(indexSubstr)){
doesNotExists=假;
子索引。添加(j);
}
}
如果(无纺织工人){
numberofuniquesubstring++;
子索引。添加(i);
}
}
LinkedHashMap substrCountMap=新LinkedHashMap();
for(int i:子索引){
String substr=text.substring(i,i+substringLength);
int lastIndex=0;
整数计数=0;
while(lastIndex!=-1){
lastIndex=text.indexOf(substr,lastIndex);
如果(lastIndex!=-1){
计数++;
lastIndex+=子字符串长度();
}
}
substrCountMap.put(i,count);
}
List substrCountList=newarraylist(substrCountMap.values());
int numberOfOccurenceSostCommonSubstrings=0;
int numberOfSubstringsOfMostCommonSubstring=0;
用于(整数计数:子计数列表){
如果(计数>发生次数ESOFMOSTCOMONSUBSTRINGS){
numberOfOccurenceSostCommonSubstrings=计数;
numberOfSubstringsOfMostCommonSubstring=1;
}else if(count==numberOfOccurrenciesOfMostCommonSubstrings){
numberOfSubstringsOfMostCommonSubstring++;
}
}
返回新的int[]{
唯一子串的数目,
OccurrenciesOfMostCommonSubstring的数量,
最常见子串的子串数
};
}
这段代码没有崩溃,只是非常非常慢(我想至少是O(2n^2))。有人能想出一个更快的方法吗


如果它能在1GB内存和15分钟内安装在相当于i3-3xxx的CPU上,那就太棒了。今天我已经做完了。

只需使用
StringTokenizer
类并提取每个单词。然后将每个单词存储在字符串数组中,字符串类型的大小由方法
.countTokens()给定

然后,您可以轻松地计算给定单词的频率,并在Java6上运行它。不是开玩笑


Java 6子字符串不复制字符,只复制引用、索引和长度。

为什么需要将找到的整个字符串存储在内存中?您是否可以将子字符串的开始和结束存储为整数(与频率一起),然后在需要时使用该数据取出这些子字符串?您是否通过探查器运行过此操作?它告诉了你什么?你的记忆、空间和时间限制是什么?您根本无法存储子字符串,而是只需获取第一个子字符串,迭代字符串以了解它发生的频率,然后移动到下一个子字符串,等等。然后,您应该实现一种更好的方法来检查子字符串是否包含-不要获取子字符串并比较结果,但是试着看看这个字符串是否包含从给定索引开始的子字符串。这里的用例是什么?目前我唯一的建议是保存第一次出现的索引,而不是在LinkedHashMap中保存子字符串。但如果没有进一步的信息,就不可能找到“最佳解决方案”,因为实际的设计缺陷可能存在于代码的周围部分(即依赖此方法的部分)。请查看Knuth-Morris-Pratt算法。使用它查找i=0到input.length()-substringLength的input.substring(i,i+substringLength)的所有匹配项。这不会像O(n²)那么糟糕,因为KMP会快速跳过无希望的部分。此外,只需比较所有元素,您就会得到最长的子字符串,您可以通过从数组中提取该单词来计算每个字母的频率,而且速度更快、内存效率更高、抗OOM?你需要提供更多关于这个答案的详细信息。不确定是否更快,但是的,这个人面临的问题将得到解决。请尝试详细回答为什么你的建议有用,以及它是如何帮助的。正如原作者所问,这是一个答案,如何使这个问题更快。用Java6运行它可以做到这一点。再说一遍:不是开玩笑。
public static int[] countSubstringOccurence(String text, int substringLength) {
    int textLength = text.length();

    int numberOfUniqueSubstrings = 0;
    List<Integer> substrIndexes = new ArrayList<>();

    for (int i = 0; i < (textLength - substringLength) + 1; i++) {
        boolean doesNotExists = true;
        for (int j = i + 1; j < (textLength - substringLength) + 1; j++) {
            String actualSubstr = text.substring(i, i + substringLength);
            String indexSubstr = text.substring(j, j + substringLength);
            if (actualSubstr.equals(indexSubstr)) {
                doesNotExists = false;
                substrIndexes.add(j);
            }
        }

        if (doesNotExists) {
            numberOfUniqueSubstrings++;
            substrIndexes.add(i);
        }
    }

    LinkedHashMap<Integer, Integer> substrCountMap = new LinkedHashMap<>();

    for (int i : substrIndexes) {
        String substr = text.substring(i, i + substringLength);

        int lastIndex = 0;
        int count = 0;

        while (lastIndex != -1) {
            lastIndex = text.indexOf(substr, lastIndex);

            if (lastIndex != -1) {
                count++;
                lastIndex += substr.length();
            }
        }

        substrCountMap.put(i, count);
    }

    List<Integer> substrCountList = new ArrayList<>(substrCountMap.values());

    int numberOfOccurenciesOfMostCommonSubstrings = 0;
    int numberOfSubstringsOfMostCommonSubstring = 0;

    for (int count : substrCountList) {
        if (count > numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfOccurenciesOfMostCommonSubstrings = count;
            numberOfSubstringsOfMostCommonSubstring = 1;
        } else if (count == numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfSubstringsOfMostCommonSubstring++;
        }
    }

    return new int[] {
            numberOfUniqueSubstrings,
            numberOfOccurenciesOfMostCommonSubstrings,
            numberOfSubstringsOfMostCommonSubstring
    };
}