Java-具有大量数据的字符串频率_Java_Memory_Substring

Java-具有大量数据的字符串频率

java memory

Java-具有大量数据的字符串频率,java,memory,substring,Java,Memory,Substring,我试图找到一个巨大字符串中最长子字符串的频率 'Huge string' can be up to 2M characters long, only a-z 'Substring' may be between 100k to 2M characters long 'Substring' is always same or smaller size than 'Huge string' 目前，我正在使用我创建的以下方法： public static int[] countSubstri

我试图找到一个巨大字符串中最长子字符串的频率

'Huge string' can be up to 2M characters long, only a-z
'Substring' may be between 100k to 2M characters long
'Substring' is always same or smaller size than 'Huge string'

目前，我正在使用我创建的以下方法：

    public static int[] countSubstringOccurence(String input, int substringLength) {
      // input = from 100 000 to 2 000 000 alphanumeric characters long string;
      // substringLength = from 100 000 to 2 000 000, always smaller than input

      LinkedHashMap < String, Integer > substringOccurence = new LinkedHashMap < > ();
      int l;

      for (int i = 0; i < (input.length() - substringLength) + 1; i++) {
          String substring = input.substring(i, i + substringLength);
          if (substringOccurence.containsKey(substring)) {
              l = substringOccurence.get(substring);
              substringOccurence.put(substring, ++l);
          } else {
              substringOccurence.put(substring, 1);
          }
      }


      List < Integer > substringOccurenceList = new ArrayList < > (substringOccurence.values());
      int numberOfUniqueSubstrings = substringOccurenceList.size();
      int numberOfOccurenciesOfMostCommonSubstrings = 0;
      int numberOfSubstringsOfMostCommonSubstring = 0;

      for (int i: substringOccurenceList) {
          if (i > numberOfOccurenciesOfMostCommonSubstrings) {
              numberOfOccurenciesOfMostCommonSubstrings = i;
              numberOfSubstringsOfMostCommonSubstring = 1;
          } else if (i == numberOfOccurenciesOfMostCommonSubstrings) {
              numberOfSubstringsOfMostCommonSubstring++;
          }
      }

      return new int[] {
          numberOfUniqueSubstrings,
          numberOfOccurenciesOfMostCommonSubstrings,
          numberOfSubstringsOfMostCommonSubstring
      };
  }

我将代码更改为：

public static int[] countSubstringOccurence(String text, int substringLength) {
    int textLength = text.length();

    int numberOfUniqueSubstrings = 0;
    List<Integer> substrIndexes = new ArrayList<>();

    for (int i = 0; i < (textLength - substringLength) + 1; i++) {
        boolean doesNotExists = true;
        for (int j = i + 1; j < (textLength - substringLength) + 1; j++) {
            String actualSubstr = text.substring(i, i + substringLength);
            String indexSubstr = text.substring(j, j + substringLength);
            if (actualSubstr.equals(indexSubstr)) {
                doesNotExists = false;
                substrIndexes.add(j);
            }
        }

        if (doesNotExists) {
            numberOfUniqueSubstrings++;
            substrIndexes.add(i);
        }
    }

    LinkedHashMap<Integer, Integer> substrCountMap = new LinkedHashMap<>();

    for (int i : substrIndexes) {
        String substr = text.substring(i, i + substringLength);

        int lastIndex = 0;
        int count = 0;

        while (lastIndex != -1) {
            lastIndex = text.indexOf(substr, lastIndex);

            if (lastIndex != -1) {
                count++;
                lastIndex += substr.length();
            }
        }

        substrCountMap.put(i, count);
    }

    List<Integer> substrCountList = new ArrayList<>(substrCountMap.values());

    int numberOfOccurenciesOfMostCommonSubstrings = 0;
    int numberOfSubstringsOfMostCommonSubstring = 0;

    for (int count : substrCountList) {
        if (count > numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfOccurenciesOfMostCommonSubstrings = count;
            numberOfSubstringsOfMostCommonSubstring = 1;
        } else if (count == numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfSubstringsOfMostCommonSubstring++;
        }
    }

    return new int[] {
            numberOfUniqueSubstrings,
            numberOfOccurenciesOfMostCommonSubstrings,
            numberOfSubstringsOfMostCommonSubstring
    };
}

public static int[]countsubstringoccurrence（字符串文本，int substringLength）{
int textLength=text.length（）；
int numberofuniquesubstring=0；
List substrindex=new ArrayList（）；
对于（int i=0；i<（textLength-substringLength）+1；i++）{
布尔doesNotExists=true；
对于（int j=i+1；j<（textLength-substringLength）+1；j++）{
String actualSubstr=text.substring（i，i+substringLength）；
String indexSubstr=text.substring（j，j+substringLength）；
if（实际substr.等于（indexSubstr））{
doesNotExists=假；
子索引。添加（j）；
}
}
如果（无纺织工人）{
numberofuniquesubstring++；
子索引。添加（i）；
}
}
LinkedHashMap substrCountMap=新LinkedHashMap（）；
for（int i：子索引）{
String substr=text.substring（i，i+substringLength）；
int lastIndex=0；
整数计数=0；
while（lastIndex！=-1）{
lastIndex=text.indexOf（substr，lastIndex）；
如果（lastIndex！=-1）{
计数++；
lastIndex+=子字符串长度（）；
}
}
substrCountMap.put（i，count）；
}
List substrCountList=newarraylist（substrCountMap.values（））；
int numberOfOccurenceSostCommonSubstrings=0；
int numberOfSubstringsOfMostCommonSubstring=0；
用于（整数计数：子计数列表）{
如果（计数>发生次数ESOFMOSTCOMONSUBSTRINGS）{
numberOfOccurenceSostCommonSubstrings=计数；
numberOfSubstringsOfMostCommonSubstring=1；
}else if（count==numberOfOccurrenciesOfMostCommonSubstrings）{
numberOfSubstringsOfMostCommonSubstring++；
}
}
返回新的int[]{
唯一子串的数目，
OccurrenciesOfMostCommonSubstring的数量，
最常见子串的子串数
};
}

这段代码没有崩溃，只是非常非常慢（我想至少是O（2n^2））。有人能想出一个更快的方法吗

如果它能在1GB内存和15分钟内安装在相当于i3-3xxx的CPU上，那就太棒了。今天我已经做完了。

只需使用

StringTokenizer

类并提取每个单词。然后将每个单词存储在字符串数组中，字符串类型的大小由方法

.countTokens（）给定
然后，您可以轻松地计算给定单词的频率，并在Java6上运行它。不是开玩笑
Java 6子字符串不复制字符，只复制引用、索引和长度。为什么需要将找到的整个字符串存储在内存中？您是否可以将子字符串的开始和结束存储为整数（与频率一起），然后在需要时使用该数据取出这些子字符串？您是否通过探查器运行过此操作？它告诉了你什么？你的记忆、空间和时间限制是什么？您根本无法存储子字符串，而是只需获取第一个子字符串，迭代字符串以了解它发生的频率，然后移动到下一个子字符串，等等。然后，您应该实现一种更好的方法来检查子字符串是否包含-不要获取子字符串并比较结果，但是试着看看这个字符串是否包含从给定索引开始的子字符串。这里的用例是什么？目前我唯一的建议是保存第一次出现的索引，而不是在LinkedHashMap中保存子字符串。但如果没有进一步的信息，就不可能找到“最佳解决方案”，因为实际的设计缺陷可能存在于代码的周围部分（即依赖此方法的部分）。请查看Knuth-Morris-Pratt算法。使用它查找i=0到input.length（）-substringLength的input.substring（i，i+substringLength）的所有匹配项。这不会像O（n²）那么糟糕，因为KMP会快速跳过无希望的部分。此外，只需比较所有元素，您就会得到最长的子字符串，您可以通过从数组中提取该单词来计算每个字母的频率，而且速度更快、内存效率更高、抗OOM？你需要提供更多关于这个答案的详细信息。不确定是否更快，但是的，这个人面临的问题将得到解决。请尝试详细回答为什么你的建议有用，以及它是如何帮助的。正如原作者所问，这是一个答案，如何使这个问题更快。用Java6运行它可以做到这一点。再说一遍：不是开玩笑。
public static int[] countSubstringOccurence(String text, int substringLength) {
    int textLength = text.length();

    int numberOfUniqueSubstrings = 0;
    List<Integer> substrIndexes = new ArrayList<>();

    for (int i = 0; i < (textLength - substringLength) + 1; i++) {
        boolean doesNotExists = true;
        for (int j = i + 1; j < (textLength - substringLength) + 1; j++) {
            String actualSubstr = text.substring(i, i + substringLength);
            String indexSubstr = text.substring(j, j + substringLength);
            if (actualSubstr.equals(indexSubstr)) {
                doesNotExists = false;
                substrIndexes.add(j);
            }
        }

        if (doesNotExists) {
            numberOfUniqueSubstrings++;
            substrIndexes.add(i);
        }
    }

    LinkedHashMap<Integer, Integer> substrCountMap = new LinkedHashMap<>();

    for (int i : substrIndexes) {
        String substr = text.substring(i, i + substringLength);

        int lastIndex = 0;
        int count = 0;

        while (lastIndex != -1) {
            lastIndex = text.indexOf(substr, lastIndex);

            if (lastIndex != -1) {
                count++;
                lastIndex += substr.length();
            }
        }

        substrCountMap.put(i, count);
    }

    List<Integer> substrCountList = new ArrayList<>(substrCountMap.values());

    int numberOfOccurenciesOfMostCommonSubstrings = 0;
    int numberOfSubstringsOfMostCommonSubstring = 0;

    for (int count : substrCountList) {
        if (count > numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfOccurenciesOfMostCommonSubstrings = count;
            numberOfSubstringsOfMostCommonSubstring = 1;
        } else if (count == numberOfOccurenciesOfMostCommonSubstrings) {
            numberOfSubstringsOfMostCommonSubstring++;
        }
    }

    return new int[] {
            numberOfUniqueSubstrings,
            numberOfOccurenciesOfMostCommonSubstrings,
            numberOfSubstringsOfMostCommonSubstring
    };
}