Java 搜索字符串中未知模式的最有效方法？_Java_Algorithm_Substring

Java 搜索字符串中未知模式的最有效方法？

java algorithm

Java 搜索字符串中未知模式的最有效方法？,java,algorithm,substring,Java,Algorithm,Substring,我试图找到以下模式：多次发生长度超过1个字符不是任何其他已知模式的子字符串不知道任何可能发生的模式例如：字符串“The boy Fall by The bell”将返回'ell'，b'，y' 字符串“The boy Fall by The bell，The boy Fall by The bell”将返回“The boy Fall by The bell” 使用double for循环，可以强制执行非常低效的： ArrayList<String> patterns

我试图找到以下模式：

多次发生
长度超过1个字符
不是任何其他已知模式的子字符串

不知道任何可能发生的模式

例如：

字符串“The boy Fall by The bell”将返回
```
'ell'，b'，y'
```
字符串“The boy Fall by The bell，The boy Fall by The bell”将返回
```
“The boy Fall by The bell”
```

使用double for循环，可以强制执行非常低效的：

ArrayList<String> patternsList = new ArrayList<>(); int length = string.length(); for (int i = 0; i < length; i++) { int limit = (length - i) / 2; for (int j = limit; j >= 1; j--) { int candidateEndIndex = i + j; String candidate = string.substring(i, candidateEndIndex); if(candidate.length() <= 1) { continue; } if (string.substring(candidateEndIndex).contains(candidate)) { boolean notASubpattern = true; for (String pattern : patternsList) { if (pattern.contains(candidate)) { notASubpattern = false; break; } } if (notASubpattern) { patternsList.add(candidate); } } } }

ArrayList patternsList=新的ArrayList（）； int length=string.length（）； for（int i=0；i=1；j--）{ int候选指数=i+j；字符串候选者=String.substring（i，candidatendIndex）；如果（candidate.length（）可以在线性时间内为字符串构建后缀树：您正在查找的模式是与只有叶子树的内部节点对应的字符串。您可以使用n-grams在字符串中查找模式。这需要O（n）扫描字符串以查找n-gram的时间到了。当您使用n-gram查找子字符串时，请将其放入哈希表中，并记录在字符串中找到该子字符串的次数。在字符串中搜索n-gram后，请在哈希表中搜索大于1的计数，以查找字符串中的重复模式 Dictionary<string, int>dict = new Dictionary<string, int>(); int count = 0; int ngramcount = 6; string substring = ""; // Add entries to the hash table while (count < str.length) { // copy the words into the substring int i = 0; substring = ""; while (ngramcount > 0 && count < str.length) { substring[i] = str[count]; if (str[i] == ' ') ngramcount--; i++; count++; } ngramcount = 6; substring.Trim(); // get rid of the last blank in the substring // Update the dictionary (hash table) with the substring if (dict.Contains(substring)) { // substring is already in hash table so increment the count int hashCount = dict[substring]; hashCount++; dict[substring] = hashCount; } else dict[substring] = 1; } // Find the most commonly occurrring pattern in the string // by searching the hash table for the greatest count. int maxCount = 0; string mostCommonPattern = ""; foreach (KeyValuePair<string, int> pair in dict) { if (pair.Value > maxCount) { maxCount = pair.Value; mostCommonPattern = pair.Key; } } 例如，在字符串“the boy Fall by the bell，the boy Fall by the bell”中，使用6克将找到子字符串“the boy Fall by the bell”。具有该子字符串的哈希表项的计数将为2，因为它在字符串中出现了两次。更改n-gram中的字数将有助于发现字符串中的不同模式 Dictionary<string, int>dict = new Dictionary<string, int>(); int count = 0; int ngramcount = 6; string substring = ""; // Add entries to the hash table while (count < str.length) { // copy the words into the substring int i = 0; substring = ""; while (ngramcount > 0 && count < str.length) { substring[i] = str[count]; if (str[i] == ' ') ngramcount--; i++; count++; } ngramcount = 6; substring.Trim(); // get rid of the last blank in the substring // Update the dictionary (hash table) with the substring if (dict.Contains(substring)) { // substring is already in hash table so increment the count int hashCount = dict[substring]; hashCount++; dict[substring] = hashCount; } else dict[substring] = 1; } // Find the most commonly occurrring pattern in the string // by searching the hash table for the greatest count. int maxCount = 0; string mostCommonPattern = ""; foreach (KeyValuePair<string, int> pair in dict) { if (pair.Value > maxCount) { maxCount = pair.Value; mostCommonPattern = pair.Key; } } Dictionarydict=newdictionary（）；整数计数=0； int ngramcount=6；字符串子字符串=”； //将条目添加到哈希表中 while（计数0&&countmaxCount）{ maxCount=pair.Value； mostCommonPattern=pair.Key； } } 我写这篇文章只是为了好玩。我希望我已经正确地理解了这个问题，这是有效的，而且速度足够快；如果没有，请对我放松：）如果有人发现它有用，我可能会对它进行进一步优化 private static IEnumerable<string> getPatterns(string txt) { char[] arr = txt.ToArray(); BitArray ba = new BitArray(arr.Length); for (int shingle = getMaxShingleSize(arr); shingle >= 2; shingle--) { char[] arr1 = new char[shingle]; int[] indexes = new int[shingle]; HashSet<int> hs = new HashSet<int>(); Dictionary<int, int[]> dic = new Dictionary<int, int[]>(); for (int i = 0, count = arr.Length - shingle; i <= count; i++) { for (int j = 0; j < shingle; j++) { int index = i + j; arr1[j] = arr[index]; indexes[j] = index; } int h = getHashCode(arr1); if (hs.Add(h)) { int[] indexes1 = new int[indexes.Length]; Buffer.BlockCopy(indexes, 0, indexes1, 0, indexes.Length * sizeof(int)); dic.Add(h, indexes1); } else { bool exists = false; foreach (int index in indexes) if (ba.Get(index)) { exists = true; break; } if (!exists) { int[] indexes1 = dic[h]; if (indexes1 != null) foreach (int index in indexes1) if (ba.Get(index)) { exists = true; break; } } if (!exists) { foreach (int index in indexes) ba.Set(index, true); int[] indexes1 = dic[h]; if (indexes1 != null) foreach (int index in indexes1) ba.Set(index, true); dic[h] = null; yield return new string(arr1); } } } } } private static int getMaxShingleSize(char[] arr) { for (int shingle = 2; shingle <= arr.Length / 2 + 1; shingle++) { char[] arr1 = new char[shingle]; HashSet<int> hs = new HashSet<int>(); bool noPattern = true; for (int i = 0, count = arr.Length - shingle; i <= count; i++) { for (int j = 0; j < shingle; j++) arr1[j] = arr[i + j]; int h = getHashCode(arr1); if (!hs.Add(h)) { noPattern = false; break; } } if (noPattern) return shingle - 1; } return -1; } private static int getHashCode(char[] arr) { unchecked { int hash = (int)2166136261; foreach (char c in arr) hash = (hash * 16777619) ^ c.GetHashCode(); return hash; } } private静态IEnumerable getPatterns（string txt） { char[]arr=txt.ToArray（）； BitArray ba=新的位数组（arr.Length）；对于（内部木瓦=getMaxShingleSize（arr）；木瓦>=2；木瓦--） { char[]arr1=新的char[shingle]； int[]索引=新的int[shingle]； HashSet hs=新的HashSet（）； Dictionary dic=新字典（）；对于（inti=0，count=arr.Length-shingle；i后缀数组是正确的想法，但缺少一个非常重要的部分，即识别文献中已知的“超最大重复”。这是一个GitHub repo，其工作代码为：。后缀数组构造使用SAIS库，作为子模块在中提供。超最大重复使用的是中findsmaxr 中的伪代码的更正版本我将使用（线性时间复杂度O（n））查找子字符串。我将尝试查找最大的子字符串模式，将其从输入字符串中删除，并尝试查找第二大的子字符串，依此类推。我将执行以下操作： string pattern = input.substring(0,lenght/2); string toMatchString = input.substring(pattern.length, input.lenght - 1); List<string> matches = new List<string>(); while(pattern.lenght > 0) { int index = KMP(pattern, toMatchString); if(index > 0) { matches.Add(pattern); // remove the matched pattern occurences from the input string // I would do something like this: // 0 to pattern.lenght gets removed // check for all occurences of pattern in toMatchString and remove them // get the remaing shrinked input, reassign values for pattern & toMatchString // keep looking for the next largest substring } else { pattern = input.substring(0, pattern.lenght - 1); toMatchString = input.substring(pattern.length, input.lenght - 1); } } string模式=输入。子字符串（0，长度/2）； string-toMatchString=input.substring（pattern.length，input.lenght-1）；列表匹配项=新列表（）； while（pattern.lenght>0） { int index=KMP（模式，toMatchString）；如果（索引>0） { 匹配。添加（模式）； //从输入字符串中删除匹配的模式出现 //我会这样做： //0到pattern.lenght被删除 //检查toMatchString中出现的所有图案，并将其删除 //获取重新生成的收缩输入，重新为pattern和toMatchString分配值 //继续查找下一个最大的子字符串 } 其他的 { pattern=input.substring（0，pattern.lenght-1）； toMatchString=input.substring（pattern.length，input.lenght-1）； } } 其中KMP 实现了Knuth-Morris-Pratt算法。您可以在或自己编写它的Java实现 PS：我不使用Java编写代码，我的第一笔奖金很快就要结束了。因此，如果我遗漏了一些琐碎的东西或犯了+/-1错误，请不要给我棍子。从某种意义上说，这是一种压缩形式。你可以对各种压缩算法进行一些研究。为什么在你的第一个结果示例中单个空格不是一个元素@Björn，因为它只有一个字符长。当然/me会清理玻璃为什么“，”一个带空格的逗号不是secound结果示例的一部分？嗨@AlexQuilliam。我想知道你是否找到了一个好的解决方案。如果是这样，如果你能友好地添加代码，那将是非常好的。我很好奇respec的代码的性能和有效性 Davids-MBP:commonsub eisen$ ./repsub input ["\u000a" ," S" ," as " ," co" ," ide" ," in " ," li" ," n" ," p" ," the " ," us" ," ve" ," w" ,"\"" ,"–" ,"(" ,")" ,". " ,"0" ,"He" ,"Suffix array" ,"`" ,"a su" ,"at " ,"code" ,"com" ,"ct" ,"do" ,"e f" ,"ec" ,"ed " ,"ei" ,"ent" ,"ere's a " ,"find" ,"her" ,"https://" ,"ib" ,"ie" ,"ing " ,"ion " ,"is" ,"ith" ,"iv" ,"k" ,"mon" ,"na" ,"no" ,"nst" ,"ons" ,"or" ,"pdf" ,"ri" ,"s are " ,"se" ,"sing" ,"sub" ,"supermaximal repeats" ,"te" ,"ti" ,"tr" ,"ub " ,"uffix arrays" ,"via" ,"y, " ] string pattern = input.substring(0,lenght/2); string toMatchString = input.substring(pattern.length, input.lenght - 1); List<string> matches = new List<string>(); while(pattern.lenght > 0) { int index = KMP(pattern, toMatchString); if(index > 0) { matches.Add(pattern); // remove the matched pattern occurences from the input string // I would do something like this: // 0 to pattern.lenght gets removed // check for all occurences of pattern in toMatchString and remove them // get the remaing shrinked input, reassign values for pattern & toMatchString // keep looking for the next largest substring } else { pattern = input.substring(0, pattern.lenght - 1); toMatchString = input.substring(pattern.length, input.lenght - 1); } }