Arrays 如何在给定数组中查找重复的字符序列？_Arrays_Algorithm_Language Agnostic

Arrays 如何在给定数组中查找重复的字符序列？

arrays algorithm language-agnostic

Arrays 如何在给定数组中查找重复的字符序列？,arrays,algorithm,language-agnostic,Arrays,Algorithm,Language Agnostic,我的问题是找到给定数组中的重复字符序列。简单地说，识别字符出现的模式 ——————。 1:| J | A | M | E | S | O | N | J | A | M | E | S | O | N| —————————————————————————————————————————— ——————————。 2:| R | O | N | R | O | N | R | O | N | R | O | N| ——————————————————————————————————————————

我的问题是找到给定数组中的重复字符序列。简单地说，识别字符出现的模式

——————。
1:| J | A | M | E | S | O | N | J | A | M | E | S | O | N|
——————————————————————————————————————————

——————————。
2:| R | O | N | R | O | N | R | O | N | R | O | N|
————————————————————————————————————————————

————。
3:| S | H | A | M | I | L | S | H | A | M | I | L|
“——”——“——”——“——”——“——”——“——”——“——”——”

——————————。
4:| C | A | R | P | E | N | T | E | R | C | A | R | P | E | N | T | E | R|
————————————————————————————————————

例子
根据之前的数据，结果应为：
“詹姆逊”
“RON”
“SHAMIL”
“木匠”

问题:

如何有效地处理这个问题
举个例子，我的第一个方法是
获取数组的第一个字符（对于上一个示例，它将是C
）
获取该字符在数组中下一次出现的索引（例如9）
如果找到，则在字符的两个外观之间搜索子字符串的下一个外观（在本例中为CARPENTER
）
如果找到了，就完成了（结果就是这个子字符串）
当然，这只适用于可能数组的一个非常有限的子集，其中同一个单词从一开始就重复了一次又一次，中间没有散乱的字符，并且它的第一个字符在单词中不重复。但你们所有的例子都属于这一类——我更喜欢可能有效的最简单的解决方案：——）
如果重复的单词多次包含第一个字符（例如，CACTUS
），则该算法可以扩展以查找该字符的后续出现，而不仅仅是第一个字符（以便它查找整个重复单词，而不仅仅是其子串）
请注意，对于第二个示例，此扩展算法将给出不同的结果，即RONRON
，而不是RON
伪代码
len = str.length
for (i in 1..len) {
   if (len%i==0) {
      if (str==str.substr(0,i).repeat(len/i)) {
         return str.substr(0,i)
      }
   }
}

注意：为了简洁起见，我发明了一个字符串的“repeat”方法，它实际上不是Java字符串的一部分；在Python中，重复（2）=“abcabc”
可以利用正则表达式，因此：
def recurrence(text):
    import re
    for i in range(1, len(text)/2 + 1):
        m = re.match(r'^(.{%d})\1+$'%i, text)
        if m: return m.group(1)

recurrence('abcabc') # Returns 'abc'

我不确定这将如何转换为Java或C。（我想这是我喜欢Python的原因之一。：-
使用C++:
//Splits the string into the fragments of given size
//Returns the set of of splitted strings avaialble
set<string> split(string s, int frag)
{
    set<string> uni;
    int len = s.length();
    for(int i = 0; i < len; i+= frag)
    {
        uni.insert(s.substr(i, frag));
    }

    return uni;
}

int main()
{

    string out;
    string s = "carpentercarpenter";
    int len = s.length();

      //Optimistic approach..hope there are only 2 repeated strings
      //If that fails, then try to break the strings with lesser number of
      //characters
    for(int i = len/2; i>1;--i)
    {
        set<string> uni = split(s,i);
        if(uni.size() == 1)
        {
            out = *uni.begin();
            break;
        }
    }

    cout<<out;
    return 0;

}

//将字符串拆分为给定大小的片段
//返回可用的拆分字符串集
设置拆分（字符串s，整数帧）
{
设置uni；
int len=s.length（）；
对于（int i=0；i1；--i）
{
设置uni=拆分（s，i）；
如果（单位大小（）==1）
{
out=*uni.begin（）；
打破
}
}
cout首先编写一个方法，在容器字符串中查找重复的子字符串sub
，如下所示
boolean findSubRepeating(String sub, String container);

现在继续在容器中增加子字符串来调用此方法，首先尝试1个字符的子字符串，然后是2个字符，以此类推到容器。length/2
我想到的第一个想法是尝试所有长度除以长度的重复序列=N。此类长度的最大值为N/2，因此这将导致O（N^2）算法
但我相信它可以改进…
开玩笑的O（NlogN）解决方案
对字符串执行FFT（将字符视为数值）。结果图中的每个峰值对应于子字符串周期。
下面是一个具体的工作示例：
/* find greatest repeated substring */
char *fgrs(const char *s,size_t *l)
{
  char *r=0,*a=s;
  *l=0;
  while( *a )
  {
    char *e=strrchr(a+1,*a);
    if( !e )
      break;
    do {
      size_t t=1;
      for(;&a[t]!=e && a[t]==e[t];++t);
      if( t>*l )
        *l=t,r=a;
      while( --e!=a && *e!=*a );
    } while( e!=a && *e==*a );
    ++a;
  }
  return r;
}

  size_t t;
  const char *p;
  p=fgrs("BARBARABARBARABARBARA",&t);
  while( t-- ) putchar(*p++);
  p=fgrs("0123456789",&t);
  while( t-- ) putchar(*p++);
  p=fgrs("1111",&t);
  while( t-- ) putchar(*p++);
  p=fgrs("11111",&t);
  while( t-- ) putchar(*p++);

我会将数组转换为字符串对象并使用regex，但不确定如何定义“高效”。为了便于/快速实现，您可以在Java中执行此操作：
    private static String findSequence(String text) {
        Pattern pattern = Pattern.compile("(.+?)\\1+");
        Matcher matcher = pattern.matcher(text);
        return matcher.matches() ? matcher.group(1) : null;
    }

它试图找到最短的字符串（+？
），该字符串必须至少重复一次（\1+
）才能匹配整个输入文本。
将所有字符放入数组e.x.a[]
i=0; j=0;
for( 0 < i < count ) 
{
if (a[i] == a[i+j+1])
    {++i;}
else
    {++j;i=0;}
}

i=0；j=0；
对于（0

然后（i/j）的比率=数组中的重复计数。
您必须注意i
和j
的限制，但这是一个简单的解决方案。
这是一个更一般的问题解决方案，它将在一个序列（任何序列）中找到重复的子序列，其中子序列不必从开头开始，也不必立即彼此跟随
给定一个序列b[0..n]，包含所讨论的数据，阈值t是要查找的最小子序列长度
l_max = 0, i_max = 0, j_max = 0;
for (i=0; i<n-(t*2);i++) {
  for (j=i+t;j<n-t; j++) {
    l=0;
    while (i+l<j && j+l<n && b[i+l] == b[j+l])
      l++;
    if (l>t) {
      print "Sequence of length " + l + " found at " + i + " and " + j);
      if (l>l_max) {
        l_max = l;
        i_max = i;
        j_max = j;
      }
    }
  }
}
if (l_max>t) {
  print "longest common subsequence found at " + i_max + " and " + j_max + " (" + l_max + " long)";
}

l_max=0，i_max=0，j_max=0；
对于（i=0；i我自己刚刚解决了这个问题，并为此编写了一些代码（用C#编写），其中有很多注释。希望这对某人有所帮助：
//检查字符串是否包含重复序列。
公共静态bool包含重复序列（string str）
{
if（string.IsNullOrEmpty（str））返回false；
对于（inti=0；i这是我使用队列提出的解决方案，它通过了codeforces中类似问题的所有测试用例。问题号是745A
#include<bits/stdc++.h>
using namespace std;
typedef long long ll;

int main()
{
    ios_base::sync_with_stdio(false);
    cin.tie(NULL);

    string s, s1, s2; cin >> s; queue<char> qu; qu.push(s[0]); bool flag = true; int ind = -1;
    s1 = s.substr(0, s.size() / 2);
    s2 = s.substr(s.size() / 2);
    if(s1 == s2)
    {
        for(int i=0; i<s1.size(); i++)
        {
            s += s1[i];
        }
    }
    //cout << s1 << " " << s2 << " " << s << "\n";
    for(int i=1; i<s.size(); i++)
    {
        if(qu.front() == s[i]) {qu.pop();}
        qu.push(s[i]);
    }
    int cycle = qu.size();

    /*queue<char> qu2 = qu; string str = "";
    while(!qu2.empty())
    {
        cout << qu2.front() << " ";
        str += qu2.front();
        qu2.pop();
    }*/


    while(!qu.empty())
    {
        if(s[++ind] != qu.front()) {flag = false; break;}
        qu.pop();
    }
    flag == true ? cout << cycle : cout << s.size();
    return 0;
}

#包括
使用名称空间std；
typedef长
    private static String findSequence(String text) {
        Pattern pattern = Pattern.compile("(.+?)\\1+");
        Matcher matcher = pattern.matcher(text);
        return matcher.matches() ? matcher.group(1) : null;
    }

i=0; j=0;
for( 0 < i < count ) 
{
if (a[i] == a[i+j+1])
    {++i;}
else
    {++j;i=0;}
}

l_max = 0, i_max = 0, j_max = 0;
for (i=0; i<n-(t*2);i++) {
  for (j=i+t;j<n-t; j++) {
    l=0;
    while (i+l<j && j+l<n && b[i+l] == b[j+l])
      l++;
    if (l>t) {
      print "Sequence of length " + l + " found at " + i + " and " + j);
      if (l>l_max) {
        l_max = l;
        i_max = i;
        j_max = j;
      }
    }
  }
}
if (l_max>t) {
  print "longest common subsequence found at " + i_max + " and " + j_max + " (" + l_max + " long)";
}

#include<bits/stdc++.h>
using namespace std;
typedef long long ll;

int main()
{
    ios_base::sync_with_stdio(false);
    cin.tie(NULL);

    string s, s1, s2; cin >> s; queue<char> qu; qu.push(s[0]); bool flag = true; int ind = -1;
    s1 = s.substr(0, s.size() / 2);
    s2 = s.substr(s.size() / 2);
    if(s1 == s2)
    {
        for(int i=0; i<s1.size(); i++)
        {
            s += s1[i];
        }
    }
    //cout << s1 << " " << s2 << " " << s << "\n";
    for(int i=1; i<s.size(); i++)
    {
        if(qu.front() == s[i]) {qu.pop();}
        qu.push(s[i]);
    }
    int cycle = qu.size();

    /*queue<char> qu2 = qu; string str = "";
    while(!qu2.empty())
    {
        cout << qu2.front() << " ";
        str += qu2.front();
        qu2.pop();
    }*/


    while(!qu.empty())
    {
        if(s[++ind] != qu.front()) {flag = false; break;}
        qu.pop();
    }
    flag == true ? cout << cycle : cout << s.size();
    return 0;
}