C 如何搜索trie中有一个错误的字符串？_C_Algorithm_Time Complexity_Bioinformatics_Trie

C 如何搜索trie中有一个错误的字符串？

c algorithm time-complexity

C 如何搜索trie中有一个错误的字符串？,c,algorithm,time-complexity,bioinformatics,trie,C,Algorithm,Time Complexity,Bioinformatics,Trie,给定：一些大型输入文件，其中包含长度相同的字符串k（通常为99个字符），由字符A、G、T、C组成任务：用C编写一种算法，该算法将搜索m>k（通常为110）个字符的字符串，这样，如果字符串的最后k-1个字符与另一个字符串的前k-1个字符相同，则应将它们连接起来。例如，如果我在集合{AGTC，GTCC}中搜索字符串AGTCC，它将给出一个正输出算法还应输出最佳精确前缀，以防搜索的字符串不在列表中此外，如果搜索字符串与当前字符串相差一个字符，则该算法还应给出正输出我能想到的最聪明的方法是将这个

给定：一些大型输入文件，其中包含长度相同的字符串k（通常为99个字符），由字符A、G、T、C组成

任务：用C编写一种算法，该算法将搜索m>k（通常为110）个字符的字符串，这样，如果字符串的最后k-1个字符与另一个字符串的前k-1个字符相同，则应将它们连接起来。例如，如果我在集合

{AGTC，GTCC}

中搜索字符串

AGTCC

，它将给出一个正输出

算法还应输出最佳精确前缀，以防搜索的字符串不在列表中

此外，如果搜索字符串与当前字符串相差一个字符，则该算法还应给出正输出

我能想到的最聪明的方法是将这个巨大的列表组织成一个四叉树：每个节点包含一个由4个指向其他节点的指针组成的数组，这些指针最初是空的。当添加一个字符时，我按以下方式更改数组：如果输入是a，我将数组的第0个位置分配给一个空数组的新节点，如果它是C，则是第一个，如果是G，则是第二个，如果是T，则是第三个

例如，最初根是

{NULL，NULL，NULL，NULL}

。如果我添加的第一个字符是C，我将创建一个带有空数组的新节点N，然后我将根更改为

{NULL，N，NULL，NULL}

。我向我的教授解释了这一点，他说这是一种实际使用的数据结构，称为trie

我可以使用此结构执行搜索并在O（m）时间内找到最佳子字符串，只需跟随树，直到到达

{NULL，NULL，NULL，NULL}

节点

我能想到的最好的方法是在最佳子串中查找每个可能的不匹配项，然后再次执行整个搜索，这导致了O（4m^2）的复杂性。这还不错，但我想让它线性化

是否有某种疯狂的算法可以在trie中执行非常有效的单一不匹配搜索？以下是我目前掌握的情况：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define k 99
#define m 110 
#define L 10000000 //Check variable in case the input file isn't properly formatted with EOF
typedef struct tree{
  struct tree* base[4];
} node;
int hash(char r){
  if (r=='A') return 0;
  if (r=='C') return 1;
  if (r=='G') return 2;
  if (r=='T'||r=='N') return 3; //Ignore the N bit
  return -1;
}

char invhash(int x){ 
  if (x==0) return 'A';
  if (x==1) return 'C';
  if (x==2) return 'G';
  if (x==3) return 'T';
  return -1;
}

node* newnode(){
  node* new=malloc(sizeof(node));
  for (int i=0; i<4; i++) {
    new->base[i]=malloc(sizeof(node));
    new->base[i]=NULL;
  }
  return new;
}

node* trovak(char* target, node* s, int level){ //Searches a string of length k and outputs a pointer to its leaf if it finds it and NULL otherwise
  if (level==k) return s;
  int h=hash(target[level]);
  if (s->base[h]==NULL){
    return NULL;
  }
  return trovak(target, s->base[h], level+1);
}


void newtree(node* s, char* kmero){ //Adds the string kmero to the tree s
  int h;
  for (int l=0; l<k; l++){
    h=hash(kmero[l]);
    if (s->base[h]==NULL){
      s->base[h]=newnode();
    }
    s=s->base[h];
  }
  return;
}


int trova(char* target, node* s,int level){ //This searches the whole target string in the tree
  if (level==m) return m;
  int h=hash(target[level]);
  if (s->base[h]==NULL){
    printf("There isn't %c (character number %d)\n", invhash(h), level);
    return level;
  }
  return trova(target, s->base[h], level+1);
}

int main(){
  FILE* file=fopen("KAPPA105.txt", "r"); //List of strings file
  FILE* ricerca=fopen("ricerca.txt", "r"); //Target string file
  node* s=newnode(); //Root of tree
  int level=0; //Same as L, don't worry about it
  char* target=malloc(m); //Actual target string
  char* kmero=malloc(k); 
  node* leaf=malloc(sizeof(node)); //Used to build the tree, din't worry about it
  node* linksucc=malloc(sizeof(node)); //Same
  target=fgets(target, m+1, ricerca);
  char* SUCC_KMER=malloc(k); 
  //Creating the tree
  do{
    kmero=fgets(kmero, k+1, file);
    newtree(s, kmero);
    level++;
  } while (fgetc(file)!=EOF&&level<L);
  rewind(file);
  level=0;
  //The tree has always height k, so I attach all the leaves in order to do the concatenation thing
  do{
    leaf=s;
    kmero=fgets(kmero, k+1, file);
    for (int l=0; l<k; l++) leaf=leaf->base[hash(kmero[l])]; 
    strncpy(SUCC_KMER, kmero+1, k-1);
    for (int i=0; i<4; i++){
      SUCC_KMER[k-1]=invhash(i); 
      linksucc=trovak(SUCC_KMER, s, 0); 
      leaf->base[i]=linksucc; /
    }
    level++;
  } while (fgetc(file)!=EOF&&level<L);

  ////////////////////////////////////////////////////////////
  //Formatting thing
  for (int i=0; i<m; i++) putchar(target[i]);
  putchar('\n');
  for (int i=0; i<m; i++) putchar('-');
  putchar('\n');
  int pos=trova(target, s, 0); //Perform the search
  if (pos==m){
    printf("The string is present in the list.\n");
    return 2;
  }
  else{
    printf("Best prefix (%d characters):\n", pos);
    for (int i=0; i<pos; i++) putchar(target[i]);
    putchar('\n');
    //What if there is a single mismatch?????
  }
  return 0;
}

#包括
#包括
#包括
#定义k99
#定义m 110
#定义L 10000000//检查变量，以防输入文件的EOF格式不正确
类型定义结构树{
结构树*base[4]；
}节点；
int散列（charr）{
如果（r=='A'）返回0；
如果（r=='C'）返回1；
如果（r=='G'）返回2；
if（r=='T'| r=='N'）返回3；//忽略N位
返回-1；
}
char invhash（int x）{
如果（x==0）返回'A'；
如果（x==1）返回'C'；
如果（x==2）返回'G'；
如果（x==3）返回'T'；
返回-1；
}
node*newnode（）{
node*new=malloc（sizeof（node））；
对于（inti=0；ibase[i]=malloc（sizeof（node））；
新建->基础[i]=NULL；
}
归还新的；
}
node*trovak（char*target，node*s，int-level）{//搜索长度为k的字符串，如果找到该字符串，则输出指向其叶的指针，否则为NULL
如果（level==k）返回s；
int h=散列（目标[级别]）；
如果（s->base[h]==NULL）{
返回NULL；
}
返回特鲁瓦克（目标，s->基础[h]，等级+1）；
}
void newtree（node*s，char*kmero）{//将字符串kmero添加到树s中
int-h；
对于（int l=0；lbase[h]==NULL）{
s->base[h]=newnode（）；
}
s=s->base[h]；
}
返回；
}
int-trova（char*target，node*s，int-level）{//这将搜索树中的整个目标字符串
如果（level==m）返回m；
int h=散列（目标[级别]）；
如果（s->base[h]==NULL）{
printf（“没有%c（字符号%d）\n”，invhash（h），级别）；
回报水平；
}
返回trova（目标，s->基础[h]，等级+1）；
}
int main（）{
FILE*FILE=fopen（“KAPPA105.txt”，“r”）；//字符串列表文件
FILE*ricerca=fopen（“ricerca.txt”，“r”）；//目标字符串文件
node*s=newnode（）；//树的根
int level=0；//与L相同，不用担心
char*target=malloc（m）；//实际目标字符串
char*kmero=malloc（k）；
node*leaf=malloc（sizeof（node））；//用于构建树，不用担心
node*linksucc=malloc（sizeof（node））；//相同
目标=fgets（目标，m+1，ricerca）；
char*suc_KMER=malloc（k）；
//创建树
做{
kmero=fgets（kmero，k+1，文件）；
纽特里(s,kmero);；
级别++；
}while（fgetc（file）！=EOF&&level注意：}while（fgetc（file）！=EOF&&level我知道，由于输入的格式，它是一个“\n”。程序工作正常，只是我不知道如何巧妙地做下一位。fgets（）
也会消耗一个“\n”。有核苷酸的那一行后面有多余的空行吗？实际上没有，但我刚才试着让它打印它读取的字符串，它们是正确的#define k 99