C 在大文件中搜索特定字符串_C_Search

C 在大文件中搜索特定字符串

c search

C 在大文件中搜索特定字符串,c,search,C,Search,我正在用C语言编写一个程序，它可以在一个大的.txt文件中搜索一个特定的字符串，然后对它进行计数，然后打印出来。但似乎出了什么问题，因为我的程序的输出与两个文本编辑器的输出不同。根据文本编辑器，总共有3000个单词，在本例中，我在.txt文件中搜索单词“make”。但是我的程序的输出只有2970 我无法找出我的程序的问题。所以我对文本编辑器如何能如此准确地搜索特定字符串感到好奇？人们如何实现这一点？有人能给我看一些C语言的代码吗说清楚一点：那是一个大的.txt文件，大约20米，包含很多字符。所

我正在用C语言编写一个程序，它可以在一个大的.txt文件中搜索一个特定的字符串，然后对它进行计数，然后打印出来。但似乎出了什么问题，因为我的程序的输出与两个文本编辑器的输出不同。根据文本编辑器，总共有3000个单词，在本例中，我在.txt文件中搜索单词“make”。但是我的程序的输出只有2970
我无法找出我的程序的问题。所以我对文本编辑器如何能如此准确地搜索特定字符串感到好奇？人们如何实现这一点？有人能给我看一些C语言的代码吗
说清楚一点：那是一个大的.txt文件，大约20米，包含很多字符。所以我觉得一下子把它读入记忆是不太好的。我已经实现了我的程序，将我的程序分割成若干部分，然后扫描所有这些部分进行解析。然而，它以某种方式失败了
也许我应该把代码放在这里。请稍等
代码有点长，大约70行。我已经把它放在我的github上了，如果你有兴趣，请帮忙。请注意，唯一相关的文件是wordCount.c和testfile.txt，如下所示：

#include<stdio.h> #include<stdlib.h> #include<stdbool.h> char arr[51]; int flag=0; int flag2=0; int flag3=0; int flag4=0; int pieceCount(FILE*); int main() { //the file in which I want to search the word is testfile.txt //I have formatted the file so that it contain no newlins any more FILE* fs=fopen("testfile.txt","r"); int n=pieceCount(fs); printf("%d\n",n); rewind(fs); //refresh the file... static bool endOfPiece1=false,endOfPiece2=false,endOfPiece3=false; bool begOfPiece1,begOfPiece2,begOfPiece3; for(int start=0;start<n;++start){ fgets(arr,sizeof(arr),fs); for(int i=0;i<=46;++i){ if((arr[i]=='M'||arr[i]=='m')&&(arr[i+1]=='A'||arr[i+1]=='a')&&(arr[i+2]=='K'||arr[i+2]=='k')&&(arr[i+3]=='E'||arr[i+3]=='e')){ flag+=1; //continue; } } //check the border begOfPiece1=((arr[1]=='e'||arr[1]=='E')); if(begOfPiece1==true&&endOfPiece1==true) flag2+=1; endOfPiece1=((arr[47]=='m'||arr[47]=='M')&&(arr[48]=='a'||arr[48]=='A')&&(arr[49]=='k'||arr[49]=='K')); begOfPiece2=((arr[1]=='k'||arr[1]=='K')&&(arr[2]=='e'||arr[2]=='E')); if(begOfPiece2==true&&endOfPiece2==true) flag3+=1; endOfPiece2=((arr[48]=='m'||arr[48]=='M')&&(arr[49]=='a'||arr[49]=='A')); begOfPiece3=((arr[1]=='a'||arr[1]=='A')&&(arr[2]=='k'||arr[2]=='K')&&(arr[3]=='e'||arr[3]=='E')); if(begOfPiece3==true&&endOfPiece3==true) flag4+=1; endOfPiece3=(arr[49]=='m'||arr[49]=='M'); } printf("%d\n%d\n%d\n%d\n",flag,flag2,flag3,flag4); getchar(); return 0; } //the function counts how many pieces have I split the file into int pieceCount(FILE* file){ static int count=0; char arr2[51]={'\0'}; while(fgets(arr2,sizeof(arr),file)){ count+=1; continue; } return count; }

#包括 #包括 #包括 char-arr[51]； int标志=0； int flag2=0； int flag3=0； int-flag4=0；整数计件计数（文件*）； int main（） { //我要在其中搜索单词的文件是testfile.txt //我已格式化该文件，使其不再包含newlin FILE*fs=fopen（“testfile.txt”、“r”）； int n=计件计数（fs）； printf（“%d\n”，n）；倒带（fs）；//刷新文件。。。静态bool endofpiec1=false，endofpiec2=false，endofpiec3=false； bool beginofpiec1，beginofpiec2，beginofpiec3；对于（int start=0；start，只需使用一个滚动缓冲区就可以做到这一点，不需要将文件分成多个部分 #include <stdio.h> #include <string.h> int main(void) { char buff [4]; // word buffer int count = 0; // occurrences FILE* fs=fopen("test.txt","r"); // open the file if (fs != NULL) { // if the file opened if (4 == fread(buff, 1, 4, fs)) { // fill the buffer do { // if it worked if (strnicmp(buff, "make", 4) == 0) // check for target word count++; // tally memmove(buff, buff+1, 3); // shift the buffer down } while (1 == fread(buff+3, 1, 1, fs)); // fill the last position } // end of file fclose(fs); // close the file } printf("%d\n", count); // report the result return 0; } #包括 #包括内部主（空）{ 字符缓冲区[4]；//字缓冲区 int count=0；//出现次数 FILE*fs=fopen（“test.txt”，“r”）；//打开文件如果（fs！=NULL）{//如果文件已打开如果（4==fread（buff，1，4，fs））{//填充缓冲区如果它有效的话，你会做什么 if（strnicmp（buff，“make”，4）==0）//检查目标字计数+++；//计数 memmove（buff，buff+1，3）；//向下移动缓冲区 }while（1==fread（buff+3,1,1,fs））；//填充最后一个位置 }//文件结尾 fclose（fs）；//关闭文件 } printf（“%d\n”，count）；//报告结果返回0； } 为了简单起见，我没有让搜索词“更软”，也没有分配正确的缓冲区和各种大小，因为这不在问题的范围内。我必须让OP做一些事情。你只需拥有一个滚动缓冲区就可以做到这一点。你不需要将文件分成多个部分 #include <stdio.h> #include <string.h> int main(void) { char buff [4]; // word buffer int count = 0; // occurrences FILE* fs=fopen("test.txt","r"); // open the file if (fs != NULL) { // if the file opened if (4 == fread(buff, 1, 4, fs)) { // fill the buffer do { // if it worked if (strnicmp(buff, "make", 4) == 0) // check for target word count++; // tally memmove(buff, buff+1, 3); // shift the buffer down } while (1 == fread(buff+3, 1, 1, fs)); // fill the last position } // end of file fclose(fs); // close the file } printf("%d\n", count); // report the result return 0; } #包括 #包括内部主（空）{ 字符缓冲区[4]；//字缓冲区 int count=0；//出现次数 FILE*fs=fopen（“test.txt”，“r”）；//打开文件如果（fs！=NULL）{//如果文件已打开如果（4==fread（buff，1，4，fs））{//填充缓冲区如果它有效的话，你会做什么 if（strnicmp（buff，“make”，4）==0）//检查目标字计数+++；//计数 memmove（buff，buff+1，3）；//向下移动缓冲区 }while（1==fread（buff+3,1,1,fs））；//填充最后一个位置 }//文件结尾 fclose（fs）；//关闭文件 } printf（“%d\n”，count）；//报告结果返回0； } 为了简单起见，我没有让搜索词“更柔和”分配正确的缓冲区和各种大小，因为这不是问题。我必须留给OP做一些事情。尝试较小的输入。找到程序输出与vim不同的最小输入。调试/尝试找出是什么弄乱了你的算法。修复它。开心点。看看你的测试txt文件在哪里被分割。Th在这里，我将查找丢失的30个事件。添加有关如何拆分文件的信息，这可能会有所帮助。生成一个只包含您想要查看的单词的文件。一个正在拆分，一个不包含，如果后者的输出错误->这是您的答案。这是一段非常出色的代码。您将文件拆分为50字节的片段d正在逐个字母检查硬编码单词“make” 是否存在，并试图跨越块边界。这是一个无处藏匿的问题。如果您的20字节输入文件“太大而无法读取”（在我的系统上，我有超过2字节可用），为什么你不一个字符一个字符地读取它，并保持一个4字节的当前缓冲区，长度为“make” ？除了框架和打开/关闭文件之外，它可以用10行代码来完成。char buff[4]；int count=0；if（4==fread（buff，1，4，fs））{do{if（strnicmp）（buff，“make”，4）==0）count++；memmove（buff，buff+1，3）；}而（1==fread（buff+3，1，1，fs））；}printf（“%d\n”，count）；文本编辑器通常将整个文件存储在内存中，不需要处理跨越任何k的字符串