Html 如何在C中查找所有出现的子字符串_Html_C_Algorithm_Parsing

Html 如何在C中查找所有出现的子字符串

html c algorithm parsing

Html 如何在C中查找所有出现的子字符串,html,c,algorithm,parsing,Html,C,Algorithm,Parsing,我试图用C语言编写一个解析程序，从HTML文档中提取某些文本片段。为此，我需要在文档中找到子字符串“name”的每个实例；但是，C函数strstr只查找子字符串的第一个实例。我找不到一个函数可以在第一个实例之外找到任何东西，我已经考虑过在找到每个子字符串之后删除它，以便strstr将返回下一个子字符串。我无法使这两种方法都起作用顺便说一句，我知道while循环将此限制为六次迭代，但我只是测试一下，看看是否可以首先让函数工作 while(entry_count < 6) {

我试图用C语言编写一个解析程序，从HTML文档中提取某些文本片段。为此，我需要在文档中找到子字符串“name”的每个实例；但是，C函数strstr只查找子字符串的第一个实例。我找不到一个函数可以在第一个实例之外找到任何东西，我已经考虑过在找到每个子字符串之后删除它，以便strstr将返回下一个子字符串。我无法使这两种方法都起作用

顺便说一句，我知道while循环将此限制为六次迭代，但我只是测试一下，看看是否可以首先让函数工作

while(entry_count < 6)
{   
    printf("test");
    if((ptr = strstr(buffer, "\"name\":")) != NULL)
    {   
        ptr += 8;
        int i = 0;
        while(*ptr != '\"')
        {   
            company_name[i] = *ptr;
            ptr++;
            i++;
        }   
        company_name[i] = '\n';
        int j;
        for(j = 0; company_name[j] != '\n'; j++)
            printf("%c", company_name[j]);
        printf("\n");
        strtok(buffer, "\"name\":");
        entry_count++;
    }   
}

while（条目计数<6）
{   
printf（“测试”）；
if（（ptr=strstrstr（缓冲区“\”名称“：”）！=NULL）
{   
ptr+=8；
int i=0；
而（*ptr！=“\”）
{   
公司名称[i]=*ptr；
ptr++；
i++；
}   
公司名称[i]='\n'；
int j；
对于（j=0；公司名称[j]！='\n'；j++）
printf（“%c”，公司名称[j]）；
printf（“\n”）；
strtok（缓冲区“\”名称“：”）；
输入计数++；
}   
}

只需将返回的指针加上一个指针返回到

strstr（）

即可找到下一个匹配项：

char *ptr = strstr(buffer, target);
while (ptr) {
    /* ... do something with ptr ... */
    ptr = strstr(ptr+1, target);
}

<强> P.< /强>虽然你一定能做到这一点，但我建议你不妨考虑更适合的工作工具：

C是一种非常低级的语言，试图用它编写字符串解析代码是很费力的（特别是如果你坚持从头开始编写代码，而不是使用现有的解析库或解析器生成器），而且容易出现错误（其中一些错误，如缓冲区溢出，会造成安全漏洞）。有很多更高级的脚本语言（如Perl、Ruby、Python甚至JavaScript）更适合此类任务
解析HTML时，确实应该使用适当的HTML解析器（最好与良好的DOM生成器和查询工具结合使用）。这将允许您根据文档的结构定位所需的数据，而不仅仅是在原始HTML源代码中匹配子字符串。真正的HTML解析器还将透明地处理字符集转换和字符实体解码等问题。（是的，有针对C的HTML解析器，如和，因此即使坚持使用C，您也可以而且应该使用一个。）

/*******\
*                                                  *
*执行行中带有参数的SubStg*
*必须使用2个参数*
*第一个是要搜索的字符串*
*第二个是子字符串*
*例如：./Srch“这是列表”是“>东西*
*例如：./Srch“$（strstrstr（strstrstr（html，“name”）+1，“name”）
查找第二个出现的“name”
（如果内部strstr没有返回空值），我的算法有一些有趣的地方：•没有保留字符（例如，您可以搜索包含空格的子字符串）•所有操作都在内存中完成，并且只分配一次。因此，程序不必不断寻找空间。•循环只执行逻辑运算符。这对CPU速度的影响最小。•因此，我认为此方案相当有效。
/*  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *\
 *                                                  *
 *  SubStg with parameters in the execution line    *
 *  Must use 2 parameters                           *
 *  The 1st is the string to be searched            *
 *  The 2nd is the substring                        *
 *  e.g.:  ./Srch "this is the list" "is" >stuff    *
 *  e.g.:  ./Srch "$(<Srch.c)" "siz"                *
 *  (ref: http://1drv.ms/1PuVpzS)                   *
 *  © SJ Hersh 15-Jun-2020                          *
 *                                                  *
\*  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef char* char_ptr;
typedef unsigned int* int_ptr;
#define NOMEM ( int_ptr )0

int main( int parm, char** stgs )
{
   char_ptr string, substg;
   unsigned int sizstg, sizsub, endsiz, *ary;
   int_ptr startmem;
   register unsigned int x, y, ctr=0;

   if( parm != 3 )
   {
      printf( "ERR: You need exactly 2 string arguments\n" );
      return ( -8 );
   }

   string = stgs[ 1 ];
   substg = stgs[ 2 ];
   sizstg = strlen( string );
   sizsub = strlen( substg );
   endsiz = sizstg - sizsub + 1;


      /* Check boundary conditions: */

if( ( sizstg == 0 ) || ( sizsub == 0 ) )
{
   printf( "ERR: Neither string can be nul\n" );
   return( -6 );
}

if( sizsub > sizstg )
{
   printf( "ERR: Substring is larger than String\n" );
   return( -7 );
}

if( NOMEM == ( ary = startmem = malloc( endsiz * sizeof( int ) ) ) )
{
   printf( "ERR: Not enough memory\n" );
   return( -9 );
}


      /* Algorithm */

   printf( "Positions:\t" );

   for( x = 0; x < endsiz; x++ )
      *ary++ = string[ x ] == substg[ 0 ];

   for( y = 1, ary = startmem; y < sizsub; y++, ary = startmem )
      for( x = y; x < ( endsiz + y ); x++ )
         *ary++ &= string[ x ] == substg[ y ];

   for( x = 0; ( x < endsiz ); x++ )
      if( *ary++ )
      {
         printf( "%d\t", x );
         ctr++;
      }

   printf( "\nCount:\t%d\n", ctr );
   free( startmem );
   return( 0 );
}