使用C解析URL的最佳方法？_C_Url_Parsing

使用C解析URL的最佳方法？

c url parsing

使用C解析URL的最佳方法？,c,url,parsing,C,Url,Parsing,我有这样一个URL： http://192.168.0.1:8080/servlet/rece 我想解析URL以获取值： IP: 192.168.0.1 Port: 8080 page: /servlet/rece 如何操作？编写一个自定义解析器，或者使用一个字符串替换函数替换分隔符“：”，然后使用带分隔符的sscanf（）。否则使用/ 您也可以使用我使用sscanf编写了一个简单的代码，它可以解析非常基本的URL #include <stdio.h> int main(vo

我有这样一个URL：

http://192.168.0.1:8080/servlet/rece

我想解析URL以获取值：

IP: 192.168.0.1
Port: 8080
page:  /servlet/rece

如何操作？

编写一个自定义解析器，或者使用一个字符串替换函数替换分隔符“：”，然后使用带分隔符的

sscanf（）。否则使用/
您也可以使用
我使用sscanf编写了一个简单的代码，它可以解析非常基本的URL
#include <stdio.h>

int main(void)
{
    const char text[] = "http://192.168.0.2:8888/servlet/rece";
    char ip[100];
    int port = 80;
    char page[100];
    sscanf(text, "http://%99[^:]:%99d/%99[^\n]", ip, &port, page);
    printf("ip = \"%s\"\n", ip);
    printf("port = \"%d\"\n", port);
    printf("page = \"%s\"\n", page);
    return 0;
}

./urlparse
ip = "192.168.0.2"
port = "8888"
page = "servlet/rece"

#包括
内部主（空）
{
常量字符文本[]=”http://192.168.0.2:8888/servlet/rece";
char-ip[100]；
int端口=80；
字符页[100]；
sscanf（文本，“http://%99[^::：%99d/%99[^\n]”，ip和端口，第页）；
printf（“ip=\%s\”\n，ip）；
printf（“端口=\%d\”\n，端口）；
printf（“第=\%s\”\n页）；
返回0；
}
/urlparse
ip=“192.168.0.2”
port=“8888”
page=“servlet/rece”
就我个人而言，我偷了HTParse.c
模块（例如，它用于Web浏览器）。然后，您可以执行以下操作：
 strncpy(hostname, HTParse(url, "", PARSE_HOST), size)

使用一个建立良好且经过调试的库的重要一点是，您不会陷入典型的
URL解析陷阱（当主机是IP地址时，许多regexp会失败，例如，特别是IPv6地址）。
这一个减小了大小，对我来说效果非常好。只有两个文件（*.c，*.h）
我必须修改代码[1]

[1] 将所有函数调用从http_parsed_url_free（purl）更改为parsed_url_free（purl）
这个C要点可能有用。它使用sscanf实现了纯C解决方案

它使用
// Parsing the tmp_source char*
if (sscanf(tmp_source, "http://%99[^:]:%i/%199[^\n]", ip, &port, page) == 3) { succ_parsing = 1;}
else if (sscanf(tmp_source, "http://%99[^/]/%199[^\n]", ip, page) == 2) { succ_parsing = 1;}
else if (sscanf(tmp_source, "http://%99[^:]:%i[^\n]", ip, &port) == 2) { succ_parsing = 1;}
else if (sscanf(tmp_source, "http://%99[^\n]", ip) == 1) { succ_parsing = 1;}
(...)

可能会迟到，。。。
我使用的是http\u parser\u parse\u url（）
函数和从lib中分离出来的所需宏，它们工作得很好，~600
LOC。

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
typedef struct
{
    const char* protocol = 0;
    const char* site = 0;
    const char* port = 0;
    const char* path = 0;
} URL_INFO;
URL_INFO* split_url(URL_INFO* info, const char* url)
{
    if (!info || !url)
        return NULL;
    info->protocol = strtok(strcpy((char*)malloc(strlen(url)+1), url), "://");
    info->site = strstr(url, "://");
    if (info->site)
    {
        info->site += 3;
        char* site_port_path = strcpy((char*)calloc(1, strlen(info->site) + 1), info->site);
        info->site = strtok(site_port_path, ":");
        info->site = strtok(site_port_path, "/");
    }
    else
    {
        char* site_port_path = strcpy((char*)calloc(1, strlen(url) + 1), url);
        info->site = strtok(site_port_path, ":");
        info->site = strtok(site_port_path, "/");
    }
    char* URL = strcpy((char*)malloc(strlen(url) + 1), url);
    info->port = strstr(URL + 6, ":");
    char* port_path = 0;
    char* port_path_copy = 0;
    if (info->port && isdigit(*(port_path = (char*)info->port + 1)))
    {
        port_path_copy = strcpy((char*)malloc(strlen(port_path) + 1), port_path);
        char * r = strtok(port_path, "/");
        if (r)
            info->port = r;
        else
            info->port = port_path;
    }
    else
        info->port = "80";
    if (port_path_copy)
        info->path = port_path_copy + strlen(info->port ? info->port : "");
    else 
    {
        char* path = strstr(URL + 8, "/");
        info->path = path ? path : "/";
    }
    int r = strcmp(info->protocol, info->site) == 0;
    if (r && info->port == "80")
        info->protocol = "http";
    else if (r)
        info->protocol = "tcp";
    return info;
}

Out
int main()
{
    URL_INFO info;
    split_url(&info, "ftp://192.168.0.1:8080/servlet/rece");
    printf("Protocol: %s\nSite: %s\nPort: %s\nPath: %s\n", info.protocol, info.site, info.port, info.path);
    return 0;
}

Protocol: ftp
Site: 192.168.0.1
Port: 8080
Path: /servlet/rece

Libcurl现在有curl\u url\u get（）
函数，可以提取主机、路径等
示例代码：
基于纯sscanf（）
的解决方案：
//Code
#include <stdio.h>

int
main (int argc, char *argv[])
{
    char *uri = "http://192.168.0.1:8080/servlet/rece"; 
    char ip_addr[12], path[100];
    int port;
    
    int uri_scan_status = sscanf(uri, "%*[^:]%*[:/]%[^:]:%d%s", ip_addr, &port, path);
    
    printf("[info] URI scan status : %d\n", uri_scan_status);
    if( uri_scan_status == 3 )
    {   
        printf("[info] IP Address : '%s'\n", ip_addr);
        printf("[info] Port: '%d'\n", port);
        printf("[info] Path : '%s'\n", path);
    }
    
    return 0;
}


//代码
#包括
int
main（int argc，char*argv[]）
{
char*uri=”http://192.168.0.1:8080/servlet/rece"; 
char ip_addr[12]，path[100]；
国际港口；
int uri_scan_status=sscanf（uri，“%*[^:]%*[：/]%[^:::：%d%s”，ip地址和端口，路径）；
printf（“[info]URI扫描状态：%d\n”，URI\u扫描状态）；
如果（uri\u扫描\u状态==3）
{   
printf（“[info]IP地址：'%s'\n'，IP地址）；
printf（“[info]端口：'%d'\n'，端口）；
printf（“[info]路径：'%s'\n'，路径）；
}
返回0；
}

但是，请记住，此解决方案是为[protocol\u name]：//[ip\u address]：[port][/path]
类型的URI量身定制的。要了解有关URI语法中存在的组件的更多信息，请访问
现在，让我们将定制的格式字符串分解为：“%*[^:]%*[://]%[^:::]%d%s”


%*[^::][/code>有助于忽略协议/方案（例如http、https、ftp等）
它基本上从一开始就捕获字符串，直到它第一次遇到：
字符。由于我们在%
字符后面使用了*
，因此捕获的字符串将被忽略

%*[：//]
有助于忽略协议和IP地址之间的分隔符，即：//


%[^:::
帮助捕获分隔符后面的字符串，直到它遇到：
。这个捕获的字符串就是IP地址

：%d
有助于捕获位于：
字符后面的编号（捕获IP地址时遇到的编号）。这里捕获的号码基本上是你的端口号

%s
如您所知，将帮助您捕获剩余字符串，该字符串只不过是您正在查找的资源的路径

这是在什么平台上？我不知道你可以把像[^:]这样的regexp放在sscanf格式中。我的平台是：uname-a Linux ubuntu 2.6.24-21-generic#1 SMP Tue 10月21日23:43:45 UTC 2008 i686 GNU/Linux[^:]在这种上下文中不是regexp，它只是sscanf（）的一个特殊格式说明符。这是标准的。例如，请参阅此手册页：。解析在没有端口号时出错，无法正常工作。如何修复它。有很多陷阱需要注意，所以自定义解析器在我看来是个坏主意。@bortzmeye:这并不意味着建议无效。这是模糊的推理。此外，自定义解析器是最强大/高效/无依赖性的。sscanf更容易出错。“写一些你需要的代码”是一个公认的答案吗？事实上，使用库似乎是唯一合理的事情，因为有很多陷阱（http与https、显式端口、路径编码等）。嗨，我为url写了一个BNF，如下所示。URL=“http://”{IP}{PORT}？{PAGE}？flex生成了一个解析url的文件。但是如何获取单独的部分，如IP、端口和页面。特别是在URL中，请注意，在IPv6中，如果尝试使用冒号分隔符，则会出现不明确的情况。e、 g.3ffe:0501:：1:2，这是2的端口，还是默认端口的完整地址。URL规范已经处理了这一点，正如预编写的库一样。URI标准RFC3986是明确的，您的示例是非法的（您需要方括号）。谢谢，这让人感到安慰。我误以为面向用户的代码，就像浏览器地址栏一样，接受没有方括号的地址。快速浏览一下一些流行的浏览器，就会发现情况并非如此。HTParse.c有很多依赖项，你能解释一下如何轻松地从项目中“窃取”这些依赖项吗？也许在2009年它没有；）@非常好的链接。它就像一个符咒。遗憾的是，优秀的代码是版权所有的“保留所有权利”，所以它不应该用于个人项目以外的其他项目。对于windows，请使用CoInternetParseUrlthird
/* extract host name from the parsed URL */ 
uc = curl_url_get(h, CURLUPART_HOST, &host, 0);
if(!uc) {
  printf("Host name: %s\n", host);
  curl_free(host);
}

//Code
#include <stdio.h>

int
main (int argc, char *argv[])
{
    char *uri = "http://192.168.0.1:8080/servlet/rece"; 
    char ip_addr[12], path[100];
    int port;
    
    int uri_scan_status = sscanf(uri, "%*[^:]%*[:/]%[^:]:%d%s", ip_addr, &port, path);
    
    printf("[info] URI scan status : %d\n", uri_scan_status);
    if( uri_scan_status == 3 )
    {   
        printf("[info] IP Address : '%s'\n", ip_addr);
        printf("[info] Port: '%d'\n", port);
        printf("[info] Path : '%s'\n", path);
    }
    
    return 0;
}