与Python相比，具有动态分配的结构数组在C中运行得非常慢_Python_C_Graph Theory

与Python相比，具有动态分配的结构数组在C中运行得非常慢

python c

与Python相比，具有动态分配的结构数组在C中运行得非常慢,python,c,graph-theory,Python,C,Graph Theory,我（在其他SO用户的帮助下）将一个将字符串标签映射到edgelist数据结构中的整数标签的小C程序“粘合”在一起。例如，对于输入文件 Mike Andrew Mike Jane John Jane 程序输出 1 2 1 3 4 3 然而，我映射了巨大的edgelist文件，不幸的是，与Python备选方案相比，该程序运行速度非常慢。下面我用C和Python粘贴了这两个程序。我恳请各位指点一下如何提高C程序的速度 #include <stdio.h> #include <st

我（在其他SO用户的帮助下）将一个将字符串标签映射到edgelist数据结构中的整数标签的小C程序“粘合”在一起。例如，对于输入文件

Mike Andrew
Mike Jane
John Jane

程序输出

1 2
1 3
4 3

然而，我映射了巨大的edgelist文件，不幸的是，与Python备选方案相比，该程序运行速度非常慢。下面我用C和Python粘贴了这两个程序。我恳请各位指点一下如何提高C程序的速度

#include <stdio.h>
#include <stdlib.h>

// Initial number of maximal lines in a file
enum { MAXL = 200};

typedef struct {
    unsigned int first;
    unsigned int second;
} edge;

typedef struct {
    unsigned int hashed;
     char **map;
} hash;


int insertInMap(hash *map, char *entry)
{
  int i =0;
  for (i=0;i<map->hashed;i++)
  {
    if (strcmp(map->map[i],entry) == 0)
    return i+1;
  }
  /* Warning no boundary check is added */
  map->map[map->hashed++] = strdup(entry);   
  return map->hashed;
}

int main() {
  FILE *fp = NULL;
  char node1[30];
  char node2[30];
  int idx = 0;
  int i, n = 0, maxl = MAXL;

  edge *edges;
  hash map;

  edges = malloc(MAXL * sizeof(edge));
  map.map = malloc(MAXL * sizeof(char*));
  map.hashed = 0;

  fp = fopen("./test.txt", "r");

  while (fscanf(fp, "%s %s", &node1, &node2) == 2) {
    if (++n == maxl) { /* if limit reached, realloc lines  */
      void *tmp = realloc (edges, (maxl + 40) * sizeof *edges);
      void *tmp1 = realloc (map.map, (maxl + 80) * sizeof(char*));
      if (!tmp) {     /* validate realloc succeeded */
        fprintf (stderr, "error: realloc - virtual memory exhausted.\n");
        break;      /* on failure, exit with existing data */
      }
      edges = tmp;    /* assign reallocated block to lines */

      map.map = tmp1;
      maxl += 40;     /* update maxl to reflect new size */
    }
    edges[idx].first = insertInMap(&map,node1);
    edges[idx].second = insertInMap(&map,node2);
    idx++;
  }

  fclose(fp);

  for (int i = 0; i < idx; i++) {
    printf("%d -- %d\n", edges[i].first, edges[i].second);
  }


  free(edges);

  return 0;
}

编辑并添加

下面是使用GLib散列实现修改的代码。我改进了性能，但不幸的是，输出仍然有问题，这应该是正确的

1 2
1 3
4 3

而不是

0 0
0 1
1 1

谁能看一下吗

#include <stdio.h>
#include <stdlib.h>
#include <glib.h>
#include <stdint.h>

int main() {
  GHashTable *table;
  table = g_hash_table_new(g_int_hash, g_int_equal);

  FILE *fp = NULL;
  char node1[30];
  char node2[30];

  fp = fopen("./test.txt", "r");
  int i = 0;
  while (fscanf(fp, "%s %s", &node1, &node2) == 2) {
    char *key1 = malloc(sizeof(char)*1024);
    char *key2 = malloc(sizeof(char)*1024);
    uint32_t* value = (uint32_t *)malloc(sizeof(uint32_t));
    key1 = g_strdup(node1);
    key2 = g_strdup(node2);
    *value = i;

    uint32_t *x;
    if (g_hash_table_contains(table, key1)) {
      x = (uint32_t *)g_hash_table_lookup(table, key1);
    } else {
      i++;
      g_hash_table_insert(table, (gpointer)key1, (gpointer)value);
      x = (uint32_t *)value;
    }

    uint32_t *y;
    if (g_hash_table_contains(table, key2)) {
      y = (uint32_t *)g_hash_table_lookup(table, key2);
    } else {
      g_hash_table_insert(table, (gpointer)key2, (gpointer)value);
      y = (uint32_t *)value;
    }
    printf("%d -- %d\n", *x, *y);
  }

  fclose(fp);

  g_hash_table_destroy(table);
  table = NULL;
  return 0;
}

#包括
#包括
#包括
#包括
int main（）{
GHashTable*表；
table=g_hash_table_new（g_int_hash，g_int_equal）；
FILE*fp=NULL；
char节点1[30]；
char节点2[30]；
fp=fopen（“./test.txt”，“r”）；
int i=0；
而（fscanf（fp、%s%s、&node1和&node2）==2）{
char*key1=malloc（sizeof（char）*1024）；
char*key2=malloc（sizeof（char）*1024）；
uint32_t*值=（uint32_t*）malloc（sizeof（uint32_t））；
键1=g_标准（节点1）；
键2=g_标准（节点2）；
*值=i；
uint32_t*x；
if（g_hash_table_contains（table，key1））{
x=（uint32\u t*）g\u哈希表\u查找（表，键1）；
}否则{
i++；
g_哈希_表_插入（表，（gpointer）键1，（gpointer）值）；
x=（uint32_t*）值；
}
uint32_t*y；
if（g_hash_table_contains（table，key2））{
y=（uint32\u t*）g\u哈希表\u查找（表，键2）；
}否则{
g_哈希_表_插入（表，（gpointer）键2，（gpointer）值）；
y=（uint32_t*）值；
}
printf（“%d--%d\n”、*x、*y）；
}
fclose（fp）；
g_散列_表_销毁（表）；
table=NULL；
返回0；
}

C语言中的“哈希”操作更像一个链表，具有线性插入和查找功能。另一方面，Python字典的行业实力是O（1）个平均插入和查找（in操作符中的

）。如果您是用C从头开始编写hashmap，那么您需要将大量的理论应用到实践中，以便在性能方面开始接近
在我看来，最好的办法是在C++中编写代码，如果可能的话，使用一个。这是两全其美的：所有的工作都已经为您完成了，但您不需要在性能上做出妥协
如果你被设置为（或被困于）C，互联网上有相当多的资源，但我不愿意在这里发布任何链接，因为我不能保证它们的质量。这应该是一项教育性的工作。
C语言中的“哈希”更像是一个链表，具有线性插入和查找功能。另一方面，Python字典的行业实力是O（1）个平均插入和查找（in

操作符中的

）。如果您是用C从头开始编写hashmap，那么您需要将大量的理论应用到实践中，以便在性能方面开始接近
在我看来，最好的办法是在C++中编写代码，如果可能的话，使用一个。这是两全其美的：所有的工作都已经为您完成了，但您不需要在性能上做出妥协
如果你被设置为（或被困于）C，互联网上有相当多的资源，但我不愿意在这里发布任何链接，因为我不能保证它们的质量。这应该是一项教育性的努力。
这两个程序使用的是具有不同时间复杂性的根本不同的数据结构。python程序使用的是一个字典，它是一个高度调优的哈希表，在查找和删除时具有O（1）摊销性能
因此，python程序运行的复杂性是O（字数）
现在，谈论您的C程序，您尝试创建的实际上只是一个键值对数组。在这里插入或检索密钥需要O（数组的大小），因为您可能会在数组中循环直到最后找到匹配项
如果你做一些数学运算，结果是O（（字数）2）
 C++有内置的哈希表实现，如果你不需要切换到C++，你可以使用它。或者查看这个问题，学习用C编写自己的哈希表。
这两个程序使用的数据结构完全不同，时间复杂度也不同。python程序使用的是一个字典，它是一个高度调优的哈希表，在查找和删除时具有O（1）摊销性能
因此，python程序运行的复杂性是O（字数）
现在，谈论您的C程序，您尝试创建的实际上只是一个键值对数组。在这里插入或检索密钥需要O（数组的大小），因为您可能会在数组中循环直到最后找到匹配项
如果你做一些数学运算，结果是O（（字数）2）
 C++有内置的哈希表实现，如果你不需要切换到C++，你可以使用它。或者查看这个问题，学习用C编写自己的哈希表。
代码的问题是，尽管名称不同，但它不是一个工作的哈希表。你用一种很慢的线性搜索来仔细搜索地图。你应该做的是：

将哈希表大小设置为固定大小。避免任何基于realloc的解决方案
提出一个哈希函数来确定表索引。网上应该有大量使用字符串的代码示例
实现存储/检查索引的方法。这可以存储在下一个可用的表索引中，或者通过实现“链接”，其中每个索引都是一个链表，等等
代码的问题在于，尽管有名称，但它不是一个有效的哈希表。你用一条直线在地图上细细咀嚼
#include <stdio.h>
#include <stdlib.h>
#include <glib.h>
#include <stdint.h>

int main() {
  GHashTable *table;
  table = g_hash_table_new(g_int_hash, g_int_equal);

  FILE *fp = NULL;
  char node1[30];
  char node2[30];

  fp = fopen("./test.txt", "r");
  int i = 0;
  while (fscanf(fp, "%s %s", &node1, &node2) == 2) {
    char *key1 = malloc(sizeof(char)*1024);
    char *key2 = malloc(sizeof(char)*1024);
    uint32_t* value = (uint32_t *)malloc(sizeof(uint32_t));
    key1 = g_strdup(node1);
    key2 = g_strdup(node2);
    *value = i;

    uint32_t *x;
    if (g_hash_table_contains(table, key1)) {
      x = (uint32_t *)g_hash_table_lookup(table, key1);
    } else {
      i++;
      g_hash_table_insert(table, (gpointer)key1, (gpointer)value);
      x = (uint32_t *)value;
    }

    uint32_t *y;
    if (g_hash_table_contains(table, key2)) {
      y = (uint32_t *)g_hash_table_lookup(table, key2);
    } else {
      g_hash_table_insert(table, (gpointer)key2, (gpointer)value);
      y = (uint32_t *)value;
    }
    printf("%d -- %d\n", *x, *y);
  }

  fclose(fp);

  g_hash_table_destroy(table);
  table = NULL;
  return 0;
}