C 哈希表排名前20的想法
我在想办法解决我的问题时有点困难。 我有一个单词计数程序,它使用哈希表进行计数 在任意数量的文件中打印所有单词,并仅打印单词 在所有文件中,以及它们的计数。我还储存了我所有的二手货 链表中的散列索引 解决了我自己的问题,我知道答案很简单。我刚刚计算出了计数最低的一个,如果我的新值大于这个值,但它位于20字结构数组中计数最低的一个的索引处 谢谢大家的帮助C 哈希表排名前20的想法,c,hashtable,C,Hashtable,我在想办法解决我的问题时有点困难。 我有一个单词计数程序,它使用哈希表进行计数 在任意数量的文件中打印所有单词,并仅打印单词 在所有文件中,以及它们的计数。我还储存了我所有的二手货 链表中的散列索引 解决了我自己的问题,我知道答案很简单。我刚刚计算出了计数最低的一个,如果我的新值大于这个值,但它位于20字结构数组中计数最低的一个的索引处 谢谢大家的帮助 #include <unistd.h> #include <stdio.h> #inclu
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <pthread.h>
#include <string.h>
/*Structures*///////////////////////////////////////////
//A struct to hold the words in the hash tables and their
//counts
struct counter{
int count;
int printed;
char word[51];
int allfiles[101];
struct counter * next;
};
//A struct to hold the hash indexes of the already visited
//index, for easy printing
struct index{
int used;
struct index * next;
};
//A simple struct to pass arguments to the work function for
//threading
struct arguments{
void * id;
int fileid;
};
////////////////////////////////////////////////////////
/*Functions*////////////////////////////////////////////
static int hash(char * word);
static void inHash(struct counter * newWord, int hash, int FILEID);
static void indexchain(int hash);
//static void hashprint(int NUMFILES);
static void * work(struct arguments *);
static void toptwenty(int NUMFILES);
static void print();
////////////////////////////////////////////////////////
/*Global Variables*/////////////////////////////////////
struct counter * top[20] = {0};
struct counter * hashtable[6222] = {0};
struct index * head;
////////////////////////////////////////////////////////
int main(int argc, char * argv[])
{
//check for valid number of arguments
if(argc < 2)
{
fprintf(stderr, "invalid number of arguments");
return 1;
}
//set up index chain starts with a null node
head = malloc(sizeof(struct index));
head->next = NULL;
head->used = -1;
//loop through any number of files
int arg;
for(arg = 1; arg < argc; arg++)
{
struct arguments * argum = malloc(sizeof(struct arguments));
argum->fileid = arg;
argum->id = ((void*)argv[arg]);
work(argum);
}
//hashprint(argc);
toptwenty(argc);
print();
return 0;
}
/*Function Definitions*/
//this function takes a file name and counts
//the words in the file
static void * work(struct arguments * argum)
{
int FILEID = argum->fileid;
void * in = argum->id;
int fd = open((char*)in, O_RDONLY);
if (fd == -1)
{
fprintf(stderr, "can't open %s for reading!\n", (char*)in);
exit(-1);
}
int BUFLEN = (int) lseek(fd, 0, SEEK_END);
lseek(fd, 0, 0);
//A few variable
char buf[BUFLEN + 1];
int lastRead;
lastRead = read(fd, buf, BUFLEN);
if (lastRead == -1)
{
fprintf(stderr, "error reading file %s!\n", (char*)in);
exit(-1);
}
//Parse the filebuffer for words.
char newword[51];
int c;
int curindex = 0;
buf[BUFLEN + 1] = ' ';
//not doing the last space because it is eof
for(c = 0; c < BUFLEN + 1; c++)
{
if((buf[c] >= 'A' && buf[c] <= 'Z'))
{
buf[c] += 32;
}
if(buf[c] >= 'a' && buf[c] <= 'z')
{
//add the next char to the string.
newword[curindex] = buf[c];
curindex++;
}
else
{
//make a new struct for the entry, and add it to the hashtable
//add its hash to the
if(strlen(newword) >= 6)
{
struct counter * temp = malloc(sizeof(struct counter));
strcpy(temp->word,newword);
int thishash = hash(temp->word);
//Only save hash indexes if they are in the first files
if(FILEID == 1)
{
indexchain(thishash);
}
inHash(temp, thishash, FILEID);
}
int wordlength = strlen(newword);
int i;
for(i = 0;i < wordlength; i++)
{
newword[i] = 0;
}
curindex = 0;
}
}
close(fd);
return in;
}
//Bad hash function by just adding ascii values of the
//characters
static int hash(char * word)
{
int loop = strlen(word);
int i;
int hashval = 0;
for(i = 0; i < loop; i++)
hashval += word[i];
return hashval;
}
//add a new word to the hash table
static void inHash(struct counter * newWord, int hash, int FILEID)
{
int eflag = 0;
if(hashtable[hash] == NULL)
{
//if the entry isnt in the table
if(FILEID == 1)
{
newWord->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
newWord->count = 1;
newWord->next = NULL;
hashtable[hash] = newWord;
}
}
else
{
//if its not, but what if it is?
struct counter * cur = hashtable[hash];
if(strcmp(cur->word, newWord->word) == 0)
{
//is the word in the first slot?
cur->count += 1;
cur->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
eflag = 1;
}
else
{
while(cur->next != NULL)
{
cur = cur->next;
if(strcmp(cur->word, newWord->word) == 0)
{
//if the word already exsists, update the count
cur->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
cur->count += 1;
eflag = 1;
break;
}
}
}
//if its not in any bucket, make a new bucket
if(eflag == 0)
{
//Else add the new entry to the end of that list
if(FILEID == 1)
{
newWord->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
newWord->count = 1;
newWord->next = NULL;
cur->next = newWord;
}
}
}
}
//adding a value to the linked list for printing
static void indexchain(int hash)
{
struct index * p = head;
int eflag = 0;
while(p->next != NULL)
{
if(p->used != hash)
p = p->next;
else
{
eflag = 1;
break;
}
}
if(eflag == 0)
{
struct index * newValue = malloc(sizeof(struct index));
newValue->used = hash;
newValue->next = NULL;
p->next = newValue;
}
}
/*
//This function will print the values in the hash tables and their counts
//Prints based on number of files to check if words are in all files
static void hashprint(int NUMFILES)
{
struct index * p;
p = head->next;
int hash;
int i;
int printbool = 1;
while(p != NULL)
{
hash = p->used;
struct counter * ptr = hashtable[hash];
while(ptr != NULL)
{
if(ptr->printed == 0)
{
for(i = 1; i < NUMFILES; i++)
{
if(ptr->allfiles[i] == 0)
{
printbool = 0;
break;
}
else
printbool = 1;
}
if(printbool == 1)
{
ptr->printed = 1;
printf("%s %d\n", ptr->word, ptr->count);
}
}
ptr = ptr->next;
}
p = p->next;
}
}
*/
//A function to see which numbers have the top twenty highest count
static void toptwenty(int NUMFILES)
{
struct index * p;
p = head->next;
int hash;
int i;
int printbool = 1;
while(p != NULL)
{
hash = p->used;
struct counter * ptr = hashtable[hash];
while(ptr != NULL)
{
if(ptr->printed == 0)
{
for(i = 1; i < NUMFILES; i++)
{
if(ptr->allfiles[i] == 0)
{
printbool = 0;
break;
}
else
printbool = 1;
}
if(printbool == 1)
{
for(i = 0; i < 20; i++)
{
if(top[i] == NULL)
{
top[i] = ptr;
break;
}
else if(ptr->count > top[i]->count)
{
top[i] = ptr;
break;
}
}
}
}
ptr = ptr->next;
}
p = p->next;
}
}
//print the top 20 count
static void print()
{
int i;
for(i = 0; i < 20; i++)
{
if(top[i] != NULL)
{
if(top[i]->printed == 0)
{
//printf("%s\n", top[i]->word);
printf("%s %d\n", top[i]->word, top[i]->count);
top[i]->printed = 1;
}
}
else
break;
}
}
创建一个优先级队列,该队列包含20个具有顶部计数及其相应计数的哈希索引
当您计算时,如果您的新词超过它,则最小值位于队列的顶部。将其从队列O1中删除,并将新值添加到仅为Olog20的队列Ologn中 你可以提供一些关于你需要解析的文本类型的细节吗。还有,会有多少。你会发布你到目前为止的代码吗?它如何记录在哪些文件中找到这些单词?