如何计算.txt文件中的单词数?在C中

如何计算.txt文件中的单词数?在C中,c,lexicographic,C,Lexicographic,这是我第一次发布问题。 所以我正在做一个家庭作业计划,我有点被一些事情卡住了,我希望有人能插话帮助我。以下是我在计划中需要做的事情: 您的程序必须读取包含标点符号句子的文件 它将把句子解析成单词和标点符号 单词将被输入词典,标点符号将被输入列表。将单词添加到词典时忽略大小写。请记住,词典是按词典顺序保存的 字典和列表中的每个条目都会计算单词或标点符号在原始文本中出现的次数 阅读文本后(第一个字符为$的行终止文本),打印出字典并列出计数 您的程序接下来将读取如下格式的行:word1

这是我第一次发布问题。 所以我正在做一个家庭作业计划,我有点被一些事情卡住了,我希望有人能插话帮助我。以下是我在计划中需要做的事情:

  • 您的程序必须读取包含标点符号句子的文件
  • 它将把句子解析成单词和标点符号
  • 单词将被输入词典,标点符号将被输入列表。将单词添加到词典时忽略大小写。请记住,词典是按词典顺序保存的
  • 字典和列表中的每个条目都会计算单词或标点符号在原始文本中出现的次数
  • 阅读文本后(第一个字符为$的行终止文本),打印出字典并列出计数
  • 您的程序接下来将读取如下格式的行:word1
  • 这意味着将文本中的word1替换为word2
我已经能够输入文件(hw5输入)并按字典顺序打印它,并且去掉了大写字母,我甚至有一个单词计数,但无法使用单词计数在单独的行上打印。。我还需要交换单词并再次打印文件,但使用单词计数打印是我真正需要帮助的。以下是我到目前为止的情况:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define PUNCT " \n,\t!:;.-"
#define MAX_STR_LEN 2048

struct listNode 
{
    char *word;
    struct listNode *next;
    int wordCount;
};

struct listNode *newListNode(const char * const);
void insertWord(struct listNode *,const char * const);
void deleteList(struct listNode *);
void printList(struct listNode *);

// Create new struct listNode

struct listNode *newListNode(const char * const s) 
{
    struct listNode *n =
        (struct listNode*)calloc(1,sizeof(struct listNode));
    n->word = (char *)calloc(strlen(s)+1,sizeof(*s));
    strcpy(n->word,s);
    n->next = NULL;
    n->wordCount = 1;
    return n;
}

// Insert words into dictionary in ascending order

void insertWord(struct listNode *head,const char * const s) 
{
    char *i;
    int x = 0;
    for(i = s; *i != '\0'; i++) {
        *i = (char)tolower(*i);
        x++;
    }

    i = i-x;

// Gets rid of duplicate words and counts words

    struct listNode *p = head,
        *q = newListNode(i);

    while ((p->next != NULL) && (strcmp(i,p->next->word) > 0)) 
    {
        p = p->next;
    }
    if(p->next != NULL && strcmp(i,p->next->word) == 0)
    {
        p->next->wordCount++;
    } else {
        q->next = p->next;
        p->next = q;
    }
}

// Free all memory allocated for the list

void deleteList(struct listNode *head) 
{
    struct listNode *p = head, *q;
    while (p != NULL) 
    {
        q = p->next;
        free(p->word);
        free(p);
        p = q;
    }
}

// Print the dictionary

void printList(struct listNode *head) 
{
    struct listNode *p = head->next;

    while (p != NULL) 
    {
        printf("%s ",p->word);
        p = p->next;
    }
    puts("");
}

// Enter file and print words in lexicographic order

int main(int argc, char *argv[]) 
{
    char line[MAX_STR_LEN], *s, fileName[MAX_STR_LEN];
    struct listNode *head = newListNode("");

    int i = 0;
    char c;

    FILE *p;

    printf("Enter file name: ");
    scanf("%s", fileName);
        if((p = fopen(fileName, "r")) == NULL)
        {
            printf("File not found.");
            return 0;
        }

    while((c = getc(p)) != '$')
    { 
        line[i] = c;
        i++;
    }

    line[i] = '\0';
    for(s = strtok(line,PUNCT); s != NULL; s = strtok(NULL,PUNCT)) 
    {
        insertWord(head,s);
    }
    printf("Lexicographical order: ");
    printList(head);
    deleteList(head);

    return 0;
}


经过进一步检查,您似乎已经在做strtok了。真正的问题是,“字符串的最佳C列表实现是什么?”

Microsoft/Amazon类型的问题: 我将为您编写一个伪代码,
C
实现:

    define a struct like:

    struct node{
       int count;
       char * word;
       struct node *next;
   }Node;

   open the file for read;
   for each line in the file do:
         split the line, in other to have each work separately
         for each work in the line do:
            check if the work already exist in the list
            if not, create a new node
                   node->word = word
                   node->count = 1
            else:
                   node->count += 1

   sort the list by the node->word param

实现的是左得练习!然而,如果你能使用
map
你的生活会更简单

我可以使用我的wordcount函数,所以我只是在printList函数下编辑了print语句:

printf("%15s (%2d)\n",p->word, p->wordCount);
然后,它能够按照字典顺序打印出所有的单词,并根据单词数量进行统计

Enter file name: hw5-input
Lexicographical order:
              a ( 5)
          about ( 2)
        account ( 1)
            ago ( 1)
            all ( 1)
         almost ( 1)
             an ( 1)
            and ( 7)
             as ( 2)
           ball ( 1)
         before ( 1)
       bringing ( 1)
            but ( 1)
           call ( 1)
            can ( 1)
           cato ( 1)
        cherish ( 1)
    circulation ( 1)
         coffin ( 1)
           damp ( 1)
         degree ( 1)
   deliberately ( 1)
        driving ( 1)
        drizzly ( 1)
     especially ( 1)
          every ( 1)
       feelings ( 1)
           find ( 2)
       flourish ( 1)
            for ( 1)
           from ( 1)
        funeral ( 1)
            get ( 2)
           grim ( 1)
        growing ( 1)
           hand ( 1)
           hats ( 1)
           have ( 1)
         having ( 1)
           high ( 1)
        himself ( 1)
            his ( 1)
            how ( 1)
          hypos ( 1)
              i ( 9)
             if ( 1)
             in ( 4)
       interest ( 1)
           into ( 1)
  involuntarily ( 1)
             is ( 4)
        ishmael ( 1)
             it ( 5)
           knew ( 1)
       knocking ( 1)
         little ( 2)
           long ( 1)
             me ( 5)
           meet ( 1)
            men ( 1)
   methodically ( 1)
           mind ( 1)
          money ( 1)
          moral ( 1)
          mouth ( 1)
             my ( 4)
         myself ( 2)
         nearly ( 1)
          never ( 1)
             no ( 1)
        nothing ( 2)
       november ( 1)
          ocean ( 1)
             of ( 4)
            off ( 2)
             on ( 1)
             or ( 2)
          other ( 1)
           part ( 1)
     particular ( 1)
        pausing ( 1)
       people's ( 1)
  philosophical ( 1)
         pistol ( 1)
      precisely ( 1)
        prevent ( 1)
      principle ( 1)
          purse ( 1)
        quietly ( 1)
           rear ( 1)
     regulating ( 1)
       requires ( 1)
           sail ( 1)
           same ( 1)
            sea ( 1)
            see ( 1)
           ship ( 1)
          shore ( 1)
           some ( 2)
           soon ( 1)
           soul ( 1)
         spleen ( 1)
       stepping ( 1)
         street ( 1)
         strong ( 1)
     substitute ( 1)
           such ( 1)
     surprising ( 1)
          sword ( 1)
           take ( 1)
           that ( 1)
            the (10)
          their ( 1)
           then ( 1)
          there ( 1)
           they ( 1)
           this ( 2)
        thought ( 1)
         throws ( 1)
           time ( 2)
             to ( 5)
        towards ( 1)
             up ( 1)
           upon ( 1)
          upper ( 1)
           very ( 1)
     warehouses ( 1)
         watery ( 1)
            way ( 1)
       whenever ( 4)
           with ( 2)
          world ( 1)
          would ( 1)
          years ( 1)

仍然需要重新打印输入文件的话交换虽然。有人有解决方案吗?

在谷歌搜索时,你应该寻找的关键词是“tokeniser”或“tokenizer”。几天前有人在做这件事,他们的代码可能是可用的。出于好奇:你是否被限制使用C?我可能会使用一些shell scriptingProf推荐的使用sting标记器来完成这样的任务。
printf("%15s (%2d)\n",p->word, p->wordCount);
Enter file name: hw5-input
Lexicographical order:
              a ( 5)
          about ( 2)
        account ( 1)
            ago ( 1)
            all ( 1)
         almost ( 1)
             an ( 1)
            and ( 7)
             as ( 2)
           ball ( 1)
         before ( 1)
       bringing ( 1)
            but ( 1)
           call ( 1)
            can ( 1)
           cato ( 1)
        cherish ( 1)
    circulation ( 1)
         coffin ( 1)
           damp ( 1)
         degree ( 1)
   deliberately ( 1)
        driving ( 1)
        drizzly ( 1)
     especially ( 1)
          every ( 1)
       feelings ( 1)
           find ( 2)
       flourish ( 1)
            for ( 1)
           from ( 1)
        funeral ( 1)
            get ( 2)
           grim ( 1)
        growing ( 1)
           hand ( 1)
           hats ( 1)
           have ( 1)
         having ( 1)
           high ( 1)
        himself ( 1)
            his ( 1)
            how ( 1)
          hypos ( 1)
              i ( 9)
             if ( 1)
             in ( 4)
       interest ( 1)
           into ( 1)
  involuntarily ( 1)
             is ( 4)
        ishmael ( 1)
             it ( 5)
           knew ( 1)
       knocking ( 1)
         little ( 2)
           long ( 1)
             me ( 5)
           meet ( 1)
            men ( 1)
   methodically ( 1)
           mind ( 1)
          money ( 1)
          moral ( 1)
          mouth ( 1)
             my ( 4)
         myself ( 2)
         nearly ( 1)
          never ( 1)
             no ( 1)
        nothing ( 2)
       november ( 1)
          ocean ( 1)
             of ( 4)
            off ( 2)
             on ( 1)
             or ( 2)
          other ( 1)
           part ( 1)
     particular ( 1)
        pausing ( 1)
       people's ( 1)
  philosophical ( 1)
         pistol ( 1)
      precisely ( 1)
        prevent ( 1)
      principle ( 1)
          purse ( 1)
        quietly ( 1)
           rear ( 1)
     regulating ( 1)
       requires ( 1)
           sail ( 1)
           same ( 1)
            sea ( 1)
            see ( 1)
           ship ( 1)
          shore ( 1)
           some ( 2)
           soon ( 1)
           soul ( 1)
         spleen ( 1)
       stepping ( 1)
         street ( 1)
         strong ( 1)
     substitute ( 1)
           such ( 1)
     surprising ( 1)
          sword ( 1)
           take ( 1)
           that ( 1)
            the (10)
          their ( 1)
           then ( 1)
          there ( 1)
           they ( 1)
           this ( 2)
        thought ( 1)
         throws ( 1)
           time ( 2)
             to ( 5)
        towards ( 1)
             up ( 1)
           upon ( 1)
          upper ( 1)
           very ( 1)
     warehouses ( 1)
         watery ( 1)
            way ( 1)
       whenever ( 4)
           with ( 2)
          world ( 1)
          would ( 1)
          years ( 1)