2012-08-06 72 views
0

我任務是創造一個詞頻分析程序讀取從文本文件的內容,併產生下列示例輸出:C:鏈表詞頻 - 排序

SUMMARY: 

27340 words 
2572 unique words 

WORD FREQUENCIES (TOP 10): 

the 1644 
and 872 
to 729 
a 632 
it 595 
she 553 
i 545 
of 514 
said 462 
you 411 

我試圖創建一個程序實現這樣的產出。我對C編程非常陌生,儘管它在某種程度上有效,但可能存在很多效率問題/缺陷。這是我寫到目前爲止:

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <ctype.h> 
#define MAX_WORD 32 
#define MAX_TEXT_LENGTH 10000 

// =========================================== 
//     STRUCTURE 
//============================================ 


typedef struct word { 
char *str;    /* Stores the word */ 
int freq;    /* Stores the frequency */ 
struct word *pNext;  /* Pointer to the next word counter in the list */ 
} Word; 

// =========================================== 
//    FUNCTION PROTOTYPES 
//============================================ 

int getNextWord(FILE *fp, char *buf, int bufsize); /* Given function to get words */ 
void addWord(char *pWord);       /* Adds a word to the list or updates exisiting word */ 
void show(Word *pWordcounter);  /* Outputs a word and its count of occurrences */ 
Word* createWordCounter(char *word); /* Creates a new WordCounter structure */ 

// =========================================== 
//    GLOBAL VARIABLES 
//============================================ 

Word *pStart = NULL;     /* Pointer to first word counter in the list */ 

int totalcount = 0;     /* Total amount of words */ 
int uniquecount = 0;    /* Amount of unique words */ 



// =========================================== 
//     MAIN 
//============================================  


int main() { 

    /* File pointer */ 
    FILE * fp; 
    /* Read text from here */ 
    fp = fopen("./test.txt","r"); 

    /* buf to hold the words */ 
    char buf[MAX_WORD]; 

    /* Size */ 
    int size = MAX_TEXT_LENGTH; 


    /* Pointer to Word counter */ 
    Word *pCounter = NULL; 


    /* Read all words from text file */ 

    while (getNextWord(fp, buf, size)) { 

     /* Add the word to the list */ 
     addWord(buf); 

     /* Increment the total words counter */ 
     totalcount++; 
    } 


    /* Loop through list and figure out the number of unique words */ 
    pCounter = pStart; 
    while(pCounter != NULL) 
    { 
     uniquecount++; 
     pCounter = pCounter->pNext; 
    } 

    /* Print Summary */ 

    printf("\nSUMMARY:\n\n"); 
    printf(" %d words\n", totalcount); /* Print total words */ 
    printf(" %d unique words\n", uniquecount); /* Print unique words */ 




    /* List the words and their counts */ 
    pCounter = pStart; 
    while(pCounter != NULL) 
    { 
     show(pCounter); 
     pCounter = pCounter->pNext; 
    } 
    printf("\n"); 


    /* Free the allocated memory*/ 
    pCounter = pStart; 
    while(pCounter != NULL) 
    { 
     free(pCounter->str);   
     pStart = pCounter;   
     pCounter = pCounter->pNext; 
     free(pStart);     
    } 

    /* Close file */ 
    fclose(fp); 

    return 0; 

} 


// =========================================== 
//     FUNCTIONS 
//============================================ 


void show(Word *pWordcounter) 
{ 
    /* output the word and it's count */ 
    printf("\n%-30s %5d", pWordcounter->str,pWordcounter->freq); 

} 

void addWord(char *word) 
{ 
    Word *pCounter = NULL; 
    Word *pLast = NULL; 

    if(pStart == NULL) 
    { 
    pStart = createWordCounter(word); 
    return; 
    } 

    /* If the word is in the list, increment its count */ 
    pCounter = pStart; 
    while(pCounter != NULL) 
    { 
    if(strcmp(word, pCounter->str) == 0) 
    { 
     ++pCounter->freq; 

     return; 
    } 
    pLast = pCounter;    
    pCounter = pCounter->pNext; 
    } 

    /* Word is not in the list, add it */ 
    pLast->pNext = createWordCounter(word); 
} 

Word* createWordCounter(char *word) 
{ 
    Word *pCounter = NULL; 
    pCounter = (Word*)malloc(sizeof(Word)); 
    pCounter->str = (char*)malloc(strlen(word)+1); 
    strcpy(pCounter->str, word); 
    pCounter->freq = 1; 
    pCounter->pNext = NULL; 
    return pCounter; 
} 

int getNextWord(FILE *fp, char *buf, int bufsize) { 
    char *p = buf; 
    char c; 


    //skip all non-word characters 
    do { 
     c = fgetc(fp); 
     if (c == EOF) 
      return 0; 
     } while (!isalpha(c)); 

    //read word chars 

    do { 
     if (p - buf < bufsize - 1) 
     *p++ = tolower(c); 
     c = fgetc(fp); 
     } while (isalpha(c)); 

     //finalize word 
     *p = '\0'; 
     return 1; 
     } 

它正確顯示摘要。單詞和獨特單詞的數量是完全正確的。然後列出文件中找到的每個唯一字,並顯示正確的出現次數。

我現在需要做的事情(以及我遇到的很多麻煩)是按降序排列我的鏈接列表的出現次數。最重要的是,它應該只顯示前10個單詞,而不是全部(這應該是可行的,一旦我有鏈接列表排序)。

我知道代碼本身現在效率很低,但我現在主要關心的是獲取正確的輸出。

如果任何人都可以幫助我排序算法,或至少指出我在正確的方向,將不勝感激。

謝謝。

回答

3

對於初學C語言的程序員來說,這個想法可能有點雄心勃勃,但是瞭解標準庫中的函數總是一個好主意。如果您知道鏈表的大小,可以使用malloc爲保存相同數據的數組分配空間。然後您可以使用qsort爲您排序數據。

函數mallocqsort是標準C庫經常使用的成員。

+0

好的,我會試試看。謝謝。 – user1300922 2012-08-06 04:29:06

2

不要對鏈表進行排序,這是非常低效且容易出錯的。將相關數據複製到數組中並使用qsort方法。

當你想讓你的算法更高效時,我建議你使用trie

+0

我明白了,謝謝。 – user1300922 2012-08-06 04:28:47

+3

實際上,在鏈表上插入排序非常簡單。 – 2012-08-07 16:05:04