word frequency analysis c program code example
Example: word frequency analysis c program
#include <stdbool.h>#include <stdio.h>#include <glib.h> typedef struct word_count_tag { const char* word; size_t count;} word_count; int compare_word_count(const void* p1, const void* p2) { const word_count* w1 = p1; const word_count* w2 = p2; if (w1->count > w2->count) return -1; if (w1->count < w2->count) return 1; return 0;} bool get_top_words(const char* filename, size_t count) { GError* error = NULL; GMappedFile* mapped_file = g_mapped_file_new(filename, FALSE, &error); if (mapped_file == NULL) { fprintf(stderr, "%s\n", error->message); g_error_free(error); return false; } const char* text = g_mapped_file_get_contents(mapped_file); if (text == NULL) { fprintf(stderr, "File %s is empty\n", filename); g_mapped_file_unref(mapped_file); return false; } gsize file_size = g_mapped_file_get_length(mapped_file); // Store word counts in a hash table GHashTable* ht = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, g_free); GRegex* regex = g_regex_new("\\w+", 0, 0, NULL); GMatchInfo* match_info; g_regex_match_full(regex, text, file_size, 0, 0, &match_info, NULL); while (g_match_info_matches(match_info)) { char* word = g_match_info_fetch(match_info, 0); char* lower = g_utf8_strdown(word, -1); g_free(word); size_t* count = g_hash_table_lookup(ht, lower); if (count != NULL) { ++*count; g_free(lower); } else { count = g_new(size_t, 1); *count = 1; g_hash_table_insert(ht, lower, count); } g_match_info_next(match_info, NULL); } g_match_info_free(match_info); g_regex_unref(regex); g_mapped_file_unref(mapped_file); // Sort words in decreasing order of frequency size_t size = g_hash_table_size(ht); word_count* words = g_new(word_count, size); GHashTableIter iter; gpointer key, value; g_hash_table_iter_init(&iter, ht); for (size_t i = 0; g_hash_table_iter_next(&iter, &key, &value); ++i) { words[i].word = key; words[i].count = *(size_t*)value; } qsort(words, size, sizeof(word_count), compare_word_count); // Print the most common words if (count > size) count = size; printf("Top %lu words\n", count); printf("Rank\tCount\tWord\n"); for (size_t i = 0; i < count; ++i) printf("%lu\t%lu\t%s\n", i + 1, words[i].count, words[i].word); g_free(words); g_hash_table_destroy(ht); return true;} int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "usage: %s file\n", argv[0]); return EXIT_FAILURE; } if (!get_top_words(argv[1], 10)) return EXIT_FAILURE; return EXIT_SUCCESS;}