Communities

Writing
Writing
Codidact Meta
Codidact Meta
The Great Outdoors
The Great Outdoors
Photography & Video
Photography & Video
Scientific Speculation
Scientific Speculation
Cooking
Cooking
Electrical Engineering
Electrical Engineering
Judaism
Judaism
Languages & Linguistics
Languages & Linguistics
Software Development
Software Development
Mathematics
Mathematics
Christianity
Christianity
Code Golf
Code Golf
Music
Music
Physics
Physics
Linux Systems
Linux Systems
Power Users
Power Users
Tabletop RPGs
Tabletop RPGs
Community Proposals
Community Proposals
tag:snake search within a tag
answers:0 unanswered questions
user:xxxx search by author id
score:0.5 posts with 0.5+ score
"snake oil" exact phrase
votes:4 posts with 4+ votes
created:<1w created < 1 week ago
post_type:xxxx type of post
Search help
Notifications
Mark all as read See all your notifications »
Code Reviews

Welcome to Software Development on Codidact!

Will you help us build our independent community of developers helping developers? We're small and trying to grow. We welcome questions about all aspects of software development, from design to code to QA and more. Got questions? Got answers? Got code you'd like someone to review? Please join us.

Post History

66%
+2 −0
Code Reviews Trie Implementation, Graph Visualization and Auto-Completion in C

Given a list of strings (say a text file containing some C symbols: c-symbols.txt), the program can: Generate a graph of the underlying trie (-p/--prefix can be specified to inspect a specific ...

1 answer  ·  posted 9mo ago by Melkor-1‭  ·  last activity 9mo ago by Lundin‭

#3: Post edited by user avatar Alexei‭ · 2024-02-19T07:20:01Z (9 months ago)
replaced Github image URL with raw URL to allow preview to work
  • Given a list of strings (say a text file containing some C symbols: [c-symbols.txt](https://github.com/Melkor-1/Trie/blob/main/c-symbols.txt)), the program can:
  • 1) Generate a graph of the underlying trie (-p/--prefix can be specified to inspect a
  • specific prefix subtree) with Graphviz dot:
  • ```bash
  • » time ./trie --prefix at --svg c-symbols.txt
  • ./trie --prefix at --svg c-symbols.txt 0.74s user 0.10s system 95% cpu 0.879 total
  • ```
  • Output Image:
  • [![Output Graph][1]][1]
  • (The green ones indicate terminal nodes.)
  • 2) Suggest completions for a given prefix:
  • ```bash
  • » time ./trie -c at c-symbols.txt
  • at_quick_exit
  • atan
  • atan2
  • atan2d
  • atan2d128
  • atan2d32
  • atan2d64
  • atan2f
  • atan2l
  • atan2pi
  • atan2pid
  • atan2pid128
  • atan2pid32
  • atan2pid64
  • atan2pif
  • [94 more results...]
  • ./trie -c at c-symbols.txt 0.00s user 0.01s system 76% cpu 0.018 total
  • ```
  • With `-s/--svg`, the transient file `graph.dot` can be saved by specifying the
  • `-k/--keep` flag.
  • ```bash
  • » cat graph.dot
  • digraph Trie {
  • node [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]
  • Node_167 [label=at]
  • Node_248 [label=A]
  • Node_0 -> Node_248 [label=A]
  • Node_63 [label=_]
  • Node_0 -> Node_63 [label=_]
  • Node_1 [label=a]
  • Node_0 -> Node_1 [label=a]
  • Node_2 [label=b]
  • Node_1 -> Node_2 [label=b]
  • Node_17 [label=c]
  • Node_1 -> Node_17 [label=c]
  • Node_53 [label=d]
  • ...
  • ...
  • Node_448 -> Node_449 [label=c]
  • Node_450 [label=i]
  • Node_449 -> Node_450 [label=i]
  • Node_451 [label=t,fillcolor=lightgreen]
  • Node_450 -> Node_451 [label=t]
  • }
  • ```
  • ## Code:
  • trie.c:
  • ```c
  • #undef _POSIX_C_SOURCE
  • #undef _XOPEN_SOURCE
  • #define _POSIX_C_SOURCE 200819L
  • #define _XOPEN_SOURCE 700
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <stdbool.h>
  • #include <string.h>
  • #include <stdint.h>
  • #include <errno.h>
  • #include <inttypes.h>
  • #include <getopt.h>
  • #define PROGRAM_NAME "auto-complete"
  • #define OUTPUT_DOT_FILE "graph.dot"
  • #define AC_BUFFER_CAP 1024
  • struct flags {
  • bool kflag; /* Keep the transient .DOT file. */
  • bool hflag; /* Help message. */
  • bool sflag; /* Generate a .SVG file. */
  • bool cflag; /* Suggest autocompletions. */
  • bool pflag; /* Prefix for the .DOT file. */
  • };
  • #ifdef DEBUG
  • #include "size.h"
  • #define debug_printf(fmt, ...) \
  • fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, \
  • __func__, __VA_ARGS__)
  • #define D(x) x
  • #else
  • #define D(x) (void) 0
  • #endif
  • static void help(FILE *sink)
  • {
  • fprintf(sink, "\nUSAGE\n"
  • "\t" PROGRAM_NAME " [OPTIONS] [filename]\n\n"
  • "DESCRIPTION\n"
  • "\t" PROGRAM_NAME
  • " is a program for auto-completion and graph visualization.\n\n"
  • "OPTIONS:\n" "\t-k, --keep\t\tKeep the transient .DOT file.\n"
  • "\t-h, --help\t\tDisplay this help message and exit.\n"
  • "\t-s, --svg \t\tGenerate a .SVG file (with optional prefix).\n"
  • "\t-c, --complete PREFIX Suggest autocompletions for prefix.\n"
  • "\t-p, --prefix PREFIX\tPrefix for the .DOT file.\n\n");
  • exit(EXIT_SUCCESS);
  • }
  • static void usage_err(const char *prog_name)
  • {
  • fprintf(stderr, "The syntax of the command is incorrect.\n"
  • "Try %s -h for more information.\n", prog_name);
  • exit(EXIT_FAILURE);
  • }
  • static void parse_options(const struct option *restrict long_options,
  • struct flags *restrict opt_ptr, int argc, char *const argv[],
  • const char **restrict prefix)
  • {
  • int c;
  • int err_flag = 0;
  • while ((c = getopt_long(argc, argv, "shkc:p:", long_options, NULL)) != -1) {
  • switch (c) {
  • case 'k':
  • ++err_flag;
  • opt_ptr->kflag = true;
  • break;
  • case 'h':
  • help(stdout);
  • break;
  • case 's':
  • if (opt_ptr->cflag) {
  • fprintf(stderr,
  • "Error: -s/--svg specified after -c/--complete.\n");
  • usage_err(argv[0]);
  • }
  • if (opt_ptr->pflag) {
  • --err_flag;
  • }
  • if (opt_ptr->kflag) {
  • --err_flag;
  • }
  • opt_ptr->sflag = true;
  • break;
  • case 'c':
  • if (opt_ptr->sflag) {
  • fprintf(stderr,
  • "Error: -c/--complete specified after -s/--svg.\n");
  • usage_err(argv[0]);
  • }
  • *prefix = optarg;
  • if (strlen(*prefix) >= AC_BUFFER_CAP) {
  • fprintf(stderr, "Error: PREFIX too long.\n");
  • exit(EXIT_FAILURE);
  • }
  • opt_ptr->cflag = true;
  • break;
  • case 'p':
  • if (!opt_ptr->sflag) {
  • err_flag = true;
  • }
  • *prefix = optarg;
  • opt_ptr->pflag = true;
  • break;
  • /* case '?' */
  • default:
  • usage_err(argv[0]);
  • }
  • }
  • /* If the -p or -k flag was specified without -s: */
  • /* Note: GNU provides an extension for optional arguments, which
  • * can be used to provide an optional prefix for -s. */
  • if (err_flag) {
  • if (opt_ptr->pflag) {
  • fputs("Error: -p specified without -s.\n", stderr);
  • }
  • if (opt_ptr->kflag) {
  • fputs("Error: -k specified without -s.\n", stderr);
  • }
  • usage_err(argv[0]);
  • }
  • }
  • /* Instead of using the whole ASCII charset, we'd only use the set of printable
  • * characters. This cuts down the struct size from 2056 bytes to 768 bytes if
  • * using 64-bit integers, or from 1028 bytes to 384 bytes if using 32-bit integers.
  • * (or so pahole outputs, there might be a slight difference in the size across
  • * different platforms).
  • */
  • #define CHILDREN_COUNT 95
  • /* In the children array, 32 should map to 0, 65 should map to 33, and so on. */
  • #define ASCII_OFFSET ' ' /* ASCII code: 32, start of printable characters. */
  • /* This wastes half the indexing range. Perhaps we can use uint32_t instead of
  • * int32_t, and use UINT32_MAX as the sentinel value.
  • */
  • #define INVALID_OFFSET -1
  • #define INITIAL_POOL_CAP 1024
  • typedef struct Node {
  • /* TODO: This still wastes a lot of memory. Something like Pythons's dict
  • * would be better. Explore the idea of using a dynamically growing
  • * hash-table for Node.children.
  • */
  • /* We shall store indices within the `Trie.pool` array instead of storing
  • * huge word size pointers. This also simplifies serializing and
  • * deserializing the structure.
  • *
  • * See also: A case where int32_t wasn't sufficient - bbc.com/news/world-asia-30288542
  • */
  • int32_t children[CHILDREN_COUNT];
  • bool terminal;
  • } Node;
  • static struct {
  • Node *pool;
  • int32_t count;
  • int32_t capacity;
  • } Trie;
  • static bool init_pool(void)
  • {
  • Trie.pool = malloc(sizeof *Trie.pool * INITIAL_POOL_CAP);
  • if (!Trie.pool) {
  • perror("malloc()");
  • return false;
  • }
  • Trie.capacity = INITIAL_POOL_CAP;
  • return true;
  • }
  • static inline void free_pool(void)
  • {
  • free(Trie.pool);
  • }
  • static int32_t alloc_node(void)
  • {
  • if (Trie.count >= Trie.capacity) {
  • const int32_t remaining = INT32_MAX - Trie.capacity;
  • /* We can no longer add more nodes. Bail out. */
  • if (!remaining) {
  • /* Is this helpful? */
  • fputs("Error: too many nodes. Consider recompiling the program"
  • "with a greater int width for more indexing range.\n", stderr);
  • exit(EXIT_FAILURE);
  • }
  • Trie.capacity += remaining < INITIAL_POOL_CAP ? remaining : INITIAL_POOL_CAP;
  • void *const tmp = realloc(Trie.pool, sizeof *Trie.pool * (size_t) Trie.capacity);
  • if (!tmp) {
  • perror("realloc()");
  • return INVALID_OFFSET;
  • }
  • Trie.pool = tmp;
  • }
  • Node *const new = &Trie.pool[Trie.count];
  • memset(new->children, INVALID_OFFSET, sizeof new->children);
  • Trie.pool[Trie.count].terminal = false;
  • return Trie.count++;
  • }
  • static bool insert_text(int32_t root_idx, const char *text)
  • {
  • while (*text) {
  • const int32_t idx = *text - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[idx] == INVALID_OFFSET) {
  • const int32_t new = alloc_node();
  • if (new == INVALID_OFFSET) {
  • return false;
  • }
  • Trie.pool[root_idx].children[idx] = new;
  • }
  • root_idx = Trie.pool[root_idx].children[idx];
  • ++text;
  • }
  • return Trie.pool[root_idx].terminal = true;
  • }
  • static char *read_file(FILE *fp)
  • {
  • static const size_t page_size = 4096;
  • char *content = NULL;
  • size_t len = 0;
  • for (size_t rcount = 1; rcount; len += rcount) {
  • void *const tmp = realloc(content, len + page_size);
  • if (!tmp) {
  • free(content);
  • content = NULL;
  • /* ENOMEM is not a C standard error code, yet quite common. */
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • content = tmp;
  • rcount = fread(content + len, 1, page_size - 1, fp);
  • if (ferror(fp)) {
  • free(content);
  • content = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • }
  • content[len] = '\0';
  • return content;
  • }
  • static size_t split_lines(char *restrict s, char ***restrict lines)
  • {
  • const size_t chunk_size = 1024;
  • size_t capacity = 0;
  • size_t line_count = 0;
  • while (s && *s) {
  • if (line_count >= capacity) {
  • char **const new = realloc(*lines,
  • sizeof **lines * (capacity += chunk_size));
  • if (!new) {
  • free(*lines);
  • *lines = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return 0;
  • }
  • *lines = new;
  • }
  • (*lines)[line_count++] = s;
  • s = strchr(s, '\n');
  • if (s) {
  • *s++ = '\0';
  • }
  • }
  • return line_count;
  • }
  • static char ac_buffer[AC_BUFFER_CAP];
  • static size_t ac_buffer_sz;
  • static void ac_buffer_push(char ch)
  • {
  • /* No check here, for we would have exited in parse_options() if the length
  • * of the prefix was greater than AC_BUFFER_CAP.
  • */
  • ac_buffer[ac_buffer_sz++] = ch;
  • }
  • static void ac_buffer_pop(void)
  • {
  • if (ac_buffer_sz > 0) {
  • ac_buffer_sz--;
  • }
  • }
  • static int32_t find_prefix(int32_t root_idx, const char *prefix)
  • {
  • while (*prefix) {
  • const int32_t child_idx = *prefix - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[child_idx] == INVALID_OFFSET) {
  • return INVALID_OFFSET;
  • }
  • root_idx = Trie.pool[root_idx].children[child_idx];
  • ac_buffer_push(*prefix);
  • ++prefix;
  • }
  • return root_idx;
  • }
  • static void dump_dot_prefix(FILE *sink, int32_t root_idx)
  • {
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[root_idx].children[i];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (i + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n", child_index,
  • (char) (i + ASCII_OFFSET));
  • }
  • fprintf(sink, "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • root_idx, child_index, (char) (i + ASCII_OFFSET));
  • dump_dot_prefix(sink, Trie.pool[root_idx].children[i]);
  • }
  • }
  • }
  • static void dump_dot_whole(FILE *sink)
  • {
  • for (int32_t i = 0; i < Trie.count; ++i) {
  • const int32_t index = i;
  • for (size_t j = 0; j < CHILDREN_COUNT; ++j) {
  • if (Trie.pool[i].children[j] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[i].children[j];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • }
  • fprintf(sink,
  • "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • index, child_index, (char) (j + ASCII_OFFSET));
  • }
  • }
  • }
  • }
  • static void print_suggestions(int32_t root_idx)
  • {
  • if (Trie.pool[root_idx].terminal) {
  • fwrite(ac_buffer, ac_buffer_sz, 1, stdout);
  • fputc('\n', stdout);
  • }
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • ac_buffer_push((char) (i + ASCII_OFFSET));
  • print_suggestions(Trie.pool[root_idx].children[i]);
  • ac_buffer_pop();
  • }
  • }
  • }
  • static bool populate_trie(int32_t root_idx, char **lines, size_t num_lines)
  • {
  • for (size_t i = 0; i < num_lines; ++i) {
  • if (!insert_text(root_idx, lines[i])) {
  • return false;
  • }
  • }
  • return true;
  • }
  • static bool generate_graph(void)
  • {
  • if (system("dot -Tsvg " OUTPUT_DOT_FILE " -O")) {
  • fprintf(stderr, "Error: failed to generate the .SVG file, %s.\n",
  • strerror(errno));
  • return false;
  • }
  • return true;
  • }
  • static bool generate_dot(int32_t root_idx, const char *prefix)
  • {
  • FILE *const sink = fopen("graph.dot", "w");
  • if (!sink) {
  • perror("fopen()");
  • return false;
  • }
  • fprintf(sink, "digraph Trie {\n"
  • "\tnode [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]\n"
  • "\tNode_%" PRId32 " [label=%s]\n", root_idx, prefix ? prefix : "root");
  • root_idx == 0 ? dump_dot_whole(sink) : dump_dot_prefix(sink, root_idx);
  • fprintf(sink, "}\n");
  • fclose(sink);
  • return true;
  • }
  • static bool process_args(int32_t root_idx,
  • const struct flags *restrict options, const char *restrict prefix,
  • const char *restrict out_file)
  • {
  • bool rv = true;
  • /* Handle --complete first as we are going to modify root later in case
  • * prefix is not a null pointer.
  • */
  • if (options->cflag) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • print_suggestions(subtree_idx);
  • }
  • if (options->sflag) {
  • if (prefix) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • root_idx = subtree_idx;
  • }
  • /* At this point, we can free the file, lines and Trie.pool, but we'd have
  • * to change the structure of the code. */
  • if (!generate_dot(root_idx, prefix)
  • || !generate_graph()) {
  • rv = !rv;
  • }
  • }
  • cleanup:
  • if (!options->kflag) {
  • remove(out_file);
  • }
  • return rv;
  • }
  • int main(int argc, char *argv[])
  • {
  • /* Sanity check. POSIX requires the invoking process to pass a non-NULL
  • * argv[0].
  • */
  • if (!argv[0]) {
  • fprintf(stderr,
  • "A NULL argv[0] was passed through an exec system call.\n");
  • return EXIT_FAILURE;
  • }
  • static const struct option long_options[] = {
  • { "keep", no_argument, NULL, 'k' },
  • { "help", no_argument, NULL, 'h' },
  • { "svg", no_argument, NULL, 's' },
  • { "complete", required_argument, NULL, 'c' },
  • { "prefix", required_argument, NULL, 'p' },
  • { NULL, 0, NULL, 0 },
  • };
  • FILE *in_file = stdin;
  • struct flags options = { false, false, false, false, false };
  • const char *search_prefix = NULL;
  • parse_options(long_options, &options, argc, argv, &search_prefix);
  • if (options.sflag || options.cflag) {
  • if ((optind + 1) == argc) {
  • in_file = fopen(argv[optind], "r");
  • if (!in_file) {
  • perror(argv[optind]);
  • return EXIT_FAILURE;
  • }
  • }
  • } else {
  • usage_err(PROGRAM_NAME);
  • }
  • if (optind > argc) {
  • usage_err(PROGRAM_NAME);
  • }
  • char *const content = read_file(in_file);
  • char **lines = NULL;
  • const size_t num_lines = split_lines(content, &lines);
  • if (!lines) {
  • free(content);
  • perror("fread()");
  • return EXIT_FAILURE;
  • }
  • bool rv = true;
  • int32_t root_idx = INVALID_OFFSET;
  • if (!init_pool()
  • || (root_idx = alloc_node()) == INVALID_OFFSET
  • || !populate_trie(root_idx, lines, num_lines)) {
  • rv = !rv;
  • goto cleanup;
  • }
  • D(
  • char *const total = calculate_size((size_t) Trie.capacity * sizeof *Trie.pool);
  • char *const used = calculate_size((size_t) Trie.count * sizeof *Trie.pool);
  • debug_printf("Total lines read: %zu.\n" "Total nodes allocated: %"
  • PRId32 ".\n" "Total nodes used: %" PRId32 ".\n"
  • "Total memory allocated: %s.\n" "Total memory used: %s.\n",
  • num_lines,
  • Trie.capacity,
  • Trie.count,
  • total,
  • used);
  • free(total);
  • free(used);
  • );
  • rv = process_args(root_idx, &options, search_prefix, OUTPUT_DOT_FILE);
  • cleanup:
  • free_pool(); /* NO-OP if init_pool() returns false. */
  • free(content);
  • free(lines);
  • if (in_file != stdin) {
  • fclose(in_file);
  • }
  • return rv ? EXIT_SUCCESS : EXIT_FAILURE;
  • }
  • ```
  • size.h:
  • ```c
  • #ifndef SIZE_H
  • #define SIZE_H 1
  • #include <stdint.h>
  • char *calculate_size(uint64_t size);
  • #endif /* SIZE_H */
  • ```
  • size.c:
  • ```c
  • #include "size.h"
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <string.h>
  • #include <inttypes.h>
  • #include <stdint.h>
  • #define ARRAY_CARDINALITY(x) (sizeof(x) / sizeof (*(x)))
  • static const char *const sizes[] =
  • { "EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B" };
  • static const uint64_t exbibytes =
  • 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL;
  • /* Acknowledgements:
  • * @Jonathan Leffer - [Converting a number of bytes into a file size in C]
  • * (https://stackoverflow.com/q/3898840/20017547)
  • */
  • char *calculate_size(uint64_t size)
  • {
  • char *const result = malloc(32);
  • if (!result) {
  • return NULL;
  • }
  • uint64_t multiplier = exbibytes;
  • for (size_t i = 0; i < ARRAY_CARDINALITY(sizes); ++i, multiplier /= 1024) {
  • if (size < multiplier) {
  • continue;
  • }
  • if (size % multiplier == 0) {
  • sprintf(result, "%" PRId64 " %s", size / multiplier, sizes[i]);
  • } else {
  • sprintf(result, "%.1lf %s", (double) size / (double) multiplier,
  • sizes[i]);
  • }
  • return result;
  • }
  • return strcpy(result, "0");
  • }
  • #ifdef TEST_MAIN
  • int main(void)
  • {
  • static const uint64_t list[] = {
  • 0, 1, 2, 34, 900, 1023, 1024, 1025, 2048, 1024 * 1024,
  • 1024 * 1024 * 1024 + 1024 * 1024 * 400
  • };
  • for (size_t i = 0; i < ARRAY_CARDINALITY(list); ++i) {
  • char *const str = calculate_size(list[i]);
  • if (!str) {
  • perror("malloc()");
  • return EXIT_FAILURE;
  • }
  • printf("%18" PRId64 " = %s\n", list[i], str);
  • free(str);
  • }
  • return EXIT_SUCCESS;
  • }
  • #endif
  • ```
  • And here's the makefile for the project:
  • ```make
  • CFLAGS = -std=c17
  • CFLAGS += -no-pie
  • CFLAGS += -ftrapv
  • CFLAGS += -Wall
  • CFLAGS += -Wextra
  • CFLAGS += -Warray-bounds
  • CFLAGS += -Wconversion
  • CFLAGS += -Wmissing-braces
  • CFLAGS += -Wno-parentheses
  • CFLAGS += -Wno-format-truncation
  • CFLAGS += -Wno-type-limits
  • CFLAGS += -Wpedantic
  • CFLAGS += -Wstrict-prototypes
  • CFLAGS += -Wwrite-strings
  • CFLAGS += -Winline
  • CFLAGS += -D_FORTIFY_SOURCE=2
  • BIN := trie
  • INSTALL_PATH := /usr/local/bin
  • SRCS := trie.c
  • ifeq ($(MAKECMDGOALS),debug)
  • SRCS += size.c
  • endif
  • all: CFLAGS += -s -O2
  • all: $(BIN)
  • debug: CFLAGS += -g3 -ggdb -fanalyzer -DDEBUG
  • debug: $(BIN)
  • $(BIN): $(SRCS)
  • install: $(BIN)
  • install $< $(INSTALL_PATH)
  • uninstall:
  • rm $(INSTALL_PATH)/$(BIN)
  • clean:
  • $(RM) $(BIN)
  • .PHONY: all debug clean install uninstall
  • .DELETE_ON_ERROR:
  • ```
  • The code was formatted with GNU Indent, compiles without any warnings with Clang,
  • GCC, and MinGW on FreeBSD 14.0, Linux Mint 21.2, and Windows 10.0, and reports
  • nothing awry under valgrind.
  • Although I did have to use gmake on FreeBSD because of some non-standard GNU extensions.
  • You could also clone this github repository: [trie](https://github.com/Melkor-1/Trie)
  • ## Test run:
  • Here are the results of a sample run:
  • ```bash
  • >> free && sync && echo 3 >| /proc/sys/vm/drop_caches && free
  • >> time ./trie --complete tran ~/data-set/wordlists/huge.txt 1> /dev/null
  • ./trie --complete tran ~/data-set/wordlists/huge.txt > /dev/null 0.19s user 0.57s system 97% cpu 0.775 total
  • >> wc ~/data-set/wordlists/huge.txt
  • 172819 172820 1916146
  • ```
  • ## Review Goals:
  • * Do you see any issues with the design/structure of the program?
  • * Does any part of my code exhibit undefined/implementation-defined behavior?
  • * General coding comments, style, et cetera.
  • Do you have a faster alternative to Graphviz dot to suggest?
  • [1]: https://github.com/Melkor-1/trie/blob/main/graph.dot.svg
  • Given a list of strings (say a text file containing some C symbols: [c-symbols.txt](https://github.com/Melkor-1/Trie/blob/main/c-symbols.txt)), the program can:
  • 1) Generate a graph of the underlying trie (-p/--prefix can be specified to inspect a
  • specific prefix subtree) with Graphviz dot:
  • ```bash
  • » time ./trie --prefix at --svg c-symbols.txt
  • ./trie --prefix at --svg c-symbols.txt 0.74s user 0.10s system 95% cpu 0.879 total
  • ```
  • Output Image:
  • [![Output Graph][1]][1]
  • (The green ones indicate terminal nodes.)
  • 2) Suggest completions for a given prefix:
  • ```bash
  • » time ./trie -c at c-symbols.txt
  • at_quick_exit
  • atan
  • atan2
  • atan2d
  • atan2d128
  • atan2d32
  • atan2d64
  • atan2f
  • atan2l
  • atan2pi
  • atan2pid
  • atan2pid128
  • atan2pid32
  • atan2pid64
  • atan2pif
  • [94 more results...]
  • ./trie -c at c-symbols.txt 0.00s user 0.01s system 76% cpu 0.018 total
  • ```
  • With `-s/--svg`, the transient file `graph.dot` can be saved by specifying the
  • `-k/--keep` flag.
  • ```bash
  • » cat graph.dot
  • digraph Trie {
  • node [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]
  • Node_167 [label=at]
  • Node_248 [label=A]
  • Node_0 -> Node_248 [label=A]
  • Node_63 [label=_]
  • Node_0 -> Node_63 [label=_]
  • Node_1 [label=a]
  • Node_0 -> Node_1 [label=a]
  • Node_2 [label=b]
  • Node_1 -> Node_2 [label=b]
  • Node_17 [label=c]
  • Node_1 -> Node_17 [label=c]
  • Node_53 [label=d]
  • ...
  • ...
  • Node_448 -> Node_449 [label=c]
  • Node_450 [label=i]
  • Node_449 -> Node_450 [label=i]
  • Node_451 [label=t,fillcolor=lightgreen]
  • Node_450 -> Node_451 [label=t]
  • }
  • ```
  • ## Code:
  • trie.c:
  • ```c
  • #undef _POSIX_C_SOURCE
  • #undef _XOPEN_SOURCE
  • #define _POSIX_C_SOURCE 200819L
  • #define _XOPEN_SOURCE 700
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <stdbool.h>
  • #include <string.h>
  • #include <stdint.h>
  • #include <errno.h>
  • #include <inttypes.h>
  • #include <getopt.h>
  • #define PROGRAM_NAME "auto-complete"
  • #define OUTPUT_DOT_FILE "graph.dot"
  • #define AC_BUFFER_CAP 1024
  • struct flags {
  • bool kflag; /* Keep the transient .DOT file. */
  • bool hflag; /* Help message. */
  • bool sflag; /* Generate a .SVG file. */
  • bool cflag; /* Suggest autocompletions. */
  • bool pflag; /* Prefix for the .DOT file. */
  • };
  • #ifdef DEBUG
  • #include "size.h"
  • #define debug_printf(fmt, ...) \
  • fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, \
  • __func__, __VA_ARGS__)
  • #define D(x) x
  • #else
  • #define D(x) (void) 0
  • #endif
  • static void help(FILE *sink)
  • {
  • fprintf(sink, "\nUSAGE\n"
  • "\t" PROGRAM_NAME " [OPTIONS] [filename]\n\n"
  • "DESCRIPTION\n"
  • "\t" PROGRAM_NAME
  • " is a program for auto-completion and graph visualization.\n\n"
  • "OPTIONS:\n" "\t-k, --keep\t\tKeep the transient .DOT file.\n"
  • "\t-h, --help\t\tDisplay this help message and exit.\n"
  • "\t-s, --svg \t\tGenerate a .SVG file (with optional prefix).\n"
  • "\t-c, --complete PREFIX Suggest autocompletions for prefix.\n"
  • "\t-p, --prefix PREFIX\tPrefix for the .DOT file.\n\n");
  • exit(EXIT_SUCCESS);
  • }
  • static void usage_err(const char *prog_name)
  • {
  • fprintf(stderr, "The syntax of the command is incorrect.\n"
  • "Try %s -h for more information.\n", prog_name);
  • exit(EXIT_FAILURE);
  • }
  • static void parse_options(const struct option *restrict long_options,
  • struct flags *restrict opt_ptr, int argc, char *const argv[],
  • const char **restrict prefix)
  • {
  • int c;
  • int err_flag = 0;
  • while ((c = getopt_long(argc, argv, "shkc:p:", long_options, NULL)) != -1) {
  • switch (c) {
  • case 'k':
  • ++err_flag;
  • opt_ptr->kflag = true;
  • break;
  • case 'h':
  • help(stdout);
  • break;
  • case 's':
  • if (opt_ptr->cflag) {
  • fprintf(stderr,
  • "Error: -s/--svg specified after -c/--complete.\n");
  • usage_err(argv[0]);
  • }
  • if (opt_ptr->pflag) {
  • --err_flag;
  • }
  • if (opt_ptr->kflag) {
  • --err_flag;
  • }
  • opt_ptr->sflag = true;
  • break;
  • case 'c':
  • if (opt_ptr->sflag) {
  • fprintf(stderr,
  • "Error: -c/--complete specified after -s/--svg.\n");
  • usage_err(argv[0]);
  • }
  • *prefix = optarg;
  • if (strlen(*prefix) >= AC_BUFFER_CAP) {
  • fprintf(stderr, "Error: PREFIX too long.\n");
  • exit(EXIT_FAILURE);
  • }
  • opt_ptr->cflag = true;
  • break;
  • case 'p':
  • if (!opt_ptr->sflag) {
  • err_flag = true;
  • }
  • *prefix = optarg;
  • opt_ptr->pflag = true;
  • break;
  • /* case '?' */
  • default:
  • usage_err(argv[0]);
  • }
  • }
  • /* If the -p or -k flag was specified without -s: */
  • /* Note: GNU provides an extension for optional arguments, which
  • * can be used to provide an optional prefix for -s. */
  • if (err_flag) {
  • if (opt_ptr->pflag) {
  • fputs("Error: -p specified without -s.\n", stderr);
  • }
  • if (opt_ptr->kflag) {
  • fputs("Error: -k specified without -s.\n", stderr);
  • }
  • usage_err(argv[0]);
  • }
  • }
  • /* Instead of using the whole ASCII charset, we'd only use the set of printable
  • * characters. This cuts down the struct size from 2056 bytes to 768 bytes if
  • * using 64-bit integers, or from 1028 bytes to 384 bytes if using 32-bit integers.
  • * (or so pahole outputs, there might be a slight difference in the size across
  • * different platforms).
  • */
  • #define CHILDREN_COUNT 95
  • /* In the children array, 32 should map to 0, 65 should map to 33, and so on. */
  • #define ASCII_OFFSET ' ' /* ASCII code: 32, start of printable characters. */
  • /* This wastes half the indexing range. Perhaps we can use uint32_t instead of
  • * int32_t, and use UINT32_MAX as the sentinel value.
  • */
  • #define INVALID_OFFSET -1
  • #define INITIAL_POOL_CAP 1024
  • typedef struct Node {
  • /* TODO: This still wastes a lot of memory. Something like Pythons's dict
  • * would be better. Explore the idea of using a dynamically growing
  • * hash-table for Node.children.
  • */
  • /* We shall store indices within the `Trie.pool` array instead of storing
  • * huge word size pointers. This also simplifies serializing and
  • * deserializing the structure.
  • *
  • * See also: A case where int32_t wasn't sufficient - bbc.com/news/world-asia-30288542
  • */
  • int32_t children[CHILDREN_COUNT];
  • bool terminal;
  • } Node;
  • static struct {
  • Node *pool;
  • int32_t count;
  • int32_t capacity;
  • } Trie;
  • static bool init_pool(void)
  • {
  • Trie.pool = malloc(sizeof *Trie.pool * INITIAL_POOL_CAP);
  • if (!Trie.pool) {
  • perror("malloc()");
  • return false;
  • }
  • Trie.capacity = INITIAL_POOL_CAP;
  • return true;
  • }
  • static inline void free_pool(void)
  • {
  • free(Trie.pool);
  • }
  • static int32_t alloc_node(void)
  • {
  • if (Trie.count >= Trie.capacity) {
  • const int32_t remaining = INT32_MAX - Trie.capacity;
  • /* We can no longer add more nodes. Bail out. */
  • if (!remaining) {
  • /* Is this helpful? */
  • fputs("Error: too many nodes. Consider recompiling the program"
  • "with a greater int width for more indexing range.\n", stderr);
  • exit(EXIT_FAILURE);
  • }
  • Trie.capacity += remaining < INITIAL_POOL_CAP ? remaining : INITIAL_POOL_CAP;
  • void *const tmp = realloc(Trie.pool, sizeof *Trie.pool * (size_t) Trie.capacity);
  • if (!tmp) {
  • perror("realloc()");
  • return INVALID_OFFSET;
  • }
  • Trie.pool = tmp;
  • }
  • Node *const new = &Trie.pool[Trie.count];
  • memset(new->children, INVALID_OFFSET, sizeof new->children);
  • Trie.pool[Trie.count].terminal = false;
  • return Trie.count++;
  • }
  • static bool insert_text(int32_t root_idx, const char *text)
  • {
  • while (*text) {
  • const int32_t idx = *text - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[idx] == INVALID_OFFSET) {
  • const int32_t new = alloc_node();
  • if (new == INVALID_OFFSET) {
  • return false;
  • }
  • Trie.pool[root_idx].children[idx] = new;
  • }
  • root_idx = Trie.pool[root_idx].children[idx];
  • ++text;
  • }
  • return Trie.pool[root_idx].terminal = true;
  • }
  • static char *read_file(FILE *fp)
  • {
  • static const size_t page_size = 4096;
  • char *content = NULL;
  • size_t len = 0;
  • for (size_t rcount = 1; rcount; len += rcount) {
  • void *const tmp = realloc(content, len + page_size);
  • if (!tmp) {
  • free(content);
  • content = NULL;
  • /* ENOMEM is not a C standard error code, yet quite common. */
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • content = tmp;
  • rcount = fread(content + len, 1, page_size - 1, fp);
  • if (ferror(fp)) {
  • free(content);
  • content = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • }
  • content[len] = '\0';
  • return content;
  • }
  • static size_t split_lines(char *restrict s, char ***restrict lines)
  • {
  • const size_t chunk_size = 1024;
  • size_t capacity = 0;
  • size_t line_count = 0;
  • while (s && *s) {
  • if (line_count >= capacity) {
  • char **const new = realloc(*lines,
  • sizeof **lines * (capacity += chunk_size));
  • if (!new) {
  • free(*lines);
  • *lines = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return 0;
  • }
  • *lines = new;
  • }
  • (*lines)[line_count++] = s;
  • s = strchr(s, '\n');
  • if (s) {
  • *s++ = '\0';
  • }
  • }
  • return line_count;
  • }
  • static char ac_buffer[AC_BUFFER_CAP];
  • static size_t ac_buffer_sz;
  • static void ac_buffer_push(char ch)
  • {
  • /* No check here, for we would have exited in parse_options() if the length
  • * of the prefix was greater than AC_BUFFER_CAP.
  • */
  • ac_buffer[ac_buffer_sz++] = ch;
  • }
  • static void ac_buffer_pop(void)
  • {
  • if (ac_buffer_sz > 0) {
  • ac_buffer_sz--;
  • }
  • }
  • static int32_t find_prefix(int32_t root_idx, const char *prefix)
  • {
  • while (*prefix) {
  • const int32_t child_idx = *prefix - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[child_idx] == INVALID_OFFSET) {
  • return INVALID_OFFSET;
  • }
  • root_idx = Trie.pool[root_idx].children[child_idx];
  • ac_buffer_push(*prefix);
  • ++prefix;
  • }
  • return root_idx;
  • }
  • static void dump_dot_prefix(FILE *sink, int32_t root_idx)
  • {
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[root_idx].children[i];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (i + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n", child_index,
  • (char) (i + ASCII_OFFSET));
  • }
  • fprintf(sink, "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • root_idx, child_index, (char) (i + ASCII_OFFSET));
  • dump_dot_prefix(sink, Trie.pool[root_idx].children[i]);
  • }
  • }
  • }
  • static void dump_dot_whole(FILE *sink)
  • {
  • for (int32_t i = 0; i < Trie.count; ++i) {
  • const int32_t index = i;
  • for (size_t j = 0; j < CHILDREN_COUNT; ++j) {
  • if (Trie.pool[i].children[j] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[i].children[j];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • }
  • fprintf(sink,
  • "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • index, child_index, (char) (j + ASCII_OFFSET));
  • }
  • }
  • }
  • }
  • static void print_suggestions(int32_t root_idx)
  • {
  • if (Trie.pool[root_idx].terminal) {
  • fwrite(ac_buffer, ac_buffer_sz, 1, stdout);
  • fputc('\n', stdout);
  • }
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • ac_buffer_push((char) (i + ASCII_OFFSET));
  • print_suggestions(Trie.pool[root_idx].children[i]);
  • ac_buffer_pop();
  • }
  • }
  • }
  • static bool populate_trie(int32_t root_idx, char **lines, size_t num_lines)
  • {
  • for (size_t i = 0; i < num_lines; ++i) {
  • if (!insert_text(root_idx, lines[i])) {
  • return false;
  • }
  • }
  • return true;
  • }
  • static bool generate_graph(void)
  • {
  • if (system("dot -Tsvg " OUTPUT_DOT_FILE " -O")) {
  • fprintf(stderr, "Error: failed to generate the .SVG file, %s.\n",
  • strerror(errno));
  • return false;
  • }
  • return true;
  • }
  • static bool generate_dot(int32_t root_idx, const char *prefix)
  • {
  • FILE *const sink = fopen("graph.dot", "w");
  • if (!sink) {
  • perror("fopen()");
  • return false;
  • }
  • fprintf(sink, "digraph Trie {\n"
  • "\tnode [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]\n"
  • "\tNode_%" PRId32 " [label=%s]\n", root_idx, prefix ? prefix : "root");
  • root_idx == 0 ? dump_dot_whole(sink) : dump_dot_prefix(sink, root_idx);
  • fprintf(sink, "}\n");
  • fclose(sink);
  • return true;
  • }
  • static bool process_args(int32_t root_idx,
  • const struct flags *restrict options, const char *restrict prefix,
  • const char *restrict out_file)
  • {
  • bool rv = true;
  • /* Handle --complete first as we are going to modify root later in case
  • * prefix is not a null pointer.
  • */
  • if (options->cflag) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • print_suggestions(subtree_idx);
  • }
  • if (options->sflag) {
  • if (prefix) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • root_idx = subtree_idx;
  • }
  • /* At this point, we can free the file, lines and Trie.pool, but we'd have
  • * to change the structure of the code. */
  • if (!generate_dot(root_idx, prefix)
  • || !generate_graph()) {
  • rv = !rv;
  • }
  • }
  • cleanup:
  • if (!options->kflag) {
  • remove(out_file);
  • }
  • return rv;
  • }
  • int main(int argc, char *argv[])
  • {
  • /* Sanity check. POSIX requires the invoking process to pass a non-NULL
  • * argv[0].
  • */
  • if (!argv[0]) {
  • fprintf(stderr,
  • "A NULL argv[0] was passed through an exec system call.\n");
  • return EXIT_FAILURE;
  • }
  • static const struct option long_options[] = {
  • { "keep", no_argument, NULL, 'k' },
  • { "help", no_argument, NULL, 'h' },
  • { "svg", no_argument, NULL, 's' },
  • { "complete", required_argument, NULL, 'c' },
  • { "prefix", required_argument, NULL, 'p' },
  • { NULL, 0, NULL, 0 },
  • };
  • FILE *in_file = stdin;
  • struct flags options = { false, false, false, false, false };
  • const char *search_prefix = NULL;
  • parse_options(long_options, &options, argc, argv, &search_prefix);
  • if (options.sflag || options.cflag) {
  • if ((optind + 1) == argc) {
  • in_file = fopen(argv[optind], "r");
  • if (!in_file) {
  • perror(argv[optind]);
  • return EXIT_FAILURE;
  • }
  • }
  • } else {
  • usage_err(PROGRAM_NAME);
  • }
  • if (optind > argc) {
  • usage_err(PROGRAM_NAME);
  • }
  • char *const content = read_file(in_file);
  • char **lines = NULL;
  • const size_t num_lines = split_lines(content, &lines);
  • if (!lines) {
  • free(content);
  • perror("fread()");
  • return EXIT_FAILURE;
  • }
  • bool rv = true;
  • int32_t root_idx = INVALID_OFFSET;
  • if (!init_pool()
  • || (root_idx = alloc_node()) == INVALID_OFFSET
  • || !populate_trie(root_idx, lines, num_lines)) {
  • rv = !rv;
  • goto cleanup;
  • }
  • D(
  • char *const total = calculate_size((size_t) Trie.capacity * sizeof *Trie.pool);
  • char *const used = calculate_size((size_t) Trie.count * sizeof *Trie.pool);
  • debug_printf("Total lines read: %zu.\n" "Total nodes allocated: %"
  • PRId32 ".\n" "Total nodes used: %" PRId32 ".\n"
  • "Total memory allocated: %s.\n" "Total memory used: %s.\n",
  • num_lines,
  • Trie.capacity,
  • Trie.count,
  • total,
  • used);
  • free(total);
  • free(used);
  • );
  • rv = process_args(root_idx, &options, search_prefix, OUTPUT_DOT_FILE);
  • cleanup:
  • free_pool(); /* NO-OP if init_pool() returns false. */
  • free(content);
  • free(lines);
  • if (in_file != stdin) {
  • fclose(in_file);
  • }
  • return rv ? EXIT_SUCCESS : EXIT_FAILURE;
  • }
  • ```
  • size.h:
  • ```c
  • #ifndef SIZE_H
  • #define SIZE_H 1
  • #include <stdint.h>
  • char *calculate_size(uint64_t size);
  • #endif /* SIZE_H */
  • ```
  • size.c:
  • ```c
  • #include "size.h"
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <string.h>
  • #include <inttypes.h>
  • #include <stdint.h>
  • #define ARRAY_CARDINALITY(x) (sizeof(x) / sizeof (*(x)))
  • static const char *const sizes[] =
  • { "EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B" };
  • static const uint64_t exbibytes =
  • 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL;
  • /* Acknowledgements:
  • * @Jonathan Leffer - [Converting a number of bytes into a file size in C]
  • * (https://stackoverflow.com/q/3898840/20017547)
  • */
  • char *calculate_size(uint64_t size)
  • {
  • char *const result = malloc(32);
  • if (!result) {
  • return NULL;
  • }
  • uint64_t multiplier = exbibytes;
  • for (size_t i = 0; i < ARRAY_CARDINALITY(sizes); ++i, multiplier /= 1024) {
  • if (size < multiplier) {
  • continue;
  • }
  • if (size % multiplier == 0) {
  • sprintf(result, "%" PRId64 " %s", size / multiplier, sizes[i]);
  • } else {
  • sprintf(result, "%.1lf %s", (double) size / (double) multiplier,
  • sizes[i]);
  • }
  • return result;
  • }
  • return strcpy(result, "0");
  • }
  • #ifdef TEST_MAIN
  • int main(void)
  • {
  • static const uint64_t list[] = {
  • 0, 1, 2, 34, 900, 1023, 1024, 1025, 2048, 1024 * 1024,
  • 1024 * 1024 * 1024 + 1024 * 1024 * 400
  • };
  • for (size_t i = 0; i < ARRAY_CARDINALITY(list); ++i) {
  • char *const str = calculate_size(list[i]);
  • if (!str) {
  • perror("malloc()");
  • return EXIT_FAILURE;
  • }
  • printf("%18" PRId64 " = %s\n", list[i], str);
  • free(str);
  • }
  • return EXIT_SUCCESS;
  • }
  • #endif
  • ```
  • And here's the makefile for the project:
  • ```make
  • CFLAGS = -std=c17
  • CFLAGS += -no-pie
  • CFLAGS += -ftrapv
  • CFLAGS += -Wall
  • CFLAGS += -Wextra
  • CFLAGS += -Warray-bounds
  • CFLAGS += -Wconversion
  • CFLAGS += -Wmissing-braces
  • CFLAGS += -Wno-parentheses
  • CFLAGS += -Wno-format-truncation
  • CFLAGS += -Wno-type-limits
  • CFLAGS += -Wpedantic
  • CFLAGS += -Wstrict-prototypes
  • CFLAGS += -Wwrite-strings
  • CFLAGS += -Winline
  • CFLAGS += -D_FORTIFY_SOURCE=2
  • BIN := trie
  • INSTALL_PATH := /usr/local/bin
  • SRCS := trie.c
  • ifeq ($(MAKECMDGOALS),debug)
  • SRCS += size.c
  • endif
  • all: CFLAGS += -s -O2
  • all: $(BIN)
  • debug: CFLAGS += -g3 -ggdb -fanalyzer -DDEBUG
  • debug: $(BIN)
  • $(BIN): $(SRCS)
  • install: $(BIN)
  • install $< $(INSTALL_PATH)
  • uninstall:
  • rm $(INSTALL_PATH)/$(BIN)
  • clean:
  • $(RM) $(BIN)
  • .PHONY: all debug clean install uninstall
  • .DELETE_ON_ERROR:
  • ```
  • The code was formatted with GNU Indent, compiles without any warnings with Clang,
  • GCC, and MinGW on FreeBSD 14.0, Linux Mint 21.2, and Windows 10.0, and reports
  • nothing awry under valgrind.
  • Although I did have to use gmake on FreeBSD because of some non-standard GNU extensions.
  • You could also clone this github repository: [trie](https://github.com/Melkor-1/Trie)
  • ## Test run:
  • Here are the results of a sample run:
  • ```bash
  • >> free && sync && echo 3 >| /proc/sys/vm/drop_caches && free
  • >> time ./trie --complete tran ~/data-set/wordlists/huge.txt 1> /dev/null
  • ./trie --complete tran ~/data-set/wordlists/huge.txt > /dev/null 0.19s user 0.57s system 97% cpu 0.775 total
  • >> wc ~/data-set/wordlists/huge.txt
  • 172819 172820 1916146
  • ```
  • ## Review Goals:
  • * Do you see any issues with the design/structure of the program?
  • * Does any part of my code exhibit undefined/implementation-defined behavior?
  • * General coding comments, style, et cetera.
  • Do you have a faster alternative to Graphviz dot to suggest?
  • [1]: https://raw.githubusercontent.com/Melkor-1/trie/main/graph.dot.svg
#2: Post edited by user avatar Melkor-1‭ · 2024-02-18T15:54:00Z (9 months ago)
  • Given a list of strings (say a text file containing some C symbols: [c-symbols.txt](https://github.com/Melkor-1/Trie/blob/main/c-symbols.txt)), the program can:
  • 1) Generate a graph of the underlying trie (-p/--prefix can be specified to inspect a
  • specific prefix subtree) with Graphviz dot:
  • ```bash
  • » time ./trie --prefix at --svg c-symbols.txt
  • ./trie --prefix at --svg c-symbols.txt 0.74s user 0.10s system 95% cpu 0.879 total
  • ```
  • Output Image:
  • [![Output Graph][1]][1]
  • (The green ones indicate terminal nodes.)
  • 2) Suggest completions for a given prefix:
  • ```bash
  • » time ./trie -c at c-symbols.txt
  • at_quick_exit
  • atan
  • atan2
  • atan2d
  • atan2d128
  • atan2d32
  • atan2d64
  • atan2f
  • atan2l
  • atan2pi
  • atan2pid
  • atan2pid128
  • atan2pid32
  • atan2pid64
  • atan2pif
  • [94 more results...]
  • ./trie -c at c-symbols.txt 0.00s user 0.01s system 76% cpu 0.018 total
  • ```
  • With `-s/--svg`, the transient file `graph.dot` can be saved by specifying the
  • `-k/--keep` flag.
  • ```bash
  • » cat graph.dot
  • digraph Trie {
  • node [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]
  • Node_167 [label=at]
  • Node_248 [label=A]
  • Node_0 -> Node_248 [label=A]
  • Node_63 [label=_]
  • Node_0 -> Node_63 [label=_]
  • Node_1 [label=a]
  • Node_0 -> Node_1 [label=a]
  • Node_2 [label=b]
  • Node_1 -> Node_2 [label=b]
  • Node_17 [label=c]
  • Node_1 -> Node_17 [label=c]
  • Node_53 [label=d]
  • ...
  • ...
  • Node_448 -> Node_449 [label=c]
  • Node_450 [label=i]
  • Node_449 -> Node_450 [label=i]
  • Node_451 [label=t,fillcolor=lightgreen]
  • Node_450 -> Node_451 [label=t]
  • }
  • ```
  • ## Code:
  • trie.c:
  • ```c
  • #undef _POSIX_C_SOURCE
  • #undef _XOPEN_SOURCE
  • #define _POSIX_C_SOURCE 200819L
  • #define _XOPEN_SOURCE 700
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <stdbool.h>
  • #include <string.h>
  • #include <stdint.h>
  • #include <errno.h>
  • #include <inttypes.h>
  • #include <getopt.h>
  • #define PROGRAM_NAME "auto-complete"
  • #define OUTPUT_DOT_FILE "graph.dot"
  • #define AC_BUFFER_CAP 1024
  • struct flags {
  • bool kflag; /* Keep the transient .DOT file. */
  • bool hflag; /* Help message. */
  • bool sflag; /* Generate a .SVG file. */
  • bool cflag; /* Suggest autocompletions. */
  • bool pflag; /* Prefix for the .DOT file. */
  • };
  • #ifdef DEBUG
  • #include "size.h"
  • #define debug_printf(fmt, ...) \
  • fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, \
  • __func__, __VA_ARGS__)
  • #define D(x) x
  • #else
  • #define D(x) (void) 0
  • #endif
  • static void help(FILE *sink)
  • {
  • fprintf(sink, "\nUSAGE\n"
  • "\t" PROGRAM_NAME " [OPTIONS] [filename]\n\n"
  • "DESCRIPTION\n"
  • "\t" PROGRAM_NAME
  • " is a program for auto-completion and graph visualization.\n\n"
  • "OPTIONS:\n" "\t-k, --keep\t\tKeep the transient .DOT file.\n"
  • "\t-h, --help\t\tDisplay this help message and exit.\n"
  • "\t-s, --svg \t\tGenerate a .SVG file (with optional prefix).\n"
  • "\t-c, --complete PREFIX Suggest autocompletions for prefix.\n"
  • "\t-p, --prefix PREFIX\tPrefix for the .DOT file.\n\n");
  • exit(EXIT_SUCCESS);
  • }
  • static void usage_err(const char *prog_name)
  • {
  • fprintf(stderr, "The syntax of the command is incorrect.\n"
  • "Try %s -h for more information.\n", prog_name);
  • exit(EXIT_FAILURE);
  • }
  • static void parse_options(const struct option *restrict long_options,
  • struct flags *restrict opt_ptr, int argc, char *const argv[],
  • const char **restrict prefix)
  • {
  • int c;
  • int err_flag = 0;
  • while ((c = getopt_long(argc, argv, "shkc:p:", long_options, NULL)) != -1) {
  • switch (c) {
  • case 'k':
  • ++err_flag;
  • opt_ptr->kflag = true;
  • break;
  • case 'h':
  • help(stdout);
  • break;
  • case 's':
  • if (opt_ptr->cflag) {
  • fprintf(stderr,
  • "Error: -s/--svg specified after -c/--complete.\n");
  • usage_err(argv[0]);
  • }
  • if (opt_ptr->pflag) {
  • --err_flag;
  • }
  • if (opt_ptr->kflag) {
  • --err_flag;
  • }
  • opt_ptr->sflag = true;
  • break;
  • case 'c':
  • if (opt_ptr->sflag) {
  • fprintf(stderr,
  • "Error: -c/--complete specified after -s/--svg.\n");
  • usage_err(argv[0]);
  • }
  • *prefix = optarg;
  • if (strlen(*prefix) >= AC_BUFFER_CAP) {
  • fprintf(stderr, "Error: PREFIX too long.\n");
  • exit(EXIT_FAILURE);
  • }
  • opt_ptr->cflag = true;
  • break;
  • case 'p':
  • if (!opt_ptr->sflag) {
  • err_flag = true;
  • }
  • *prefix = optarg;
  • opt_ptr->pflag = true;
  • break;
  • /* case '?' */
  • default:
  • usage_err(argv[0]);
  • }
  • }
  • /* If the -p or -k flag was specified without -s: */
  • /* Note: GNU provides an extension for optional arguments, which
  • * can be used to provide an optional prefix for -s. */
  • if (err_flag) {
  • if (opt_ptr->pflag) {
  • fputs("Error: -p specified without -s.\n", stderr);
  • }
  • if (opt_ptr->kflag) {
  • fputs("Error: -k specified without -s.\n", stderr);
  • }
  • usage_err(argv[0]);
  • }
  • }
  • /* Instead of using the whole ASCII charset, we'd only use the set of printable
  • * characters. This cuts down the struct size from 2056 bytes to 768 bytes if
  • * using 64-bit pointers, or from 1028 bytes to 384 bytes if using 32-bit integers.
  • * (or so pahole outputs, there might be a slight difference in the size across
  • * different platforms).
  • */
  • #define CHILDREN_COUNT 95
  • /* In the children array, 32 should map to 0, 65 should map to 33, and so on. */
  • #define ASCII_OFFSET ' ' /* ASCII code: 32, start of printable characters. */
  • /* This wastes half the indexing range. Perhaps we can use uint32_t instead of
  • * int32_t, and use UINT32_MAX as the sentinel value.
  • */
  • #define INVALID_OFFSET -1
  • #define INITIAL_POOL_CAP 1024
  • typedef struct Node {
  • /* TODO: This still wastes a lot of memory. Something like Pythons's dict
  • * would be better. Explore the idea of using a dynamically growing
  • * hash-table for Node.children.
  • */
  • /* We shall store indices within the `Trie.pool` array instead of storing
  • * huge word size pointers. This also simplifies serializing and
  • * deserializing the structure.
  • *
  • * See also: A case where int32_t wasn't sufficient - bbc.com/news/world-asia-30288542
  • */
  • int32_t children[CHILDREN_COUNT];
  • bool terminal;
  • } Node;
  • static struct {
  • Node *pool;
  • int32_t count;
  • int32_t capacity;
  • } Trie;
  • static bool init_pool(void)
  • {
  • Trie.pool = malloc(sizeof *Trie.pool * INITIAL_POOL_CAP);
  • if (!Trie.pool) {
  • perror("malloc()");
  • return false;
  • }
  • Trie.capacity = INITIAL_POOL_CAP;
  • return true;
  • }
  • static inline void free_pool(void)
  • {
  • free(Trie.pool);
  • }
  • static int32_t alloc_node(void)
  • {
  • if (Trie.count >= Trie.capacity) {
  • const int32_t remaining = INT32_MAX - Trie.capacity;
  • /* We can no longer add more nodes. Bail out. */
  • if (!remaining) {
  • /* Is this helpful? */
  • fputs("Error: too many nodes. Consider recompiling the program"
  • "with a greater int width for more indexing range.\n", stderr);
  • exit(EXIT_FAILURE);
  • }
  • Trie.capacity += remaining < INITIAL_POOL_CAP ? remaining : INITIAL_POOL_CAP;
  • void *const tmp = realloc(Trie.pool, sizeof *Trie.pool * (size_t) Trie.capacity);
  • if (!tmp) {
  • perror("realloc()");
  • return INVALID_OFFSET;
  • }
  • Trie.pool = tmp;
  • }
  • Node *const new = &Trie.pool[Trie.count];
  • memset(new->children, INVALID_OFFSET, sizeof new->children);
  • Trie.pool[Trie.count].terminal = false;
  • return Trie.count++;
  • }
  • static bool insert_text(int32_t root_idx, const char *text)
  • {
  • while (*text) {
  • const int32_t idx = *text - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[idx] == INVALID_OFFSET) {
  • const int32_t new = alloc_node();
  • if (new == INVALID_OFFSET) {
  • return false;
  • }
  • Trie.pool[root_idx].children[idx] = new;
  • }
  • root_idx = Trie.pool[root_idx].children[idx];
  • ++text;
  • }
  • return Trie.pool[root_idx].terminal = true;
  • }
  • static char *read_file(FILE *fp)
  • {
  • static const size_t page_size = 4096;
  • char *content = NULL;
  • size_t len = 0;
  • for (size_t rcount = 1; rcount; len += rcount) {
  • void *const tmp = realloc(content, len + page_size);
  • if (!tmp) {
  • free(content);
  • content = NULL;
  • /* ENOMEM is not a C standard error code, yet quite common. */
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • content = tmp;
  • rcount = fread(content + len, 1, page_size - 1, fp);
  • if (ferror(fp)) {
  • free(content);
  • content = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • }
  • content[len] = '\0';
  • return content;
  • }
  • static size_t split_lines(char *restrict s, char ***restrict lines)
  • {
  • const size_t chunk_size = 1024;
  • size_t capacity = 0;
  • size_t line_count = 0;
  • while (s && *s) {
  • if (line_count >= capacity) {
  • char **const new = realloc(*lines,
  • sizeof **lines * (capacity += chunk_size));
  • if (!new) {
  • free(*lines);
  • *lines = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return 0;
  • }
  • *lines = new;
  • }
  • (*lines)[line_count++] = s;
  • s = strchr(s, '\n');
  • if (s) {
  • *s++ = '\0';
  • }
  • }
  • return line_count;
  • }
  • static char ac_buffer[AC_BUFFER_CAP];
  • static size_t ac_buffer_sz;
  • static void ac_buffer_push(char ch)
  • {
  • /* No check here, for we would have exited in parse_options() if the length
  • * of the prefix was greater than AC_BUFFER_CAP.
  • */
  • ac_buffer[ac_buffer_sz++] = ch;
  • }
  • static void ac_buffer_pop(void)
  • {
  • if (ac_buffer_sz > 0) {
  • ac_buffer_sz--;
  • }
  • }
  • static int32_t find_prefix(int32_t root_idx, const char *prefix)
  • {
  • while (*prefix) {
  • const int32_t child_idx = *prefix - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[child_idx] == INVALID_OFFSET) {
  • return INVALID_OFFSET;
  • }
  • root_idx = Trie.pool[root_idx].children[child_idx];
  • ac_buffer_push(*prefix);
  • ++prefix;
  • }
  • return root_idx;
  • }
  • static void dump_dot_prefix(FILE *sink, int32_t root_idx)
  • {
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[root_idx].children[i];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (i + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n", child_index,
  • (char) (i + ASCII_OFFSET));
  • }
  • fprintf(sink, "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • root_idx, child_index, (char) (i + ASCII_OFFSET));
  • dump_dot_prefix(sink, Trie.pool[root_idx].children[i]);
  • }
  • }
  • }
  • static void dump_dot_whole(FILE *sink)
  • {
  • for (int32_t i = 0; i < Trie.count; ++i) {
  • const int32_t index = i;
  • for (size_t j = 0; j < CHILDREN_COUNT; ++j) {
  • if (Trie.pool[i].children[j] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[i].children[j];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • }
  • fprintf(sink,
  • "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • index, child_index, (char) (j + ASCII_OFFSET));
  • }
  • }
  • }
  • }
  • static void print_suggestions(int32_t root_idx)
  • {
  • if (Trie.pool[root_idx].terminal) {
  • fwrite(ac_buffer, ac_buffer_sz, 1, stdout);
  • fputc('\n', stdout);
  • }
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • ac_buffer_push((char) (i + ASCII_OFFSET));
  • print_suggestions(Trie.pool[root_idx].children[i]);
  • ac_buffer_pop();
  • }
  • }
  • }
  • static bool populate_trie(int32_t root_idx, char **lines, size_t num_lines)
  • {
  • for (size_t i = 0; i < num_lines; ++i) {
  • if (!insert_text(root_idx, lines[i])) {
  • return false;
  • }
  • }
  • return true;
  • }
  • static bool generate_graph(void)
  • {
  • if (system("dot -Tsvg " OUTPUT_DOT_FILE " -O")) {
  • fprintf(stderr, "Error: failed to generate the .SVG file, %s.\n",
  • strerror(errno));
  • return false;
  • }
  • return true;
  • }
  • static bool generate_dot(int32_t root_idx, const char *prefix)
  • {
  • FILE *const sink = fopen("graph.dot", "w");
  • if (!sink) {
  • perror("fopen()");
  • return false;
  • }
  • fprintf(sink, "digraph Trie {\n"
  • "\tnode [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]\n"
  • "\tNode_%" PRId32 " [label=%s]\n", root_idx, prefix ? prefix : "root");
  • root_idx == 0 ? dump_dot_whole(sink) : dump_dot_prefix(sink, root_idx);
  • fprintf(sink, "}\n");
  • fclose(sink);
  • return true;
  • }
  • static bool process_args(int32_t root_idx,
  • const struct flags *restrict options, const char *restrict prefix,
  • const char *restrict out_file)
  • {
  • bool rv = true;
  • /* Handle --complete first as we are going to modify root later in case
  • * prefix is not a null pointer.
  • */
  • if (options->cflag) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • print_suggestions(subtree_idx);
  • }
  • if (options->sflag) {
  • if (prefix) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • root_idx = subtree_idx;
  • }
  • /* At this point, we can free the file, lines and Trie.pool, but we'd have
  • * to change the structure of the code. */
  • if (!generate_dot(root_idx, prefix)
  • || !generate_graph()) {
  • rv = !rv;
  • }
  • }
  • cleanup:
  • if (!options->kflag) {
  • remove(out_file);
  • }
  • return rv;
  • }
  • int main(int argc, char *argv[])
  • {
  • /* Sanity check. POSIX requires the invoking process to pass a non-NULL
  • * argv[0].
  • */
  • if (!argv[0]) {
  • fprintf(stderr,
  • "A NULL argv[0] was passed through an exec system call.\n");
  • return EXIT_FAILURE;
  • }
  • static const struct option long_options[] = {
  • { "keep", no_argument, NULL, 'k' },
  • { "help", no_argument, NULL, 'h' },
  • { "svg", no_argument, NULL, 's' },
  • { "complete", required_argument, NULL, 'c' },
  • { "prefix", required_argument, NULL, 'p' },
  • { NULL, 0, NULL, 0 },
  • };
  • FILE *in_file = stdin;
  • struct flags options = { false, false, false, false, false };
  • const char *search_prefix = NULL;
  • parse_options(long_options, &options, argc, argv, &search_prefix);
  • if (options.sflag || options.cflag) {
  • if ((optind + 1) == argc) {
  • in_file = fopen(argv[optind], "r");
  • if (!in_file) {
  • perror(argv[optind]);
  • return EXIT_FAILURE;
  • }
  • }
  • } else {
  • usage_err(PROGRAM_NAME);
  • }
  • if (optind > argc) {
  • usage_err(PROGRAM_NAME);
  • }
  • char *const content = read_file(in_file);
  • char **lines = NULL;
  • const size_t num_lines = split_lines(content, &lines);
  • if (!lines) {
  • free(content);
  • perror("fread()");
  • return EXIT_FAILURE;
  • }
  • bool rv = true;
  • int32_t root_idx = INVALID_OFFSET;
  • if (!init_pool()
  • || (root_idx = alloc_node()) == INVALID_OFFSET
  • || !populate_trie(root_idx, lines, num_lines)) {
  • rv = !rv;
  • goto cleanup;
  • }
  • D(
  • char *const total = calculate_size((size_t) Trie.capacity * sizeof *Trie.pool);
  • char *const used = calculate_size((size_t) Trie.count * sizeof *Trie.pool);
  • debug_printf("Total lines read: %zu.\n" "Total nodes allocated: %"
  • PRId32 ".\n" "Total nodes used: %" PRId32 ".\n"
  • "Total memory allocated: %s.\n" "Total memory used: %s.\n",
  • num_lines,
  • Trie.capacity,
  • Trie.count,
  • total,
  • used);
  • free(total);
  • free(used);
  • );
  • rv = process_args(root_idx, &options, search_prefix, OUTPUT_DOT_FILE);
  • cleanup:
  • free_pool(); /* NO-OP if init_pool() returns false. */
  • free(content);
  • free(lines);
  • if (in_file != stdin) {
  • fclose(in_file);
  • }
  • return rv ? EXIT_SUCCESS : EXIT_FAILURE;
  • }
  • ```
  • size.h:
  • ```c
  • #ifndef SIZE_H
  • #define SIZE_H 1
  • #include <stdint.h>
  • char *calculate_size(uint64_t size);
  • #endif /* SIZE_H */
  • ```
  • size.c:
  • ```c
  • #include "size.h"
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <string.h>
  • #include <inttypes.h>
  • #include <stdint.h>
  • #define ARRAY_CARDINALITY(x) (sizeof(x) / sizeof (*(x)))
  • static const char *const sizes[] =
  • { "EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B" };
  • static const uint64_t exbibytes =
  • 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL;
  • /* Acknowledgements:
  • * @Jonathan Leffer - [Converting a number of bytes into a file size in C]
  • * (https://stackoverflow.com/q/3898840/20017547)
  • */
  • char *calculate_size(uint64_t size)
  • {
  • char *const result = malloc(32);
  • if (!result) {
  • return NULL;
  • }
  • uint64_t multiplier = exbibytes;
  • for (size_t i = 0; i < ARRAY_CARDINALITY(sizes); ++i, multiplier /= 1024) {
  • if (size < multiplier) {
  • continue;
  • }
  • if (size % multiplier == 0) {
  • sprintf(result, "%" PRId64 " %s", size / multiplier, sizes[i]);
  • } else {
  • sprintf(result, "%.1lf %s", (double) size / (double) multiplier,
  • sizes[i]);
  • }
  • return result;
  • }
  • return strcpy(result, "0");
  • }
  • #ifdef TEST_MAIN
  • int main(void)
  • {
  • static const uint64_t list[] = {
  • 0, 1, 2, 34, 900, 1023, 1024, 1025, 2048, 1024 * 1024,
  • 1024 * 1024 * 1024 + 1024 * 1024 * 400
  • };
  • for (size_t i = 0; i < ARRAY_CARDINALITY(list); ++i) {
  • char *const str = calculate_size(list[i]);
  • if (!str) {
  • perror("malloc()");
  • return EXIT_FAILURE;
  • }
  • printf("%18" PRId64 " = %s\n", list[i], str);
  • free(str);
  • }
  • return EXIT_SUCCESS;
  • }
  • #endif
  • ```
  • And here's the makefile for the project:
  • ```make
  • CFLAGS = -std=c17
  • CFLAGS += -no-pie
  • CFLAGS += -ftrapv
  • CFLAGS += -Wall
  • CFLAGS += -Wextra
  • CFLAGS += -Warray-bounds
  • CFLAGS += -Wconversion
  • CFLAGS += -Wmissing-braces
  • CFLAGS += -Wno-parentheses
  • CFLAGS += -Wno-format-truncation
  • CFLAGS += -Wno-type-limits
  • CFLAGS += -Wpedantic
  • CFLAGS += -Wstrict-prototypes
  • CFLAGS += -Wwrite-strings
  • CFLAGS += -Winline
  • CFLAGS += -D_FORTIFY_SOURCE=2
  • BIN := trie
  • INSTALL_PATH := /usr/local/bin
  • SRCS := trie.c
  • ifeq ($(MAKECMDGOALS),debug)
  • SRCS += size.c
  • endif
  • all: CFLAGS += -s -O2
  • all: $(BIN)
  • debug: CFLAGS += -g3 -ggdb -fanalyzer -DDEBUG
  • debug: $(BIN)
  • $(BIN): $(SRCS)
  • install: $(BIN)
  • install $< $(INSTALL_PATH)
  • uninstall:
  • rm $(INSTALL_PATH)/$(BIN)
  • clean:
  • $(RM) $(BIN)
  • .PHONY: all debug clean install uninstall
  • .DELETE_ON_ERROR:
  • ```
  • The code was formatted with GNU Indent, compiles without any warnings with Clang,
  • GCC, and MinGW on FreeBSD 14.0, Linux Mint 21.2, and Windows 10.0, and reports
  • nothing awry under valgrind.
  • Although I did have to use gmake on FreeBSD because of some non-standard GNU extensions.
  • You could also clone this github repository: [trie](https://github.com/Melkor-1/Trie)
  • ## Test run:
  • Here are the results of a sample run:
  • ```bash
  • >> free && sync && echo 3 >| /proc/sys/vm/drop_caches && free
  • >> time ./trie --complete tran ~/data-set/wordlists/huge.txt 1> /dev/null
  • ./trie --complete tran ~/data-set/wordlists/huge.txt > /dev/null 0.19s user 0.57s system 97% cpu 0.775 total
  • >> wc ~/data-set/wordlists/huge.txt
  • 172819 172820 1916146
  • ```
  • ## Review Goals:
  • * Do you see any issues with the design/structure of the program?
  • * Does any part of my code exhibit undefined/implementation-defined behavior?
  • * General coding comments, style, et cetera.
  • Do you have a faster alternative to Graphviz dot to suggest?
  • [1]: https://github.com/Melkor-1/trie/blob/main/graph.dot.svg
  • Given a list of strings (say a text file containing some C symbols: [c-symbols.txt](https://github.com/Melkor-1/Trie/blob/main/c-symbols.txt)), the program can:
  • 1) Generate a graph of the underlying trie (-p/--prefix can be specified to inspect a
  • specific prefix subtree) with Graphviz dot:
  • ```bash
  • » time ./trie --prefix at --svg c-symbols.txt
  • ./trie --prefix at --svg c-symbols.txt 0.74s user 0.10s system 95% cpu 0.879 total
  • ```
  • Output Image:
  • [![Output Graph][1]][1]
  • (The green ones indicate terminal nodes.)
  • 2) Suggest completions for a given prefix:
  • ```bash
  • » time ./trie -c at c-symbols.txt
  • at_quick_exit
  • atan
  • atan2
  • atan2d
  • atan2d128
  • atan2d32
  • atan2d64
  • atan2f
  • atan2l
  • atan2pi
  • atan2pid
  • atan2pid128
  • atan2pid32
  • atan2pid64
  • atan2pif
  • [94 more results...]
  • ./trie -c at c-symbols.txt 0.00s user 0.01s system 76% cpu 0.018 total
  • ```
  • With `-s/--svg`, the transient file `graph.dot` can be saved by specifying the
  • `-k/--keep` flag.
  • ```bash
  • » cat graph.dot
  • digraph Trie {
  • node [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]
  • Node_167 [label=at]
  • Node_248 [label=A]
  • Node_0 -> Node_248 [label=A]
  • Node_63 [label=_]
  • Node_0 -> Node_63 [label=_]
  • Node_1 [label=a]
  • Node_0 -> Node_1 [label=a]
  • Node_2 [label=b]
  • Node_1 -> Node_2 [label=b]
  • Node_17 [label=c]
  • Node_1 -> Node_17 [label=c]
  • Node_53 [label=d]
  • ...
  • ...
  • Node_448 -> Node_449 [label=c]
  • Node_450 [label=i]
  • Node_449 -> Node_450 [label=i]
  • Node_451 [label=t,fillcolor=lightgreen]
  • Node_450 -> Node_451 [label=t]
  • }
  • ```
  • ## Code:
  • trie.c:
  • ```c
  • #undef _POSIX_C_SOURCE
  • #undef _XOPEN_SOURCE
  • #define _POSIX_C_SOURCE 200819L
  • #define _XOPEN_SOURCE 700
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <stdbool.h>
  • #include <string.h>
  • #include <stdint.h>
  • #include <errno.h>
  • #include <inttypes.h>
  • #include <getopt.h>
  • #define PROGRAM_NAME "auto-complete"
  • #define OUTPUT_DOT_FILE "graph.dot"
  • #define AC_BUFFER_CAP 1024
  • struct flags {
  • bool kflag; /* Keep the transient .DOT file. */
  • bool hflag; /* Help message. */
  • bool sflag; /* Generate a .SVG file. */
  • bool cflag; /* Suggest autocompletions. */
  • bool pflag; /* Prefix for the .DOT file. */
  • };
  • #ifdef DEBUG
  • #include "size.h"
  • #define debug_printf(fmt, ...) \
  • fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, \
  • __func__, __VA_ARGS__)
  • #define D(x) x
  • #else
  • #define D(x) (void) 0
  • #endif
  • static void help(FILE *sink)
  • {
  • fprintf(sink, "\nUSAGE\n"
  • "\t" PROGRAM_NAME " [OPTIONS] [filename]\n\n"
  • "DESCRIPTION\n"
  • "\t" PROGRAM_NAME
  • " is a program for auto-completion and graph visualization.\n\n"
  • "OPTIONS:\n" "\t-k, --keep\t\tKeep the transient .DOT file.\n"
  • "\t-h, --help\t\tDisplay this help message and exit.\n"
  • "\t-s, --svg \t\tGenerate a .SVG file (with optional prefix).\n"
  • "\t-c, --complete PREFIX Suggest autocompletions for prefix.\n"
  • "\t-p, --prefix PREFIX\tPrefix for the .DOT file.\n\n");
  • exit(EXIT_SUCCESS);
  • }
  • static void usage_err(const char *prog_name)
  • {
  • fprintf(stderr, "The syntax of the command is incorrect.\n"
  • "Try %s -h for more information.\n", prog_name);
  • exit(EXIT_FAILURE);
  • }
  • static void parse_options(const struct option *restrict long_options,
  • struct flags *restrict opt_ptr, int argc, char *const argv[],
  • const char **restrict prefix)
  • {
  • int c;
  • int err_flag = 0;
  • while ((c = getopt_long(argc, argv, "shkc:p:", long_options, NULL)) != -1) {
  • switch (c) {
  • case 'k':
  • ++err_flag;
  • opt_ptr->kflag = true;
  • break;
  • case 'h':
  • help(stdout);
  • break;
  • case 's':
  • if (opt_ptr->cflag) {
  • fprintf(stderr,
  • "Error: -s/--svg specified after -c/--complete.\n");
  • usage_err(argv[0]);
  • }
  • if (opt_ptr->pflag) {
  • --err_flag;
  • }
  • if (opt_ptr->kflag) {
  • --err_flag;
  • }
  • opt_ptr->sflag = true;
  • break;
  • case 'c':
  • if (opt_ptr->sflag) {
  • fprintf(stderr,
  • "Error: -c/--complete specified after -s/--svg.\n");
  • usage_err(argv[0]);
  • }
  • *prefix = optarg;
  • if (strlen(*prefix) >= AC_BUFFER_CAP) {
  • fprintf(stderr, "Error: PREFIX too long.\n");
  • exit(EXIT_FAILURE);
  • }
  • opt_ptr->cflag = true;
  • break;
  • case 'p':
  • if (!opt_ptr->sflag) {
  • err_flag = true;
  • }
  • *prefix = optarg;
  • opt_ptr->pflag = true;
  • break;
  • /* case '?' */
  • default:
  • usage_err(argv[0]);
  • }
  • }
  • /* If the -p or -k flag was specified without -s: */
  • /* Note: GNU provides an extension for optional arguments, which
  • * can be used to provide an optional prefix for -s. */
  • if (err_flag) {
  • if (opt_ptr->pflag) {
  • fputs("Error: -p specified without -s.\n", stderr);
  • }
  • if (opt_ptr->kflag) {
  • fputs("Error: -k specified without -s.\n", stderr);
  • }
  • usage_err(argv[0]);
  • }
  • }
  • /* Instead of using the whole ASCII charset, we'd only use the set of printable
  • * characters. This cuts down the struct size from 2056 bytes to 768 bytes if
  • * using 64-bit integers, or from 1028 bytes to 384 bytes if using 32-bit integers.
  • * (or so pahole outputs, there might be a slight difference in the size across
  • * different platforms).
  • */
  • #define CHILDREN_COUNT 95
  • /* In the children array, 32 should map to 0, 65 should map to 33, and so on. */
  • #define ASCII_OFFSET ' ' /* ASCII code: 32, start of printable characters. */
  • /* This wastes half the indexing range. Perhaps we can use uint32_t instead of
  • * int32_t, and use UINT32_MAX as the sentinel value.
  • */
  • #define INVALID_OFFSET -1
  • #define INITIAL_POOL_CAP 1024
  • typedef struct Node {
  • /* TODO: This still wastes a lot of memory. Something like Pythons's dict
  • * would be better. Explore the idea of using a dynamically growing
  • * hash-table for Node.children.
  • */
  • /* We shall store indices within the `Trie.pool` array instead of storing
  • * huge word size pointers. This also simplifies serializing and
  • * deserializing the structure.
  • *
  • * See also: A case where int32_t wasn't sufficient - bbc.com/news/world-asia-30288542
  • */
  • int32_t children[CHILDREN_COUNT];
  • bool terminal;
  • } Node;
  • static struct {
  • Node *pool;
  • int32_t count;
  • int32_t capacity;
  • } Trie;
  • static bool init_pool(void)
  • {
  • Trie.pool = malloc(sizeof *Trie.pool * INITIAL_POOL_CAP);
  • if (!Trie.pool) {
  • perror("malloc()");
  • return false;
  • }
  • Trie.capacity = INITIAL_POOL_CAP;
  • return true;
  • }
  • static inline void free_pool(void)
  • {
  • free(Trie.pool);
  • }
  • static int32_t alloc_node(void)
  • {
  • if (Trie.count >= Trie.capacity) {
  • const int32_t remaining = INT32_MAX - Trie.capacity;
  • /* We can no longer add more nodes. Bail out. */
  • if (!remaining) {
  • /* Is this helpful? */
  • fputs("Error: too many nodes. Consider recompiling the program"
  • "with a greater int width for more indexing range.\n", stderr);
  • exit(EXIT_FAILURE);
  • }
  • Trie.capacity += remaining < INITIAL_POOL_CAP ? remaining : INITIAL_POOL_CAP;
  • void *const tmp = realloc(Trie.pool, sizeof *Trie.pool * (size_t) Trie.capacity);
  • if (!tmp) {
  • perror("realloc()");
  • return INVALID_OFFSET;
  • }
  • Trie.pool = tmp;
  • }
  • Node *const new = &Trie.pool[Trie.count];
  • memset(new->children, INVALID_OFFSET, sizeof new->children);
  • Trie.pool[Trie.count].terminal = false;
  • return Trie.count++;
  • }
  • static bool insert_text(int32_t root_idx, const char *text)
  • {
  • while (*text) {
  • const int32_t idx = *text - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[idx] == INVALID_OFFSET) {
  • const int32_t new = alloc_node();
  • if (new == INVALID_OFFSET) {
  • return false;
  • }
  • Trie.pool[root_idx].children[idx] = new;
  • }
  • root_idx = Trie.pool[root_idx].children[idx];
  • ++text;
  • }
  • return Trie.pool[root_idx].terminal = true;
  • }
  • static char *read_file(FILE *fp)
  • {
  • static const size_t page_size = 4096;
  • char *content = NULL;
  • size_t len = 0;
  • for (size_t rcount = 1; rcount; len += rcount) {
  • void *const tmp = realloc(content, len + page_size);
  • if (!tmp) {
  • free(content);
  • content = NULL;
  • /* ENOMEM is not a C standard error code, yet quite common. */
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • content = tmp;
  • rcount = fread(content + len, 1, page_size - 1, fp);
  • if (ferror(fp)) {
  • free(content);
  • content = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return NULL;
  • }
  • }
  • content[len] = '\0';
  • return content;
  • }
  • static size_t split_lines(char *restrict s, char ***restrict lines)
  • {
  • const size_t chunk_size = 1024;
  • size_t capacity = 0;
  • size_t line_count = 0;
  • while (s && *s) {
  • if (line_count >= capacity) {
  • char **const new = realloc(*lines,
  • sizeof **lines * (capacity += chunk_size));
  • if (!new) {
  • free(*lines);
  • *lines = NULL;
  • #ifdef ENOMEM
  • errno = ENOMEM;
  • #endif
  • return 0;
  • }
  • *lines = new;
  • }
  • (*lines)[line_count++] = s;
  • s = strchr(s, '\n');
  • if (s) {
  • *s++ = '\0';
  • }
  • }
  • return line_count;
  • }
  • static char ac_buffer[AC_BUFFER_CAP];
  • static size_t ac_buffer_sz;
  • static void ac_buffer_push(char ch)
  • {
  • /* No check here, for we would have exited in parse_options() if the length
  • * of the prefix was greater than AC_BUFFER_CAP.
  • */
  • ac_buffer[ac_buffer_sz++] = ch;
  • }
  • static void ac_buffer_pop(void)
  • {
  • if (ac_buffer_sz > 0) {
  • ac_buffer_sz--;
  • }
  • }
  • static int32_t find_prefix(int32_t root_idx, const char *prefix)
  • {
  • while (*prefix) {
  • const int32_t child_idx = *prefix - ASCII_OFFSET;
  • if (Trie.pool[root_idx].children[child_idx] == INVALID_OFFSET) {
  • return INVALID_OFFSET;
  • }
  • root_idx = Trie.pool[root_idx].children[child_idx];
  • ac_buffer_push(*prefix);
  • ++prefix;
  • }
  • return root_idx;
  • }
  • static void dump_dot_prefix(FILE *sink, int32_t root_idx)
  • {
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[root_idx].children[i];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (i + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n", child_index,
  • (char) (i + ASCII_OFFSET));
  • }
  • fprintf(sink, "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • root_idx, child_index, (char) (i + ASCII_OFFSET));
  • dump_dot_prefix(sink, Trie.pool[root_idx].children[i]);
  • }
  • }
  • }
  • static void dump_dot_whole(FILE *sink)
  • {
  • for (int32_t i = 0; i < Trie.count; ++i) {
  • const int32_t index = i;
  • for (size_t j = 0; j < CHILDREN_COUNT; ++j) {
  • if (Trie.pool[i].children[j] != INVALID_OFFSET) {
  • const int32_t child_index = Trie.pool[i].children[j];
  • if (Trie.pool[child_index].terminal) {
  • fprintf(sink,
  • "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • } else {
  • fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n",
  • child_index, (char) (j + ASCII_OFFSET));
  • }
  • fprintf(sink,
  • "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
  • index, child_index, (char) (j + ASCII_OFFSET));
  • }
  • }
  • }
  • }
  • static void print_suggestions(int32_t root_idx)
  • {
  • if (Trie.pool[root_idx].terminal) {
  • fwrite(ac_buffer, ac_buffer_sz, 1, stdout);
  • fputc('\n', stdout);
  • }
  • for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
  • if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
  • ac_buffer_push((char) (i + ASCII_OFFSET));
  • print_suggestions(Trie.pool[root_idx].children[i]);
  • ac_buffer_pop();
  • }
  • }
  • }
  • static bool populate_trie(int32_t root_idx, char **lines, size_t num_lines)
  • {
  • for (size_t i = 0; i < num_lines; ++i) {
  • if (!insert_text(root_idx, lines[i])) {
  • return false;
  • }
  • }
  • return true;
  • }
  • static bool generate_graph(void)
  • {
  • if (system("dot -Tsvg " OUTPUT_DOT_FILE " -O")) {
  • fprintf(stderr, "Error: failed to generate the .SVG file, %s.\n",
  • strerror(errno));
  • return false;
  • }
  • return true;
  • }
  • static bool generate_dot(int32_t root_idx, const char *prefix)
  • {
  • FILE *const sink = fopen("graph.dot", "w");
  • if (!sink) {
  • perror("fopen()");
  • return false;
  • }
  • fprintf(sink, "digraph Trie {\n"
  • "\tnode [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]\n"
  • "\tNode_%" PRId32 " [label=%s]\n", root_idx, prefix ? prefix : "root");
  • root_idx == 0 ? dump_dot_whole(sink) : dump_dot_prefix(sink, root_idx);
  • fprintf(sink, "}\n");
  • fclose(sink);
  • return true;
  • }
  • static bool process_args(int32_t root_idx,
  • const struct flags *restrict options, const char *restrict prefix,
  • const char *restrict out_file)
  • {
  • bool rv = true;
  • /* Handle --complete first as we are going to modify root later in case
  • * prefix is not a null pointer.
  • */
  • if (options->cflag) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • print_suggestions(subtree_idx);
  • }
  • if (options->sflag) {
  • if (prefix) {
  • const int32_t subtree_idx = find_prefix(root_idx, prefix);
  • if (subtree_idx == INVALID_OFFSET) {
  • fprintf(stderr, "Error: Unable to find prefix.\n");
  • rv = !rv;
  • goto cleanup;
  • }
  • root_idx = subtree_idx;
  • }
  • /* At this point, we can free the file, lines and Trie.pool, but we'd have
  • * to change the structure of the code. */
  • if (!generate_dot(root_idx, prefix)
  • || !generate_graph()) {
  • rv = !rv;
  • }
  • }
  • cleanup:
  • if (!options->kflag) {
  • remove(out_file);
  • }
  • return rv;
  • }
  • int main(int argc, char *argv[])
  • {
  • /* Sanity check. POSIX requires the invoking process to pass a non-NULL
  • * argv[0].
  • */
  • if (!argv[0]) {
  • fprintf(stderr,
  • "A NULL argv[0] was passed through an exec system call.\n");
  • return EXIT_FAILURE;
  • }
  • static const struct option long_options[] = {
  • { "keep", no_argument, NULL, 'k' },
  • { "help", no_argument, NULL, 'h' },
  • { "svg", no_argument, NULL, 's' },
  • { "complete", required_argument, NULL, 'c' },
  • { "prefix", required_argument, NULL, 'p' },
  • { NULL, 0, NULL, 0 },
  • };
  • FILE *in_file = stdin;
  • struct flags options = { false, false, false, false, false };
  • const char *search_prefix = NULL;
  • parse_options(long_options, &options, argc, argv, &search_prefix);
  • if (options.sflag || options.cflag) {
  • if ((optind + 1) == argc) {
  • in_file = fopen(argv[optind], "r");
  • if (!in_file) {
  • perror(argv[optind]);
  • return EXIT_FAILURE;
  • }
  • }
  • } else {
  • usage_err(PROGRAM_NAME);
  • }
  • if (optind > argc) {
  • usage_err(PROGRAM_NAME);
  • }
  • char *const content = read_file(in_file);
  • char **lines = NULL;
  • const size_t num_lines = split_lines(content, &lines);
  • if (!lines) {
  • free(content);
  • perror("fread()");
  • return EXIT_FAILURE;
  • }
  • bool rv = true;
  • int32_t root_idx = INVALID_OFFSET;
  • if (!init_pool()
  • || (root_idx = alloc_node()) == INVALID_OFFSET
  • || !populate_trie(root_idx, lines, num_lines)) {
  • rv = !rv;
  • goto cleanup;
  • }
  • D(
  • char *const total = calculate_size((size_t) Trie.capacity * sizeof *Trie.pool);
  • char *const used = calculate_size((size_t) Trie.count * sizeof *Trie.pool);
  • debug_printf("Total lines read: %zu.\n" "Total nodes allocated: %"
  • PRId32 ".\n" "Total nodes used: %" PRId32 ".\n"
  • "Total memory allocated: %s.\n" "Total memory used: %s.\n",
  • num_lines,
  • Trie.capacity,
  • Trie.count,
  • total,
  • used);
  • free(total);
  • free(used);
  • );
  • rv = process_args(root_idx, &options, search_prefix, OUTPUT_DOT_FILE);
  • cleanup:
  • free_pool(); /* NO-OP if init_pool() returns false. */
  • free(content);
  • free(lines);
  • if (in_file != stdin) {
  • fclose(in_file);
  • }
  • return rv ? EXIT_SUCCESS : EXIT_FAILURE;
  • }
  • ```
  • size.h:
  • ```c
  • #ifndef SIZE_H
  • #define SIZE_H 1
  • #include <stdint.h>
  • char *calculate_size(uint64_t size);
  • #endif /* SIZE_H */
  • ```
  • size.c:
  • ```c
  • #include "size.h"
  • #include <stdio.h>
  • #include <stdlib.h>
  • #include <string.h>
  • #include <inttypes.h>
  • #include <stdint.h>
  • #define ARRAY_CARDINALITY(x) (sizeof(x) / sizeof (*(x)))
  • static const char *const sizes[] =
  • { "EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B" };
  • static const uint64_t exbibytes =
  • 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL;
  • /* Acknowledgements:
  • * @Jonathan Leffer - [Converting a number of bytes into a file size in C]
  • * (https://stackoverflow.com/q/3898840/20017547)
  • */
  • char *calculate_size(uint64_t size)
  • {
  • char *const result = malloc(32);
  • if (!result) {
  • return NULL;
  • }
  • uint64_t multiplier = exbibytes;
  • for (size_t i = 0; i < ARRAY_CARDINALITY(sizes); ++i, multiplier /= 1024) {
  • if (size < multiplier) {
  • continue;
  • }
  • if (size % multiplier == 0) {
  • sprintf(result, "%" PRId64 " %s", size / multiplier, sizes[i]);
  • } else {
  • sprintf(result, "%.1lf %s", (double) size / (double) multiplier,
  • sizes[i]);
  • }
  • return result;
  • }
  • return strcpy(result, "0");
  • }
  • #ifdef TEST_MAIN
  • int main(void)
  • {
  • static const uint64_t list[] = {
  • 0, 1, 2, 34, 900, 1023, 1024, 1025, 2048, 1024 * 1024,
  • 1024 * 1024 * 1024 + 1024 * 1024 * 400
  • };
  • for (size_t i = 0; i < ARRAY_CARDINALITY(list); ++i) {
  • char *const str = calculate_size(list[i]);
  • if (!str) {
  • perror("malloc()");
  • return EXIT_FAILURE;
  • }
  • printf("%18" PRId64 " = %s\n", list[i], str);
  • free(str);
  • }
  • return EXIT_SUCCESS;
  • }
  • #endif
  • ```
  • And here's the makefile for the project:
  • ```make
  • CFLAGS = -std=c17
  • CFLAGS += -no-pie
  • CFLAGS += -ftrapv
  • CFLAGS += -Wall
  • CFLAGS += -Wextra
  • CFLAGS += -Warray-bounds
  • CFLAGS += -Wconversion
  • CFLAGS += -Wmissing-braces
  • CFLAGS += -Wno-parentheses
  • CFLAGS += -Wno-format-truncation
  • CFLAGS += -Wno-type-limits
  • CFLAGS += -Wpedantic
  • CFLAGS += -Wstrict-prototypes
  • CFLAGS += -Wwrite-strings
  • CFLAGS += -Winline
  • CFLAGS += -D_FORTIFY_SOURCE=2
  • BIN := trie
  • INSTALL_PATH := /usr/local/bin
  • SRCS := trie.c
  • ifeq ($(MAKECMDGOALS),debug)
  • SRCS += size.c
  • endif
  • all: CFLAGS += -s -O2
  • all: $(BIN)
  • debug: CFLAGS += -g3 -ggdb -fanalyzer -DDEBUG
  • debug: $(BIN)
  • $(BIN): $(SRCS)
  • install: $(BIN)
  • install $< $(INSTALL_PATH)
  • uninstall:
  • rm $(INSTALL_PATH)/$(BIN)
  • clean:
  • $(RM) $(BIN)
  • .PHONY: all debug clean install uninstall
  • .DELETE_ON_ERROR:
  • ```
  • The code was formatted with GNU Indent, compiles without any warnings with Clang,
  • GCC, and MinGW on FreeBSD 14.0, Linux Mint 21.2, and Windows 10.0, and reports
  • nothing awry under valgrind.
  • Although I did have to use gmake on FreeBSD because of some non-standard GNU extensions.
  • You could also clone this github repository: [trie](https://github.com/Melkor-1/Trie)
  • ## Test run:
  • Here are the results of a sample run:
  • ```bash
  • >> free && sync && echo 3 >| /proc/sys/vm/drop_caches && free
  • >> time ./trie --complete tran ~/data-set/wordlists/huge.txt 1> /dev/null
  • ./trie --complete tran ~/data-set/wordlists/huge.txt > /dev/null 0.19s user 0.57s system 97% cpu 0.775 total
  • >> wc ~/data-set/wordlists/huge.txt
  • 172819 172820 1916146
  • ```
  • ## Review Goals:
  • * Do you see any issues with the design/structure of the program?
  • * Does any part of my code exhibit undefined/implementation-defined behavior?
  • * General coding comments, style, et cetera.
  • Do you have a faster alternative to Graphviz dot to suggest?
  • [1]: https://github.com/Melkor-1/trie/blob/main/graph.dot.svg
#1: Initial revision by user avatar Melkor-1‭ · 2024-02-18T09:10:06Z (9 months ago)
Trie Implementation, Graph Visualization and Auto-Completion in C
Given a list of strings (say a text file containing some C symbols: [c-symbols.txt](https://github.com/Melkor-1/Trie/blob/main/c-symbols.txt)), the program can:

1) Generate a graph of the underlying trie (-p/--prefix can be specified to inspect a
specific prefix subtree) with Graphviz dot:

```bash
» time ./trie --prefix at --svg c-symbols.txt  

./trie --prefix at --svg c-symbols.txt 0.74s user 0.10s system 95% cpu 0.879 total 
```

Output Image:

[![Output Graph][1]][1]

(The green ones indicate terminal nodes.)

2) Suggest completions for a given prefix:
```bash
» time ./trie -c at c-symbols.txt

at_quick_exit
atan
atan2
atan2d
atan2d128
atan2d32
atan2d64
atan2f
atan2l
atan2pi
atan2pid
atan2pid128
atan2pid32
atan2pid64
atan2pif
[94 more results...]

./trie -c at c-symbols.txt 0.00s user 0.01s system 76% cpu 0.018 total
```

With `-s/--svg`, the transient file `graph.dot` can be saved by specifying the 
`-k/--keep` flag.

```bash
»  cat graph.dot

digraph Trie {
	node [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]
	Node_167 [label=at]
    Node_248 [label=A]
    Node_0 -> Node_248 [label=A]
    Node_63 [label=_]
    Node_0 -> Node_63 [label=_]
    Node_1 [label=a]
    Node_0 -> Node_1 [label=a]
    Node_2 [label=b]
    Node_1 -> Node_2 [label=b]
    Node_17 [label=c]
    Node_1 -> Node_17 [label=c]
    Node_53 [label=d]
    ...
    ...
    Node_448 -> Node_449 [label=c]
    Node_450 [label=i]
    Node_449 -> Node_450 [label=i]
    Node_451 [label=t,fillcolor=lightgreen]
    Node_450 -> Node_451 [label=t]
}
```


## Code:

trie.c:

```c
#undef _POSIX_C_SOURCE
#undef _XOPEN_SOURCE

#define _POSIX_C_SOURCE 200819L
#define _XOPEN_SOURCE 700

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdint.h>
#include <errno.h>
#include <inttypes.h>

#include <getopt.h>

#define PROGRAM_NAME "auto-complete"
#define OUTPUT_DOT_FILE "graph.dot"
#define AC_BUFFER_CAP 1024

struct flags {
    bool kflag;                 /* Keep the transient .DOT file. */
    bool hflag;                 /* Help message. */
    bool sflag;                 /* Generate a .SVG file. */
    bool cflag;                 /* Suggest autocompletions. */
    bool pflag;                 /* Prefix for the .DOT file. */
};

#ifdef DEBUG
#include "size.h"
#define debug_printf(fmt, ...) \
    fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, \
            __func__, __VA_ARGS__)
#define D(x) x
#else
#define D(x) (void) 0
#endif

static void help(FILE *sink)
{
    fprintf(sink, "\nUSAGE\n"
        "\t" PROGRAM_NAME " [OPTIONS] [filename]\n\n"
        "DESCRIPTION\n"
        "\t" PROGRAM_NAME
        " is a program for auto-completion and graph visualization.\n\n"
        "OPTIONS:\n" "\t-k, --keep\t\tKeep the transient .DOT file.\n"
        "\t-h, --help\t\tDisplay this help message and exit.\n"
        "\t-s, --svg \t\tGenerate a .SVG file (with optional prefix).\n"
        "\t-c, --complete PREFIX   Suggest autocompletions for prefix.\n"
        "\t-p, --prefix PREFIX\tPrefix for the .DOT file.\n\n");
    exit(EXIT_SUCCESS);
}

static void usage_err(const char *prog_name)
{
    fprintf(stderr, "The syntax of the command is incorrect.\n"
        "Try %s -h for more information.\n", prog_name);
    exit(EXIT_FAILURE);
}

static void parse_options(const struct option *restrict long_options,
    struct flags *restrict opt_ptr, int argc, char *const argv[],
    const char **restrict prefix)
{
    int c;
    int err_flag = 0;

    while ((c = getopt_long(argc, argv, "shkc:p:", long_options, NULL)) != -1) {
        switch (c) {
            case 'k':
                ++err_flag;
                opt_ptr->kflag = true;
                break;
            case 'h':
                help(stdout);
                break;
            case 's':
                if (opt_ptr->cflag) {
                    fprintf(stderr,
                        "Error: -s/--svg specified after -c/--complete.\n");
                    usage_err(argv[0]);
                }

                if (opt_ptr->pflag) {
                    --err_flag;
                }

                if (opt_ptr->kflag) {
                    --err_flag;
                }

                opt_ptr->sflag = true;
                break;
            case 'c':
                if (opt_ptr->sflag) {
                    fprintf(stderr,
                        "Error: -c/--complete specified after -s/--svg.\n");
                    usage_err(argv[0]);
                }

                *prefix = optarg;

                if (strlen(*prefix) >= AC_BUFFER_CAP) {
                    fprintf(stderr, "Error: PREFIX too long.\n");
                    exit(EXIT_FAILURE);
                }

                opt_ptr->cflag = true;
                break;
            case 'p':
                if (!opt_ptr->sflag) {
                    err_flag = true;
                }

                *prefix = optarg;
                opt_ptr->pflag = true;
                break;

                /* case '?' */
            default:
                usage_err(argv[0]);
        }
    }

    /* If the -p or -k flag was specified without -s: */
    /* Note: GNU provides an extension for optional arguments, which
     *       can be used to provide an optional prefix for -s. */
    if (err_flag) {
        if (opt_ptr->pflag) {
            fputs("Error: -p specified without -s.\n", stderr);
        }

        if (opt_ptr->kflag) {
            fputs("Error: -k specified without -s.\n", stderr);
        }
        usage_err(argv[0]);
    }
}

/* Instead of using the whole ASCII charset, we'd only use the set of printable
 * characters. This cuts down the struct size from 2056 bytes to 768 bytes if 
 * using 64-bit pointers, or from 1028 bytes to 384 bytes if using 32-bit integers.
 * (or so pahole outputs, there might be a slight difference in the size across 
 * different platforms).
 */

#define CHILDREN_COUNT 95

/* In the children array, 32 should map to 0, 65 should map to 33, and so on. */
#define ASCII_OFFSET ' '        /* ASCII code: 32, start of printable characters. */

/* This wastes half the indexing range. Perhaps we can use uint32_t instead of
 * int32_t, and use UINT32_MAX as the sentinel value.
 */
#define INVALID_OFFSET -1

#define INITIAL_POOL_CAP 1024

typedef struct Node {
    /* TODO: This still wastes a lot of memory. Something like Pythons's dict 
     * would be better. Explore the idea of using a dynamically growing 
     * hash-table for Node.children. 
     */

    /* We shall store indices within the `Trie.pool` array instead of storing
     * huge word size pointers. This also simplifies serializing and
     * deserializing the structure.
     *
     * See also: A case where int32_t wasn't sufficient - bbc.com/news/world-asia-30288542
     */
    int32_t children[CHILDREN_COUNT];
    bool terminal;
} Node;

static struct {
    Node *pool;
    int32_t count;
    int32_t capacity;
} Trie;

static bool init_pool(void)
{
    Trie.pool = malloc(sizeof *Trie.pool * INITIAL_POOL_CAP);

    if (!Trie.pool) {
        perror("malloc()");
        return false;
    }

    Trie.capacity = INITIAL_POOL_CAP;
    return true;
}

static inline void free_pool(void)
{
    free(Trie.pool);
}

static int32_t alloc_node(void)
{
    if (Trie.count >= Trie.capacity) {
        const int32_t remaining = INT32_MAX - Trie.capacity;

        /* We can no longer add more nodes. Bail out. */
        if (!remaining) {
            /* Is this helpful? */
            fputs("Error: too many nodes. Consider recompiling the program"
                "with a greater int width for more indexing range.\n", stderr);
            exit(EXIT_FAILURE);
        }

        Trie.capacity += remaining < INITIAL_POOL_CAP ? remaining : INITIAL_POOL_CAP;
        void *const tmp = realloc(Trie.pool, sizeof *Trie.pool * (size_t) Trie.capacity);

        if (!tmp) {
            perror("realloc()");
            return INVALID_OFFSET;
        }

        Trie.pool = tmp;
    }

    Node *const new = &Trie.pool[Trie.count];

    memset(new->children, INVALID_OFFSET, sizeof new->children);
    Trie.pool[Trie.count].terminal = false;
    return Trie.count++;
}

static bool insert_text(int32_t root_idx, const char *text)
{
    while (*text) {
        const int32_t idx = *text - ASCII_OFFSET;

        if (Trie.pool[root_idx].children[idx] == INVALID_OFFSET) {
            const int32_t new = alloc_node();

            if (new == INVALID_OFFSET) {
                return false;
            }

            Trie.pool[root_idx].children[idx] = new;
        }
        root_idx = Trie.pool[root_idx].children[idx];
        ++text;
    }

    return Trie.pool[root_idx].terminal = true;
}

static char *read_file(FILE *fp)
{
    static const size_t page_size = 4096;
    char *content = NULL;
    size_t len = 0;

    for (size_t rcount = 1; rcount; len += rcount) {
        void *const tmp = realloc(content, len + page_size);

        if (!tmp) {
            free(content);
            content = NULL;
/* ENOMEM is not a C standard error code, yet quite common. */
#ifdef ENOMEM
            errno = ENOMEM;
#endif
            return NULL;
        }
        content = tmp;
        rcount = fread(content + len, 1, page_size - 1, fp);

        if (ferror(fp)) {
            free(content);
            content = NULL;
#ifdef ENOMEM
            errno = ENOMEM;
#endif
            return NULL;
        }
    }
    content[len] = '\0';
    return content;
}

static size_t split_lines(char *restrict s, char ***restrict lines)
{
    const size_t chunk_size = 1024;
    size_t capacity = 0;
    size_t line_count = 0;

    while (s && *s) {
        if (line_count >= capacity) {
            char **const new = realloc(*lines,
                sizeof **lines * (capacity += chunk_size));

            if (!new) {
                free(*lines);
                *lines = NULL;
#ifdef ENOMEM
                errno = ENOMEM;
#endif
                return 0;
            }
            *lines = new;
        }
        (*lines)[line_count++] = s;
        s = strchr(s, '\n');

        if (s) {
            *s++ = '\0';
        }
    }
    return line_count;
}

static char ac_buffer[AC_BUFFER_CAP];
static size_t ac_buffer_sz;

static void ac_buffer_push(char ch)
{
    /* No check here, for we would have exited in parse_options() if the length
     * of the prefix was greater than AC_BUFFER_CAP.
     */
    ac_buffer[ac_buffer_sz++] = ch;
}

static void ac_buffer_pop(void)
{
    if (ac_buffer_sz > 0) {
        ac_buffer_sz--;
    }
}

static int32_t find_prefix(int32_t root_idx, const char *prefix)
{
    while (*prefix) {
        const int32_t child_idx = *prefix - ASCII_OFFSET;

        if (Trie.pool[root_idx].children[child_idx] == INVALID_OFFSET) {
            return INVALID_OFFSET;
        }

        root_idx = Trie.pool[root_idx].children[child_idx];
        ac_buffer_push(*prefix);
        ++prefix;
    }
    return root_idx;
}

static void dump_dot_prefix(FILE *sink, int32_t root_idx)
{
    for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
        if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
            const int32_t child_index = Trie.pool[root_idx].children[i];

            if (Trie.pool[child_index].terminal) {
                fprintf(sink,
                    "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
                    child_index, (char) (i + ASCII_OFFSET));
            } else {
                fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n", child_index,
                    (char) (i + ASCII_OFFSET));
            }

            fprintf(sink, "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
                root_idx, child_index, (char) (i + ASCII_OFFSET));
            dump_dot_prefix(sink, Trie.pool[root_idx].children[i]);
        }
    }
}

static void dump_dot_whole(FILE *sink)
{
    for (int32_t i = 0; i < Trie.count; ++i) {
        const int32_t index = i;

        for (size_t j = 0; j < CHILDREN_COUNT; ++j) {
            if (Trie.pool[i].children[j] != INVALID_OFFSET) {
                const int32_t child_index = Trie.pool[i].children[j];

                if (Trie.pool[child_index].terminal) {
                    fprintf(sink,
                        "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
                        child_index, (char) (j + ASCII_OFFSET));
                } else {
                    fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n",
                        child_index, (char) (j + ASCII_OFFSET));
                }

                fprintf(sink,
                    "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
                    index, child_index, (char) (j + ASCII_OFFSET));
            }
        }
    }
}

static void print_suggestions(int32_t root_idx)
{
    if (Trie.pool[root_idx].terminal) {
        fwrite(ac_buffer, ac_buffer_sz, 1, stdout);
        fputc('\n', stdout);
    }

    for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
        if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
            ac_buffer_push((char) (i + ASCII_OFFSET));
            print_suggestions(Trie.pool[root_idx].children[i]);
            ac_buffer_pop();
        }
    }
}

static bool populate_trie(int32_t root_idx, char **lines, size_t num_lines)
{
    for (size_t i = 0; i < num_lines; ++i) {
        if (!insert_text(root_idx, lines[i])) {
            return false;
        }
    }
    return true;
}

static bool generate_graph(void)
{
    if (system("dot -Tsvg " OUTPUT_DOT_FILE " -O")) {
        fprintf(stderr, "Error: failed to generate the .SVG file, %s.\n",
            strerror(errno));
        return false;
    }

    return true;
}

static bool generate_dot(int32_t root_idx, const char *prefix)
{
    FILE *const sink = fopen("graph.dot", "w");

    if (!sink) {
        perror("fopen()");
        return false;
    }

    fprintf(sink, "digraph Trie {\n"
        "\tnode [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]\n"
        "\tNode_%" PRId32 " [label=%s]\n", root_idx, prefix ? prefix : "root");

    root_idx == 0 ? dump_dot_whole(sink) : dump_dot_prefix(sink, root_idx);
    fprintf(sink, "}\n");
    fclose(sink);
    return true;
}

static bool process_args(int32_t root_idx,
    const struct flags *restrict options, const char *restrict prefix,
    const char *restrict out_file)
{
    bool rv = true;

    /* Handle --complete first as we are going to modify root later in case
     * prefix is not a null pointer. 
     */
    if (options->cflag) {
        const int32_t subtree_idx = find_prefix(root_idx, prefix);

        if (subtree_idx == INVALID_OFFSET) {
            fprintf(stderr, "Error: Unable to find prefix.\n");
            rv = !rv;
            goto cleanup;
        }

        print_suggestions(subtree_idx);
    }

    if (options->sflag) {
        if (prefix) {
            const int32_t subtree_idx = find_prefix(root_idx, prefix);

            if (subtree_idx == INVALID_OFFSET) {
                fprintf(stderr, "Error: Unable to find prefix.\n");
                rv = !rv;
                goto cleanup;
            }
            root_idx = subtree_idx;
        }

        /* At this point, we can free the file, lines and Trie.pool, but we'd have
         * to change the structure of the code. */
        if (!generate_dot(root_idx, prefix)
            || !generate_graph()) {
            rv = !rv;
        }
    }

  cleanup:
    if (!options->kflag) {
        remove(out_file);
    }

    return rv;
}

int main(int argc, char *argv[])
{

    /* Sanity check. POSIX requires the invoking process to pass a non-NULL 
     * argv[0]. 
     */
    if (!argv[0]) {
        fprintf(stderr,
            "A NULL argv[0] was passed through an exec system call.\n");
        return EXIT_FAILURE;
    }

    static const struct option long_options[] = {
        { "keep", no_argument, NULL, 'k' },
        { "help", no_argument, NULL, 'h' },
        { "svg", no_argument, NULL, 's' },
        { "complete", required_argument, NULL, 'c' },
        { "prefix", required_argument, NULL, 'p' },
        { NULL, 0, NULL, 0 },
    };

    FILE *in_file = stdin;
    struct flags options = { false, false, false, false, false };
    const char *search_prefix = NULL;

    parse_options(long_options, &options, argc, argv, &search_prefix);

    if (options.sflag || options.cflag) {
        if ((optind + 1) == argc) {
            in_file = fopen(argv[optind], "r");
            if (!in_file) {
                perror(argv[optind]);
                return EXIT_FAILURE;
            }
        }
    } else {
        usage_err(PROGRAM_NAME);
    }

    if (optind > argc) {
        usage_err(PROGRAM_NAME);
    }

    char *const content = read_file(in_file);
    char **lines = NULL;
    const size_t num_lines = split_lines(content, &lines);

    if (!lines) {
        free(content);
        perror("fread()");
        return EXIT_FAILURE;
    }

    bool rv = true;
    int32_t root_idx = INVALID_OFFSET;

    if (!init_pool()
        || (root_idx = alloc_node()) == INVALID_OFFSET
        || !populate_trie(root_idx, lines, num_lines)) {
        rv = !rv;
        goto cleanup;

    }

    D(
        char *const total = calculate_size((size_t) Trie.capacity * sizeof *Trie.pool);
        char *const used = calculate_size((size_t) Trie.count * sizeof *Trie.pool);

        debug_printf("Total lines read: %zu.\n" "Total nodes allocated: %"
                    PRId32 ".\n" "Total nodes used: %" PRId32 ".\n"
                    "Total memory allocated: %s.\n" "Total memory used: %s.\n",
                    num_lines, 
                    Trie.capacity, 
                    Trie.count, 
                    total, 
                    used); 
        free(total);
        free(used);
    );
    
    rv = process_args(root_idx, &options, search_prefix, OUTPUT_DOT_FILE);

  cleanup:
    free_pool();                /* NO-OP if init_pool() returns false. */
    free(content);
    free(lines);

    if (in_file != stdin) {
        fclose(in_file);
    }

    return rv ? EXIT_SUCCESS : EXIT_FAILURE;
}
```

size.h:

```c
#ifndef SIZE_H
#define SIZE_H 1

#include <stdint.h>

char *calculate_size(uint64_t size);

#endif                          /* SIZE_H */
```

size.c:

```c
#include "size.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <stdint.h>

#define ARRAY_CARDINALITY(x) (sizeof(x) / sizeof (*(x)))

static const char *const sizes[] =
    { "EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B" };
static const uint64_t exbibytes =
    1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL;

/* Acknowledgements:
 * @Jonathan Leffer - [Converting a number of bytes into a file size in C]
 * (https://stackoverflow.com/q/3898840/20017547)
 */
char *calculate_size(uint64_t size)
{
    char *const result = malloc(32);

    if (!result) {
        return NULL;
    }

    uint64_t multiplier = exbibytes;

    for (size_t i = 0; i < ARRAY_CARDINALITY(sizes); ++i, multiplier /= 1024) {
        if (size < multiplier) {
            continue;
        }

        if (size % multiplier == 0) {
            sprintf(result, "%" PRId64 " %s", size / multiplier, sizes[i]);
        } else {
            sprintf(result, "%.1lf %s", (double) size / (double) multiplier,
                sizes[i]);
        }
        return result;
    }

    return strcpy(result, "0");
}

#ifdef TEST_MAIN
int main(void)
{
    static const uint64_t list[] = {
        0, 1, 2, 34, 900, 1023, 1024, 1025, 2048, 1024 * 1024,
        1024 * 1024 * 1024 + 1024 * 1024 * 400
    };

    for (size_t i = 0; i < ARRAY_CARDINALITY(list); ++i) {
        char *const str = calculate_size(list[i]);

        if (!str) {
            perror("malloc()");
            return EXIT_FAILURE;
        }

        printf("%18" PRId64 " = %s\n", list[i], str);
        free(str);
    }

    return EXIT_SUCCESS;
}
#endif
```

And here's the makefile for the project:
```make
CFLAGS 	= -std=c17
CFLAGS 	+= -no-pie
CFLAGS	+= -ftrapv
CFLAGS 	+= -Wall
CFLAGS 	+= -Wextra
CFLAGS 	+= -Warray-bounds
CFLAGS 	+= -Wconversion
CFLAGS 	+= -Wmissing-braces
CFLAGS 	+= -Wno-parentheses
CFLAGS 	+= -Wno-format-truncation
CFLAGS	+= -Wno-type-limits
CFLAGS 	+= -Wpedantic
CFLAGS 	+= -Wstrict-prototypes
CFLAGS 	+= -Wwrite-strings
CFLAGS 	+= -Winline
CFLAGS 	+= -D_FORTIFY_SOURCE=2

BIN 		 := trie
INSTALL_PATH := /usr/local/bin
SRCS		 := trie.c 

ifeq ($(MAKECMDGOALS),debug)
SRCS += size.c
endif

all: CFLAGS += -s -O2
all: $(BIN)

debug: CFLAGS += -g3 -ggdb -fanalyzer -DDEBUG
debug: $(BIN)

$(BIN): $(SRCS)

install: $(BIN)
	install $< $(INSTALL_PATH)

uninstall:
	rm $(INSTALL_PATH)/$(BIN)

clean:
	$(RM) $(BIN) 

.PHONY: all debug clean install uninstall 
.DELETE_ON_ERROR:
```

The code was formatted with GNU Indent, compiles without any warnings with Clang,
GCC, and MinGW on FreeBSD 14.0, Linux Mint 21.2, and Windows 10.0, and reports
nothing awry under valgrind.

Although I did have to use gmake on FreeBSD because of some non-standard GNU extensions.

You could also clone this github repository: [trie](https://github.com/Melkor-1/Trie)
## Test run:

Here are the results of a sample run:
```bash
>> free && sync && echo 3 >| /proc/sys/vm/drop_caches && free

>> time ./trie --complete tran ~/data-set/wordlists/huge.txt 1> /dev/null
./trie --complete tran ~/data-set/wordlists/huge.txt > /dev/null  0.19s user 0.57s system 97% cpu 0.775 total

>> wc ~/data-set/wordlists/huge.txt
172819  172820 1916146 
```



## Review Goals:

* Do you see any issues with the design/structure of the program?
* Does any part of my code exhibit undefined/implementation-defined behavior?
* General coding comments, style, et cetera.

Do you have a faster alternative to Graphviz dot to suggest?

[1]: https://github.com/Melkor-1/trie/blob/main/graph.dot.svg