Welcome to Software Development on Codidact!

Will you help us build our independent community of developers helping developers? We're small and trying to grow. We welcome questions about all aspects of software development, from design to code to QA and more. Got questions? Got answers? Got code you'd like someone to review? Please join us.

Trie Implementation, Graph Visualization and Auto-Completion in C

−0

Given a list of strings (say a text file containing some C symbols: c-symbols.txt), the program can:

Generate a graph of the underlying trie (-p/--prefix can be specified to inspect a specific prefix subtree) with Graphviz dot:

» time ./trie --prefix at --svg c-symbols.txt  

./trie --prefix at --svg c-symbols.txt 0.74s user 0.10s system 95% cpu 0.879 total

Output Image:

(The green ones indicate terminal nodes.)

Suggest completions for a given prefix:

» time ./trie -c at c-symbols.txt

at_quick_exit
atan
atan2
atan2d
atan2d128
atan2d32
atan2d64
atan2f
atan2l
atan2pi
atan2pid
atan2pid128
atan2pid32
atan2pid64
atan2pif
[94 more results...]

./trie -c at c-symbols.txt 0.00s user 0.01s system 76% cpu 0.018 total

With -s/--svg, the transient file graph.dot can be saved by specifying the -k/--keep flag.

»  cat graph.dot

digraph Trie {
	node [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]
	Node_167 [label=at]
    Node_248 [label=A]
    Node_0 -> Node_248 [label=A]
    Node_63 [label=_]
    Node_0 -> Node_63 [label=_]
    Node_1 [label=a]
    Node_0 -> Node_1 [label=a]
    Node_2 [label=b]
    Node_1 -> Node_2 [label=b]
    Node_17 [label=c]
    Node_1 -> Node_17 [label=c]
    Node_53 [label=d]
    ...
    ...
    Node_448 -> Node_449 [label=c]
    Node_450 [label=i]
    Node_449 -> Node_450 [label=i]
    Node_451 [label=t,fillcolor=lightgreen]
    Node_450 -> Node_451 [label=t]
}

Code:

trie.c:

#undef _POSIX_C_SOURCE
#undef _XOPEN_SOURCE

#define _POSIX_C_SOURCE 200819L
#define _XOPEN_SOURCE 700

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdint.h>
#include <errno.h>
#include <inttypes.h>

#include <getopt.h>

#define PROGRAM_NAME "auto-complete"
#define OUTPUT_DOT_FILE "graph.dot"
#define AC_BUFFER_CAP 1024

struct flags {
    bool kflag;                 /* Keep the transient .DOT file. */
    bool hflag;                 /* Help message. */
    bool sflag;                 /* Generate a .SVG file. */
    bool cflag;                 /* Suggest autocompletions. */
    bool pflag;                 /* Prefix for the .DOT file. */
};

#ifdef DEBUG
#include "size.h"
#define debug_printf(fmt, ...) \
    fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, \
            __func__, __VA_ARGS__)
#define D(x) x
#else
#define D(x) (void) 0
#endif

static void help(FILE *sink)
{
    fprintf(sink, "\nUSAGE\n"
        "\t" PROGRAM_NAME " [OPTIONS] [filename]\n\n"
        "DESCRIPTION\n"
        "\t" PROGRAM_NAME
        " is a program for auto-completion and graph visualization.\n\n"
        "OPTIONS:\n" "\t-k, --keep\t\tKeep the transient .DOT file.\n"
        "\t-h, --help\t\tDisplay this help message and exit.\n"
        "\t-s, --svg \t\tGenerate a .SVG file (with optional prefix).\n"
        "\t-c, --complete PREFIX   Suggest autocompletions for prefix.\n"
        "\t-p, --prefix PREFIX\tPrefix for the .DOT file.\n\n");
    exit(EXIT_SUCCESS);
}

static void usage_err(const char *prog_name)
{
    fprintf(stderr, "The syntax of the command is incorrect.\n"
        "Try %s -h for more information.\n", prog_name);
    exit(EXIT_FAILURE);
}

static void parse_options(const struct option *restrict long_options,
    struct flags *restrict opt_ptr, int argc, char *const argv[],
    const char **restrict prefix)
{
    int c;
    int err_flag = 0;

    while ((c = getopt_long(argc, argv, "shkc:p:", long_options, NULL)) != -1) {
        switch (c) {
            case 'k':
                ++err_flag;
                opt_ptr->kflag = true;
                break;
            case 'h':
                help(stdout);
                break;
            case 's':
                if (opt_ptr->cflag) {
                    fprintf(stderr,
                        "Error: -s/--svg specified after -c/--complete.\n");
                    usage_err(argv[0]);
                }

                if (opt_ptr->pflag) {
                    --err_flag;
                }

                if (opt_ptr->kflag) {
                    --err_flag;
                }

                opt_ptr->sflag = true;
                break;
            case 'c':
                if (opt_ptr->sflag) {
                    fprintf(stderr,
                        "Error: -c/--complete specified after -s/--svg.\n");
                    usage_err(argv[0]);
                }

                *prefix = optarg;

                if (strlen(*prefix) >= AC_BUFFER_CAP) {
                    fprintf(stderr, "Error: PREFIX too long.\n");
                    exit(EXIT_FAILURE);
                }

                opt_ptr->cflag = true;
                break;
            case 'p':
                if (!opt_ptr->sflag) {
                    err_flag = true;
                }

                *prefix = optarg;
                opt_ptr->pflag = true;
                break;

                /* case '?' */
            default:
                usage_err(argv[0]);
        }
    }

    /* If the -p or -k flag was specified without -s: */
    /* Note: GNU provides an extension for optional arguments, which
     *       can be used to provide an optional prefix for -s. */
    if (err_flag) {
        if (opt_ptr->pflag) {
            fputs("Error: -p specified without -s.\n", stderr);
        }

        if (opt_ptr->kflag) {
            fputs("Error: -k specified without -s.\n", stderr);
        }
        usage_err(argv[0]);
    }
}

/* Instead of using the whole ASCII charset, we'd only use the set of printable
 * characters. This cuts down the struct size from 2056 bytes to 768 bytes if 
 * using 64-bit integers, or from 1028 bytes to 384 bytes if using 32-bit integers.
 * (or so pahole outputs, there might be a slight difference in the size across 
 * different platforms).
 */

#define CHILDREN_COUNT 95

/* In the children array, 32 should map to 0, 65 should map to 33, and so on. */
#define ASCII_OFFSET ' '        /* ASCII code: 32, start of printable characters. */

/* This wastes half the indexing range. Perhaps we can use uint32_t instead of
 * int32_t, and use UINT32_MAX as the sentinel value.
 */
#define INVALID_OFFSET -1

#define INITIAL_POOL_CAP 1024

typedef struct Node {
    /* TODO: This still wastes a lot of memory. Something like Pythons's dict 
     * would be better. Explore the idea of using a dynamically growing 
     * hash-table for Node.children. 
     */

    /* We shall store indices within the `Trie.pool` array instead of storing
     * huge word size pointers. This also simplifies serializing and
     * deserializing the structure.
     *
     * See also: A case where int32_t wasn't sufficient - bbc.com/news/world-asia-30288542
     */
    int32_t children[CHILDREN_COUNT];
    bool terminal;
} Node;

static struct {
    Node *pool;
    int32_t count;
    int32_t capacity;
} Trie;

static bool init_pool(void)
{
    Trie.pool = malloc(sizeof *Trie.pool * INITIAL_POOL_CAP);

    if (!Trie.pool) {
        perror("malloc()");
        return false;
    }

    Trie.capacity = INITIAL_POOL_CAP;
    return true;
}

static inline void free_pool(void)
{
    free(Trie.pool);
}

static int32_t alloc_node(void)
{
    if (Trie.count >= Trie.capacity) {
        const int32_t remaining = INT32_MAX - Trie.capacity;

        /* We can no longer add more nodes. Bail out. */
        if (!remaining) {
            /* Is this helpful? */
            fputs("Error: too many nodes. Consider recompiling the program"
                "with a greater int width for more indexing range.\n", stderr);
            exit(EXIT_FAILURE);
        }

        Trie.capacity += remaining < INITIAL_POOL_CAP ? remaining : INITIAL_POOL_CAP;
        void *const tmp = realloc(Trie.pool, sizeof *Trie.pool * (size_t) Trie.capacity);

        if (!tmp) {
            perror("realloc()");
            return INVALID_OFFSET;
        }

        Trie.pool = tmp;
    }

    Node *const new = &Trie.pool[Trie.count];

    memset(new->children, INVALID_OFFSET, sizeof new->children);
    Trie.pool[Trie.count].terminal = false;
    return Trie.count++;
}

static bool insert_text(int32_t root_idx, const char *text)
{
    while (*text) {
        const int32_t idx = *text - ASCII_OFFSET;

        if (Trie.pool[root_idx].children[idx] == INVALID_OFFSET) {
            const int32_t new = alloc_node();

            if (new == INVALID_OFFSET) {
                return false;
            }

            Trie.pool[root_idx].children[idx] = new;
        }
        root_idx = Trie.pool[root_idx].children[idx];
        ++text;
    }

    return Trie.pool[root_idx].terminal = true;
}

static char *read_file(FILE *fp)
{
    static const size_t page_size = 4096;
    char *content = NULL;
    size_t len = 0;

    for (size_t rcount = 1; rcount; len += rcount) {
        void *const tmp = realloc(content, len + page_size);

        if (!tmp) {
            free(content);
            content = NULL;
/* ENOMEM is not a C standard error code, yet quite common. */
#ifdef ENOMEM
            errno = ENOMEM;
#endif
            return NULL;
        }
        content = tmp;
        rcount = fread(content + len, 1, page_size - 1, fp);

        if (ferror(fp)) {
            free(content);
            content = NULL;
#ifdef ENOMEM
            errno = ENOMEM;
#endif
            return NULL;
        }
    }
    content[len] = '\0';
    return content;
}

static size_t split_lines(char *restrict s, char ***restrict lines)
{
    const size_t chunk_size = 1024;
    size_t capacity = 0;
    size_t line_count = 0;

    while (s && *s) {
        if (line_count >= capacity) {
            char **const new = realloc(*lines,
                sizeof **lines * (capacity += chunk_size));

            if (!new) {
                free(*lines);
                *lines = NULL;
#ifdef ENOMEM
                errno = ENOMEM;
#endif
                return 0;
            }
            *lines = new;
        }
        (*lines)[line_count++] = s;
        s = strchr(s, '\n');

        if (s) {
            *s++ = '\0';
        }
    }
    return line_count;
}

static char ac_buffer[AC_BUFFER_CAP];
static size_t ac_buffer_sz;

static void ac_buffer_push(char ch)
{
    /* No check here, for we would have exited in parse_options() if the length
     * of the prefix was greater than AC_BUFFER_CAP.
     */
    ac_buffer[ac_buffer_sz++] = ch;
}

static void ac_buffer_pop(void)
{
    if (ac_buffer_sz > 0) {
        ac_buffer_sz--;
    }
}

static int32_t find_prefix(int32_t root_idx, const char *prefix)
{
    while (*prefix) {
        const int32_t child_idx = *prefix - ASCII_OFFSET;

        if (Trie.pool[root_idx].children[child_idx] == INVALID_OFFSET) {
            return INVALID_OFFSET;
        }

        root_idx = Trie.pool[root_idx].children[child_idx];
        ac_buffer_push(*prefix);
        ++prefix;
    }
    return root_idx;
}

static void dump_dot_prefix(FILE *sink, int32_t root_idx)
{
    for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
        if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
            const int32_t child_index = Trie.pool[root_idx].children[i];

            if (Trie.pool[child_index].terminal) {
                fprintf(sink,
                    "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
                    child_index, (char) (i + ASCII_OFFSET));
            } else {
                fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n", child_index,
                    (char) (i + ASCII_OFFSET));
            }

            fprintf(sink, "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
                root_idx, child_index, (char) (i + ASCII_OFFSET));
            dump_dot_prefix(sink, Trie.pool[root_idx].children[i]);
        }
    }
}

static void dump_dot_whole(FILE *sink)
{
    for (int32_t i = 0; i < Trie.count; ++i) {
        const int32_t index = i;

        for (size_t j = 0; j < CHILDREN_COUNT; ++j) {
            if (Trie.pool[i].children[j] != INVALID_OFFSET) {
                const int32_t child_index = Trie.pool[i].children[j];

                if (Trie.pool[child_index].terminal) {
                    fprintf(sink,
                        "\tNode_%" PRId32 " [label=%c,fillcolor=lightgreen]\n",
                        child_index, (char) (j + ASCII_OFFSET));
                } else {
                    fprintf(sink, "\tNode_%" PRId32 " [label=%c]\n",
                        child_index, (char) (j + ASCII_OFFSET));
                }

                fprintf(sink,
                    "\tNode_%" PRId32 " -> Node_%" PRId32 " [label=%c]\n",
                    index, child_index, (char) (j + ASCII_OFFSET));
            }
        }
    }
}

static void print_suggestions(int32_t root_idx)
{
    if (Trie.pool[root_idx].terminal) {
        fwrite(ac_buffer, ac_buffer_sz, 1, stdout);
        fputc('\n', stdout);
    }

    for (size_t i = 0; i < CHILDREN_COUNT; ++i) {
        if (Trie.pool[root_idx].children[i] != INVALID_OFFSET) {
            ac_buffer_push((char) (i + ASCII_OFFSET));
            print_suggestions(Trie.pool[root_idx].children[i]);
            ac_buffer_pop();
        }
    }
}

static bool populate_trie(int32_t root_idx, char **lines, size_t num_lines)
{
    for (size_t i = 0; i < num_lines; ++i) {
        if (!insert_text(root_idx, lines[i])) {
            return false;
        }
    }
    return true;
}

static bool generate_graph(void)
{
    if (system("dot -Tsvg " OUTPUT_DOT_FILE " -O")) {
        fprintf(stderr, "Error: failed to generate the .SVG file, %s.\n",
            strerror(errno));
        return false;
    }

    return true;
}

static bool generate_dot(int32_t root_idx, const char *prefix)
{
    FILE *const sink = fopen("graph.dot", "w");

    if (!sink) {
        perror("fopen()");
        return false;
    }

    fprintf(sink, "digraph Trie {\n"
        "\tnode [fillcolor=lightblue,style=filled,arrowhead=vee,color=black]\n"
        "\tNode_%" PRId32 " [label=%s]\n", root_idx, prefix ? prefix : "root");

    root_idx == 0 ? dump_dot_whole(sink) : dump_dot_prefix(sink, root_idx);
    fprintf(sink, "}\n");
    fclose(sink);
    return true;
}

static bool process_args(int32_t root_idx,
    const struct flags *restrict options, const char *restrict prefix,
    const char *restrict out_file)
{
    bool rv = true;

    /* Handle --complete first as we are going to modify root later in case
     * prefix is not a null pointer. 
     */
    if (options->cflag) {
        const int32_t subtree_idx = find_prefix(root_idx, prefix);

        if (subtree_idx == INVALID_OFFSET) {
            fprintf(stderr, "Error: Unable to find prefix.\n");
            rv = !rv;
            goto cleanup;
        }

        print_suggestions(subtree_idx);
    }

    if (options->sflag) {
        if (prefix) {
            const int32_t subtree_idx = find_prefix(root_idx, prefix);

            if (subtree_idx == INVALID_OFFSET) {
                fprintf(stderr, "Error: Unable to find prefix.\n");
                rv = !rv;
                goto cleanup;
            }
            root_idx = subtree_idx;
        }

        /* At this point, we can free the file, lines and Trie.pool, but we'd have
         * to change the structure of the code. */
        if (!generate_dot(root_idx, prefix)
            || !generate_graph()) {
            rv = !rv;
        }
    }

  cleanup:
    if (!options->kflag) {
        remove(out_file);
    }

    return rv;
}

int main(int argc, char *argv[])
{

    /* Sanity check. POSIX requires the invoking process to pass a non-NULL 
     * argv[0]. 
     */
    if (!argv[0]) {
        fprintf(stderr,
            "A NULL argv[0] was passed through an exec system call.\n");
        return EXIT_FAILURE;
    }

    static const struct option long_options[] = {
        { "keep", no_argument, NULL, 'k' },
        { "help", no_argument, NULL, 'h' },
        { "svg", no_argument, NULL, 's' },
        { "complete", required_argument, NULL, 'c' },
        { "prefix", required_argument, NULL, 'p' },
        { NULL, 0, NULL, 0 },
    };

    FILE *in_file = stdin;
    struct flags options = { false, false, false, false, false };
    const char *search_prefix = NULL;

    parse_options(long_options, &options, argc, argv, &search_prefix);

    if (options.sflag || options.cflag) {
        if ((optind + 1) == argc) {
            in_file = fopen(argv[optind], "r");
            if (!in_file) {
                perror(argv[optind]);
                return EXIT_FAILURE;
            }
        }
    } else {
        usage_err(PROGRAM_NAME);
    }

    if (optind > argc) {
        usage_err(PROGRAM_NAME);
    }

    char *const content = read_file(in_file);
    char **lines = NULL;
    const size_t num_lines = split_lines(content, &lines);

    if (!lines) {
        free(content);
        perror("fread()");
        return EXIT_FAILURE;
    }

    bool rv = true;
    int32_t root_idx = INVALID_OFFSET;

    if (!init_pool()
        || (root_idx = alloc_node()) == INVALID_OFFSET
        || !populate_trie(root_idx, lines, num_lines)) {
        rv = !rv;
        goto cleanup;

    }

    D(
        char *const total = calculate_size((size_t) Trie.capacity * sizeof *Trie.pool);
        char *const used = calculate_size((size_t) Trie.count * sizeof *Trie.pool);

        debug_printf("Total lines read: %zu.\n" "Total nodes allocated: %"
                    PRId32 ".\n" "Total nodes used: %" PRId32 ".\n"
                    "Total memory allocated: %s.\n" "Total memory used: %s.\n",
                    num_lines, 
                    Trie.capacity, 
                    Trie.count, 
                    total, 
                    used); 
        free(total);
        free(used);
    );
    
    rv = process_args(root_idx, &options, search_prefix, OUTPUT_DOT_FILE);

  cleanup:
    free_pool();                /* NO-OP if init_pool() returns false. */
    free(content);
    free(lines);

    if (in_file != stdin) {
        fclose(in_file);
    }

    return rv ? EXIT_SUCCESS : EXIT_FAILURE;
}

size.h:

#ifndef SIZE_H
#define SIZE_H 1

#include <stdint.h>

char *calculate_size(uint64_t size);

#endif                          /* SIZE_H */

size.c:

#include "size.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <stdint.h>

#define ARRAY_CARDINALITY(x) (sizeof(x) / sizeof (*(x)))

static const char *const sizes[] =
    { "EiB", "PiB", "TiB", "GiB", "MiB", "KiB", "B" };
static const uint64_t exbibytes =
    1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL * 1024ULL;

/* Acknowledgements:
 * @Jonathan Leffer - [Converting a number of bytes into a file size in C]
 * (https://stackoverflow.com/q/3898840/20017547)
 */
char *calculate_size(uint64_t size)
{
    char *const result = malloc(32);

    if (!result) {
        return NULL;
    }

    uint64_t multiplier = exbibytes;

    for (size_t i = 0; i < ARRAY_CARDINALITY(sizes); ++i, multiplier /= 1024) {
        if (size < multiplier) {
            continue;
        }

        if (size % multiplier == 0) {
            sprintf(result, "%" PRId64 " %s", size / multiplier, sizes[i]);
        } else {
            sprintf(result, "%.1lf %s", (double) size / (double) multiplier,
                sizes[i]);
        }
        return result;
    }

    return strcpy(result, "0");
}

#ifdef TEST_MAIN
int main(void)
{
    static const uint64_t list[] = {
        0, 1, 2, 34, 900, 1023, 1024, 1025, 2048, 1024 * 1024,
        1024 * 1024 * 1024 + 1024 * 1024 * 400
    };

    for (size_t i = 0; i < ARRAY_CARDINALITY(list); ++i) {
        char *const str = calculate_size(list[i]);

        if (!str) {
            perror("malloc()");
            return EXIT_FAILURE;
        }

        printf("%18" PRId64 " = %s\n", list[i], str);
        free(str);
    }

    return EXIT_SUCCESS;
}
#endif

And here's the makefile for the project:

CFLAGS 	= -std=c17
CFLAGS 	+= -no-pie
CFLAGS	+= -ftrapv
CFLAGS 	+= -Wall
CFLAGS 	+= -Wextra
CFLAGS 	+= -Warray-bounds
CFLAGS 	+= -Wconversion
CFLAGS 	+= -Wmissing-braces
CFLAGS 	+= -Wno-parentheses
CFLAGS 	+= -Wno-format-truncation
CFLAGS	+= -Wno-type-limits
CFLAGS 	+= -Wpedantic
CFLAGS 	+= -Wstrict-prototypes
CFLAGS 	+= -Wwrite-strings
CFLAGS 	+= -Winline
CFLAGS 	+= -D_FORTIFY_SOURCE=2

BIN 		 := trie
INSTALL_PATH := /usr/local/bin
SRCS		 := trie.c 

ifeq ($(MAKECMDGOALS),debug)
SRCS += size.c
endif

all: CFLAGS += -s -O2
all: $(BIN)

debug: CFLAGS += -g3 -ggdb -fanalyzer -DDEBUG
debug: $(BIN)

$(BIN): $(SRCS)

install: $(BIN)
	install $< $(INSTALL_PATH)

uninstall:
	rm $(INSTALL_PATH)/$(BIN)

clean:
	$(RM) $(BIN) 

.PHONY: all debug clean install uninstall 
.DELETE_ON_ERROR:

The code was formatted with GNU Indent, compiles without any warnings with Clang, GCC, and MinGW on FreeBSD 14.0, Linux Mint 21.2, and Windows 10.0, and reports nothing awry under valgrind.

Although I did have to use gmake on FreeBSD because of some non-standard GNU extensions.

You could also clone this github repository: trie

Test run:

Here are the results of a sample run:

>> free && sync && echo 3 >| /proc/sys/vm/drop_caches && free

>> time ./trie --complete tran ~/data-set/wordlists/huge.txt 1> /dev/null
./trie --complete tran ~/data-set/wordlists/huge.txt > /dev/null  0.19s user 0.57s system 97% cpu 0.775 total

>> wc ~/data-set/wordlists/huge.txt
172819  172820 1916146

Review Goals:

Do you see any issues with the design/structure of the program?
Does any part of my code exhibit undefined/implementation-defined behavior?
General coding comments, style, et cetera.

Do you have a faster alternative to Graphviz dot to suggest?

c Trie graph-visualization autocomplete

posted over 1 year ago

CC BY-NC-SA 4.0

1y ago by Alexei‭

Melkor-1‭

134 reputation 7 0 26 17

Raw

Markdown

History

is a duplicate

This question has been asked before and has already been answered. It should be marked as a duplicate.

Please enter the URL of the proposed duplicate in the details field below.

not constructive

This question cannot be answered in a way that is helpful to anyone. It's not possible to learn something from possible answers, except for the solution for the specific problem of the asker.

0 comment threads

1 answer

Score Active Age

−0

Worked for Melkor-1‭

The following users marked this post as Works for me:

User	Comment	Date
Melkor-1‭	(no comment)	Mar 7, 2024 at 08:56

General/program design:

I would have expected a public header file with the API for the whole thing, rather than having some main() test case calling static functions from the same file. Sure, this program is small, but having everything in a single file makes things less maintainable.
A unified error type/handling throughout the whole library would be nice. Rather than handling errors in many different ways: returning null pointers, -1, bools on a case by case basis. Normally done with an enum instead. Document which error codes each function may return. You can use the same error type internally as you use for the exposed API to the user.
Weird use of restrict. It doesn't make sense to use restrict for example here:
```
static void parse_options(const struct option *restrict long_options,
 struct flags *restrict opt_ptr, int argc, char *const argv[],
 const char **restrict prefix)
```
None of these types may alias and you have no file scope variables of those types either which could confuse the optimizer. Just drop restrict. You are using it at a couple of other strange places as well. A char* won't alias with a char*** and so on, so no use for restrict.
The code isn't thread-safe because of Trie and other file scope variables/statics, but maybe that's not a concern. One possible improvement if you'd want to add a GUI in the future would otherwise be to outsource stuff like file handling to separate threads, not to lag up the rest of the application.
You could make Trie use a flexible array member instead and allocate the whole struct dynamically. Probably won't improve performance in any notable way though.
I don't recommend compiling with -Wconversion. That one is prone to false positives. Use it temporarily for code review purposes, treat the results with skepticism and then turn it off again.

Performance:

Calling realloc repeatedly in a loop gives bad performance. You do this at a couple of places. Probably dwarfed by the file I/O etc, but still.

You could consider using a common reallocation pattern where we start by allocating a "large enough" chunk, often at some suitable aligned size (divisible by 8 on 64 bit systems etc). Then keep track of how much of that memory that is actually used. When running out/low of it, realloc twice that amount. And next time you run out of it, twice the amount again, making it grow exponentially. Never realloc to a smaller size. This is a speed over memory use optimization.

Bad practices:

You should avoid assignment inside conditions like the plague, since it is such a well-known source of bugs (and usually results in compiler warnings because of it).

Consider something like this instead:
```
int c=0;
while(c != -1)
{
  c = getopt_long(argc, argv, "shkc:p:", long_options, NULL);
  ...
  switch(c)
  ...
}
```
Or maybe even while(1) ... case -1: ... if that makes more sense.
You use a lot of *const qualified pointers which is just odd and don't really add anything except potential pointer conversion issues. For example char *const argv[] boils down to char*const* which is probably not what anyone expected.

"Three star programming". There are a few valid uses for char*** but generally they are just code smell. split_lines in particular is a seriously evil-looking function, I would recommend to rewrite it. Maybe something like this? (not tried, obviously, so treat it as pseudo code)

static char** split_lines(char* s, size_t* line_count)
{
    char** lines = NULL;
    const size_t chunk_size = 1024;
    size_t capacity = 0;
    line_count = 0;

    for(; s != NULL && *s != '\0'; s++)
    {
        if (line_count >= capacity) 
        { 
          ... 
        }

        lines[*line_count] = s;
        (*line_count)++;

        s = strchr(s, '\n');
        if (s != NULL) 
        {
            *s = '\0';
        }
    }

    return lines;   
}

Similarly, do not design a "leaky API" where your lib does some allocation internally and then leave it the caller to clean up. Such designs probably stand for the majority of all memory leaks in C, rather than leaks through bugs.

Instead do like the free_pool case, where the one responsible for the allocation (your lib) is also responsible for the clean-up through a provided function. The above rewrite of split_lines should come with a corresponding clean-up function.
"Magic numbers". Don't type out stuff like 1024 in the middle of the code, where did that number come from? Place it behind a named identifier/macro with a meaningful name instead. Some times you do this properly, some times you don't.

Impl.defined behavior (minor):

ULL does not necessarily correspond to uint64_t, that's implementation-defined behavior and non-portable. If portability is important, you could instead use the standard C constant UINT64_C(1024) which results in a portable integer constant of type uint_least64_t.

Style (minor):

For long function declarations/definitions with a lot of parameters, consider different formatting for readability. This style is common:

static void parse_options (const struct option*  long_options,
                           struct flags*         opt_ptr, 
                           int                   argc, 
                           char *const           argv[],   
                           const char**          prefix)

You keep mixing "K&R { brace" style with { on a line of its own style. Both of these styles are fine but pick one and use it consistently.
while (*text) should ideally be replaced with explicit, self-documenting style while (*text != '\0') to avoid mix-ups with cases where you compare a pointer against null etc. This program here has a lot of null termination checks and a lot of null pointer checks, so you should be explicit each time.
new is a bad identifier name since it is a reserved keyword in C++. Even if you never care for porting/compatibility with C++, you should still avoid naming identifiers after keywords from that language, since IDE style formatting tends to have a "C/C++" setting and then it will format new in weird ways.

posted over 1 year ago

CC BY-NC-SA 4.0

Lundin‭

9429 reputation 40 128 1088 177

Copy Link

Raw