Skip to main content

Stack Exchange Network

Stack Exchange network consists of 183 Q&A communities including Stack Overflow, the largest, most trusted online community for developers to learn, share their knowledge, and build their careers.

Visit Stack Exchange
Asked
Modified 13 days ago
Viewed 362 times
3
\$\begingroup\$

I started implementing my own hashmap in C after taking inspiration from Git's hashmap API. As far as I know, the closest thing to templates in C are macros. I've seen that they are used in uthash or khash single header files, but I don't know how and when to use them correctly. I'm currently stuck, my code is a mess, I rewrote it many times, I really don't know what to do. Any help would be appreciated

hashmap.h:

#ifndef HASHMAP_H
#define HASHMAP_H

#include <string.h>

typedef struct hashmap_entry {
    void *key;
    void *data;
    struct hashmap_entry *next;
} hashmap_entry_t;

typedef unsigned int (*hashmap_hash_fn)(const void *key);
typedef int (*hashmap_cmp_fn)(const void *key1, const void *key2);

typedef struct {
    hashmap_entry_t **table;
    size_t size;
    size_t count;
    size_t grow_at;
    size_t shrink_at;
    hashmap_hash_fn hash_fn;
    hashmap_cmp_fn cmp_fn;
} hashmap_t;

unsigned int fnv1a_hash_str(const void *key);
unsigned int fnv1a_hash_int(const void *key);

void hashmap_init(hashmap_t *map, size_t initial_size, hashmap_hash_fn hash_fn,
          hashmap_cmp_fn cmp_fn);
void hashmap_release(hashmap_t *map);
int hashmap_add(hashmap_t *map, void *key, void *data);
void hashmap_rehash(hashmap_t *map, size_t newsize);
hashmap_entry_t *hashmap_get_entry(hashmap_t *map, const void *key);
void *hashmap_get(hashmap_t *map, const void *key);
void *hashmap_remove_entry(hashmap_t *map, hashmap_entry_t *entry);
void *hashmap_remove(hashmap_t *map, const void *key);
int hashmap_cleanremove(hashmap_t *map, const void *key);

static inline int hashmap_cmp_str(const void *key1, const void *key2) {
    return strcmp((const char *)key1, (const char *)key2) == 0;
}

static inline int hashmap_cmp_int(const void *key1, const void *key2) {
    return *(const int *)key1 == *(const int *)key2;
}

#endif

hashmap.c:

#include "hashmap.h"

#include <stdlib.h>
#include <string.h>

#define FNV32_PRIME 0x01000193
#define FNV32_BASE 0x811c9dc5

#define HASHMAP_INITIAL_SIZE 64
#define HASHMAP_LOAD_FACTOR 70

unsigned int fnv1a_hash_str(const void *key) {
    unsigned int hash = FNV32_BASE;
    for (const unsigned char *c = (const unsigned char *)key; *c; c++)
        hash = (hash ^ *c) * FNV32_PRIME;
    return hash;
}

unsigned int fnv1a_hash_int(const void *key) {
    unsigned int hash = FNV32_BASE;
    const unsigned char *b = (const unsigned char *)key;
    for (size_t i = 0, size = sizeof(int); i < size; i++)
        hash = (hash ^ b[i]) * FNV32_PRIME;
    return hash;
}

static void alloc_table(hashmap_t *map, size_t size) {
    memset(map, 0, sizeof(*map));
    map->size = size;
    map->table = calloc(size, sizeof(hashmap_entry_t *));
    if (!map->table) {
        free(map->table);
        // die
    }
    map->grow_at = size * HASHMAP_LOAD_FACTOR / 100;
    if (size > HASHMAP_INITIAL_SIZE)
        map->shrink_at = map->grow_at / 3 + 1;
}

void hashmap_init(hashmap_t *map, size_t initial_size, hashmap_hash_fn hash_fn, hashmap_cmp_fn cmp_fn) {
    if (!initial_size)
        initial_size = HASHMAP_INITIAL_SIZE;
    alloc_table(map, initial_size);
    map->hash_fn = hash_fn;
    map->cmp_fn = cmp_fn;
}

void hashmap_release(hashmap_t *map) {
    for (size_t i = 0; i < map->size; i++) {
        hashmap_entry_t *e = map->table[i];
        while (e) {
            hashmap_entry_t *next = e->next;
            free(e->key);
            free(e->data);
            free(e);
            e = next;
        }
    }
    free(map->table);
}

static void hashmap_add_entry(hashmap_t *map, hashmap_entry_t *entry) {
    size_t b = map->hash_fn(entry->key) % map->size;
    entry->next = map->table[b];
    map->table[b] = entry;

    map->count++;
    if (map->count == map->grow_at)
        hashmap_rehash(map, map->size * 2);
}

int hashmap_add(hashmap_t *map, void *key, void *data) {
    size_t b = map->hash_fn(key) % map->size;
    hashmap_entry_t *e = map->table[b];
    while (e) {
        if (map->cmp_fn(e->key, key)) {
            return 1;
        }
        e = e->next;
    }

    e = malloc(sizeof(hashmap_entry_t));
    if (!e) {
        free(e);
        // die
    }
    memset(e, 0, sizeof(*e));
    e->key = (void *)key;
    e->data = data;
    e->next = map->table[b];
    map->table[b] = e;

    map->count++;
    if (map->count == map->grow_at)
        hashmap_rehash(map, map->size * 2);
    return 0;
}

void hashmap_rehash(hashmap_t *map, size_t newsize) {
    size_t oldsize = map->size;
    hashmap_entry_t **oldtable = map->table;
    hashmap_hash_fn oldhash_fn = map->hash_fn;
    hashmap_cmp_fn oldcmp_fn = map->cmp_fn;

    alloc_table(map, newsize);
    map->hash_fn = oldhash_fn;
    map->cmp_fn = oldcmp_fn;
    for (size_t i = 0; i < oldsize; i++) {
        hashmap_entry_t *e = oldtable[i];
        while (e) {
            hashmap_entry_t *next = e->next;
            e->next = NULL;
            hashmap_add_entry(map, e);
            e = next;
        }
    }
    free(oldtable);
}

hashmap_entry_t *hashmap_get_entry(hashmap_t *map, const void *key) {
    size_t b = map->hash_fn(key) % map->size;
    hashmap_entry_t *e = map->table[b];
    while (e) {
        if (map->cmp_fn(e->key, key))
            return e;
        e = e->next;
    }
    return NULL;
}

void *hashmap_get(hashmap_t *map, const void *key) {
    hashmap_entry_t *e = hashmap_get_entry(map, key);
    return e ? e->data : NULL;
}

void *hashmap_remove_entry(hashmap_t *map, hashmap_entry_t *entry) {
    void *data = entry->data;
    free(entry->key);
    free(entry);
    entry = NULL;

    map->count--;
    if (map->shrink_at == map->count)
        hashmap_rehash(map, map->size / 2);
    return data;
}

void *hashmap_remove(hashmap_t *map, const void *key) {
    size_t b = map->hash_fn(key) % map->size;
    hashmap_entry_t *e = map->table[b];
    hashmap_entry_t *prev = NULL;
    while (e) {
        if (map->cmp_fn(e->key, key)) {
            if (prev)
                prev->next = e->next;
            else
                map->table[b] = e->next;
            return hashmap_remove_entry(map, e);
        }
        prev = e;
        e = e->next;
    }
    return NULL;
}

int hashmap_cleanremove(hashmap_t *map, const void *key) {
    void *data = hashmap_remove(map, key);
    if (!data)
        return 1;
    free(data);
    return 0;
}
\$\endgroup\$

2 Answers 2

4
\$\begingroup\$

This error handling (in hashmap_add()) is incomplete:

    e = malloc(sizeof(hashmap_entry_t));
    if (!e) {
        free(e);
        // die
    }

Given that e is a null pointer inside that block, then free(e) is a no-op and can safely be omitted. On the other hand, the die comment isn't a very effective strategy. We need to tell the caller that we couldn't perform the requested action; we would normally do that by returning immediately with a suitable error code (in this function, probably any non-zero value - but the function documentation should tell users what that means).


The big concern with this code is that it's not at all templated, contrary to the title. So we're using void* instead of specific types. One solution to that is to keep the code you have written and tested, but then write a macro that defines a set of functions for working with a specific type. You can then use the macro once for each type you want a hash table for.

Example (untested, as I'm about to head home for the day):

#define DECLARE_HASHMAP(key_type, value_type, hash_fn, cmp_fun) \
    int hashmap_##key_type##_##value_type##_init(hashmap_t *map, size_t initial_size) \
        { return hashmap_init(map, initial_size, hash_fn, cmp_fun); }      \
    int hashmap_##key_type##_##value_type##_add(hashmap_t *map, key_type const *key, value_type const *data) \
        { return hashmap_add(map, key, data); }
    // ... more functions with key_type and value_type in the name

A bit better would be to use a different hashmap_t for each pair of types.

\$\endgroup\$
3
\$\begingroup\$

Any help would be appreciated

There is much to address before templates.

Consider handling the item to store in a generic way (perhaps use opaque structure pointer for now). Get that hash table interface right right and then move to templates.


.h review

.h vs. .c

I'd expect to understand a great deal of the overall coding goal by examining the .h file. This is the external interface users see and high level descriptions and comments belong there. As this code lacks comments this first thing is to provide high level and meaningful comments there.

I'd expect functions descriptions to accompany the declarations.

I assume the user never sees the .c file and all necessary info for the user and for code to interface this function set are in the .h file.

Scale to the system memory

The hash function returns an unsigned which scales (approximately) with the processor width. Yet size_t scales more directly with the memory and its the memory that is what hash tables are mostly about (along with the low O()). unsigned is not bad, yet I would consider returning an size_t instead. This is more important on implementations where the unsigned is widest and size_t not so wide - or vise-versa.

Unneeded cast

Cast not needed going from const void * to const char *

//return strcmp((const char *)key1, (const char *)key2) == 0;
return strcmp((const char *)key1, (const char *)key2) == 0;

Naming

Good use of hashmap_ to prefix objects to restrict use of namespace to a narrow area.

Content hiding

hashmap_entry_t and hashmap_entry_t definitions are not needed in the .h file. It is sufficient to have their declarations like typedef hashmap_entry_t; and hide the structure internals from the higher level user. This may oblige some helper function to access select structure members.

.c review

Includes

Good use of first including "hashmap.h" as this tests that the .h does all the includes it needs.

Unneeded cast

Going from const void * to const unsigned char * does not oblige a cast.

// const unsigned char *b = (const unsigned char *)key;
const unsigned char *b = key;

unsigned may be 16 bit

Consider uint_least32_t or the like for fnv1a_hash_str(), fnv1a_hash_int(), ...

Consider sizing to the referenced object and not type

Easier to code right, review and maintain.

// map->table = calloc(size, sizeof(hashmap_entry_t *));
map->table = calloc(size, sizeof map->table[0]);

Better to die with an exit()

Instead of a comment // die, use exit(EXIT_FAILURE);.

Consider returning a pointer

Rather than burden the user with knowing the hashmap_t size, just return a pointer (or NULL on failure) to maintain information hiding.

Consider edge cases

Good that hashmap_init(&map, 0, ...) is well handled.

Debug tip

When in debug mode, consider zeroing (or randomizing) the data before freeing it. This more likely exhibits errors of using data after a free().

Some compilers will optimize out a memset() before a free(). If so, consider C23's memset_explicit() or an equivalent helper function

// Sample idea
#ifdef NDEBUG
  #define my_memzero(p, size)
#else
  void my_memzero(void *p, size_t size) {
    #if __STDC_VERSION__ > 202311
      memset_explicit(p, size, 0);
    #else
      volatile unsigned char *ptr = a;
      for (size_t i == 0; i < size; i++) {
        ptr[i] = 0;
      }
    #endif
  #endif
#endif

Now some deeper thoughts

size_t b = map->hash_fn(key) % map->size; is weak. Note that the default size is 64 and doubles at each growth point. This effectively makes % map->size a mask of the lower bits of the result of map->hash_fn(key). It is better to use % map->size with some prime number to allow all bits of map->hash_fn(key) to contribute to the hash.

I recommend coding a table of primes, each just under a power of 2 and using an index into this table for the hash table size. On growth, move that index up to the next odd index. On shrinkage, go down to the next even index.

 static const size_t ht_size[[] = { 0, 1, 3, 7, 13, 31, 61, ... };

Deeper: I'd go with { 0, 13, 31, 61, ... just to get off thrashing of small hash table that grow/shrink. Also some allocators work better with a value under of a power of 2 minus some offset for the allocation overhead. Thus once the size is at least a few dozen byte, finding a prime less that a power of 2 minus 2 pointer widths is useful. This is an advanced optimization.

Empty table

When a table is empty, minimize that memory impact. An application may have many (thousands, millions) of hash tables, many of which are empty).

Note that some advanced memory management often does not truly allocate physical memory until memory is used, so this is less of an issue. Still I like collections of data, like hash tables, to be minimal until the first item is added.

\$\endgroup\$

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.

Morty Proxy This is a proxified and sanitized view of the page, visit original site.