13 Aug 2025
Say you want to write an integer hash-set implementation in C. You might start like this:
/* a hash-set implemented with a flat open-addressed array with linear probing */
struct set {
int *entries;
size_t N;
};
void set_init(struct set *set, size_t initial) {
/* make sure N is a power of 2, with enough capacity to minimize collisions */
for (set->N = 8; set->N < initial * 2; set->N *= 2) ;
set->entries = calloc(set->N, sizeof *set->entries);
}
unsigned inthash(unsigned x) { ... }
bool set_contains(struct set *set, int x) {
int h = inthash(x) & (set->N - 1), i = h;
do {
----> if (entry i is empty) <----
return 0;
else if (set->entries[i] == x)
return 1;
i = (i + 1) & (set->N - 1);
} while (i != h);
/* set is full, looped around */
return 0;
}
void _set_grow(struct set *set) { ... } // omitted for brevity
void set_put(struct set *set, int x) {
int h = inthash(x) & (set->N - 1), i = h;
do {
----> if (entry i is empty) <----
set->entries[i] = x;
else if (set->entries[i] == x)
return; /* nothing to do */
i = (i + 1) & (set->N - 1);
} while (i != h);
/* full set, looped around */
_set_grow(set);
set_put(set, x);
}
The problem is, if every array item can hold a key, how do you mark an entry as empty? There's two ways you can go about this:
bool *hasentry
also of size N,
which for every index stores whether the corresponding entry is used or
empty. Initialized to zeroes and updated when new entries are added in
set_put
. Instead of 1 byte per entry with a bool array, you can be more
efficient using a bit array. This is the generalized approach that is useful
if the values the set is storing are truly arbitrary.So the easy case is if your sentinel value is zero, then it's a matter of changing
if (entry i is empty)
into
if (!set->entries[i])
and adding a check like
void set_put(struct set *set, int x) {
assert(x != 0 && "illegal value");
...
}
Note that in set_init
(and _set_grow
), the use of calloc
already zeroes
the whole array, marking it initially as all empty, exactly as we want.
But in practice, very often the integers you might wanna store in the set are not arbitrary (so you can use the sentinel technique), but zero is a value you would like to be able to store, thus not a good sentinel. In this case you have to fallback to a non-zero sentinel, for example
#define _SET_EMPTY INT_MIN
void set_init(struct set *set, size_t initial) {
...
set->entries = malloc(set->N);
for (int i = 0; i < set->N; ++i) set->entries[i] = _SET_EMPTY;
}
bool set_contains(struct set *set, int x) {
...
if (set->entries[i] == _SET_EMPTY)
return 0;
...
}
void _set_grow(struct set *set) {
int *old_entries = set->entries;
size_t old_N = set->N;
set->entries = malloc(set->N *= 2);
for (int i = 0; i < set->N; ++i) set->entries[i] = _SET_EMPTY;
...
}
void set_put(struct set *set, int x) {
assert(x != _SET_EMPTY && "illegal value");
...
if (set->entries[i] == _SET_EMPTY)
set->entries[i] = x;
...
}
This works fine, but we lose the nicety of calloc
.
My trick combines the benefits of the zero-sentinel (initialization for free) and an arbitrary sentinel by simply storing the value XORed with the sentinel.
calloc
s for zero initializationif (entry i is empty)
becomes if (!set->entries[i])
Thus:
bool set_contains(struct set *set, int x) {
int h = inthash(x) & (set->N - 1), i = h;
do {
if (!set->entries[i])
return 0;
else if ((set->entries[i] ^ _SET_EMPTY) == x) // <--------
return 1;
i = (i + 1) & (set->N - 1);
} while (i != h);
/* set is full, looped around */
return 0;
}
void set_put(struct set *set, int x) {
int h = inthash(x) & (set->N - 1), i = h;
assert(x != _SET_EMPTY && "illegal value");
do {
if (!set->entries[i])
set->entries[i] = x ^ _SET_EMPTY; // <--------
else if ((set->entries[i] ^ _SET_EMPTY) == x) // <-------
return; /* nothing to do */
i = (i + 1) & (set->N - 1);
} while (i != h);
/* full set, looped around */
_set_grow(set);
set_put(set, x);
}
This works because of the mathematical properties of bitwise XOR:
x^y == 0
if and only if x == y
,x^y = z
then z^y = x
and x^z = y
(XOR is reversible)In situations where calloc
is more efficient than malloc
+ manual
initialization, this is a potentially more efficient solution, especially if
the size of your set is unpredictable and could grow multiple times.