There is very efficient assoc. array C language implementation used in php source code.
/*
* HashTable Data Layout
* =====================
*
* +=============================+
* | HT_HASH(ht, ht->nTableMask) |
* | ... |
* | HT_HASH(ht, -1) |
* +-----------------------------+
* ht->arData ---> | Bucket[0] |
* | ... |
* | Bucket[ht->nTableSize-1] |
* +=============================+
*/
Structures:
typedef struct _Bucket {
zval val;
zend_ulong h; /* hash value (or numeric index) */
zend_string *key; /* string key or NULL for numerics */
} Bucket;
typedef struct _zend_array HashTable;
struct _zend_array {
zend_refcounted_h gc;
union {
struct {
ZEND_ENDIAN_LOHI_4(
zend_uchar flags,
zend_uchar _unused,
zend_uchar nIteratorsCount,
zend_uchar _unused2)
} v;
uint32_t flags;
} u;
uint32_t nTableMask;
Bucket *arData;
uint32_t nNumUsed;
uint32_t nNumOfElements;
uint32_t nTableSize;
uint32_t nInternalPointer;
zend_long nNextFreeElement;
dtor_func_t pDestructor;
};
example function:
static zend_always_inline Bucket *zend_hash_find_bucket(const HashTable *ht, zend_string *key)
{
zend_ulong h;
uint32_t nIndex;
uint32_t idx;
Bucket *p, *arData;
h = zend_string_hash_val(key);
arData = ht->arData;
nIndex = h | ht->nTableMask; //index calculation
idx = HT_HASH_EX(arData, nIndex);
while (EXPECTED(idx != HT_INVALID_IDX)) {
p = HT_HASH_TO_BUCKET_EX(arData, idx);
if (EXPECTED(p->key == key)) { /* check for the same interned string */
return p;
} else if (EXPECTED(p->h == h) &&
EXPECTED(p->key) &&
EXPECTED(ZSTR_LEN(p->key) == ZSTR_LEN(key)) &&
EXPECTED(memcmp(ZSTR_VAL(p->key), ZSTR_VAL(key), ZSTR_LEN(key)) == 0)) {
return p;
}
idx = Z_NEXT(p->val);
}
return NULL;
}
h is a big integer returned by a hash function.
The question is: Why index calculating this way?
nIndex = h | ht->nTableMask; //index calculation
Why not simple remainder of the division h integer on hashtable size?
nIndex = h & (ht->nTableSize - 1); //analog: nIndex = h % ht->nTableSize
This is to make the number negative. The layout of the hash table is really brain-dead (Zend/zend_types.h):
/*
* HashTable Data Layout
* =====================
*
* +=============================+
* | HT_HASH(ht, ht->nTableMask) |
* | ... |
* | HT_HASH(ht, -1) |
* +-----------------------------+
* ht->arData ---> | Bucket[0] |
* | ... |
* | Bucket[ht->nTableSize-1] |
* +=============================+
*/
The ht->nTableMask
is an integer that interpreted as 2's complement is negative, the intent is that by ORring with this, and converting to int32_t
you get a negative offset from ht->arData
. Then the ht->arData
which is of type pointer to Bucket is cast to pointer to uint32_t
and that pointer is indexed using negative indexes. I.e. all this dubious trickery exists to not need to have 2 pointers per hash table but using 1 that points to the middle of the data structure.
A modulo using AND and subtracting from ht->arData
would have sufficed, and resulted in identical operation - it seems that this has been hand-optimized to be fast on some bad compiler.