Search code examples
calgorithmtreeassociative-arrayb-tree

Implementation of in-memory B-Tree insertion


I've been creating an in-memory C implementation of a B-tree because I found none online which were readable (ie. This horrid code: http://www.freewebs.com/attractivechaos/kbtree.h.html).

It doesn't quite work because occasionally when inserting elements it will fail to find previously inserted elements. Also I'm not sure if my general implementation is very good and that I'm doing the insertion in a wise way. Can anyone criticise what I've done and find out why it doesn't work all the time?

Apparently B-Trees can be more efficient than Red-black or AVL trees, because elements are stored together in memory for each node. That got me interested.

Please note that the "order" is the number of elements and not the number of children pointers. The reason is simply because it made more sense to me.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define CB_BTREE_ORDER 8
#define CB_BTREE_HALF_ORDER CB_BTREE_ORDER/2

typedef struct{
    void * parent;
    void * children[CB_BTREE_ORDER + 1];
    unsigned char numElements;
} CBBTreeNode;

typedef struct{
    unsigned char found;
    CBBTreeNode * node;
    unsigned char pos;
} CBFindResult;

typedef struct{
    unsigned char keySize;
    unsigned char dataSize;
    int nodeSize;
    CBBTreeNode * root;
} CBAssociativeArray;

CBFindResult CBAssociativeArrayFind(CBAssociativeArray * self, unsigned char * key);
void CBAssociativeArrayInsert(CBAssociativeArray * self, unsigned char * key, void * data, CBFindResult pos, CBBTreeNode * right);
CBFindResult CBBTreeNodeBinarySearch(CBBTreeNode * self, unsigned char * key, unsigned char keySize);
void CBInitAssociativeArray(CBAssociativeArray * self, unsigned char keySize, unsigned char dataSize);

CBFindResult CBAssociativeArrayFind(CBAssociativeArray * self, unsigned char * key){
    CBFindResult result;
    CBBTreeNode * node = self->root;
    for (;;) {
        result = CBBTreeNodeBinarySearch(node, key, self->keySize);
        if (result.found){
            result.node = node;
            return result;
        }else{
            if (node->children[result.pos])
                node = node->children[result.pos];
            else{
                result.node = node;
                return result;
            }
        }
    }
}
void CBAssociativeArrayInsert(CBAssociativeArray * self, unsigned char * key, void * data, CBFindResult pos, CBBTreeNode * right){
    // See if we can insert data in this node
    unsigned char * keys = (unsigned char *)(pos.node + 1);
    unsigned char * dataElements = keys + self->keySize * CB_BTREE_ORDER;
    if (pos.node->numElements < CB_BTREE_ORDER) {
        if (pos.node->numElements > pos.pos){
            memmove(keys + (pos.pos + 1) * self->keySize, keys + pos.pos * self->keySize, (pos.node->numElements - pos.pos) * self->keySize);
            memmove(dataElements + (pos.pos + 1) * self->dataSize, dataElements + pos.pos * self->dataSize, (pos.node->numElements - pos.pos) * self->dataSize);
            memmove(pos.node->children + pos.pos + 2, pos.node->children + pos.pos + 1, (pos.node->numElements - pos.pos) *  sizeof(*pos.node->children));
        }
        memcpy(keys + pos.pos * self->keySize, key, self->keySize);
        memcpy(dataElements + pos.pos * self->dataSize, data, self->dataSize);
        pos.node->children[pos.pos + 1] = right;
        pos.node->numElements++;
    }else{
        CBBTreeNode * new = malloc(self->nodeSize);
        unsigned char * newKeys = (unsigned char *)(new + 1);
        unsigned char * newData = newKeys + self->keySize * CB_BTREE_ORDER;
        new->numElements = CB_BTREE_HALF_ORDER;
        pos.node->numElements = CB_BTREE_HALF_ORDER;
        unsigned char * midKey;
        unsigned char * midVal;
        if (pos.pos >= CB_BTREE_HALF_ORDER) {
            if (pos.pos == CB_BTREE_HALF_ORDER) {
                memcpy(newKeys, keys + CB_BTREE_HALF_ORDER * self->keySize, CB_BTREE_HALF_ORDER * self->keySize);
                memcpy(newData, dataElements + CB_BTREE_HALF_ORDER * self->dataSize, CB_BTREE_HALF_ORDER * self->dataSize);
                memcpy(new->children + 1, pos.node->children + CB_BTREE_HALF_ORDER + 1, CB_BTREE_HALF_ORDER * sizeof(*new->children));
                new->children[0] = right;
                midKey = key;
                midVal = data;
            }else{
                if (pos.pos > CB_BTREE_HALF_ORDER + 1){
                    memcpy(newKeys, keys + (CB_BTREE_HALF_ORDER + 1) * self->keySize, (pos.pos - CB_BTREE_HALF_ORDER - 1) * self->keySize);
                    memcpy(newData, dataElements + (CB_BTREE_HALF_ORDER + 1) * self->dataSize, (pos.pos - CB_BTREE_HALF_ORDER - 1)  * self->dataSize);
                }
                memcpy(newKeys + (pos.pos - CB_BTREE_HALF_ORDER - 1) * self->keySize, key, self->keySize);
                memcpy(newData + (pos.pos - CB_BTREE_HALF_ORDER - 1) * self->dataSize, data, self->dataSize);
                memcpy(newKeys + (pos.pos - CB_BTREE_HALF_ORDER) * self->keySize, keys + pos.pos * self->keySize, (CB_BTREE_ORDER - pos.pos) * self->keySize);
                memcpy(newData + (pos.pos - CB_BTREE_HALF_ORDER) * self->dataSize, dataElements + pos.pos * self->dataSize, (CB_BTREE_ORDER - pos.pos) * self->dataSize); // o 0 i 1 ii 2 iii 3 iv
                memcpy(new->children, pos.node->children + CB_BTREE_HALF_ORDER + 1, (pos.pos - CB_BTREE_HALF_ORDER) * sizeof(*new->children));
                new->children[pos.pos - CB_BTREE_HALF_ORDER] = right;
                if (CB_BTREE_ORDER > pos.pos)
                    memcpy(new->children + pos.pos - CB_BTREE_HALF_ORDER + 1, pos.node->children + pos.pos + 1, (CB_BTREE_ORDER - pos.pos) * sizeof(*new->children));
                midKey = keys + CB_BTREE_HALF_ORDER * self->keySize;
                midVal = dataElements + CB_BTREE_HALF_ORDER * self->dataSize;
            }
        }else{
            memcpy(newKeys, keys + CB_BTREE_HALF_ORDER * self->keySize, CB_BTREE_HALF_ORDER * self->keySize);
            memcpy(newData, dataElements + CB_BTREE_HALF_ORDER * self->dataSize, CB_BTREE_HALF_ORDER * self->dataSize);
            memcpy(new->children, pos.node->children + CB_BTREE_HALF_ORDER, (CB_BTREE_HALF_ORDER + 1) * sizeof(*new->children));
            memmove(keys + (pos.pos + 1) * self->keySize, keys + pos.pos * self->keySize, (CB_BTREE_HALF_ORDER - pos.pos) * self->keySize);
            memmove(dataElements + (pos.pos + 1) * self->dataSize, dataElements + pos.pos * self->dataSize, (CB_BTREE_HALF_ORDER - pos.pos) * self->dataSize);
            if (CB_BTREE_HALF_ORDER > 1 + pos.pos)
                memmove(pos.node->children + pos.pos + 2, pos.node->children + pos.pos + 1, (CB_BTREE_HALF_ORDER - pos.pos - 1) * self->dataSize);
            memcpy(keys + pos.pos * self->keySize, key, self->keySize);
            memcpy(dataElements + pos.pos * self->dataSize, data, self->dataSize);
            pos.node->children[pos.pos + 1] = right;
            midKey = keys + CB_BTREE_HALF_ORDER * self->keySize;
            midVal = dataElements + CB_BTREE_HALF_ORDER * self->dataSize;
        }
        if ( ! pos.node->parent) {
            self->root = malloc(self->nodeSize);
            self->root->numElements = 0;
            self->root->parent = NULL;
            pos.node->parent = self->root;
            self->root->children[0] = pos.node;
        }
        new->parent = pos.node->parent;
        CBFindResult res = CBBTreeNodeBinarySearch(pos.node->parent, midKey, self->keySize);
        res.node = pos.node->parent;
        return CBAssociativeArrayInsert(self, midKey, midVal, res, new);
    }
}
CBFindResult CBBTreeNodeBinarySearch(CBBTreeNode * self, unsigned char * key, unsigned char keySize){
    CBFindResult res;
    res.found = 0;
    if ( ! self->numElements) {
        res.pos = 0;
        return res;
    }
    unsigned char left = 0;
    unsigned char right = self->numElements - 1;
    unsigned char * keys = (unsigned char *)(self + 1);
    int cmp;
    while (left <= right) {
        res.pos = (right+left)/2;
        cmp = memcmp(key, keys + res.pos * keySize, keySize);
        if (cmp == 0) {
            res.found = 1;
            break;
        }else if (cmp < 0){
            if ( ! res.pos)
                break;
            right = res.pos - 1;
        }else
            left = res.pos + 1;
    }
    if (cmp > 0)
        res.pos++;
    return res;
}
void CBInitAssociativeArray(CBAssociativeArray * self, unsigned char keySize, unsigned char dataSize){
    self->keySize = keySize;
    self->dataSize = dataSize;
    self->nodeSize = sizeof(*self->root) + (keySize + dataSize) * CB_BTREE_ORDER;
    self->root = malloc(self->nodeSize);
    self->root->parent = NULL;
    self->root->numElements = 0;
    for (unsigned char x = 0; x < CB_BTREE_ORDER + 1; x++)
        self->root->children[x] = NULL;
}

int main(){
    srand(1);
    CBAssociativeArray array;
    CBInitAssociativeArray(&array, 10, 10);
    int size = CB_BTREE_ORDER * (CB_BTREE_ORDER + 2) * 10;;
    unsigned char * keys = malloc(size);
    for (int x = 0; x < size; x++) {
        keys[x] = rand();
    }
    for (int x = 0; x < size; x += 10) {
        CBAssociativeArrayInsert(&array, keys + x, keys + x, CBAssociativeArrayFind(&array, keys + x), NULL);
        for (int y = 0; y <= x; y += 10) {
            if ( ! CBAssociativeArrayFind(&array, keys + y).found) {
                printf("RANDOM FIND FAIL %u - %u\n", y, x);
                return 1;
            }
        }
    }
    return 0;
}

Thanks.


Solution

  • The problem is nothing to do with the memcpy or memmove operations. I forgot to change the parent pointers when splitting the nodes.

    Adding this after the node has been split will make it work:

    if (new->children[0])
        for (unsigned char x = 0; x < CB_BTREE_HALF_ORDER + 1; x++) {
            CBBTreeNode * child = new->children[x];
            child->parent = new;
        }
    

    Forgetting to add this was very foolish. The algorithm now works with many random insertions.