Search code examples
dictionarypython-c-api

A problem in manually reproducing dict underlying code:the value can't be object


I tried to manually reproduce python dict underlying code(it can only implement setitem and getitem), but I met a problem: The hashmap I wrote can only work well when the value type is basic data types(like int,str etc).If the value's type is object, the hashmap can only set item, the python will crash when getting the object type's value.

The error message is " Fatal error: GC object already tracked"

I guess there are several possible problems:

  1. something wrong in the (PyTypeObject)PyHashMap_Type definition
  2. The getitem method return's PyObject*, but the python cannot resolve the object's pointer

Here are some cases I have test

case1:the key and value types are all basic data types, it works well

case2:the key's type is object,but the value type is basic data type , it works well

case3:the key'type is basic data type, but the value type is object, the python crash when getting item, even though the value is the an empty object

case4:the key and value types are all object, the result is the same as case3

// the PyTypeObject
static PyTypeObject PyHashMap_Type = {
    PyVarObject_HEAD_INIT(NULL, 0)
    "hashmap",
    sizeof(PyMapObject),
    0,
    (destructor)PyHashMap_dealloc,              /* tp_dealloc */
    0,                                          /* tp_print */
    0,                                          /* tp_getattr */
    0,                                          /* tp_setattr */
    0,                                          /* tp_reserved */
    (reprfunc)repr_func,                        /* tp_repr */
    0,                                          /* tp_as_number */
    0,                                          /* tp_as_sequence */
    0,                                          /* tp_as_mapping */
    0,                                          /* tp_hash */
    0,                                          /* tp_call */
    0,                                          /* tp_str */
    PyObject_GenericGetAttr,                    /* tp_getattro */
    0,                                          /* tp_setattro */
    0,                                          /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
        Py_TPFLAGS_BASETYPE | Py_TPFLAGS_DICT_SUBCLASS,         /* tp_flags */
    0,                              /* tp_doc */
    0,                              /* tp_traverse */
    0,                              /* tp_clear */
    0,                              /* tp_richcompare */
    0,                                          /* tp_weaklistoffset */
    0,                                          /* tp_iter */
    0,                                          /* tp_iternext */
    0,                               /* tp_methods */
    0,                                          /* tp_members */
    0,                                          /* tp_getset */
    0,                                          /* tp_base */
    0,                                          /* tp_dict */
    0,                                          /* tp_descr_get */
    0,                                          /* tp_descr_set */
    0,                                          /* tp_dictoffset */
    0,                                  /* tp_init */
    PyType_GenericAlloc,                        /* tp_alloc */
    _HashMap_New,                                   /* tp_new */
    PyObject_GC_Del,                            /* tp_free */
};

// the hashmap struct
typedef struct _mapkeysobject PyMapKeysObject;

typedef struct {
    Py_hash_t me_hash;                          
    PyObject *me_key;                           
    PyObject *me_value;                         
} PyMapKeyEntry;

struct _mapkeysobject {
    Py_ssize_t dk_size;                         
    Py_ssize_t dk_usable;
    PyMapKeyEntry dk_entries[1];                
};

typedef struct {
    PyObject_HEAD
    Py_ssize_t ma_used;
    PyMapKeysObject *ma_keys;
    PyObject **ma_values;
} PyMapObject;
// get item methods, the python call GET_ITEM_WRAPPER
static PyObject* GET_ITEM_WRAPPER(PyObject* self, PyObject* args){
    PyObject * o = NULL;
    PyObject * key = NULL;
    if (!PyArg_ParseTuple(args, "OO",&o,&key)) {
        printf("error: arg list error");
        Py_RETURN_NONE;
    }
    PyObject* value = PyMap_GetItem(o, key);
    if (value == NULL) Py_RETURN_NONE;
    return value;
}

static PyObject* PyMap_GetItem(PyObject* o, PyObject* key){
    PyMapObject *mp;
    Py_hash_t hash;
    mp = (PyMapObject *)o;

    hash = PyObject_Hash(key);  
    if (hash == -1)
        return NULL;
    return searchmap(mp, key, hash);
}

static PyObject* searchmap(PyMapObject* mp, PyObject* key, Py_hash_t hash){
    PyObject **value_addr;  
    PyMapKeyEntry *ep;

    ep = lookup_function(mp, key, hash, &value_addr);

    if (ep == NULL) return NULL;
    return ep->me_value;
}

PyMapKeyEntry* lookup_function(PyMapObject* mp, PyObject* key, Py_hash_t hash, PyObject ***value_addr){
    size_t i;
    size_t perturb;
    size_t mask = DK_MASK(mp->ma_keys);                                
    PyMapKeyEntry *ep0 = &mp->ma_keys->dk_entries[0];                 
    PyMapKeyEntry *ep;

    i = (size_t)hash & mask;                                           
    ep = &ep0[i];
    if (ep->me_key == NULL || ep->me_key == key) {                     
        *value_addr = &ep->me_value;                                    
        return ep;                                                      
    }
    for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {                    
        i = (i << 2) + i + perturb + 1;
        ep = &ep0[i & mask];
        if (ep->me_key == NULL || ep->me_key == key) {                  
            *value_addr = &ep->me_value;                                
            return ep;                                                  
        }
    }
    assert(0);          /* NOT REACHED */
    return 0;
}

I expect the value type can be object


Solution

  • I believe you've messed up the reference counting (although it's hard to be absolutely certain with incomplete code).

    Your reference count for the stored values should be at least one (to represent the fact that you hold a reference to it as part of your dictionary). However, at the point you return it then it should be two (one for the reference that you hold, and one for the reference to the return value).

    I suspect in PyMap_GetItem you should do:

    PyObject *ret_val = searchmap(mp, key, hash);
    Py_XINCREF(ret_val);
    return ret_val;
    

    (although I you don't seem to actually set the sequence/mapping methods in PyHashMap_Type so who knows if PyMap_GetItem is actually used...)

    The reason this will appear to work for ints and string is that Python "interns" small ints and short strings, essentially keeping a constant reference to them. If you kept looking them up then it'd break in the end though (each time you do it you lose a reference).