Search code examples
pythonccpythonc-api

C char array from python string


I have a list of strings in python which I'm trying to pass down to a C extension for character analysis. I've gotten so far as to have the list broken up into their individual string PyObjects. Next, I'm hoping to split these strings into their individual characters so that every string PyObject is now a corresponding C-type character array. I can't seem to figure out how to do this though.

Here's what I have so far: Currently after building the .pyd file it will return a list of 1's as a filler to Python (so everything else works), I just don't know how to split a string PyObject into the C-type character array.

--- cExt.c ---

#include <Python.h>
#include <stdio.h>

static int *CitemCheck(PyObject *commandString, int commandStringLength) {

    // HAALP

    //char* commandChars = (char*) malloc(commandStringLength*sizeof(char*));

    // char c[] = PyString_AsString("c", commandString);
    // printf("%c" , c);
    // printf("%s", PyString_AsString(commandString));
    // for (int i=0; i<sizeof(commandChars)/sizeof(*commandChars); i++) {
    //     printf("%s", PyString_AsString(commandString));
    //     printf("%c", commandChars[i]);
    // }
    return 1; // TODO: RETURN PROPER RESULTANT
}

static PyObject *ClistCheck(PyObject *commandList, int commandListLength) {

    PyObject *results = PyList_New(commandListLength);

    for (int index = 0; index < commandListLength; index++) {
        PyObject *commandString;
        commandString = PyList_GetItem(commandList, index);
        int commandStringLength = PyObject_Length(commandString);

        // CitemCheck should take string PyObject and its length as int
        int x = CitemCheck(commandString, commandStringLength);

        PyObject* pyItem = Py_BuildValue("i", x);
        PyList_SetItem(results, index, pyItem);
    }
    return results;
}

static PyObject *parseListCheck(PyObject *self, PyObject *args) {
    PyObject *commandList;
    int commandListLength;

    if (!PyArg_ParseTuple(args, "O", &commandList)){
        return NULL;
    }

    commandListLength = PyObject_Length(commandList);

    return Py_BuildValue("O", ClistCheck(commandList, commandListLength));
}

static char listCheckDocs[] = 
    ""; // TODO: ADD DOCSTRING

static PyMethodDef listCheck[] = {
 {"listCheck", (PyCFunction) parseListCheck, METH_VARARGS, listCheckDocs},
 {NULL,NULL,0,NULL}
};

static struct PyModuleDef DCE = {
    PyModuleDef_HEAD_INIT,
    "listCheck",
    NULL,
    -1,
    listCheck
};

PyMODINIT_FUNC PyInit_cExt(void){
    return PyModule_Create(&DCE);
}

for reference, my temporary extension build file:

--- _c_setup.py --- 
(located in same folder as cExt.c)
"""
to build C files, pass:

python _c_setup.py build_ext --inplace clean --all

in command prompt which is cd'd to the file's dierctory
"""
import glob
from setuptools import setup, Extension, find_packages
from os import path

here = path.abspath(path.dirname(__file__))
files = [path.split(x)[1] for x in glob.glob(path.join(here, '**.c'))]

extensions = [Extension(
    path.splitext(x)[0], [x]
) for x in files]

setup(
    ext_modules = extensions,
)

Solution

  • You can use PyUnicode_AsEncodedString, which

    Encode a Unicode object and return the result as Python bytes object. encoding and errors have the same meaning as the parameters of the same name in the Unicode encode() method. The codec to be used is looked up using the Python codec registry. Return NULL if an exception was raised by the codec.

    see https://docs.python.org/3/c-api/unicode.html#c.PyUnicode_AsEncodedString

    Then with PyBytes_AsString you get a pointer to internal buffer with a terminating NUL byte. This buffer must neither be deallocated nor modified. If you need a copy you could use e.g. strdup.

    see https://docs.python.org/3/c-api/bytes.html#c.PyBytes_AsString

    Slightly modifying your code it could look like this:

    PyObject *encodedString = PyUnicode_AsEncodedString(commandString, "UTF-8", "strict");
    if (encodedString) { //returns NULL if an exception was raised
        char *commandChars = PyBytes_AsString(encodedString); //pointer refers to the internal buffer of encodedString
        if(commandChars) {
            printf("the string '%s' consists of the following chars:\n", commandChars);
            for (int i = 0; commandChars[i] != '\0'; i++) {
                printf("%c ", commandChars[i]);
            }
            printf("\n");
        }
        Py_DECREF(encodedString);
    }
    

    If one would test with:

    import cExt
    
    fruits = ["apple", "pears", "cherry", "pear", "blueberry", "strawberry"]         
    res = cExt.listCheck(fruits)
    print(res)
    

    The output would be:

    the string 'apple' consists of the following chars:
    a p p l e 
    the string 'pears' consists of the following chars:
    p e a r s 
    the string 'cherry' consists of the following chars:
    c h e r r y 
    the string 'pear' consists of the following chars:
    p e a r 
    the string 'blueberry' consists of the following chars:
    b l u e b e r r y 
    the string 'strawberry' consists of the following chars:
    s t r a w b e r r y 
    [1, 1, 1, 1, 1, 1]
    

    Side note not directly related to the question: Your CitemCheck function returns a pointer to int, but if looking at how it is called, it seems that you want to return an int value. The function signature should look more like this:

    static int CitemCheck(PyObject *commandString, int commandStringLength)
    

    (note the removed * after int).