python performance numpy numba python-cffi

How to wrap a CFFI function in Numba taking Pointers

It should be a easy task, but I can't find a way how to pass a pointer of a scalar value to a CFFI function within a Numba function. Passing a pointer to an array works without problems using ffi.from_buffer.

Example function

import cffi

ffi = cffi.FFI()
defs="void foo_f(int a,double *b);"
ffi.cdef(defs, override=True)
source="""
#include <stdio.h>;
void foo_f(int a,double *b){
  printf("%i",a);
  printf("   ");
  printf("%f",b[0]);
  }

"""
ffi.set_source(module_name="foo",source=source)
ffi.compile()

Passing a pointer to an array

import numpy as np
import numba as nb
import cffi
ffi = cffi.FFI()
import numpy as np
import ctypes
import foo
nb.cffi_support.register_module(foo)
foo_f = foo.lib.foo_f

@nb.njit()
def Test(a,b):
  a_wrap=np.int32(a)
  #This works for an array
  b_wrap=ffi.from_buffer(b.astype(np.float64))
  foo_f(a_wrap,b_wrap)


a=64.
b=np.ones(5)
Test(a,b)

This works without problems, but how can I modify the Test function to take a scalar value b=5. without modifying the CFFI-function itself?

Solution

Pass scalar values by reference using Numba

To get useful timings I have modified the wrapped function a bit. The function simply adds a scalar (passed by value) to a scalar b (passed by reference).

Pros and cons of the approach using intrinsics

Only working in nopython mode
Faster for C or Fortran functions with short runtime (real-world example)

Example function

import cffi

ffi = cffi.FFI()
defs="void foo_f(double a,double *b);"
ffi.cdef(defs, override=True)
source="""
void foo_f(double a,double *b){
  b[0]+=a;
  }
"""
ffi.set_source(module_name="foo",source=source)
ffi.compile()

Wrapper using a temporary array

This is quite straight forward, but requires to allocate an array of size one, which is quite slow.

import numpy as np
import numba as nb
from numba import cffi_support
import cffi
ffi = cffi.FFI()
import foo

nb.cffi_support.register_module(foo)
foo_f = foo.lib.foo_f

@nb.njit("float64(float64,float64)")
def method_using_arrays(a,b):
    b_arr=np.empty(1,dtype=np.float64)
    b_arr[0]=b
    b_arr_ptr=b_wrap=ffi.from_buffer(b_arr)
    foo_f(a,b_arr_ptr)
    return b_arr[0]

Wrapper using intrinsics

from numba import types
from numba.extending import intrinsic
from numba import cgutils

@intrinsic
def ptr_from_val(typingctx, data):
    def impl(context, builder, signature, args):
        ptr = cgutils.alloca_once_value(builder,args[0])
        return ptr
    sig = types.CPointer(data)(data)
    return sig, impl

@intrinsic
def val_from_ptr(typingctx, data):
    def impl(context, builder, signature, args):
        val = builder.load(args[0])
        return val
    sig = data.dtype(data)
    return sig, impl

@nb.njit("float64(float64,float64)")
def method_using_intrinsics(a,b):
    b_ptr=ptr_from_val(b)
    foo_f(a,b_ptr)
    return val_from_ptr(b_ptr)

Timings

#Just call the wrapped function a few times
@nb.njit()
def timing_method_using_intrinsics(a,b):
    for i in range(1000):
        b=method_using_intrinsics(a,b)
    return b

#Just call the wrapped function a few times
@nb.njit()
def timing_method_using_arrays(a,b):
    for i in range(1000):
        b=method_using_arrays(a,b)
    return b

a=1.
b=1.

%timeit timing_method_using_intrinsics(a,b)
#5.15 µs ± 33.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit timing_method_using_arrays(a,b)
#121 µs ± 601 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)