通过ctypes调用numpy的sse2

Question

简单来说，我想从Python中调用一个共享库，具体来说，是从numpy中调用。这个共享库是用C语言写的，使用了sse2指令。当我开启优化（也就是用 -O2 或 -O1 编译这个库）时，通过ctypes调用共享库时会出现奇怪的段错误（segfault）。但是如果关闭优化（用 -O0），一切都正常，就像直接把库链接到C程序一样（无论是否优化）。下面是我在系统上遇到的情况。开启优化后，gdb报告在 emmintrin.h 的第113行出现了段错误，__P的值被报告为优化掉了。

test.c:

#include <emmintrin.h>
#include <complex.h>
void test(const int m, const double* x, double complex* y) {

    int i;
    __m128d _f, _x, _b;
    double complex f __attribute__( (aligned(16)) );
    double complex b __attribute__( (aligned(16)) );
    __m128d* _p;

    b = 1;
    _b = _mm_loadu_pd( (double *) &b );

    _p = (__m128d*) y;

    for(i=0; i<m; ++i) {
        f = cexp(-I*x[i]);
        _f = _mm_loadu_pd( (double *) &f );
        _x = _mm_loadu_pd( (double *) &x[i] );      
        _f = _mm_shuffle_pd(_f, _f, 1);
        *_p = _mm_add_pd(*_p, _f);
        *_p = _mm_add_pd(*_p, _x); 
        *_p = _mm_mul_pd(*_p,_b);
        _p++;
    }
    return;
}

编译器选项：
gcc -o libtest.so -shared -std=c99 -msse2 -fPIC -O2 -g -lm test.c

test.py:

import numpy as np
import os

def zerovec_aligned(nr, dtype=np.float64, boundary=16):
    '''Create an aligned array of zeros.
    '''
    size = nr * np.dtype(dtype).itemsize
    tmp = np.zeros(size + boundary, dtype=np.uint8)
    address = tmp.__array_interface__['data'][0]
    offset = boundary - address % boundary
    return tmp[offset:offset + size].view(dtype=dtype)


lib = np.ctypeslib.load_library('libtest', '.' )
lib.test.restype = None
lib.test.argtypes = [np.ctypeslib.ctypes.c_int,
                     np.ctypeslib.ndpointer(np.float64, flags=('C', 'A') ),
                     np.ctypeslib.ndpointer(np.complex128, flags=('C', 'A', 'W') )]


n = 13
y = zerovec_aligned(n, dtype=np.complex128)
x = np.ones(n, dtype=np.float64)
# x = zerovec_aligned(n, dtype=np.float64)
# x[:] = 1.

lib.test(n,x,y)

从C调用test的结果是正常的：

call_from_c.c:

#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <emmintrin.h>

void test(const int m, const double* x, double complex* y);

int main() {

    int i; 
    const int n = 11;
    double complex *y = (double complex*) _mm_malloc(n*sizeof(double complex), 16);
    double *x = (double *) malloc(n*sizeof(double));
    for(i=0; i<n; ++i) {
        x[i] = 1;
        y[i] = 0;
    }

    test(n, x, y);
    for(i=0; i<n; ++i)
            printf("[%f %f]\n", creal(y[i]), cimag(y[i]));

    return 1;

}

编译并调用：
gcc -std=c99 -otestc -msse2 -L. -ltest call_from_c.c
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:.
./testc
... 一切正常。

我的系统信息：

Ubuntu Linux i686 2.6.31-22-generic
编译器：gcc (Ubuntu 4.4.1-4ubuntu9)
Python：Python 2.6.4 (r264:75706, 2009年12月7日) [GCC 4.4.1]
Numpy：1.4.0

我已经采取了一些措施（参考Python代码），确保y是对齐的，而x的对齐应该没问题（我认为；不过明确对齐x并没有解决问题）。

另外，我在加载b和f时使用了_mm_loadu_pd而不是_mm_load_pd。对于仅C的版本，_mm_load_pd是可以正常工作的（如预期）。但是，当通过ctypes调用这个函数时，使用_mm_load_pd总是会出现段错误（与优化无关）。

我尝试了好几天来解决这个问题，但一直没有成功……我快要忍不住想把显示器砸了。欢迎任何建议。
Daniel

gcc numpy ctypes optimization shared library segfault memory alignment sse2

通过ctypes调用numpy的sse2

3 个回答

撰写回答