在Python中运行C扩展比普通C更快

#include "mmult.h" void mmult(int32_t a[1024],int32_t b[1024],int32_t c[1024]) { struct timeval t1, t2; gettimeofday(&t1, NULL); for(int i=0; i<32; i=i+1) { for(int j=0; j<32; j=j+1) { int32_t result=0; for(int k=0; k<32; k=k+1) { result+=a[i*32+k]*b[k*32+j]; } c[i*32+j] = result; } } gettimeofday(&t2, NULL); double elapsedTime = (t2.tv_usec - t1.tv_usec) + (t2.tv_sec - t1.tv_sec)*1000000; printf("elapsed time: %fus\n",elapsedTime); }

#include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include "mmult.h" int main() { int* a = (int*)malloc(sizeof(int)*1024); int* b = (int*)malloc(sizeof(int)*1024); int* c = (int*)malloc(sizeof(int)*1024); for(int i=0; i<1024; i++) { a[i]=i+1; b[i]=i+1; c[i]=0; } struct timeval t1, t2; gettimeofday(&t1, NULL); mmult(a,b,c); gettimeofday(&t2, NULL); double elapsedTime = (t2.tv_usec - t1.tv_usec) + (t2.tv_sec - t1.tv_sec)*1000000; printf("elapsed time: %fus\n",elapsedTime); free(a); free(b); free(c); return 0; }

#include <Python.h> #include <numpy/arrayobject.h> #include "mmult.h" static PyObject* mmult_wrapper(PyObject* self, PyObject* args) { int32_t* a; PyArrayObject* a_obj = NULL; int32_t* b; PyArrayObject* b_obj = NULL; int32_t* c; PyArrayObject* c_obj = NULL; int res = PyArg_ParseTuple(args, "OOO", &a_obj, &b_obj, &c_obj); if (!res) return NULL; a = (int32_t*) PyArray_DATA(a_obj); b = (int32_t*) PyArray_DATA(b_obj); c = (int32_t*) PyArray_DATA(c_obj); /* call function */ mmult(a,b,c); Py_RETURN_NONE; } /* define functions in module */ static PyMethodDef TheMethods[] = { {"mmult_wrapper", mmult_wrapper, METH_VARARGS, "your c function"}, {NULL, NULL, 0, NULL} }; static struct PyModuleDef cModPyDem = { PyModuleDef_HEAD_INIT, "mmult", "Some documentation", -1, TheMethods }; PyMODINIT_FUNC PyInit_c_module(void) { PyObject* retval = PyModule_Create(&cModPyDem); import_array(); return retval; }

import os import numpy from distutils.core import setup, Extension cur = os.path.dirname(os.path.realpath(__file__)) c_module = Extension("c_module", sources=["wrapper.cpp","mmult.cpp"],include_dirs=[cur,numpy.get_include()]) setup(ext_modules=[c_module])

import c_module import time import numpy as np if __name__ == "__main__": a = np.ndarray((32,32),dtype='int32',buffer=np.linspace(1,1024,1024,dtype='int32').reshape(32,32)) b = np.ndarray((32,32),dtype='int32',buffer=np.linspace(1,1024,1024,dtype='int32').reshape(32,32)) c = np.ndarray((32,32),dtype='int32',buffer=np.zeros((32,32),dtype='int32')) c_module.mmult_wrapper(a,b,c)

1条回答

网友
1楼 · 发布于 2024-05-23 16:51:14

85微秒的延迟太小，无法可靠地重复测量。例如，CPU cache效应（或context switches，或paging）可能支配计算时间（并改变它使时间变得毫无意义）。你知道吗
^{（我猜您使用的是Linux/x86-64）}
根据经验，试着至少持续半秒，并重复几次基准测试。您还可以使用time(1)进行测量。你知道吗
另见time(7)。时间有几个概念（经过的“实时”时间、单调时间、进程cpu时间、线程cpu时间等等）。您可以考虑使用clock(3)或clock_gettime(2)来测量时间。你知道吗
顺便说一句，您可能会使用较新版本的GCC（2017年11月，GCC7和几周后的GCC8）进行编译，并且您希望使用gcc -march=native -O3进行编译以进行基准测试。还可以尝试其他optimization options和调优。您也可以尝试其他编译器，例如Clang/LLVM。你知道吗
另请看this相关问题的答案（关于并行化）。可能numpy包正在（内部）使用类似的技术（在Python GIL之外），因此可能比C中的原始顺序矩阵乘法代码更快

相关问题更多 >

编程相关推荐

热门问题

热门文章