我第一次尝试使用cython。并尝试将函数从使用纯numpy转换为cython
以下是两个功能:
from __future__ import division
import numpy as np
cimport numpy as np
DTYPEf = np.float64
ctypedef np.float64_t DTYPEf_t
DTYPEi = np.int64
ctypedef np.int64_t DTYPEi_t
DTYPEu = np.uint8
ctypedef np.uint8_t DTYPEu_t
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
def twodcitera(np.ndarray[DTYPEf_t, ndim=3] data, int res, int indexl, int indexu, float radius1, float radius2, output, float height1, float height2 ):
'''
Function to return correlation for fixed radius using Cython
'''
cdef float sum_mask = 0
cdef int i,j,k
cdef int a, b, c
cdef np.ndarray[DTYPEi_t, ndim=3] x
cdef np.ndarray[DTYPEi_t, ndim=3] y
cdef np.ndarray[DTYPEi_t, ndim=3] z
cdef np.ndarray[DTYPEu_t, ndim=3, cast=True] R
a,b,c = res//2,res//2,res//2
x,y,z = np.ogrid[-a:a,-b:b,-c:c]
for i in xrange(indexl,indexu):
for j in xrange(1):
for k in xrange(1):
R = np.roll(np.roll(np.roll(np.logical_and(np.logical_or(np.logical_and(z>height1,z<=height2), np.logical_and(z<-height1,z>=-height2)), np.logical_and(x**2 + y**2<= radius2**2, x**2 + y**2 > radius1**2)), (i-a), axis =0), (j-a), axis =1), (k-a), axis =2)
sum_mask += (data[i][j][k] * np.average(data[R]))
output.put(sum_mask)
对于numpy实现:
def no_twodcitera(data, res, indexl, indexu, radius1, radius2, output, height1, height2 ):
'''
Function to return correlation for fixed radius
'''
a,b,c = res/2,res/2,res/2
x,y,z = np.ogrid[-a:a,-b:b,-c:c]
sum_mask = 0
for i in xrange(indexl,indexu):
for j in xrange(1):
for k in xrange(1):
R = np.roll(np.roll(np.roll(np.logical_and(np.logical_or(np.logical_and(z>height1,z<=height2), np.logical_and(z<-height1,z>=-height2)), np.logical_and(x**2 + y**2<= radius2**2, x**2 + y**2 > radius1**2)), (i-a), axis =0), (j-a), axis =1), (k-a), axis =2)
sum_mask += (data[i][j][k] * np.average(data[R]))
output.put(sum_mask)
这两个函数实际上给了我相同的时间来完成。
%timeit -n200 -r10 twodcitera(dd, tes_res,in1,in2,r[k],r[k+1], output, r[l], r[l+1])
200 loops, best of 10: 1.57 ms per loop
%timeit -n200 -r10 no_twodcitera(dd, tes_res,in1,in2,r[k],r[k+1], output, r[l], r[l+1])
200 loops, best of 10: 1.57 ms per loop
我想知道我做错了什么或者在尝试实现cython时我没有正确理解。输入是:
dd = np.random.randn(64,64,64)
res = 64
r = np.arange(0,21,2)
in1 = 0
in2 = 1
l = 5
k = 7
output = mp.Queue()
谢谢你,如果你能在这里指出我的误解。
答案 0 :(得分:1)
在不知道您的输入和输出之后,在cython之后为我编译了以下guide如果您解释如何创建测试输入,我可以提供更多帮助。
编辑:我的第一个想法是,cython编译可能还有一些东西。但我找不到任何有用的东西。因此,这个答案对改善速度问题并没有多大帮助。无论如何,我把它留给那些对测试和理解感兴趣的人。将代码放入test.pyx
cimport cython
import numpy as np
cimport numpy as np
DTYPEf = np.float64
ctypedef np.float64_t DTYPEf_t
DTYPEi = np.int64
ctypedef np.int64_t DTYPEi_t
DTYPEu = np.uint8
ctypedef np.uint8_t DTYPEu_t
@cython.boundscheck(False)
@cython.wraparound(False)
def twodcitera(np.ndarray[DTYPEf_t, ndim=3] data, int res, int indexl, int indexu, float radius1, float radius2, output, float height1, float height2 ):
'''
Function to return correlation for fixed radius using Cython
'''
cdef float sum_mask = 0
cdef int i,j,k
cdef int a, b, c
cdef np.ndarray[DTYPEi_t, ndim=3] x
cdef np.ndarray[DTYPEi_t, ndim=3] y
cdef np.ndarray[DTYPEi_t, ndim=3] z
cdef np.ndarray[DTYPEu_t, ndim=3, cast=True] R
a,b,c = res//2,res//2,res//2
x,y,z = np.ogrid[-a:a,-b:b,-c:c]
for i in xrange(indexl,indexu):
for j in xrange(1):
for k in xrange(1):
R = np.roll(np.roll(np.roll(np.logical_and(np.logical_or(np.logical_and(z>height1,z<=height2), np.logical_and(z<-height1,z>=-height2)), np.logical_and(x**2 + y**2<= radius2**2, x**2 + y**2 > radius1**2)), (i-a), axis =0), (j-a), axis =1), (k-a), axis =2)
sum_mask += (data[i][j][k] * np.average(data[R]))
output.put(sum_mask)
创建一个make文件setup.py并放入
from distutils.core import setup
from Cython.Build import cythonize
setup(
name = "testapp",
ext_modules = cythonize('test.pyx'), # accepts a glob pattern
)
转到shell并编译它:
$python setup.py build_ext --inplace
转到ipython并尝试导入:
from test import *
让我跑步。
速度测试显示:
In [28]: %timeit -n200 -r10 no_twodcitera(dd, res,in1,in2,r[k],r[k+1], output, r[l], r[l+1])
200 loops, best of 10: 1.29 ms per loop
In [29]: %timeit -n200 -r10 test.twodcitera(dd, res,in1,in2,r[k],r[k+1], output, r[l], r[l+1])
200 loops, best of 10: 1.31 ms per loop
所以结果是一样的,差别不大。我进一步进行了一个cProfile研究,看看调用堆栈的运行时是否有东西出现。必须承认,cProfile在达到ms秒速度时难以解释!但我们试一试。
In [34]: cProfile.run("""no_twodcitera(dd, res,in1,in2,r[k],r[k+1], output, r[l], r[l+1])""")
82 function calls in 0.004 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.001 0.001 0.004 0.004 <ipython-input-27-663e142d15fb>:1(no_twodcitera)
1 0.000 0.000 0.004 0.004 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 _methods.py:43(_count_reduce_items)
1 0.000 0.000 0.000 0.000 _methods.py:53(_mean)
1 0.000 0.000 0.000 0.000 function_base.py:436(average)
1 0.000 0.000 0.000 0.000 index_tricks.py:151(__getitem__)
3 0.000 0.000 0.002 0.001 numeric.py:1279(roll)
1 0.000 0.000 0.000 0.000 numeric.py:394(asarray)
4 0.000 0.000 0.000 0.000 numeric.py:464(asanyarray)
1 0.000 0.000 0.000 0.000 queues.py:99(put)
1 0.000 0.000 0.000 0.000 threading.py:299(_is_owned)
1 0.000 0.000 0.000 0.000 threading.py:372(notify)
1 0.000 0.000 0.000 0.000 threading.py:63(_note)
1 0.000 0.000 0.000 0.000 {hasattr}
18 0.000 0.000 0.000 0.000 {isinstance}
1 0.000 0.000 0.000 0.000 {issubclass}
5 0.000 0.000 0.000 0.000 {len}
3 0.000 0.000 0.000 0.000 {math.ceil}
1 0.000 0.000 0.000 0.000 {method 'acquire' of '_multiprocessing.SemLock' objects}
2 0.000 0.000 0.000 0.000 {method 'acquire' of 'thread.lock' objects}
1 0.000 0.000 0.000 0.000 {method 'append' of 'collections.deque' objects}
3 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1 0.000 0.000 0.000 0.000 {method 'mean' of 'numpy.ndarray' objects}
1 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}
1 0.000 0.000 0.000 0.000 {method 'release' of 'thread.lock' objects}
3 0.002 0.001 0.002 0.001 {method 'take' of 'numpy.ndarray' objects}
9 0.000 0.000 0.000 0.000 {numpy.core.multiarray.arange}
5 0.000 0.000 0.000 0.000 {numpy.core.multiarray.array}
3 0.000 0.000 0.000 0.000 {numpy.core.multiarray.concatenate}
4 0.000 0.000 0.000 0.000 {range}
1 0.000 0.000 0.000 0.000 {zip}
In [35]: cProfile.run("""test.twodcitera(dd, res,in1,in2,r[k],r[k+1], output, r[l], r[l+1])""")
82 function calls in 0.003 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.003 0.003 <string>:1(<module>)
1 0.000 0.000 0.000 0.000 _methods.py:43(_count_reduce_items)
1 0.000 0.000 0.000 0.000 _methods.py:53(_mean)
1 0.000 0.000 0.000 0.000 function_base.py:436(average)
1 0.000 0.000 0.000 0.000 index_tricks.py:151(__getitem__)
3 0.000 0.000 0.001 0.000 numeric.py:1279(roll)
1 0.000 0.000 0.000 0.000 numeric.py:394(asarray)
4 0.000 0.000 0.000 0.000 numeric.py:464(asanyarray)
1 0.000 0.000 0.000 0.000 queues.py:99(put)
1 0.000 0.000 0.000 0.000 threading.py:299(_is_owned)
1 0.000 0.000 0.000 0.000 threading.py:372(notify)
1 0.000 0.000 0.000 0.000 threading.py:63(_note)
1 0.000 0.000 0.000 0.000 {hasattr}
18 0.000 0.000 0.000 0.000 {isinstance}
1 0.000 0.000 0.000 0.000 {issubclass}
5 0.000 0.000 0.000 0.000 {len}
3 0.000 0.000 0.000 0.000 {math.ceil}
1 0.000 0.000 0.000 0.000 {method 'acquire' of '_multiprocessing.SemLock' objects}
2 0.000 0.000 0.000 0.000 {method 'acquire' of 'thread.lock' objects}
1 0.000 0.000 0.000 0.000 {method 'append' of 'collections.deque' objects}
3 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1 0.000 0.000 0.000 0.000 {method 'mean' of 'numpy.ndarray' objects}
1 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects}
1 0.000 0.000 0.000 0.000 {method 'release' of 'thread.lock' objects}
3 0.001 0.000 0.001 0.000 {method 'take' of 'numpy.ndarray' objects}
9 0.000 0.000 0.000 0.000 {numpy.core.multiarray.arange}
5 0.000 0.000 0.000 0.000 {numpy.core.multiarray.array}
3 0.000 0.000 0.000 0.000 {numpy.core.multiarray.concatenate}
4 0.000 0.000 0.000 0.000 {range}
1 0.001 0.001 0.003 0.003 {test.twodcitera}
1 0.000 0.000 0.000 0.000 {zip}
可悲的是,没有任何东西出现。我会得出结论,原因可能是numpy已经很好地实现了,并且大部分时间都没有在嵌套循环中丢失。此外,cPython主要受益于静态类型。由于我们在这里使用numpy,这可能不是一个很大的好处。