Python GPU编程之NumbaPro入门

Python 2015-12-29

  相信如果你使用过Python Numpy包,一定了解NumPy(Numeric Python)提供了许多高级的数值编程工具,如:矩阵数据类型、矢量处理,以及精密的运算库。它专为进行严格的数字处理而产生。多为很多大型金融公司使用,以及核心的科学计算组织如:Lawrence Livermore,NASA用其处理一些本来使用C++,Fortran或Matlab等所做的任务。   但是由于复杂的计算,Numpy的计算效率难免受到影响,因此我们对它进行了许多优化,用于优化的包有PyPy、Numba 与 Cython,而NumbaPro就是建立在Numba和cuda基础上的高级优化方法。   下面我们一起来看。   使用NumbaPro,我们可以对Numpy中的方法进行优化,使Python代码可以动态编译为机器码,并在运行中加载,使得GPU充分发挥多线程的优势。针对GPU,Numbapro也可以自动完成工作,并优化GPU体系结构的代码。另外,基于CUDA API编写的Python代码也可以有效地利用硬件。   说了这么多,下面就让我们从简单的示例开始学习。

 from numbapro import vectorize
@vectorize(['float32(float32, float32)'], target='cpu')
def sum(a, b):
    return a + b

  如果需要使用GPU来运行,只需要将第二行改成@vectorize(['float32(float32, float32)'], target='gpu')

  对于更复杂的操作,可以使用Just-In-Time (JIT)来编译。

 from numbapro import cuda

@cuda.jit('void(float32[:], float32[:], float32[:])')
def sum(a, b, result):
    i = cuda.grid(1)   # 等价于threadIdx.x + blockIdx.x * blockDim.x
    result[i] = a[i] + b[i]

# 调用: sum[grid_dim, block_dim](big_input_1, big_input_2, result_array)

  下面继续看一个具体的应用:

import numpy as np
import math
import time
from numba import *
from numbapro import cuda
from blackscholes_numba import black_scholes, black_scholes_numba
#import logging; logging.getLogger().setLevel(0)

RISKFREE = 0.02
VOLATILITY = 0.30

A1 = 0.31938153
A2 = -0.356563782
A3 = 1.781477937
A4 = -1.821255978
A5 = 1.330274429
RSQRT2PI = 0.39894228040143267793994605993438

@cuda.jit(argtypes=(double,), restype=double, device=True, inline=True)
def cnd_cuda(d):
    K = 1.0 / (1.0 + 0.2316419 * math.fabs(d))
    ret_val = (RSQRT2PI * math.exp(-0.5 * d * d) *
               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
    if d > 0:
        ret_val = 1.0 - ret_val
    return ret_val

@cuda.jit(argtypes=(double[:], double[:], double[:], double[:], double[:],
                    double, double))
def black_scholes_cuda(callResult, putResult, S, X,
                       T, R, V):
#    S = stockPrice
#    X = optionStrike
#    T = optionYears
#    R = Riskfree
#    V = Volatility
    i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    if i >= S.shape[0]:
        return
    sqrtT = math.sqrt(T[i])
    d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT)
    d2 = d1 - V * sqrtT
    cndd1 = cnd_cuda(d1)
    cndd2 = cnd_cuda(d2)

    expRT = math.exp((-1. * R) * T[i])
    callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
    putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))

def randfloat(rand_var, low, high):
    return (1.0 - rand_var) * low + rand_var * high

def main (*args):
    OPT_N = 4000000
    iterations = 10
    if len(args) >= 2:
        iterations = int(args[0])

    callResultNumpy = np.zeros(OPT_N)
    putResultNumpy = -np.ones(OPT_N)
    stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
    optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
    optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)
    callResultNumba = np.zeros(OPT_N)
    putResultNumba = -np.ones(OPT_N)
    callResultNumbapro = np.zeros(OPT_N)
    putResultNumbapro = -np.ones(OPT_N)

    time0 = time.time()
    for i in range(iterations):
        black_scholes(callResultNumpy, putResultNumpy, stockPrice,
                      optionStrike, optionYears, RISKFREE, VOLATILITY)
    time1 = time.time()
    print("Numpy Time: %f msec" %
          ((1000 * (time1 - time0)) / iterations))

    time0 = time.time()
    for i in range(iterations):
        black_scholes_numba(callResultNumba, putResultNumba, stockPrice,
                            optionStrike, optionYears, RISKFREE, VOLATILITY)
    time1 = time.time()
    print("Numba Time: %f msec" %
          ((1000 * (time1 - time0)) / iterations))

    time0 = time.time()
    blockdim = 1024, 1
    griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
    stream = cuda.stream()
    d_callResult = cuda.to_device(callResultNumbapro, stream)
    d_putResult = cuda.to_device(putResultNumbapro, stream)
    d_stockPrice = cuda.to_device(stockPrice, stream)
    d_optionStrike = cuda.to_device(optionStrike, stream)
    d_optionYears = cuda.to_device(optionYears, stream)
    time1 = time.time()
    for i in range(iterations):
        black_scholes_cuda[griddim, blockdim, stream](
            d_callResult, d_putResult, d_stockPrice, d_optionStrike,
            d_optionYears, RISKFREE, VOLATILITY)
        d_callResult.to_host(stream)
        d_putResult.to_host(stream)
        stream.synchronize()
    time2 = time.time()
    dt = (time1 - time0) * 10 + (time2 - time1)
    print("numbapro.cuda time: %f msec" % ((1000 * dt) / iterations))

    delta = np.abs(callResultNumpy - callResultNumba)
    L1norm = delta.sum() / np.abs(callResultNumpy).sum()
    print("L1 norm: %E" % L1norm)
    print("Max absolute error: %E" % delta.max())

    delta = np.abs(callResultNumpy - callResultNumbapro)
    L1norm = delta.sum() / np.abs(callResultNumpy).sum()
    print("L1 norm (Numbapro): %E" % L1norm)
    print("Max absolute error (Numbapro): %E" % delta.max())

if __name__ == "__main__":
    import sys
    main(*sys.argv[1:])

  运行结果是:

Numpy Time: 1178.500009 msec
Numba Time: 424.500012 msec
numbapro.cuda time: 138.099957 msec

  可以看出,该程序是通过引入cuda对象并使用即时编译方法进行加速。   比较发现运行时间发现,运用numbapro方式加速效果明显。

  总结:1.可以通过GPU编写数据并行处理的程序来加快处理速度。 2.可以使用CUDA库,例如cuRAND, cuBLAS, cuFFT 3.Python CUDA程序也可以最大化的使用硬件资源。


本文由 Tony 创作,采用 知识共享署名 3.0,可自由转载、引用,但需署名作者且注明文章出处。

如果对您有用,您的支持将鼓励我继续创作!