Amir_D_Shadow 2021-06-16 11:29 采纳率: 0%
浏览 49

三维卷积 CUDA PYTHON

 我试着用GPU做三维卷积,但是CPU计算出来的结果跟GPU的不一样(np.array_equal(k1,k2) return false).求大佬指教orz

 

 

import numpy as np
import math
from numba import cuda,float64,int64
import time

    
@cuda.jit("float64[:,:,:,:],float64[:,:,:],float64[:,:,:,:],float64[:,:,:],int64,int64,int64,int64")
def conv_step_forward3D(W,img,b,Z,stride,xlim,ylim,zlim):

    """
    W -- (fH,fW,n_C_prev,n_C)
    img -- (n_H_prev,n_W_prev,n_C_prev)
    Z -- (n_H,n_W,n_C)
    """

    fH,fW,n_C_prev,n_C = W.shape
    n_H_prev,n_W_prev,n_C_prev = img.shape
    
    n_H = cuda.threadIdx.x + cuda.blockIdx.x*cuda.blockDim.x
    n_W = cuda.threadIdx.y + cuda.blockIdx.y*cuda.blockDim.y
    n_C = cuda.threadIdx.z + cuda.blockIdx.z*cuda.blockDim.z

    if (n_H < xlim) and (n_W < ylim) and (n_C < zlim):

        #loop through height
        for h in range(fH):

            #loop through width
            for w in range(fW):

                #loop through channels
                for c in range(n_C_prev):

                    IMG_H = n_H*stride+h
                    IMG_W = n_W*stride+w

                    Z[n_H,n_W,n_C] = Z[n_H,n_W,n_C] + W[h,w,c,n_C]*img[IMG_H,IMG_W,c]

        #wait until result come out
        cuda.syncthreads()

        #add bias
        Z[n_H,n_W,n_C] = Z[n_H,n_W,n_C] + float(b[0,0,0,n_C])

        #wait until result come out
        cuda.syncthreads()
    


if __name__ == "__main__":


    #GPU
    W = np.random.randn(3,3,3,16)
    b = np.random.randn(1,1,1,16)
    Img = np.random.randn(1,1080,1920,3)

    m,n_H_prev,n_W_prev,n_C_prev = Img.shape

    fH,fW = W.shape[0],W.shape[1]
    
    stride = 2
    n_H = int((n_H_prev-fH)/stride)+1
    n_W = int((n_W_prev-fW)/stride)+1
    n_C = 16
    
    Z = np.zeros((n_H,n_W,16))
    
    threadsperblock = (8,8,2)

    blockspergrid_H = int(math.ceil(Z.shape[0]/threadsperblock[0]))
    blockspergrid_W = int(math.ceil(Z.shape[1]/threadsperblock[1]))
    blockspergrid_C = int(math.ceil(Z.shape[2]/threadsperblock[2]))

    blockspergrid = (blockspergrid_H,blockspergrid_W,blockspergrid_C)

    
    W_device = cuda.to_device(W)
    Img_device = cuda.to_device(Img[0,:,:,:])
    Z_device = cuda.to_device(Z)
    b_device = cuda.to_device(b)
    
    cuda.synchronize()
    
    gpu_time = time.time()
    conv_step_forward3D[blockspergrid,threadsperblock](W_device,Img_device,b_device,Z_device,stride,n_H,n_W,n_C)
    cuda.synchronize()
    k1 = Z_device.copy_to_host()
    print(f"With GPU:{time.time()-gpu_time}")
    

    #CPU
    obj = Layers.ConvLayer()
    cpu_time = time.time()
    Z = np.zeros((n_H,n_W,n_C))

    #Get a sample
    a_prev = Img[0,:,:,:]

    #Loop over vertical axis 
    for h in range(n_H):

         vert_start = h*stride
         vert_end = vert_start + fH
        
         #Loop over horizontal axis
         for w in range(n_W):

             hori_start = w*stride
             hori_end = hori_start + fW

             #Slice current sample
             a_slice_prev = a_prev[vert_start:vert_end,hori_start:hori_end,:]

             #For each filter
             for c in range(n_C):

                 Wc = W[:,:,:,c]
                 bc = b[:,:,:,c]
                    
                 Z[h,w,c] =  np.sum(a_slice_prev*Wc)+float(bc)           
    k2 = Z.copy()
    print(f"With CPU:{time.time()-cpu_time}")

    print(np.array_equal(k1,k2))
  • 写回答

1条回答 默认 最新

  • 有问必答小助手 2021-06-21 11:52
    关注

    你好,我是有问必答小助手,非常抱歉,本次您提出的有问必答问题,目前超出我们的服务范围,暂时无法为您解答。

    首次提问人员可免费体验一次有问必答服务。目前首次提问的问题服务范围为:编程语言、Java开发、python、数据库、前端开发 领域专业技术问题,为您提供问题的解决思路和指导。不提供源码代写、项目文档代写、论文代写、作业代写、安装包资源发送或安装、软件使用指导等服务。

    我们后续会持续优化,扩大我们的服务范围,为您带来更好地服务。

    评论

报告相同问题?

悬赏问题

  • ¥15 树莓派与pix飞控通信
  • ¥15 自动转发微信群信息到另外一个微信群
  • ¥15 outlook无法配置成功
  • ¥30 这是哪个作者做的宝宝起名网站
  • ¥60 版本过低apk如何修改可以兼容新的安卓系统
  • ¥25 由IPR导致的DRIVER_POWER_STATE_FAILURE蓝屏
  • ¥50 有数据,怎么建立模型求影响全要素生产率的因素
  • ¥50 有数据,怎么用matlab求全要素生产率
  • ¥15 TI的insta-spin例程
  • ¥15 完成下列问题完成下列问题