Go中的并行saxpy实现无法在内核之间很好地扩展

So I'm trying to implement an implementation of saxpy that is both blocked and can be computed in parallel using the 8-cores available on my machine. I started with the assumption that small sizes of the vectors x and y which fit into the L1 cache of my machine (split 256kB - 128kB data, 128kB code), can be computed in serial. To test this assumption, I wrote two implementations of saxpy, one which is a blocked serial version of saxpy (BSS) and a blocked parallel version of saxpy (BPS). The blocking algorithm is used only when the sizes of the vectors are larger than 4096 elements long. The following are the implementations:

const cachecap = 32*1024/8 // 4096
func blocked_serial_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {
    zn := len(z)
    //fmt.Println("zn: ", zn)
    if zn <= cachecap {
        serial_saxpy(a, x, incx, b, y, incy, z, incz)
        return
    }

    nblocks := zn/cachecap + 1
    //fmt.Println("nblocks: ", nblocks)
    for i := 0; i < nblocks; i++ {
        beg := i * cachecap
        end := (i + 1) * cachecap
        if end >= zn {
            end = zn
        }
        //fmt.Println("beg, end: ", beg, end)
        xb := x[beg:end]
        yb := y[beg:end]
        zb := z[beg:end]
        serial_saxpy(a, xb, incx, b, yb, incy, zb, incz)
    }
}
func blocked_parallel_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {
    zn := len(z)
    if zn <= cachecap {
        //fmt.Println("zn <= cachecap: using serial_saxpy")
        serial_saxpy(a, x, incx, b, y, incy, z, incz)
        return
    }

    nblocks := zn/cachecap + 1
    //fmt.Println("nblocks: ", nblocks)
    nworkers := runtime.GOMAXPROCS(0)
    if nblocks < nworkers {
        nworkers = nblocks
    }
    //fmt.Println("nworkers: ", nworkers)

    //buf := blockSize*nworkers
    //if buf > nblocks {
    //  buf = nblocks
    //}
    //sendchan := make(chan block, buf)
    sendchan := make(chan block, nblocks)

    var wg sync.WaitGroup
    for i := 0; i < nworkers; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            a, b := a, b
            incx, incy, incz := incx, incy, incz
            for blk := range sendchan {
                beg, end := blk.beg, blk.end
                serial_saxpy(a, x[beg:end], incx, b, y[beg:end], incy, z[beg:end], incz)
            }
        }()
    }

    for i := 0; i < nblocks; i++ {
        beg := i * cachecap
        end := (i + 1) * cachecap
        if end >= zn {
            end = zn
        }
        //fmt.Println("beg:end", beg, end)
        sendchan <- block{beg, end}
    }
    close(sendchan)
    wg.Wait()
}

func serial_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {
    if incx <= 0 || incy <= 0 || incz <= 0 {
        panic("AxpBy: zero or negative increments not supported")
    }

    n := len(z) / incz
    if incx == 1 && incy == 1 && incz == 1 {
        if a == 1 && b == 1 {
            for i := 0; i < n; i++ {
                z[i] = x[i] + y[i]
            }
            return
        }

        if a == 0 && b == 1 {
            copy(z, y)
            //for i := 0; i < n; i++ {
            //  z[i] = y[i]
            //}
            return
        }

        if a == 1 && b == 0 {
            copy(z, x)
            //for i := 0; i < n; i++ {
            //  z[i] = x[i]
            //}
            return
        }

        if a == 0 && b == 0 {
            return
        }

        for i := 0; i < n; i++ {
            z[i] = a*x[i] + b*y[i]
        }
        return
    }

    // unequal increments or equal increments != 1
    ix, iy, iz := 0, 0, 0
    if a == 1 && b == 1 {
        for i := 0; i < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {
            z[iz] = x[ix] + y[iy]
        }
        return
    }

    if a == 0 && b == 1 {
        for i := 0; i < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {
            z[iz] = y[iy]
        }
        return
    }

    if a == 1 && b == 0 {
        for i := 0; i < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {
            z[iz] = x[ix]
        }
        return
    }

    if a == 0 && b == 0 {
        return
    }

    for i := 0; i < n; i, ix, iy, iz = i+1, ix+incx, iy+incy, iz+incz {
        z[iz] = a*x[ix] + b*y[iy]
    }
}

I then wrote benchmarks for the three functions blocked_serial_saxpy, blocked_parallel_saxpy and serial_saxpy. The following image shows the results of the benchmarks with vector sizes 1e3, 1e4, 1e5, 2e5, 3e5, 4e5, 6e5, 8e5 and 1e6 respectively:

To help me visualize the performance of the blocked_parallel_saxpy implementation, I plotted the results and this is what I obtained: Looking at the plot, makes me wonder, why am I not seeing a parallel speedup, when all the CPUs are being used and at 100% during the blocked_parallel_saxpy benchmark. The image from task manager is below:

Could someone help me understand what's going on here? Is what I'm seeing, symptom of a problem or the way it should be? If it's the former, is there a way to fix it?

Edit: I have modified the blocked_parallel_saxpy code to the following. I dividing the total no.of blocks (nblocks) such that there are nworker goroutines computing nworker no. of blocks, in parallel. In addition, I have removed the channel. I have benchmarked the code and it performs identically to the parallel implementation with the channel, hence why I haven't attached the benchmarks.

func blocked_parallel_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {
    zn := len(z)
    if zn <= cachecap {
        serial_saxpy(a, x, incx, b, y, incy, z, incz)
        return
    }

    nblocks := zn/cachecap + 1
    nworkers := runtime.GOMAXPROCS(0)
    if nblocks < nworkers {
        nworkers = nblocks
    }

    var wg sync.WaitGroup
    for i := 0; i < nworkers; i++ {
        for j := 0; j < nworkers && (i+j) < nblocks; j++ {
            wg.Add(1)
            go func(i, j int) {
                defer wg.Done()
                a, b := a, b
                incx, incy, incz := incx, incy, incz
                k := i + j
                beg := k * cachecap
                end := (k + 1) * cachecap
                if end >= zn {
                    end = zn
                }
                serial_saxpy(a, x[beg:end], incx, b, y[beg:end], incy, z[beg:end], incz)
            }(i, j)
        }
    wg.Wait()
}

Edit.2: I have written another version of the blocked_parallel_saxpy, again, without channels. This time, I spawn NumCPU goroutines, each processing nblocks/nworkers + 1 blocks where each block is cachecap no. of elements in length. Even, here, the code performs identically to the previous two implementations.

func blocked_parallel_saxpy(a float64, x []float64, incx int, b float64, y []float64, incy int, z []float64, incz int) {
    zn := len(z)
    if zn <= cachecap {
        serial_saxpy(a, x, incx, b, y, incy, z, incz)
        return
    }

    nblocks := zn/cachecap + 1
    nworkers := runtime.GOMAXPROCS(runtime.NumCPU())
    if nblocks < nworkers {
        nworkers = nblocks
    }

    k := nblocks/nworkers + 1
    var wg sync.WaitGroup
    wg.Add(nworkers)
    for i := 0; i < nworkers; i++ {
        go func(i int) {
            defer wg.Done()
            for j := 0; j < k && (j+i*k) < nblocks; j++ {
                beg := (j + i*k) * cachecap
                end := beg + cachecap
                if end > zn {
                    end = zn
                }
                //fmt.Printf("i:%d, j:%d, k:%d, [beg:end]=[%d:%d]
", i, j, k, beg, end)
                serial_saxpy(a, x[beg:end], incx, b, y[beg:end], incy, z[beg:end], incz)
            }
        }(i)
    }

    wg.Wait()
}

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

2条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
ds355020 2017-09-10 14:06
关注
I'd try a parallel version without channels, where each worker computes every 8th block, without coordination.

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

saxpy:符号聚合近似的Python实现
2021-04-27 14:16

python中符号集合近似的实现。基于论文时间序列的符号表示，对流算法有影响一般使用： from saxpy import SAX s = SAX(wordSize, alphabetSize, epsilon) 您可以选择指定字号，字母大小和epsilon 如果要比较x1...
learn-gpgpu:在CUDA中实现的算法+有关GPGPU的资源
2021-05-02 23:40

例子CUDA线性代数-最简单的快速一维向量加法[ ] 中元素的总和-中元素的并行总和[ ] 用cuBlas实现SAXPY [ ]图像处理-2D朴素实现[ ] -具有任意大小内核的中值过滤器[ CUDA ] Sobel边缘检测滤波器-用于图像处理的Sobel ...
单片机与DSP中的循环缓冲变量和滤波器
2020-11-14 23:30

DSP算法的特性意味着DSP处理器应能频繁而有效地进行SAXPY类操作。这类操仵常常用于变量和滤波器系数的相乘。可以认为采样数据和系数是从一个循环缓砷中狻出的，如图1所示。数据通过一个指针被访问，在程序运行中，这...
单片机与DSP中的滤波器算法误差
2020-11-15 11:07

在不同MAC单元中这种缩减操作造成的舍人不同。如果T＞S，则一般将其中的累加器称为具有扩展精度的累加器。T bit累加器的输出最终将被缩减成一个M bit的SAXPY输出，其中，M≤T。如果M，一般设舍人误差为　其中，...
saxpy_glsl2.rar_OpenGL_Visual_C++_
2021-08-11 17:40

GPU通用编程的示范代码。GPU片断编程采用OpenGL2.0语言。
caffe小问题大烦恼——无法解析的外部符号 cblas_saxpy，cblas_ddot
2018-05-23 09:54

gaohang_hdu的博客原因：没有添加cblas库解决方法：第一个方法，下载cblas，添加头文件、库文件；第二个方法，添加caffe下载的NugetPackages包中OpenBLAS包中的头文件、库文件，其中库文件是libopenblas.dll.a...
CUDA：多块协作组在GPU上实现共轭梯度解算器实例
2022-12-21 17:29

源代码大师的博客 CUDA：多块协作组在GPU上实现共轭梯度解算器实例
CUDA：统一内存优化的预取和使用提示在多个GPU上实现共轭梯度解算器实例
2022-12-21 17:32

源代码大师的博客 CUDA：统一内存优化的预取和使用提示在多个GPU上实现共轭梯度解算器实例
CUDA - 如何在CUDA C/C++中实现性能度量
2023-06-06 23:41

pla66的博客如何在CUDA C/C++中实现性能度量
c cuda 指定gpu_[译]在CUDA C/C++中如何衡量代码性能
2020-12-19 22:27

weixin_39759589的博客本文翻译自NVIDIA官方博客Parallel Forall，内容仅供参考，如有疑问请访问原网站:https://devblogs.nvidia.com/p....在这个系列的第一篇文章中，我们通过用CUDA C/C++实现SAXPY，学习了CUDA C/C++编程的基本要素。在...
《矩阵计算》chapter 01 在 Octave 中整理运行示例算法
2022-10-31 17:47

Eloudy的博客 --------------------- mc_1_1_2_algo_saxpy.m function y = mc_1_1_2_algo_saxpy (n, a, x, y) for i=1:n y(i) = a*x(i) + y(i) endfor endfunction >> x1=[1;2;3] x1 = 1 2 3 >> y1=[1;2;3] y1 = 1 2 3 >> y1=mc_1...
caffe 中 BLOB的实现
2016-07-12 16:33

junmuzi的博客 original url: ... ...等着caffe没有膨胀到很大的程度把caffe的代码理一理...（1）第一次阅读Caffe的源码，给人的印象就是里面大量使用了gtest，确实也简化了不少代码，看起来很清晰。（2）caffe的文档是使用doxygen
cblas_saxpy catlas_sset
2014-11-25 14:54

weixin_30763455的博客 cblas_saxpy ( const int __N , 　 const float __alpha , 　 const float * __X , 　 const int __incX , 　 float * __Y , 　 const int __incY 　 ); ...
使用Grid-Stride循环编写灵活的CUDA内核
2022-05-19 15:38

扫地的小何尚的博客 CUDA 编程中最常见的任务之一是使用内核并行化循环。举个例子，让我们用我们的老朋友 SAXPY。这是使用 for 循环的基本顺序实现。为了有效地并行化，我们需要启动足够多的线程来充分利用 GPU。 void saxpy(int n, ...
cuda中的Grid-Stride Loops (网格跨步循环) 详解
2019-01-05 18:35

超级代码搬运工的博客最近在学习cuda编程的时候遇到了不少问题，其中有一个问题很费解的就是为什么cuda中循环的步长是一个网格中容纳的线程的数量。代码如下所示： __global__ void add(int n, float *x, float *y) [添加链接描述]...
子程序中的数组（Arrays in subprograms）
2022-02-24 08:40

木苏州的博客这就会节省很多内存空间。不需要额外的存储空间，当调用subroutine时。只是传进去一个Array的地址。并不复制这个数组，这也就就意味着不会再占用一个数组的内存空间。一、可变长度数组最基础的向量操作叫做saxpy...
[译]在CUDA C/C++中如何衡量代码性能
2017-03-08 20:22

Innerpeace_yu的博客 cudacc++ Fighting_Bird 2016年11月21日发布 1 推荐 4 收藏，667 浏览本文翻译自NVIDIA官方博客Parallel Forall，内容仅供参考，如有...在这个系列的第一篇文章中，
Kay-2023.4.26
2023-04-27 00:54

Aristokay002的博客 1）今日已完成任务列表 4,2,3 VIM编辑器 2）遇到的问题及解决方案 4.2.3编写程序saxpy现了单精度浮点型数据的向量加法运算检查错误，发现.h文件忘加分号函数声明类型错误！ 3）任务完成详细笔记 4）对自己的表现...
thrust快速入门指南（并行算法库，类似C++的STL）
2018-12-12 12:50

qccz123456的博客 Thrust提供了丰富的数据并行原语集合，例如扫描，排序和缩减，它们可以组合在一起，通过简洁易读的源代码实现复杂的算法。通过根据这些高级抽象描述您的计算，您可以为Thrust提供自动选择最有效实现的自由。因此...
没有解决我的问题, 去提问

悬赏问题

¥15 素材场景中光线烘焙后灯光失效
¥15 请教一下各位，为什么我这个没有实现模拟点击
¥15 执行 virtuoso 命令后，界面没有，cadence 启动不起来
¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
¥20 有关区间dp的问题求解
¥15 多电路系统共用电源的串扰问题
¥15 slam rangenet++配置
¥15 有没有研究水声通信方面的帮我改俩matlab代码
¥15 ubuntu子系统密码忘记
¥15 保护模式-系统加载-段寄存器

Go中的并行saxpy实现无法在内核之间很好地扩展

2条回答 默认 最新

悬赏问题

2条回答默认最新