Bug#881054: libarrayfire-opencl3: "INTERNAL KERNEL BUILD ERROR" from af::matmulTN

Tue Nov 7 13:53:17 UTC 2017

Package: libarrayfire-opencl3
Version: 3.3.2+dfsg1-4
Severity: important

Dear Maintainer,

   * What led up to the situation?

As a trivial example, I am trying to implement an OLS solver based on arrayfire.

   * What exactly did you do (or not do) that was effective (or
     ineffective)?

Consider the following C++ file (fastLm.cpp):

#include <arrayfire.h>
#include <iostream>

int main() {
  dim_t n = 100000;
  dim_t p = 40;
  //  af::setBackend(AF_BACKEND_CPU);
  af::array X = af::randn(n, p);
  af::array y = af::matmul(X, af::constant(0.5, p)) + af::randn(n);
  std::cout << "X: " << X.dims() << std::endl;
  std::cout << "y: " << y.dims() << std::endl;
  af::array tXX = af::matmulTN(X, X);
  std::cout << "tXX: " << tXX.dims() << std::endl;
  af::array tXy = af::matmulTN(X, y);
  std::cout << "tXy: " << tXy.dims() << std::endl;
  af::array coef = af::solve(tXX, tXy);
  std::cout << "coef: " << coef.dims() << std::endl;
  return 0;
}

I compiled this file with

$ g++ -laf -o fastLm fastLm.cpp

   * What was the outcome of this action?

The follwoing error was displayed:

$ ./fastLm 

========================================================

AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = Intel(R) HD Graphics Skylake ULT GT2
error = -11
memory pattern = Cached global memory based block gemv, computing kernel generator
Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 1, dims[0].y = 32, dims[0].x = 1, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 1, dims[1].y = 4, dims[1].x = 1, dims[1].bwidth = 4; ; 
Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64
Kernel extra flags: 789680
Source:

typedef union GPtr {
    __global float *f;
    __global float2 *f2v;
    __global float4 *f4v;
    __global float8 *f8v;
    __global float16 *f16v;
} GPtr;

typedef union LPtr {
    __local float *f;
    __local float2 *f2v;
    __local float4 *f4v;
    __local float8 *f8v;
    __local float16 *f16v;
} LPtr;

typedef union PPtr {
    float *f;
    float2 *f2v;
    float4 *f4v;
    float8 *f8v;
    float16 *f16v;
} PPtr;

__attribute__((reqd_work_group_size(64, 1, 1)))
void __kernel
sgemv(
    uint M,
    uint N,
    const float alpha,
    const __global float *restrict A,
    const __global float *restrict X,
    __global float *Y,
    uint lda)
{
    // M always denotes length of Y and N denotes length of X in the kernel
    float4 a0;
    float4 x0;
    float4 y0;
    y0 = 0;

    __local float4 localRes[64][1];
    uint coordA = (get_group_id(0) * 8 + get_local_id(0) % 8) * 4;
    uint k0 = (get_local_id(0) / 8) * 4;

    if (coordA < M && k0 < N) {
        const GPtr Ag = {(__global float*)A};
        const GPtr Xg = {(__global float*)X};

        uint Ntail = N % 4;
        N -= Ntail;

        uint k = k0;
        for (; k < N; k += 32) {
            const uint xk = k / 4;
            x0 = Xg.f4v[xk + 0];
            /* -- Tiles multiplier -- */
            const uint ay = (uint)(coordA >> 2);
            const uint4 ak = {mad24(k, (lda >> 2), 0u), mad24(k + 1, (lda >> 2), 0u), mad24(k + 2, (lda >> 2), 0u),
            		mad24(k + 3, (lda >> 2), 0u)};

            a0 = Ag.f4v[ay + ak.s0];
            y0 += a0 * x0.s0;

            a0 = Ag.f4v[ay + ak.s1];
            y0 += a0 * x0.s1;

            a0 = Ag.f4v[ay + ak.s2];
            y0 += a0 * x0.s2;

            a0 = Ag.f4v[ay + ak.s3];

            y0 += a0 * x0.s3;
            /* ---------------------- */
        }
        N += Ntail;
        if (k < N) {
            x0.s0 = X[k + 0 < N ? k : 0];
            x0.s1 = X[k + 1 < N ? k + 1 : 0];
            x0.s2 = X[k + 2 < N ? k + 2 : 0];
            x0.s3 = X[k + 3 < N ? k + 3 : 0];
            x0.s0 = k + 0 < N ? x0.s0 : 0;
            x0.s1 = k + 1 < N ? x0.s1 : 0;
            x0.s2 = k + 2 < N ? x0.s2 : 0;
            x0.s3 = k + 3 < N ? x0.s3 : 0;
            /* -- Tiles multiplier -- */
            const uint ay = (uint)(coordA >> 2);
            const uint4 ak = {mad24(k % N, (lda >> 2), 0u), mad24((k + 1) % N, (lda >> 2), 0u), mad24((k + 2) % N, (lda >> 2), 0u),
            		mad24((k + 3) % N, (lda >> 2), 0u)};

            a0 = Ag.f4v[ay + ak.s0];

            y0 += a0 * x0.s0;

            a0 = Ag.f4v[ay + ak.s1];

            y0 += a0 * x0.s1;

            a0 = Ag.f4v[ay + ak.s2];

            y0 += a0 * x0.s2;

            a0 = Ag.f4v[ay + ak.s3];

            y0 += a0 * x0.s3;
            /* ---------------------- */
        }
    }
    localRes[get_local_id(0)][0] = y0;
    barrier(CLK_LOCAL_MEM_FENCE);

    if (get_local_id(0) < 8 && coordA < M && k0 < N) {
        for (uint i = 1; i < 8; i++) {
            y0 += localRes[get_local_id(0) + i*8][0];
        }
        Y += coordA;
        float4 r0;
        GPtr uC;
        uC.f = Y;
        r0 = uC.f4v[0];
        r0 = alpha * y0;
        uC.f4v[0] = r0;
    }
}

--------------------------------------------------------

Build log:

error: unknown argument: '-g'

========================================================

Speicherzugriffsfehler

   * What outcome did you expect instead?

The same output as I get from the CPU backend after uncommenting
"af::setBackend", i.e.:

$ ./fastLm 
X: 100000 40 1 1
y: 100000 1 1 1
tXX: 40 40 1 1
tXy: 40 1 1 1
coef: 40 1 1 1

When I install the upstream version 3.5.1 via upstream's installer in a
Docker image, fastLm.cpp works with both OpenCL and CPU backend.

Greetings
Ralf

-- System Information:
Debian Release: 9.1
  APT prefers stable
  APT policy: (500, 'stable')
Architecture: amd64 (x86_64)

Kernel: Linux 4.9.0-4-amd64 (SMP w/4 CPU cores)
Locale: LANG=de_DE.UTF-8, LC_CTYPE=de_DE.UTF-8 (charmap=UTF-8), LANGUAGE=de_DE.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)

Versions of packages libarrayfire-opencl3 depends on:
ii  libblas3 [libblas.so.3]          3.7.0-2
ii  libc6                            2.24-11+deb9u1
ii  libclblas2                       2.12-1
ii  libclfft2                        2.12.2-1+b1
ii  libfreeimage3                    3.17.0+ds1-5
ii  libgcc1                          1:6.3.0-18
ii  liblapacke                       3.7.0-2
ii  libopenblas-base [libblas.so.3]  0.2.19-3
ii  libstdc++6                       6.3.0-18
ii  ocl-icd-libopencl1 [libopencl1]  2.2.11-1

libarrayfire-opencl3 recommends no packages.

libarrayfire-opencl3 suggests no packages.

-- no debconf information