Bug#881054: libarrayfire-opencl3: "INTERNAL KERNEL BUILD ERROR" from af::matmulTN
Ralf Stubner
ralf.stubner at r-institute.com
Tue Nov 7 13:53:17 UTC 2017
Package: libarrayfire-opencl3
Version: 3.3.2+dfsg1-4
Severity: important
Dear Maintainer,
* What led up to the situation?
As a trivial example, I am trying to implement an OLS solver based on arrayfire.
* What exactly did you do (or not do) that was effective (or
ineffective)?
Consider the following C++ file (fastLm.cpp):
#include <arrayfire.h>
#include <iostream>
int main() {
dim_t n = 100000;
dim_t p = 40;
// af::setBackend(AF_BACKEND_CPU);
af::array X = af::randn(n, p);
af::array y = af::matmul(X, af::constant(0.5, p)) + af::randn(n);
std::cout << "X: " << X.dims() << std::endl;
std::cout << "y: " << y.dims() << std::endl;
af::array tXX = af::matmulTN(X, X);
std::cout << "tXX: " << tXX.dims() << std::endl;
af::array tXy = af::matmulTN(X, y);
std::cout << "tXy: " << tXy.dims() << std::endl;
af::array coef = af::solve(tXX, tXy);
std::cout << "coef: " << coef.dims() << std::endl;
return 0;
}
I compiled this file with
$ g++ -laf -o fastLm fastLm.cpp
* What was the outcome of this action?
The follwoing error was displayed:
$ ./fastLm
========================================================
AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = Intel(R) HD Graphics Skylake ULT GT2
error = -11
memory pattern = Cached global memory based block gemv, computing kernel generator
Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 1, dims[0].y = 32, dims[0].x = 1, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 1, dims[1].y = 4, dims[1].x = 1, dims[1].bwidth = 4; ;
Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64
Kernel extra flags: 789680
Source:
typedef union GPtr {
__global float *f;
__global float2 *f2v;
__global float4 *f4v;
__global float8 *f8v;
__global float16 *f16v;
} GPtr;
typedef union LPtr {
__local float *f;
__local float2 *f2v;
__local float4 *f4v;
__local float8 *f8v;
__local float16 *f16v;
} LPtr;
typedef union PPtr {
float *f;
float2 *f2v;
float4 *f4v;
float8 *f8v;
float16 *f16v;
} PPtr;
__attribute__((reqd_work_group_size(64, 1, 1)))
void __kernel
sgemv(
uint M,
uint N,
const float alpha,
const __global float *restrict A,
const __global float *restrict X,
__global float *Y,
uint lda)
{
// M always denotes length of Y and N denotes length of X in the kernel
float4 a0;
float4 x0;
float4 y0;
y0 = 0;
__local float4 localRes[64][1];
uint coordA = (get_group_id(0) * 8 + get_local_id(0) % 8) * 4;
uint k0 = (get_local_id(0) / 8) * 4;
if (coordA < M && k0 < N) {
const GPtr Ag = {(__global float*)A};
const GPtr Xg = {(__global float*)X};
uint Ntail = N % 4;
N -= Ntail;
uint k = k0;
for (; k < N; k += 32) {
const uint xk = k / 4;
x0 = Xg.f4v[xk + 0];
/* -- Tiles multiplier -- */
const uint ay = (uint)(coordA >> 2);
const uint4 ak = {mad24(k, (lda >> 2), 0u), mad24(k + 1, (lda >> 2), 0u), mad24(k + 2, (lda >> 2), 0u),
mad24(k + 3, (lda >> 2), 0u)};
a0 = Ag.f4v[ay + ak.s0];
y0 += a0 * x0.s0;
a0 = Ag.f4v[ay + ak.s1];
y0 += a0 * x0.s1;
a0 = Ag.f4v[ay + ak.s2];
y0 += a0 * x0.s2;
a0 = Ag.f4v[ay + ak.s3];
y0 += a0 * x0.s3;
/* ---------------------- */
}
N += Ntail;
if (k < N) {
x0.s0 = X[k + 0 < N ? k : 0];
x0.s1 = X[k + 1 < N ? k + 1 : 0];
x0.s2 = X[k + 2 < N ? k + 2 : 0];
x0.s3 = X[k + 3 < N ? k + 3 : 0];
x0.s0 = k + 0 < N ? x0.s0 : 0;
x0.s1 = k + 1 < N ? x0.s1 : 0;
x0.s2 = k + 2 < N ? x0.s2 : 0;
x0.s3 = k + 3 < N ? x0.s3 : 0;
/* -- Tiles multiplier -- */
const uint ay = (uint)(coordA >> 2);
const uint4 ak = {mad24(k % N, (lda >> 2), 0u), mad24((k + 1) % N, (lda >> 2), 0u), mad24((k + 2) % N, (lda >> 2), 0u),
mad24((k + 3) % N, (lda >> 2), 0u)};
a0 = Ag.f4v[ay + ak.s0];
y0 += a0 * x0.s0;
a0 = Ag.f4v[ay + ak.s1];
y0 += a0 * x0.s1;
a0 = Ag.f4v[ay + ak.s2];
y0 += a0 * x0.s2;
a0 = Ag.f4v[ay + ak.s3];
y0 += a0 * x0.s3;
/* ---------------------- */
}
}
localRes[get_local_id(0)][0] = y0;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < 8 && coordA < M && k0 < N) {
for (uint i = 1; i < 8; i++) {
y0 += localRes[get_local_id(0) + i*8][0];
}
Y += coordA;
float4 r0;
GPtr uC;
uC.f = Y;
r0 = uC.f4v[0];
r0 = alpha * y0;
uC.f4v[0] = r0;
}
}
--------------------------------------------------------
Build log:
error: unknown argument: '-g'
========================================================
Speicherzugriffsfehler
* What outcome did you expect instead?
The same output as I get from the CPU backend after uncommenting
"af::setBackend", i.e.:
$ ./fastLm
X: 100000 40 1 1
y: 100000 1 1 1
tXX: 40 40 1 1
tXy: 40 1 1 1
coef: 40 1 1 1
When I install the upstream version 3.5.1 via upstream's installer in a
Docker image, fastLm.cpp works with both OpenCL and CPU backend.
Greetings
Ralf
-- System Information:
Debian Release: 9.1
APT prefers stable
APT policy: (500, 'stable')
Architecture: amd64 (x86_64)
Kernel: Linux 4.9.0-4-amd64 (SMP w/4 CPU cores)
Locale: LANG=de_DE.UTF-8, LC_CTYPE=de_DE.UTF-8 (charmap=UTF-8), LANGUAGE=de_DE.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)
Versions of packages libarrayfire-opencl3 depends on:
ii libblas3 [libblas.so.3] 3.7.0-2
ii libc6 2.24-11+deb9u1
ii libclblas2 2.12-1
ii libclfft2 2.12.2-1+b1
ii libfreeimage3 3.17.0+ds1-5
ii libgcc1 1:6.3.0-18
ii liblapacke 3.7.0-2
ii libopenblas-base [libblas.so.3] 0.2.19-3
ii libstdc++6 6.3.0-18
ii ocl-icd-libopencl1 [libopencl1] 2.2.11-1
libarrayfire-opencl3 recommends no packages.
libarrayfire-opencl3 suggests no packages.
-- no debconf information
More information about the debian-science-maintainers
mailing list