Bug#920497: clblas: *ger out of bounds memory access under pocl

Sat Jan 26 10:16:47 GMT 2019

Package: libclblas2,libpocl2
Version: 2.12-1,1.2-3
(and also 1.2-2, but the below is from -3)

libgpuarray's test_ger (DEVICE=opencl0:0 POCL_KERNEL_CACHE=0 nosetests3 
-v pygpu.tests.test_blas:test_ger, requires python3-pygpu, python3-nose, 
python3-scipy, ocl-icd-opencl-dev, libclblas-dev) crashes with memory 
corruption errors (e.g. "double free" - exact message varies) under 
clblas+pocl.  Changing the test matrix size (default 4x5) to 64x64 (i.e. 
a whole number of clblas blocks - 16x64 or 64x16 depending on array 
order) makes it stop crashing.

clblas' 
https://sources.debian.org/src/clblas/2.12-1/src/samples/example_sger.c/ 
(5x5 matrix, with CL_DEVICE_TYPE_GPU changed to _ALL) doesn't actually 
crash, but Valgrind says the kernel is reading and writing out-of-bounds 
memory.

The kernel source has what look like proper bounds checks for the 
edge-of-matrix blocks 
(https://sources.debian.org/src/clblas/2.12-1/src/library/blas/gens/clTemplates/ger.cl/#L263), 
but disassembling the kernel suggests these aren't there in the binary:

#no cache is to avoid #919824
$ POCL_KERNEL_CACHE=0 valgrind --track-origins=yes --vgdb=yes 
--vgdb-error=0 ./example_sger &
$ gdb ./example_sger

(relevant part - => is current position)
    0x0000000004853a0f <+2303>:  mov    $0x1,%eax # eax isn't 1 so we 
didn't arrive straight down from here - the next 4 jumps are the only 
ones into here
--Type <RET> for more, q to quit, c to continue without paging--
    0x0000000004853a14 <+2308>:  mov    0x20(%rbx),%rcx
    0x0000000004853a18 <+2312>:  nopl   0x0(%rax,%rax,1)
    0x0000000004853a20 <+2320>:  mov    %rdi,0x10(%rbx)
    0x0000000004853a24 <+2324>:  mov    %rsi,0x28(%rbx)
    0x0000000004853a28 <+2328>:  mov    %rsi,%r9
    0x0000000004853a2b <+2331>:  mov    %rdx,0x30(%rbx)
    0x0000000004853a2f <+2335>:  mov    %rdx,%r11
    0x0000000004853a32 <+2338>:  mov    %r10,0x38(%rbx)
    0x0000000004853a36 <+2342>:  mov    %r10,%rdi
    0x0000000004853a39 <+2345>:  mov    %rcx,0x20(%rbx)
    0x0000000004853a3d <+2349>:  xor    %r10d,%r10d
    0x0000000004853a40 <+2352>:  cmp    %r14,%rax
    0x0000000004853a43 <+2355>:  mov    0x40(%rbp),%r15
    0x0000000004853a47 <+2359>:  mov    0x30(%rbp),%r13d
    0x0000000004853a4b <+2363>:  mov    0x48(%rbx),%r8
    0x0000000004853a4f <+2367>:  jae    0x4853a9d 
<_pocl_launcher_Sger_R_kernel+2445>
    0x0000000004853a51 <+2369>:  nopw   %cs:0x0(%rax,%rax,1)
    0x0000000004853a5b <+2379>:  nopl   0x0(%rax,%rax,1)
    0x0000000004853a60 <+2384>:  mov    (%r9,%rax,4),%edx
    0x0000000004853a64 <+2388>:  mov    (%r11,%rax,4),%esi
    0x0000000004853a68 <+2392>:  shl    $0x4,%rsi
    0x0000000004853a6c <+2396>:  vmulps (%r12,%rsi,1),%xmm0,%xmm1 # temp 
=  yRegS *  alpha ;
    0x0000000004853a72 <+2402>:  mov    (%rdi,%rax,4),%esi # row index 
to esi
--Type <RET> for more, q to quit, c to continue without paging--
    0x0000000004853a75 <+2405>:  imul   %r13d,%esi # esi = row*lda, lda 
in r13d
    0x0000000004853a79 <+2409>:  lea    (%r8,%rsi,4),%rsi #row start to 
rsi , r8 is base of A
    0x0000000004853a7d <+2413>:  vbroadcastss (%r15,%rdx,4),%xmm2 # load 
xreg to xmm2 - r15 = localX base, tIDy in rdx here
    0x0000000004853a83 <+2419>:  mov    (%rcx,%rax,4),%edx # column 
index to edx
    0x0000000004853a86 <+2422>:  vmulps %xmm1,%xmm2,%xmm1 # * of mad
=> 0x0000000004853a8a <+2426>:  vaddps (%rsi,%rdx,4),%xmm1,%xmm1 #vload 
(out of bounds read - edx (col) is too big) and + of mad
    0x0000000004853a8f <+2431>:  vmovups %xmm1,(%rsi,%rdx,4) #vstore
    0x0000000004853a94 <+2436>:  add    $0x1,%rax
    0x0000000004853a98 <+2440>:  cmp    %rax,%r14 # r14 is group size 
... is this the loop over workitems, with rax = local ID and (..,rax,4) 
= private variables?  and if it is, where are the bounds checks? 
(...which is the bug...)
    0x0000000004853a9b <+2443>:  jne    0x4853a60 
<_pocl_launcher_Sger_R_kernel+2384> # must have arrived from here
    0x0000000004853a9d <+2445>:  add    $0x1,%r10
    0x0000000004853aa1 <+2449>:  mov    0xc8(%rbx),%rax
    0x0000000004853aa8 <+2456>:  add    %rax,%rcx
    0x0000000004853aab <+2459>:  add    %rax,%rdi
    0x0000000004853aae <+2462>:  add    %rax,%r11
    0x0000000004853ab1 <+2465>:  add    %rax,%r9
    0x0000000004853ab4 <+2468>:  mov    $0x0,%eax
    0x0000000004853ab9 <+2473>:  cmp    0x70(%rbx),%r10
    0x0000000004853abd <+2477>:  jb     0x4853a40 
<_pocl_launcher_Sger_R_kernel+2352>
    0x0000000004853abf <+2479>:  mov    0x10(%rbx),%rdi
    0x0000000004853ac3 <+2483>:  add    $0x1,%rdi
    0x0000000004853ac7 <+2487>:  mov    0x20(%rbx),%rcx
--Type <RET> for more, q to quit, c to continue without paging--
    0x0000000004853acb <+2491>:  mov    0xf8(%rbx),%rax
    0x0000000004853ad2 <+2498>:  add    %rax,%rcx
    0x0000000004853ad5 <+2501>:  mov    0x38(%rbx),%r10
    0x0000000004853ad9 <+2505>:  add    %rax,%r10
    0x0000000004853adc <+2508>:  mov    0x30(%rbx),%rdx
    0x0000000004853ae0 <+2512>:  add    %rax,%rdx
    0x0000000004853ae3 <+2515>:  mov    0x28(%rbx),%rsi
    0x0000000004853ae7 <+2519>:  add    %rax,%rsi
    0x0000000004853aea <+2522>:  mov    $0x0,%eax
    0x0000000004853aef <+2527>:  cmp    0x68(%rbx),%rdi
    0x0000000004853af3 <+2531>:  jb     0x4853a20 
<_pocl_launcher_Sger_R_kernel+2320>

(gdb) info all-registers
rax            0x8                 8 work item localID?
rbx            0xc3bd9c0           205248960 on stack?
rcx            0xc3bd4c0           205247680 on stack?
rdx            0x20                32 current column number
rsi            0xe3c1e00           238820864 current row start (=r8 so 
row 0)
rdi            0xc3bccc0           205245632 on stack?
rbp            0xc3bdb40           0xc3bdb40 on stack?
rsp            0xc3bc0c0           0xc3bc0c0 stack ptr
r8             0xe3c1e00           238820864 base of 128byte block A
r9             0xc3bc4c0           205243584 on stack?
r10            0x0                 0
r11            0xc3bc8c0           205244608 on stack?
r12            0x11bc5100          297554176
r13            0x5                 5 stride (in entries)
r14            0x100               256 workgroup size
r15            0x11bc5080          297554048 base of localX
rip            0x4853a8a           0x4853a8a 
<_pocl_launcher_Sger_R_kernel+2426>
eflags         0x10                [ AF ]
cs             0x0                 0
ss             0x0                 0
ds             0x0                 0
es             0x0                 0
fs             0x0                 0
--Type <RET> for more, q to quit, c to continue without paging--
gs             0x0                 0
st0            0                   (raw 0x00000000000000000000)
st1            0                   (raw 0x00000000000000000000)
st2            0                   (raw 0x00000000000000000000)
st3            0                   (raw 0x00000000000000000000)
st4            0                   (raw 0x00000000000000000000)
st5            0                   (raw 0x00000000000000000000)
st6            0                   (raw 0x00000000000000000000)
st7            0                   (raw 0x00000000000000000000)
fctrl          0x37f               895
fstat          0x0                 0
ftag           0xffff              65535
fiseg          0x0                 0
fioff          0x0                 0
foseg          0x0                 0
fooff          0x0                 0
fop            0x0                 0
mxcsr          0x1f80              [ IM DM ZM OM UM PM ]
ymm0           {v8_float = {0xa, 0xa, 0xa, 0xa, 0x0, 0x0, 0x0, 0x0}, 
v4_double = {0x80000, 0x80000, 0x0, 0x0} # 10 = alpha or one-off-end of 
Y ; should be 4 successive entries of Y
ymm1           {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
ymm2           {v8_float = {0xb, 0xb, 0xb, 0xb, 0x0, 0x0, 0x0, 0x0}, #11 
= first entry of X

This suggests:
- either the kernel has undefined behaviour in the bounds-check-hit case 
(compilers often remove such code to make the defined case faster - 
http://blog.llvm.org/2011/05/what-every-c-programmer-should-know_14.html 
), though I can't see where
- or, there is a bug in pocl; as this kernel has multiple barriers, 
possibly related to https://github.com/pocl/pocl/issues/553 and/or 
https://github.com/pocl/pocl/issues/683

https://github.com/clMathLibraries/clBLAS/issues/108 was a vaguely 
similar issue in the same kernel, but we already have the fix for that one.