Bug#920497: clblas: *ger out of bounds memory access under pocl
Rebecca N. Palmer
rebecca_palmer at zoho.com
Sat Jan 26 10:16:47 GMT 2019
Package: libclblas2,libpocl2
Version: 2.12-1,1.2-3
(and also 1.2-2, but the below is from -3)
libgpuarray's test_ger (DEVICE=opencl0:0 POCL_KERNEL_CACHE=0 nosetests3
-v pygpu.tests.test_blas:test_ger, requires python3-pygpu, python3-nose,
python3-scipy, ocl-icd-opencl-dev, libclblas-dev) crashes with memory
corruption errors (e.g. "double free" - exact message varies) under
clblas+pocl. Changing the test matrix size (default 4x5) to 64x64 (i.e.
a whole number of clblas blocks - 16x64 or 64x16 depending on array
order) makes it stop crashing.
clblas'
https://sources.debian.org/src/clblas/2.12-1/src/samples/example_sger.c/
(5x5 matrix, with CL_DEVICE_TYPE_GPU changed to _ALL) doesn't actually
crash, but Valgrind says the kernel is reading and writing out-of-bounds
memory.
The kernel source has what look like proper bounds checks for the
edge-of-matrix blocks
(https://sources.debian.org/src/clblas/2.12-1/src/library/blas/gens/clTemplates/ger.cl/#L263),
but disassembling the kernel suggests these aren't there in the binary:
#no cache is to avoid #919824
$ POCL_KERNEL_CACHE=0 valgrind --track-origins=yes --vgdb=yes
--vgdb-error=0 ./example_sger &
$ gdb ./example_sger
(relevant part - => is current position)
0x0000000004853a0f <+2303>: mov $0x1,%eax # eax isn't 1 so we
didn't arrive straight down from here - the next 4 jumps are the only
ones into here
--Type <RET> for more, q to quit, c to continue without paging--
0x0000000004853a14 <+2308>: mov 0x20(%rbx),%rcx
0x0000000004853a18 <+2312>: nopl 0x0(%rax,%rax,1)
0x0000000004853a20 <+2320>: mov %rdi,0x10(%rbx)
0x0000000004853a24 <+2324>: mov %rsi,0x28(%rbx)
0x0000000004853a28 <+2328>: mov %rsi,%r9
0x0000000004853a2b <+2331>: mov %rdx,0x30(%rbx)
0x0000000004853a2f <+2335>: mov %rdx,%r11
0x0000000004853a32 <+2338>: mov %r10,0x38(%rbx)
0x0000000004853a36 <+2342>: mov %r10,%rdi
0x0000000004853a39 <+2345>: mov %rcx,0x20(%rbx)
0x0000000004853a3d <+2349>: xor %r10d,%r10d
0x0000000004853a40 <+2352>: cmp %r14,%rax
0x0000000004853a43 <+2355>: mov 0x40(%rbp),%r15
0x0000000004853a47 <+2359>: mov 0x30(%rbp),%r13d
0x0000000004853a4b <+2363>: mov 0x48(%rbx),%r8
0x0000000004853a4f <+2367>: jae 0x4853a9d
<_pocl_launcher_Sger_R_kernel+2445>
0x0000000004853a51 <+2369>: nopw %cs:0x0(%rax,%rax,1)
0x0000000004853a5b <+2379>: nopl 0x0(%rax,%rax,1)
0x0000000004853a60 <+2384>: mov (%r9,%rax,4),%edx
0x0000000004853a64 <+2388>: mov (%r11,%rax,4),%esi
0x0000000004853a68 <+2392>: shl $0x4,%rsi
0x0000000004853a6c <+2396>: vmulps (%r12,%rsi,1),%xmm0,%xmm1 # temp
= yRegS * alpha ;
0x0000000004853a72 <+2402>: mov (%rdi,%rax,4),%esi # row index
to esi
--Type <RET> for more, q to quit, c to continue without paging--
0x0000000004853a75 <+2405>: imul %r13d,%esi # esi = row*lda, lda
in r13d
0x0000000004853a79 <+2409>: lea (%r8,%rsi,4),%rsi #row start to
rsi , r8 is base of A
0x0000000004853a7d <+2413>: vbroadcastss (%r15,%rdx,4),%xmm2 # load
xreg to xmm2 - r15 = localX base, tIDy in rdx here
0x0000000004853a83 <+2419>: mov (%rcx,%rax,4),%edx # column
index to edx
0x0000000004853a86 <+2422>: vmulps %xmm1,%xmm2,%xmm1 # * of mad
=> 0x0000000004853a8a <+2426>: vaddps (%rsi,%rdx,4),%xmm1,%xmm1 #vload
(out of bounds read - edx (col) is too big) and + of mad
0x0000000004853a8f <+2431>: vmovups %xmm1,(%rsi,%rdx,4) #vstore
0x0000000004853a94 <+2436>: add $0x1,%rax
0x0000000004853a98 <+2440>: cmp %rax,%r14 # r14 is group size
... is this the loop over workitems, with rax = local ID and (..,rax,4)
= private variables? and if it is, where are the bounds checks?
(...which is the bug...)
0x0000000004853a9b <+2443>: jne 0x4853a60
<_pocl_launcher_Sger_R_kernel+2384> # must have arrived from here
0x0000000004853a9d <+2445>: add $0x1,%r10
0x0000000004853aa1 <+2449>: mov 0xc8(%rbx),%rax
0x0000000004853aa8 <+2456>: add %rax,%rcx
0x0000000004853aab <+2459>: add %rax,%rdi
0x0000000004853aae <+2462>: add %rax,%r11
0x0000000004853ab1 <+2465>: add %rax,%r9
0x0000000004853ab4 <+2468>: mov $0x0,%eax
0x0000000004853ab9 <+2473>: cmp 0x70(%rbx),%r10
0x0000000004853abd <+2477>: jb 0x4853a40
<_pocl_launcher_Sger_R_kernel+2352>
0x0000000004853abf <+2479>: mov 0x10(%rbx),%rdi
0x0000000004853ac3 <+2483>: add $0x1,%rdi
0x0000000004853ac7 <+2487>: mov 0x20(%rbx),%rcx
--Type <RET> for more, q to quit, c to continue without paging--
0x0000000004853acb <+2491>: mov 0xf8(%rbx),%rax
0x0000000004853ad2 <+2498>: add %rax,%rcx
0x0000000004853ad5 <+2501>: mov 0x38(%rbx),%r10
0x0000000004853ad9 <+2505>: add %rax,%r10
0x0000000004853adc <+2508>: mov 0x30(%rbx),%rdx
0x0000000004853ae0 <+2512>: add %rax,%rdx
0x0000000004853ae3 <+2515>: mov 0x28(%rbx),%rsi
0x0000000004853ae7 <+2519>: add %rax,%rsi
0x0000000004853aea <+2522>: mov $0x0,%eax
0x0000000004853aef <+2527>: cmp 0x68(%rbx),%rdi
0x0000000004853af3 <+2531>: jb 0x4853a20
<_pocl_launcher_Sger_R_kernel+2320>
(gdb) info all-registers
rax 0x8 8 work item localID?
rbx 0xc3bd9c0 205248960 on stack?
rcx 0xc3bd4c0 205247680 on stack?
rdx 0x20 32 current column number
rsi 0xe3c1e00 238820864 current row start (=r8 so
row 0)
rdi 0xc3bccc0 205245632 on stack?
rbp 0xc3bdb40 0xc3bdb40 on stack?
rsp 0xc3bc0c0 0xc3bc0c0 stack ptr
r8 0xe3c1e00 238820864 base of 128byte block A
r9 0xc3bc4c0 205243584 on stack?
r10 0x0 0
r11 0xc3bc8c0 205244608 on stack?
r12 0x11bc5100 297554176
r13 0x5 5 stride (in entries)
r14 0x100 256 workgroup size
r15 0x11bc5080 297554048 base of localX
rip 0x4853a8a 0x4853a8a
<_pocl_launcher_Sger_R_kernel+2426>
eflags 0x10 [ AF ]
cs 0x0 0
ss 0x0 0
ds 0x0 0
es 0x0 0
fs 0x0 0
--Type <RET> for more, q to quit, c to continue without paging--
gs 0x0 0
st0 0 (raw 0x00000000000000000000)
st1 0 (raw 0x00000000000000000000)
st2 0 (raw 0x00000000000000000000)
st3 0 (raw 0x00000000000000000000)
st4 0 (raw 0x00000000000000000000)
st5 0 (raw 0x00000000000000000000)
st6 0 (raw 0x00000000000000000000)
st7 0 (raw 0x00000000000000000000)
fctrl 0x37f 895
fstat 0x0 0
ftag 0xffff 65535
fiseg 0x0 0
fioff 0x0 0
foseg 0x0 0
fooff 0x0 0
fop 0x0 0
mxcsr 0x1f80 [ IM DM ZM OM UM PM ]
ymm0 {v8_float = {0xa, 0xa, 0xa, 0xa, 0x0, 0x0, 0x0, 0x0},
v4_double = {0x80000, 0x80000, 0x0, 0x0} # 10 = alpha or one-off-end of
Y ; should be 4 successive entries of Y
ymm1 {v8_float = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
ymm2 {v8_float = {0xb, 0xb, 0xb, 0xb, 0x0, 0x0, 0x0, 0x0}, #11
= first entry of X
This suggests:
- either the kernel has undefined behaviour in the bounds-check-hit case
(compilers often remove such code to make the defined case faster -
http://blog.llvm.org/2011/05/what-every-c-programmer-should-know_14.html
), though I can't see where
- or, there is a bug in pocl; as this kernel has multiple barriers,
possibly related to https://github.com/pocl/pocl/issues/553 and/or
https://github.com/pocl/pocl/issues/683
https://github.com/clMathLibraries/clBLAS/issues/108 was a vaguely
similar issue in the same kernel, but we already have the fix for that one.
More information about the debian-science-maintainers
mailing list