[Pkg-opencl-devel] [beignet] 51/66: Imported Upstream version 0.2+git20130816+cc09b26
Andreas Beckmann
anbe at moszumanska.debian.org
Fri Oct 31 07:27:08 UTC 2014
This is an automated email from the git hooks/post-receive script.
anbe pushed a commit to branch master
in repository beignet.
commit 1a4f8602d9416f95483cd941ecacabef0684e335
Author: Simon Richter <sjr at debian.org>
Date: Fri Aug 16 12:07:35 2013 +0200
Imported Upstream version 0.2+git20130816+cc09b26
---
backend/src/backend/gen_context.cpp | 508 ++++++++++++++++-----
backend/src/backend/gen_context.hpp | 9 +
backend/src/backend/gen_encoder.cpp | 6 -
.../src/backend/gen_insn_gen7_schedule_info.hxx | 5 +
backend/src/backend/gen_insn_scheduling.cpp | 21 +-
backend/src/backend/gen_insn_selection.cpp | 225 +++++++--
backend/src/backend/gen_insn_selection.hxx | 21 +-
backend/src/backend/gen_reg_allocation.cpp | 3 +-
backend/src/backend/gen_register.hpp | 7 +
backend/src/gen_builtin_vector.py | 4 +-
backend/src/ir/instruction.cpp | 4 +-
backend/src/ir/instruction.hpp | 2 +
backend/src/ir/instruction.hxx | 1 +
backend/src/llvm/llvm_gen_backend.cpp | 9 +
backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
backend/src/ocl_stdlib.tmpl.h | 167 ++++++-
kernels/builtin_acos_asin.cl | 10 +
kernels/compiler_async_copy.cl | 16 +
kernels/compiler_async_stride_copy.cl | 16 +
kernels/compiler_event.cl | 6 +
kernels/compiler_load_bool_imm.cl | 12 +
kernels/compiler_long_asr.cl | 7 +
kernels/compiler_long_cmp.cl | 29 ++
kernels/compiler_long_convert.cl | 7 +
kernels/compiler_long_mult.cl | 7 +
kernels/compiler_long_shl.cl | 7 +
kernels/compiler_long_shr.cl | 7 +
kernels/compiler_upsample_long.cl | 4 +
kernels/compiler_vector_load_store.cl | 6 +-
src/CMakeLists.txt | 1 +
src/cl_api.c | 456 +++++++++---------
src/cl_command_queue.c | 7 +
src/cl_command_queue.h | 3 +
src/cl_command_queue_gen7.c | 2 -
src/cl_context.h | 2 +
src/cl_driver.h | 31 ++
src/cl_driver_defs.c | 5 +
src/cl_enqueue.c | 336 ++++++++++++++
src/cl_enqueue.h | 63 +++
src/cl_event.c | 388 +++++++++++++++-
src/cl_event.h | 66 ++-
src/cl_internals.h | 1 +
src/cl_utils.h | 14 +-
src/intel/intel_gpgpu.c | 109 ++++-
utests/CMakeLists.txt | 12 +
utests/builtin_acos_asin.cpp | 87 ++++
utests/compiler_async_copy.cpp | 39 ++
utests/compiler_async_stride_copy.cpp | 45 ++
utests/compiler_load_bool_imm.cpp | 29 ++
utests/compiler_long_asr.cpp | 41 ++
utests/compiler_long_cmp.cpp | 117 +++++
utests/compiler_long_convert.cpp | 67 +++
utests/compiler_long_mult.cpp | 49 ++
utests/compiler_long_shl.cpp | 41 ++
utests/compiler_long_shr.cpp | 41 ++
utests/compiler_smoothstep.cpp | 2 +-
utests/compiler_upsample_long.cpp | 38 ++
utests/compiler_vector_load_store.cpp | 4 +-
utests/runtime_event.cpp | 61 +++
utests/utest_helper.hpp | 10 +-
60 files changed, 2873 insertions(+), 421 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 621e7be..1a012fe 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -151,13 +151,146 @@ namespace gbe
}
}
+ void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction &insn) {
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister src = ra->genReg(insn.src(0));
+ GenRegister tmp = ra->genReg(insn.dst(1));
+ switch (insn.opcode) {
+ case SEL_OP_LOAD_DF_IMM:
+ p->LOAD_DF_IMM(dst, tmp, src.value.df);
+ break;
+ case SEL_OP_MOV_DF:
+ p->MOV_DF(dst, src, tmp);
+ break;
+ case SEL_OP_CONVI_TO_I64: {
+ GenRegister middle;
+ if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
+ middle = tmp;
+ middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
+ p->MOV(middle, src);
+ } else {
+ middle = src;
+ }
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->curr.chooseNib(nib);
+ p->MOV(dst.bottom_half(), middle);
+ if(middle.is_signed_int())
+ p->ASR(dst.top_half(), middle, GenRegister::immud(31));
+ else
+ p->MOV(dst.top_half(), GenRegister::immd(0));
+ dst = GenRegister::suboffset(dst, 4);
+ middle = GenRegister::suboffset(middle, 4);
+ }
+ p->pop();
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction &insn) {
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister src0 = ra->genReg(insn.src(0));
+ GenRegister src1 = ra->genReg(insn.src(1));
+ GenRegister tmp = ra->genReg(insn.dst(1));
+ switch (insn.opcode) {
+ case SEL_OP_I64ADD: {
+ GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
+ y = GenRegister::suboffset(x, p->curr.execWidth);
+ loadBottomHalf(x, src0);
+ loadBottomHalf(y, src1);
+ addWithCarry(x, x, y);
+ storeBottomHalf(dst, x);
+ loadTopHalf(x, src0);
+ p->ADD(x, x, y);
+ loadTopHalf(y, src1);
+ p->ADD(x, x, y);
+ storeTopHalf(dst, x);
+ break;
+ }
+ case SEL_OP_I64SUB: {
+ GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
+ y = GenRegister::suboffset(x, p->curr.execWidth);
+ loadBottomHalf(x, src0);
+ loadBottomHalf(y, src1);
+ subWithBorrow(x, x, y);
+ storeBottomHalf(dst, x);
+ loadTopHalf(x, src0);
+ subWithBorrow(x, x, y);
+ loadTopHalf(y, src1);
+ subWithBorrow(x, x, y);
+ storeTopHalf(dst, x);
+ break;
+ }
+ case SEL_OP_MUL_HI: {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int i = 0; i < w / 8; i ++) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
+ p->curr.accWrEnable = 1;
+ p->MACH(tmp, src0, src1);
+ p->pop();
+ p->curr.quarterControl = i;
+ p->MOV(dst, tmp);
+ dst = GenRegister::Qn(dst, 1);
+ src0 = GenRegister::Qn(src0, 1);
+ src1 = GenRegister::Qn(src1, 1);
+ }
+ p->pop();
+ break;
+ }
+ case SEL_OP_HADD: {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int i = 0; i < w / 8; i ++) {
+ p->curr.quarterControl = i;
+ p->ADDC(dst, src0, src1);
+ p->SHR(dst, dst, GenRegister::immud(1));
+ p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+ p->OR(dst, dst, tmp);
+ dst = GenRegister::Qn(dst, 1);
+ src0 = GenRegister::Qn(src0, 1);
+ src1 = GenRegister::Qn(src1, 1);
+ }
+ p->pop();
+ break;
+ }
+ case SEL_OP_RHADD: {
+ int w = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int i = 0; i < w / 8; i ++) {
+ p->curr.quarterControl = i;
+ p->ADDC(dst, src0, src1);
+ p->ADD(dst, dst, GenRegister::immud(1));
+ p->SHR(dst, dst, GenRegister::immud(1));
+ p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+ p->OR(dst, dst, tmp);
+ dst = GenRegister::Qn(dst, 1);
+ src0 = GenRegister::Qn(src0, 1);
+ src1 = GenRegister::Qn(src1, 1);
+ }
+ p->pop();
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+
void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
switch (insn.opcode) {
- case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break;
- case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
case SEL_OP_SEL_INT64:
{
@@ -248,10 +381,219 @@ namespace gbe
case SEL_OP_MACH: p->MACH(dst, src0, src1); break;
case SEL_OP_UPSAMPLE_SHORT: p->UPSAMPLE_SHORT(dst, src0, src1); break;
case SEL_OP_UPSAMPLE_INT: p->UPSAMPLE_INT(dst, src0, src1); break;
+ case SEL_OP_UPSAMPLE_LONG:
+ {
+ GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+ xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+ xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->curr.chooseNib(nib);
+ p->MOV(xdst.top_half(), xsrc0.bottom_half());
+ p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
+ xdst = GenRegister::suboffset(xdst, 4);
+ xsrc0 = GenRegister::suboffset(xsrc0, 4);
+ xsrc1 = GenRegister::suboffset(xsrc1, 4);
+ }
+ p->pop();
+ }
+ break;
default: NOT_IMPLEMENTED;
}
}
+ void GenContext::collectShifter(GenRegister dest, GenRegister src) {
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->AND(dest, src.bottom_half(), GenRegister::immud(63));
+ dest = GenRegister::suboffset(dest, 4);
+ src = GenRegister::suboffset(src, 4);
+ }
+ p->pop();
+ }
+
+ void GenContext::emitI64ShiftInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+ GenRegister zero = GenRegister::immud(0);
+ switch(insn.opcode) {
+ case SEL_OP_I64SHL:
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ collectShifter(a, y);
+ loadBottomHalf(e, x);
+ loadTopHalf(f, x);
+ p->SHR(b, e, GenRegister::negate(a));
+ p->SHL(c, e, a);
+ p->SHL(d, f, a);
+ p->OR(e, d, b);
+ p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, e);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(a, a, GenRegister::immud(32));
+ p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, c);
+ p->SEL(c, c, zero);
+ p->pop();
+ storeBottomHalf(dest, c);
+ storeTopHalf(dest, d);
+ break;
+ case SEL_OP_I64SHR:
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ collectShifter(a, y);
+ loadBottomHalf(e, x);
+ loadTopHalf(f, x);
+ p->SHL(b, f, GenRegister::negate(a));
+ p->SHR(c, f, a);
+ p->SHR(d, e, a);
+ p->OR(e, d, b);
+ p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, e);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(a, a, GenRegister::immud(32));
+ p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, c);
+ p->SEL(c, c, zero);
+ p->pop();
+ storeBottomHalf(dest, d);
+ storeTopHalf(dest, c);
+ break;
+ case SEL_OP_I64ASR:
+ f.type = GEN_TYPE_D;
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ collectShifter(a, y);
+ loadBottomHalf(e, x);
+ loadTopHalf(f, x);
+ p->SHL(b, f, GenRegister::negate(a));
+ p->ASR(c, f, a);
+ p->SHR(d, e, a);
+ p->OR(e, d, b);
+ p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, e);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->AND(a, a, GenRegister::immud(32));
+ p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->SEL(d, d, c);
+ p->SEL(c, c, GenRegister::immd(-1));
+ p->pop();
+ storeBottomHalf(dest, d);
+ storeTopHalf(dest, c);
+ break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
+
+ void GenContext::saveFlag(GenRegister dest, int flag, int subFlag) {
+ p->push();
+ p->curr.execWidth = 1;
+ p->MOV(dest, GenRegister::flag(flag, subFlag));
+ p->pop();
+ }
+
+ void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
+ GenRegister src0 = ra->genReg(insn.src(0));
+ GenRegister src1 = ra->genReg(insn.src(1));
+ GenRegister tmp0 = ra->genReg(insn.dst(0));
+ GenRegister tmp1 = ra->genReg(insn.dst(1));
+ GenRegister tmp2 = ra->genReg(insn.dst(2));
+ tmp0.type = (src0.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+ tmp1.type = (src1.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+ int flag = p->curr.flag, subFlag = p->curr.subFlag;
+ GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW),
+ f2 = GenRegister::suboffset(f1, 1),
+ f3 = GenRegister::suboffset(f1, 2);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.flag = 0, p->curr.subFlag = 1;
+ loadTopHalf(tmp0, src0);
+ loadTopHalf(tmp1, src1);
+ switch(insn.extra.function) {
+ case GEN_CONDITIONAL_L:
+ case GEN_CONDITIONAL_LE:
+ case GEN_CONDITIONAL_G:
+ case GEN_CONDITIONAL_GE:
+ {
+ int cmpTopHalf = insn.extra.function;
+ if(insn.extra.function == GEN_CONDITIONAL_LE)
+ cmpTopHalf = GEN_CONDITIONAL_L;
+ if(insn.extra.function == GEN_CONDITIONAL_GE)
+ cmpTopHalf = GEN_CONDITIONAL_G;
+ p->CMP(cmpTopHalf, tmp0, tmp1);
+ }
+ saveFlag(f1, 0, 1);
+ p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+ saveFlag(f2, 0, 1);
+ tmp0.type = tmp1.type = GEN_TYPE_UD;
+ loadBottomHalf(tmp0, src0);
+ loadBottomHalf(tmp1, src1);
+ p->CMP(insn.extra.function, tmp0, tmp1);
+ saveFlag(f3, 0, 1);
+ p->AND(f2, f2, f3);
+ p->OR(f1, f1, f2);
+ break;
+ case GEN_CONDITIONAL_EQ:
+ p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+ saveFlag(f1, 0, 1);
+ tmp0.type = tmp1.type = GEN_TYPE_UD;
+ loadBottomHalf(tmp0, src0);
+ loadBottomHalf(tmp1, src1);
+ p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+ saveFlag(f2, 0, 1);
+ p->AND(f1, f1, f2);
+ break;
+ case GEN_CONDITIONAL_NEQ:
+ p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+ saveFlag(f1, 0, 1);
+ tmp0.type = tmp1.type = GEN_TYPE_UD;
+ loadBottomHalf(tmp0, src0);
+ loadBottomHalf(tmp1, src1);
+ p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+ saveFlag(f2, 0, 1);
+ p->OR(f1, f1, f2);
+ break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ saveFlag(f2, flag, subFlag);
+ p->AND(f1, f1, f2);
+ p->MOV(GenRegister::flag(flag, subFlag), f1);
+ p->pop();
+ }
+
void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
int execWidth = p->curr.execWidth;
src = src.top_half();
@@ -352,113 +694,59 @@ namespace gbe
p->pop();
}
+ void GenContext::I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1) {
+ GenRegister acc = GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for(int i = 0; i < execWidth; i += 8) {
+ p->MUL(acc, src0, src1);
+ p->curr.accWrEnable = 1;
+ p->MACH(high, src0, src1);
+ p->curr.accWrEnable = 0;
+ p->MOV(low, acc);
+ src0 = GenRegister::suboffset(src0, 8);
+ src1 = GenRegister::suboffset(src1, 8);
+ high = GenRegister::suboffset(high, 8);
+ low = GenRegister::suboffset(low, 8);
+ }
+ p->pop();
+ }
+
+ void GenContext::emitI64MULInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ I32FullMult(GenRegister::null(), e, b, c);
+ I32FullMult(GenRegister::null(), f, a, d);
+ p->ADD(e, e, f);
+ I32FullMult(f, a, b, d);
+ p->ADD(e, e, f);
+ p->pop();
+ storeTopHalf(dest, e);
+ storeBottomHalf(dest, a);
+ }
+
void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
const GenRegister src2 = ra->genReg(insn.src(2));
switch (insn.opcode) {
- case SEL_OP_I64ADD:
- {
- GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
- y = GenRegister::suboffset(x, p->curr.execWidth);
- loadBottomHalf(x, src0);
- loadBottomHalf(y, src1);
- addWithCarry(x, x, y);
- storeBottomHalf(dst, x);
- loadTopHalf(x, src0);
- p->ADD(x, x, y);
- loadTopHalf(y, src1);
- p->ADD(x, x, y);
- storeTopHalf(dst, x);
- }
- break;
- case SEL_OP_I64SUB:
- {
- GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
- y = GenRegister::suboffset(x, p->curr.execWidth);
- loadBottomHalf(x, src0);
- loadBottomHalf(y, src1);
- subWithBorrow(x, x, y);
- storeBottomHalf(dst, x);
- loadTopHalf(x, src0);
- subWithBorrow(x, x, y);
- loadTopHalf(y, src1);
- subWithBorrow(x, x, y);
- storeTopHalf(dst, x);
- }
- break;
- case SEL_OP_MUL_HI:
- {
- int w = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- p->curr.quarterControl = 0;
- p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
- p->curr.accWrEnable = 1;
- p->MACH(src2, src0, src1);
- p->curr.accWrEnable = 0;
- p->pop();
- p->MOV(dst, src2);
- if (w == 16) {
- p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
- p->curr.accWrEnable = 1;
- p->MACH(src2, GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
- p->curr.accWrEnable = 0;
- p->pop();
- p->curr.quarterControl = 1;
- p->MOV(GenRegister::Qn(dst, 1), src2);
- }
- p->pop();
- break;
- }
case SEL_OP_MAD: p->MAD(dst, src0, src1, src2); break;
- case SEL_OP_HADD:
- {
- int w = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- p->curr.quarterControl = 0;
- p->ADDC(dst, src0, src1);
- p->SHR(dst, dst, GenRegister::immud(1));
- p->SHL(src2, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
- p->OR(dst, dst, src2);
- if (w == 16) {
- p->curr.quarterControl = 1;
- p->ADDC(GenRegister::Qn(dst, 1), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
- p->SHR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
- p->SHL(GenRegister::Qn(src2, 1), GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
- p->OR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::Qn(src2, 1));
- }
- p->pop();
- break;
- }
- case SEL_OP_RHADD:
- {
- int w = p->curr.execWidth;
- p->push();
- p->curr.execWidth = 8;
- p->curr.quarterControl = 0;
- p->ADDC(dst, src0, src1);
- p->ADD(dst, dst, GenRegister::immud(1));
- p->SHR(dst, dst, GenRegister::immud(1));
- p->SHL(src2, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
- p->OR(dst, dst, src2);
- if (w == 16) {
- p->curr.quarterControl = 1;
- p->ADDC(GenRegister::Qn(dst, 1), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
- p->ADD(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
- p->SHR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
- p->SHL(GenRegister::Qn(src2, 1), GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
- p->OR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::Qn(src2, 1));
- }
- p->pop();
- break;
- }
default: NOT_IMPLEMENTED;
}
}
@@ -553,11 +841,12 @@ namespace gbe
}
void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
+ p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.execWidth = 8;
p->curr.noMask = 1;
- p->EOT(0);
+ p->EOT(112);
p->pop();
}
@@ -601,10 +890,10 @@ namespace gbe
void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
const uint32_t elemNum = insn.extra.elem;
const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
- const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
- const GenRegister tmp = ra->genReg(insn.dst(0));
+ const GenRegister tempAddr = ra->genReg(insn.dst(0));
+ const GenRegister dst = ra->genReg(insn.dst(tmpRegSize + 1));
+ const GenRegister tmp = ra->genReg(insn.dst(1));
const GenRegister src = ra->genReg(insn.src(0));
- const GenRegister tempAddr = ra->genReg(insn.src(1));
const uint32_t bti = insn.extra.function;
p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
}
@@ -621,11 +910,12 @@ namespace gbe
// then follow the real destination registers.
// For SIMD16, we allocate elemNum temporary registers from dst(0).
void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
- const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister src = ra->genReg(insn.dst(0));
const uint32_t elemNum = insn.extra.elem;
- const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
- const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1));
+ const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
+ const GenRegister data = ra->genReg(insn.src(1));
const uint32_t bti = insn.extra.function;
+ p->MOV(src, addr);
p->WRITE64(src, data, bti, elemNum);
}
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 694ae98..8b481d0 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -76,6 +76,7 @@ namespace gbe
return this->liveness->getLiveOut(bb);
}
+ void collectShifter(GenRegister dest, GenRegister src);
void loadTopHalf(GenRegister dest, GenRegister src);
void storeTopHalf(GenRegister dest, GenRegister src);
@@ -84,12 +85,18 @@ namespace gbe
void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+ void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
+ void saveFlag(GenRegister dest, int flag, int subFlag);
/*! Final Gen ISA emission helper functions */
void emitLabelInstruction(const SelectionInstruction &insn);
void emitUnaryInstruction(const SelectionInstruction &insn);
+ void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
void emitBinaryInstruction(const SelectionInstruction &insn);
+ void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
void emitTernaryInstruction(const SelectionInstruction &insn);
+ void emitI64ShiftInstruction(const SelectionInstruction &insn);
+ void emitI64CompareInstruction(const SelectionInstruction &insn);
void emitCompareInstruction(const SelectionInstruction &insn);
void emitJumpInstruction(const SelectionInstruction &insn);
void emitIndirectMoveInstruction(const SelectionInstruction &insn);
@@ -111,8 +118,10 @@ namespace gbe
void emitSpillRegInstruction(const SelectionInstruction &insn);
void emitUnSpillRegInstruction(const SelectionInstruction &insn);
void emitGetImageInfoInstruction(const SelectionInstruction &insn);
+ void emitI64MULInstruction(const SelectionInstruction &insn);
void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+
/*! Implements base class */
virtual Kernel *allocateKernel(void);
/*! Store the position of each label instruction in the Gen ISA stream */
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 4d6aa34..1a459e1 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -412,12 +412,6 @@ namespace gbe
curr.noMask = originMask;
this->UNTYPED_WRITE(msg, bti, elemNum);
}
- /* Let's restore the original message(addr) register. */
- /* XXX could be optimized if we don't allocate the address to the header
- position of the message. */
- curr.predicate = GEN_PREDICATE_NONE;
- curr.noMask = GEN_MASK_DISABLE;
- ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(-4));
pop();
}
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index da8f2a2..2204837 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -1,9 +1,13 @@
// Family Latency SIMD16 SIMD8
DECL_GEN7_SCHEDULE(Label, 0, 0, 0)
DECL_GEN7_SCHEDULE(Unary, 20, 4, 2)
+DECL_GEN7_SCHEDULE(UnaryWithTemp, 20, 4, 2)
DECL_GEN7_SCHEDULE(Binary, 20, 4, 2)
+DECL_GEN7_SCHEDULE(BinaryWithTemp, 20, 4, 2)
DECL_GEN7_SCHEDULE(Ternary, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64Shift, 20, 4, 2)
DECL_GEN7_SCHEDULE(Compare, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64Compare, 20, 4, 2)
DECL_GEN7_SCHEDULE(Jump, 14, 1, 1)
DECL_GEN7_SCHEDULE(IndirectMove, 20, 2, 2)
DECL_GEN7_SCHEDULE(Eot, 20, 1, 1)
@@ -24,3 +28,4 @@ DECL_GEN7_SCHEDULE(SpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(UnSpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(GetImageInfo, 20, 4, 2)
DECL_GEN7_SCHEDULE(Atomic, 80, 1, 1)
+DECL_GEN7_SCHEDULE(I64MUL, 20, 4, 2)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index 0b720b7..f1f5775 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -165,11 +165,15 @@ namespace gbe
}
return false;
}
+ /*! Owns the tracker */
+ SelectionScheduler &scheduler;
/*! Add a new dependency "node0 depends on node set for register reg" */
- INLINE void addDependency(ScheduleDAGNode *node0, GenRegister reg) {
+ INLINE void addDependency(ScheduleDAGNode *node0, GenRegister reg) {
if (this->ignoreDependency(reg) == false) {
const uint32_t index = this->getIndex(reg);
this->addDependency(node0, index);
+ if (reg.isdf() || reg.isint64())
+ this->addDependency(node0, index + 1);
}
}
/*! Add a new dependency "node set for register reg depends on node0" */
@@ -177,6 +181,8 @@ namespace gbe
if (this->ignoreDependency(reg) == false) {
const uint32_t index = this->getIndex(reg);
this->addDependency(index, node0);
+ if (reg.isdf() || reg.isint64())
+ this->addDependency(index + 1, node0);
}
}
/*! Make the node located at insnID a barrier */
@@ -187,8 +193,6 @@ namespace gbe
static const uint32_t MAX_FLAG_REGISTER = 8u;
/*! Maximum number of *physical* accumulators registers */
static const uint32_t MAX_ACC_REGISTER = 1u;
- /*! Owns the tracker */
- SelectionScheduler &scheduler;
/*! Stores the last node that wrote to a register / memory ... */
vector<ScheduleDAGNode*> nodes;
/*! Stores the nodes per instruction */
@@ -328,11 +332,13 @@ namespace gbe
if (this->ignoreDependency(dst) == false) {
const uint32_t index = this->getIndex(dst);
this->nodes[index] = node;
+ if (dst.isdf() || dst.isint64())
+ this->nodes[index + 1] = node;
}
}
// Track writes in predicates
- if (insn.opcode == SEL_OP_CMP) {
+ if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
const uint32_t index = this->getIndex(getFlag(insn));
this->nodes[index] = node;
}
@@ -454,7 +460,7 @@ namespace gbe
tracker.addDependency(node, insn.dst(dstID));
// write-after-write for predicate
- if (insn.opcode == SEL_OP_CMP)
+ if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP)
tracker.addDependency(node, getFlag(insn));
// write-after-write for accumulators
@@ -467,6 +473,11 @@ namespace gbe
tracker.addDependency(node, index);
}
+ // write-after-write in scratch memory
+ if (insn.opcode == SEL_OP_SPILL_REG) {
+ const uint32_t index = tracker.getIndex(0xff);
+ tracker.addDependency(node, index);
+ }
// Consider barriers and wait are writing memory (local and global)
if (insn.opcode == SEL_OP_BARRIER ||
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 1a3af68..8e4cd8f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -410,13 +410,19 @@ namespace gbe
#define ALU1(OP) \
INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
+#define ALU1WithTemp(OP) \
+ INLINE void OP(Reg dst, Reg src, Reg temp) { ALU1WithTemp(SEL_OP_##OP, dst, src, temp); }
#define ALU2(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1) { ALU2(SEL_OP_##OP, dst, src0, src1); }
+#define ALU2WithTemp(OP) \
+ INLINE void OP(Reg dst, Reg src0, Reg src1, Reg temp) { ALU2WithTemp(SEL_OP_##OP, dst, src0, src1, temp); }
#define ALU3(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
+#define I64Shift(OP) \
+ INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
ALU1(MOV)
- ALU2(MOV_DF)
- ALU2(LOAD_DF_IMM)
+ ALU1WithTemp(MOV_DF)
+ ALU1WithTemp(LOAD_DF_IMM)
ALU1(LOAD_INT64_IMM)
ALU1(RNDZ)
ALU1(RNDE)
@@ -435,8 +441,8 @@ namespace gbe
ALU2(RSL)
ALU2(ASR)
ALU2(ADD)
- ALU3(I64ADD)
- ALU3(I64SUB)
+ ALU2WithTemp(I64ADD)
+ ALU2WithTemp(I64SUB)
ALU2(MUL)
ALU1(FRC)
ALU1(RNDD)
@@ -444,16 +450,28 @@ namespace gbe
ALU2(MACH)
ALU1(LZD)
ALU3(MAD)
- ALU3(MUL_HI)
+ ALU2WithTemp(MUL_HI)
ALU1(FBH)
ALU1(FBL)
- ALU3(HADD)
- ALU3(RHADD)
+ ALU2WithTemp(HADD)
+ ALU2WithTemp(RHADD)
ALU2(UPSAMPLE_SHORT)
ALU2(UPSAMPLE_INT)
+ ALU2(UPSAMPLE_LONG)
+ ALU1WithTemp(CONVI_TO_I64)
+ I64Shift(I64SHL)
+ I64Shift(I64SHR)
+ I64Shift(I64ASR)
#undef ALU1
+#undef ALU1WithTemp
#undef ALU2
+#undef ALU2WithTemp
#undef ALU3
+#undef I64Shift
+ /*! Shift a 64-bit integer */
+ void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+ /*! Compare 64-bit integer */
+ void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
/*! Encode a barrier instruction */
void BARRIER(GenRegister src);
/*! Encode a barrier instruction */
@@ -479,7 +497,7 @@ namespace gbe
/*! Read 64 bits float/int array */
void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
/*! Write 64 bits float/int array */
- void WRITE64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
+ void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Untyped write (up to 4 elements) */
@@ -494,8 +512,12 @@ namespace gbe
void MATH(Reg dst, uint32_t function, Reg src);
/*! Encode unary instructions */
void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
+ /*! Encode unary with temp reg instructions */
+ void ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg temp);
/*! Encode binary instructions */
void ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1);
+ /*! Encode binary with temp reg instructions */
+ void ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp);
/*! Encode ternary instructions */
void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
/*! Encode sample instructions */
@@ -504,6 +526,8 @@ namespace gbe
void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
/*! Get image information */
void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
+ /*! Multiply 64-bit integers */
+ void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
/*! Use custom allocators */
GBE_CLASS(Opaque);
friend class SelectionBlock;
@@ -629,12 +653,15 @@ namespace gbe
for (auto &block : blockList)
for (auto &insn : block.insnList) {
+ // spill / unspill insn should be skipped when do spilling
+ if(insn.opcode == SEL_OP_SPILL_REG || insn.opcode == SEL_OP_UNSPILL_REG) continue;
+
const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const GenRegister selReg = insn.src(srcID);
const ir::Register reg = selReg.reg();
- if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+ if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
GBE_ASSERT(srcID < 5);
SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
unspill->state = GenInstructionState(simdWidth);
@@ -653,7 +680,7 @@ namespace gbe
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const GenRegister selReg = insn.dst(dstID);
const ir::Register reg = selReg.reg();
- if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+ if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
GBE_ASSERT(dstID < 5);
SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
spill->state = GenInstructionState(simdWidth);
@@ -825,28 +852,29 @@ namespace gbe
/* elemNum contains all the temporary register and the
real destination registers.*/
void Selection::Opaque::READ64(Reg addr,
- Reg tempAddr,
- const GenRegister *dst,
- uint32_t elemNum,
- uint32_t valueNum,
- uint32_t bti)
+ Reg tempAddr,
+ const GenRegister *dst,
+ uint32_t elemNum,
+ uint32_t valueNum,
+ uint32_t bti)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 2);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
+ /* temporary addr register is to be modified, set it to dst registers.*/
+ insn->dst(0) = tempAddr;
// Regular instruction to encode
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
- insn->dst(elemID) = dst[elemID];
+ insn->dst(elemID + 1) = dst[elemID];
insn->src(0) = addr;
- insn->src(1) = tempAddr;
insn->extra.function = bti;
insn->extra.elem = valueNum;
// Only the temporary registers need contiguous allocation
dstVector->regNum = elemNum - valueNum;
dstVector->isSrc = 0;
- dstVector->reg = &insn->dst(0);
+ dstVector->reg = &insn->dst(1);
// Source cannot be scalar (yet)
srcVector->regNum = 1;
@@ -883,24 +911,27 @@ namespace gbe
/* elemNum contains all the temporary register and the
real data registers.*/
void Selection::Opaque::WRITE64(Reg addr,
- const GenRegister *src,
- uint32_t elemNum,
- uint32_t valueNum,
- uint32_t bti)
+ const GenRegister *src,
+ uint32_t srcNum,
+ const GenRegister *dst,
+ uint32_t dstNum,
+ uint32_t bti)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, elemNum+1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
SelectionVector *vector = this->appendVector();
// Regular instruction to encode
insn->src(0) = addr;
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
- insn->src(elemID+1) = src[elemID];
+ for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+ insn->src(elemID + 1) = src[elemID];
+ for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+ insn->dst(elemID) = dst[elemID];
insn->extra.function = bti;
- insn->extra.elem = valueNum;
+ insn->extra.elem = srcNum;
// Only the addr + temporary registers need to be contiguous.
- vector->regNum = (elemNum - valueNum) + 1;
- vector->reg = &insn->src(0);
+ vector->regNum = dstNum;
+ vector->reg = &insn->dst(0);
vector->isSrc = 1;
}
@@ -977,12 +1008,28 @@ namespace gbe
insn->extra.function = function;
}
+ void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 6; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
insn->dst(0) = dst;
insn->src(0) = src;
}
+ void Selection::Opaque::ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src, Reg temp) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 2, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ insn->dst(1) = temp;
+ }
+
void Selection::Opaque::ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 2);
insn->dst(0) = dst;
@@ -990,6 +1037,14 @@ namespace gbe
insn->src(1) = src1;
}
+ void Selection::Opaque::ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 2, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->dst(1) = temp;
+ }
+
void Selection::Opaque::ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 3);
insn->dst(0) = dst;
@@ -998,6 +1053,24 @@ namespace gbe
insn->src(2) = src2;
}
+ void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2);
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i=0; i<3; i++)
+ insn->dst(i) = tmp[i];
+ insn->extra.function = conditional;
+ }
+
+ void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 7, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 6; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
// Boiler plate to initialize the selection library at c++ pre-main
static SelectionLibrary *selLib = NULL;
static void destroySelectionLibrary(void) { GBE_DELETE(selLib); }
@@ -1524,9 +1597,33 @@ namespace gbe
sel.ADD(dst, src0, GenRegister::negate(src1));
sel.pop();
break;
- case OP_SHL: sel.SHL(dst, src0, src1); break;
- case OP_SHR: sel.SHR(dst, src0, src1); break;
- case OP_ASR: sel.ASR(dst, src0, src1); break;
+ case OP_SHL:
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i ++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64SHL(dst, src0, src1, tmp);
+ } else
+ sel.SHL(dst, src0, src1);
+ break;
+ case OP_SHR:
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i ++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64SHR(dst, src0, src1, tmp);
+ } else
+ sel.SHR(dst, src0, src1);
+ break;
+ case OP_ASR:
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i ++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64ASR(dst, src0, src1, tmp);
+ } else
+ sel.ASR(dst, src0, src1);
+ break;
case OP_MUL_HI: {
GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
sel.MUL_HI(dst, src0, src1, temp);
@@ -1536,12 +1633,14 @@ namespace gbe
if (type == TYPE_U32 || type == TYPE_S32) {
sel.pop();
return false;
- }
- else {
- GBE_ASSERTM((type != TYPE_S64 && type != TYPE_U64), "64bit integer not supported yet!" );
+ } else if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64MUL(dst, src0, src1, tmp);
+ } else
sel.MUL(dst, src0, src1);
- }
- break;
+ break;
case OP_HADD: {
GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
sel.HADD(dst, src0, src1, temp);
@@ -1558,6 +1657,9 @@ namespace gbe
case OP_UPSAMPLE_INT:
sel.UPSAMPLE_INT(dst, src0, src1);
break;
+ case OP_UPSAMPLE_LONG:
+ sel.UPSAMPLE_LONG(dst, src0, src1);
+ break;
default: NOT_IMPLEMENTED;
}
sel.pop();
@@ -1666,7 +1768,6 @@ namespace gbe
sel.push();
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.execWidth = simdWidth;
- sel.curr.physicalFlag = 0;
sel.SEL_CMP(genCmp, tmp, src0, src1);
sel.pop();
@@ -1861,15 +1962,32 @@ namespace gbe
const Type type = insn.getType();
const Immediate imm = insn.getImmediate();
const GenRegister dst = sel.selReg(insn.getDst(0), type);
+ GenRegister flagReg;
sel.push();
if (sel.isScalarOrBool(insn.getDst(0)) == true) {
sel.curr.execWidth = 1;
+ if(type == TYPE_BOOL) {
+ if(imm.data.b) {
+ if(sel.curr.predicate == GEN_PREDICATE_NONE)
+ flagReg = GenRegister::immuw(0xffff);
+ else {
+ if(sel.curr.physicalFlag)
+ flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
+ else
+ flagReg = sel.selReg(Register(sel.curr.flagIndex), TYPE_U16);
+ }
+ } else
+ flagReg = GenRegister::immuw(0x0);
+ }
sel.curr.predicate = GEN_PREDICATE_NONE;
sel.curr.noMask = 1;
}
switch (type) {
+ case TYPE_BOOL:
+ sel.MOV(dst, flagReg);
+ break;
case TYPE_U32:
case TYPE_S32:
case TYPE_FLOAT:
@@ -1985,7 +2103,7 @@ namespace gbe
dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
dst[dstID] = sel.selReg(insn.getValue(valueID));
- sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, valueNum + tmpRegNum, valueNum, bti);
+ sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
}
void emitByteGather(Selection::Opaque &sel,
@@ -2085,13 +2203,16 @@ namespace gbe
addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
// The first 16 DWORD register space is for temporary usage at encode stage.
uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
- GenRegister src[valueNum + tmpRegNum];
+ GenRegister src[valueNum];
+ GenRegister dst[tmpRegNum + 1];
+ /* dst 0 is for the temporary address register. */
+ dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
for (srcID = 0; srcID < tmpRegNum; ++srcID)
- src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD));
+ dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
- for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID)
- src[srcID] = sel.selReg(insn.getValue(valueID));
- sel.WRITE64(addr, src, valueNum + tmpRegNum, valueNum, bti);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+ src[valueID] = sel.selReg(insn.getValue(valueID));
+ sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
}
void emitByteScatter(Selection::Opaque &sel,
@@ -2187,7 +2308,13 @@ namespace gbe
sel.push();
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(dst);
- sel.CMP(genCmp, src0, src1);
+ if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[3];
+ for(int i=0; i<3; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64CMP(genCmp, src0, src1, tmp);
+ } else
+ sel.CMP(genCmp, src0, src1);
sel.pop();
return true;
}
@@ -2223,6 +2350,14 @@ namespace gbe
} else if (dst.isdf()) {
ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
sel.MOV_DF(dst, src, sel.selReg(r));
+ } else if (dst.isint64()) {
+ switch(src.type) {
+ case GEN_TYPE_F:
+ case GEN_TYPE_DF:
+ NOT_IMPLEMENTED;
+ default:
+ sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+ }
} else
sel.MOV(dst, src);
return true;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index eeca9af..32c7a05 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,7 +1,7 @@
DECL_SELECTION_IR(LABEL, LabelInstruction)
DECL_SELECTION_IR(MOV, UnaryInstruction)
-DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
-DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
+DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
DECL_SELECTION_IR(NOT, UnaryInstruction)
DECL_SELECTION_IR(LZD, UnaryInstruction)
@@ -23,13 +23,18 @@ DECL_SELECTION_IR(SHL, BinaryInstruction)
DECL_SELECTION_IR(RSR, BinaryInstruction)
DECL_SELECTION_IR(RSL, BinaryInstruction)
DECL_SELECTION_IR(ASR, BinaryInstruction)
+DECL_SELECTION_IR(I64SHR, I64ShiftInstruction)
+DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
+DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
DECL_SELECTION_IR(ADD, BinaryInstruction)
-DECL_SELECTION_IR(I64ADD, TernaryInstruction)
-DECL_SELECTION_IR(I64SUB, TernaryInstruction)
+DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
DECL_SELECTION_IR(MUL, BinaryInstruction)
+DECL_SELECTION_IR(I64MUL, I64MULInstruction)
DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
DECL_SELECTION_IR(MACH, BinaryInstruction)
DECL_SELECTION_IR(CMP, CompareInstruction)
+DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
DECL_SELECTION_IR(MAD, TernaryInstruction)
DECL_SELECTION_IR(JMPI, JumpInstruction)
@@ -51,10 +56,12 @@ DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
-DECL_SELECTION_IR(MUL_HI, TernaryInstruction)
+DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
DECL_SELECTION_IR(FBH, UnaryInstruction)
DECL_SELECTION_IR(FBL, UnaryInstruction)
-DECL_SELECTION_IR(HADD, TernaryInstruction)
-DECL_SELECTION_IR(RHADD, TernaryInstruction)
+DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
+DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index ccbc0da..a765917 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -136,6 +136,7 @@ namespace gbe
const uint32_t offset = GEN_REG_SIZE + curbeOffset + subOffset;
RA.insert(std::make_pair(reg, offset));
this->intervals[reg].minID = 0;
+ this->intervals[reg].maxID = 0;
}
}
@@ -456,7 +457,7 @@ namespace gbe
// Compare instructions update the flags so we must copy it back to
// the GRF
- if (insn.opcode == SEL_OP_CMP) {
+ if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
SelectionInstruction *mov1 = selection.create(SEL_OP_MOV,1,1);
mov1->state = mov0->state;
mov1->dst(0) = mov0->src(0);
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 2cad4c0..ea1bc06 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -270,9 +270,16 @@ namespace gbe
GenRegister r = *this;
r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
r.hstride = GEN_HORIZONTAL_STRIDE_2;
+ r.vstride = GEN_VERTICAL_STRIDE_16;
return r;
}
+ INLINE bool is_signed_int(void) const {
+ if ((type == GEN_TYPE_B || type == GEN_TYPE_W || type == GEN_TYPE_D || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+ return true;
+ return false;
+ }
+
INLINE bool isdf(void) const {
if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
return true;
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
index b073682..35e3a2a 100755
--- a/backend/src/gen_builtin_vector.py
+++ b/backend/src/gen_builtin_vector.py
@@ -260,9 +260,7 @@ class builtinProto():
# XXX FIXME now skip all double vector, as we don't
# defined those scalar version's prototype.
if ptype[0].find('double') != -1 or \
- vtype[0].find('double') != -1 or \
- ptype[0].find('long') != -1 or \
- vtype[0].find('long') != -1 :
+ vtype[0].find('double') != -1:
return
if (n == 0):
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 45095db..48e83b4 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -861,7 +861,8 @@ namespace ir {
return false;
if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
return false;
- CHECK_TYPE(this->type, allButBool);
+ //Support all type IMM, disable check
+ //CHECK_TYPE(this->type, allButBool);
return true;
}
@@ -1334,6 +1335,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
DECL_EMIT_FUNCTION(MUL_HI)
DECL_EMIT_FUNCTION(UPSAMPLE_SHORT)
DECL_EMIT_FUNCTION(UPSAMPLE_INT)
+ DECL_EMIT_FUNCTION(UPSAMPLE_LONG)
DECL_EMIT_FUNCTION(DIV)
DECL_EMIT_FUNCTION(REM)
DECL_EMIT_FUNCTION(SHL)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 48e6963..40a3d40 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -523,6 +523,8 @@ namespace ir {
Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
/*! upsample_int.type dst src */
Instruction UPSAMPLE_INT(Type type, Register dst, Register src0, Register src1);
+ /*! upsample_long.type dst src */
+ Instruction UPSAMPLE_LONG(Type type, Register dst, Register src0, Register src1);
/*! fbh.type dst src */
Instruction FBH(Type type, Register dst, Register src);
/*! fbl.type dst src */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index b9f0e73..c15e912 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -79,3 +79,4 @@ DECL_INSN(HADD, BinaryInstruction)
DECL_INSN(RHADD, BinaryInstruction)
DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
+DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 18448cf..12d809d 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1827,6 +1827,7 @@ namespace gbe
case GEN_OCL_MUL_HI_UINT:
case GEN_OCL_UPSAMPLE_SHORT:
case GEN_OCL_UPSAMPLE_INT:
+ case GEN_OCL_UPSAMPLE_LONG:
case GEN_OCL_SADD_SAT_CHAR:
case GEN_OCL_SADD_SAT_SHORT:
case GEN_OCL_SADD_SAT_INT:
@@ -2223,6 +2224,14 @@ namespace gbe
ctx.UPSAMPLE_INT(getType(ctx, I.getType()), dst, src0, src1);
break;
}
+ case GEN_OCL_UPSAMPLE_LONG:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.UPSAMPLE_LONG(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
case GEN_OCL_SADD_SAT_CHAR:
case GEN_OCL_SADD_SAT_SHORT:
case GEN_OCL_SADD_SAT_INT:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 8e940bc..b712860 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -135,3 +135,4 @@ DECL_LLVM_GEN_FUNCTION(HADD, __gen_ocl_hadd)
DECL_LLVM_GEN_FUNCTION(RHADD, __gen_ocl_rhadd)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 84f15ca..8d4220c 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -123,6 +123,7 @@ typedef size_t __event_t;
#define FLT_MIN_10_EXP -37
#define FLT_MIN_EXP -125
#define FLT_RADIX 2
+#define FLT_ONE 1.0000000000e+00 /* 0x3F800000 */
#define FLT_MAX 0x1.fffffep127f
#define FLT_MIN 0x1.0p-126f
#define FLT_EPSILON 0x1.0p-23f
@@ -310,6 +311,14 @@ INLINE_OVERLOADABLE uint clz(uint x) {
return __gen_ocl_fbh(x);
}
+INLINE_OVERLOADABLE long clz(long x) {
+ return 0;
+}
+
+INLINE_OVERLOADABLE ulong clz(ulong x) {
+ return 0;
+}
+
OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
@@ -318,6 +327,12 @@ INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE long mul_hi(long x, long y) {
+ return 0;
+}
+INLINE_OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
+ return 0;
+}
#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
DEF(char)
@@ -326,6 +341,8 @@ DEF(short)
DEF(ushort)
DEF(int)
DEF(uint)
+DEF(long)
+DEF(ulong)
#undef DEF
INLINE_OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
@@ -366,12 +383,27 @@ INLINE_OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c) {
return x;
}
-/* XXX not implemented. */
INLINE_OVERLOADABLE int mad_sat(int a, int b, int c) {
- return 0;
+ long x = (long)a * (long)b + (long)c;
+ if (x > 0x7FFFFFFF)
+ x = 0x7FFFFFFF;
+ else if (x < -0x7FFFFFFF-1)
+ x = -0x7FFFFFFF-1;
+ return (int)x;
}
INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
+ ulong x = (ulong)a * (ulong)b + (ulong)c;
+ if (x > 0xFFFFFFFFu)
+ x = 0xFFFFFFFFu;
+ return (uint)x;
+}
+
+INLINE_OVERLOADABLE long mad_sat(long a, long b, long c) {
+ return 0;
+}
+
+INLINE_OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
return 0;
}
@@ -389,13 +421,26 @@ DEF(ushort, 15)
DEF(int, 31)
DEF(uint, 31)
#undef DEF
+INLINE_OVERLOADABLE long rotate(long x, long y) {
+ return 0;
+}
+INLINE_OVERLOADABLE ulong rotate(ulong x, ulong y) {
+ return 0;
+}
OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
+OVERLOADABLE long __gen_ocl_upsample(long hi, long lo);
INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE long upsample(int hi, uint lo) {
+ return __gen_ocl_upsample((long)hi, (long)lo);
+}
+INLINE_OVERLOADABLE ulong upsample(uint hi, uint lo) {
+ return __gen_ocl_upsample((long)hi, (long)lo);
+}
PURE CONST uint __gen_ocl_hadd(uint x, uint y);
PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
@@ -411,6 +456,18 @@ INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 &
INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+INLINE_OVERLOADABLE long hadd(long x, long y) {
+ return 0;
+}
+INLINE_OVERLOADABLE ulong hadd(ulong x, ulong y) {
+ return 0;
+}
+INLINE_OVERLOADABLE long rhadd(long x, long y) {
+ return 0;
+}
+INLINE_OVERLOADABLE ulong rhadd(ulong x, ulong y) {
+ return 0;
+}
int __gen_ocl_abs(int x);
#define DEC(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
@@ -418,11 +475,13 @@ DEC(int)
DEC(short)
DEC(char)
#undef DEC
+INLINE_OVERLOADABLE ulong abs(long x) { return x < 0 ? -x : x; }
/* For unsigned types, do nothing. */
#define DEC(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
DEC(uint)
DEC(ushort)
DEC(uchar)
+DEC(ulong)
#undef DEC
/* Char and short type abs diff */
@@ -448,6 +507,13 @@ INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
return (abs(x) + abs(y));
}
+INLINE_OVERLOADABLE ulong abs_diff (long x, long y) {
+ return 0;
+}
+INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
+ return 0;
+}
+
/////////////////////////////////////////////////////////////////////////////
// Work Items functions (see 6.11.1 of OCL 1.1 spec)
/////////////////////////////////////////////////////////////////////////////
@@ -561,7 +627,36 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
float y = native_exp(-2 * x);
return (1 - y) / (1 + y);
}
+
+typedef union
+{
+ float value;
+ int word;
+} ieee_float_shape_type;
+
+#ifndef GET_FLOAT_WORD
+#define GET_FLOAT_WORD(i,d) \
+do { \
+ ieee_float_shape_type gf_u; \
+ gf_u.value = (d); \
+ (i) = gf_u.word; \
+} while (0)
+#endif
+
INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+ int hx, ix;
+ GET_FLOAT_WORD(hx,x);
+ ix = hx&0x7fffffff;
+ if(ix == 0x3f800000) {
+ return x * M_PI_2_F; /* asin(|1|)=+-pi/2 with inexact */
+ }
+ if(ix > 0x3f800000) { /* |x|>= 1 */
+ return (x-x) / (x-x); /* asin(|x|>1) is NaN */
+ }
+ if(ix < 0x32000000) { /* if |x| < 2**-27 */
+ if(HUGE_VALF + x > FLT_ONE) return x; /* return x with inexact if x!=0*/
+ }
+ /* 1 > |x| >= 2**-27 */
float sum = x, c = x, m = 1.0;
int n = 1;
do
@@ -733,6 +828,8 @@ DECL_MIN_MAX_CLAMP(char)
DECL_MIN_MAX_CLAMP(uint)
DECL_MIN_MAX_CLAMP(unsigned short)
DECL_MIN_MAX_CLAMP(unsigned char)
+DECL_MIN_MAX_CLAMP(long)
+DECL_MIN_MAX_CLAMP(ulong)
#undef DECL_MIN_MAX_CLAMP
#define BODY \
@@ -1196,6 +1293,72 @@ INLINE void write_mem_fence(cl_mem_fence_flags flags) {
}
/////////////////////////////////////////////////////////////////////////////
+// Async Copies and prefetch
+/////////////////////////////////////////////////////////////////////////////
+#define BODY(SRC_STRIDE, DST_STRIDE) \
+ uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
+ uint count = num / size; \
+ uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1); \
+ offset = offset * get_local_size(0) + get_local_id(0); \
+ for(uint i=0; i<count; i+=1) { \
+ *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+ offset += size; \
+ } \
+ if(offset < num) \
+ *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+ return 0;
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE event_t async_work_group_copy (local TYPE *dst, const global TYPE *src, \
+ size_t num, event_t event) { \
+ BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_copy (global TYPE *dst, const local TYPE *src, \
+ size_t num, event_t event) { \
+ BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst, const global TYPE *src, \
+ size_t num, size_t src_stride, event_t event) { \
+ BODY(src_stride, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst, const local TYPE *src, \
+ size_t num, size_t dst_stride, event_t event) { \
+ BODY(1, dst_stride); \
+}
+#define DEF(TYPE) \
+ DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+#undef BODY
+#undef DEFN
+#undef DEF
+
+INLINE void wait_group_events (int num_events, event_t *event_list) {
+ barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
// Atomic functions
/////////////////////////////////////////////////////////////////////////////
OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
diff --git a/kernels/builtin_acos_asin.cl b/kernels/builtin_acos_asin.cl
new file mode 100644
index 0000000..bba2d21
--- /dev/null
+++ b/kernels/builtin_acos_asin.cl
@@ -0,0 +1,10 @@
+__kernel void builtin_acos_asin(__global float *dst, __global float *src, __global int *max_func) {
+ int i = get_global_id(0);
+ float x = src[i];
+
+ dst[i * (*max_func) + 0] = acos(x);
+ dst[i * (*max_func) + 1] = acosh(x);
+ dst[i * (*max_func) + 2] = asin(x);
+ dst[i * (*max_func) + 3] = asinh(x);
+ dst[i * (*max_func) + 4] = x;
+};
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
new file mode 100644
index 0000000..a2432a4
--- /dev/null
+++ b/kernels/compiler_async_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_copy(__global int2 *dst, __global int2 *src, __local int2 *localBuffer, int copiesPerWorkItem)
+{
+ event_t event;
+ int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+ int i;
+ event = async_work_group_copy((__local int2*)localBuffer, (__global const int2*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 );
+ wait_group_events( 1, &event );
+
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (int2)(3, 3);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ event = async_work_group_copy((__global int2*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const int2*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 );
+ wait_group_events( 1, &event );
+}
diff --git a/kernels/compiler_async_stride_copy.cl b/kernels/compiler_async_stride_copy.cl
new file mode 100644
index 0000000..a926588
--- /dev/null
+++ b/kernels/compiler_async_stride_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_stride_copy(__global char4 *dst, __global char4 *src, __local char4 *localBuffer, int copiesPerWorkItem, int stride)
+{
+ event_t event;
+ int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+ int i;
+ event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+ wait_group_events( 1, &event );
+
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (char4)(3);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+ wait_group_events( 1, &event );
+}
diff --git a/kernels/compiler_event.cl b/kernels/compiler_event.cl
new file mode 100644
index 0000000..a901b05
--- /dev/null
+++ b/kernels/compiler_event.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_event(__global int *dst, int value)
+{
+ int id = (int)get_global_id(0);
+ dst[id] += value;
+}
diff --git a/kernels/compiler_load_bool_imm.cl b/kernels/compiler_load_bool_imm.cl
new file mode 100644
index 0000000..fda49b9
--- /dev/null
+++ b/kernels/compiler_load_bool_imm.cl
@@ -0,0 +1,12 @@
+__kernel void
+compiler_load_bool_imm(__global int *dst, __local int *localBuffer, int copiesPerWorkItem )
+{
+ int i;
+ for(i=0; i<copiesPerWorkItem; i++)
+ localBuffer[get_local_id(0)*copiesPerWorkItem+i] = copiesPerWorkItem;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for(i=0; i<copiesPerWorkItem; i++)
+ dst[get_global_id(0)*copiesPerWorkItem + i] = localBuffer[get_local_id(0)*copiesPerWorkItem+i];
+ barrier(CLK_LOCAL_MEM_FENCE);
+}
diff --git a/kernels/compiler_long_asr.cl b/kernels/compiler_long_asr.cl
new file mode 100644
index 0000000..901630b
--- /dev/null
+++ b/kernels/compiler_long_asr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_asr(global long *src, global long *dst) {
+ int i = get_global_id(0);
+ if(i > 7)
+ dst[i] = src[i] >> i;
+ else
+ dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_cmp.cl b/kernels/compiler_long_cmp.cl
new file mode 100644
index 0000000..90dfb60
--- /dev/null
+++ b/kernels/compiler_long_cmp.cl
@@ -0,0 +1,29 @@
+kernel void compiler_long_cmp_l(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] < src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_le(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] <= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_g(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] > src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_ge(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] >= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_eq(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] == src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_neq(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = (src1[i] != src2[i]) ? 3 : 4;
+}
diff --git a/kernels/compiler_long_convert.cl b/kernels/compiler_long_convert.cl
new file mode 100644
index 0000000..f22914f
--- /dev/null
+++ b/kernels/compiler_long_convert.cl
@@ -0,0 +1,7 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_long_convert(global char *src1, global short *src2, global int *src3, global long *dst1, global long *dst2, global long *dst3) {
+ int i = get_global_id(0);
+ dst1[i] = src1[i];
+ dst2[i] = src2[i];
+ dst3[i] = src3[i];
+}
diff --git a/kernels/compiler_long_mult.cl b/kernels/compiler_long_mult.cl
new file mode 100644
index 0000000..5b96d74
--- /dev/null
+++ b/kernels/compiler_long_mult.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_mult(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ if(i < 3)
+ dst[i] = src1[i] + src2[i];
+ else
+ dst[i] = src1[i] * src2[i];
+}
diff --git a/kernels/compiler_long_shl.cl b/kernels/compiler_long_shl.cl
new file mode 100644
index 0000000..3786b77
--- /dev/null
+++ b/kernels/compiler_long_shl.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shl(global long *src, global long *dst) {
+ int i = get_global_id(0);
+ if(i > 7)
+ dst[i] = src[i] << i;
+ else
+ dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_shr.cl b/kernels/compiler_long_shr.cl
new file mode 100644
index 0000000..d4e859c
--- /dev/null
+++ b/kernels/compiler_long_shr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shr(global ulong *src, global ulong *dst) {
+ int i = get_global_id(0);
+ if(i > 7)
+ dst[i] = src[i] >> i;
+ else
+ dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_upsample_long.cl b/kernels/compiler_upsample_long.cl
new file mode 100644
index 0000000..16f806b
--- /dev/null
+++ b/kernels/compiler_upsample_long.cl
@@ -0,0 +1,4 @@
+kernel void compiler_upsample_int(global int *src1, global uint *src2, global long *dst) {
+ int i = get_global_id(0);
+ dst[i] = upsample(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
index 320194e..964f5e7 100644
--- a/kernels/compiler_vector_load_store.cl
+++ b/kernels/compiler_vector_load_store.cl
@@ -25,11 +25,11 @@ __kernel void test_##type ##n(__global type *pin, \
TEST_TYPE(int,n) \
TEST_TYPE(uint,n) \
TEST_TYPE(float,n) \
- TEST_TYPE(double,n)
+ TEST_TYPE(double,n)\
+ TEST_TYPE(long,n) \
+ TEST_TYPE(ulong,n)
#if 0
- TEST_TYPE(long,n)
- TEST_TYPE(ulong,n)
TEST_TYPE(half,n)
#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cea78c0..58d23cb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ set(OPENCL_SRC
cl_program.c
cl_sampler.c
cl_event.c
+ cl_enqueue.c
cl_image.c
cl_mem.c
cl_platform_id.c
diff --git a/src/cl_api.c b/src/cl_api.c
index 146c010..4f048ee 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -18,9 +18,11 @@
*/
#include "cl_platform_id.h"
-#include "cl_device_id.h"
+#include "cl_device_id.h"
#include "cl_context.h"
#include "cl_command_queue.h"
+#include "cl_enqueue.h"
+#include "cl_event.h"
#include "cl_program.h"
#include "cl_kernel.h"
#include "cl_mem.h"
@@ -36,6 +38,7 @@
#include <stdio.h>
#include <string.h>
#include <assert.h>
+#include <unistd.h>
#ifndef CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
@@ -59,6 +62,23 @@ typedef intptr_t cl_device_partition_property;
return RET; \
} while(0)
+inline cl_int
+handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
+ cl_event* event, enqueue_data* data, cl_command_type type)
+{
+ cl_int status = cl_event_wait_events(num, wait_list);
+ cl_event e;
+ if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
+ e = cl_event_new(queue->ctx, queue, type, event!=NULL);
+ if(event != NULL)
+ *event = e;
+ if(status == CL_ENQUEUE_EXECUTE_DEFER) {
+ cl_event_new_enqueue_callback(e, data, num, wait_list);
+ }
+ }
+ return status;
+}
+
static cl_int
cl_check_device_type(cl_device_type device_type)
{
@@ -987,8 +1007,20 @@ cl_int
clWaitForEvents(cl_uint num_events,
const cl_event * event_list)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ cl_context ctx = NULL;
+
+ if(num_events > 0 && event_list)
+ ctx = event_list[0]->ctx;
+
+ TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
+
+ while(cl_event_wait_events(num_events, event_list) == CL_ENQUEUE_EXECUTE_DEFER) {
+ usleep(8000); //sleep 8ms to wait other thread
+ }
+
+error:
+ return err;
}
cl_int
@@ -998,38 +1030,94 @@ clGetEventInfo(cl_event event,
void * param_value,
size_t * param_value_size_ret)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ CHECK_EVENT(event);
+
+ if (param_name == CL_EVENT_COMMAND_QUEUE) {
+ if(event->queue == NULL) {
+ param_value_size_ret = 0;
+ param_value = NULL;
+ return err;
+ }
+ FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_CONTEXT) {
+ FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_COMMAND_TYPE) {
+ FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
+ cl_event_update_status(event);
+ FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
+ } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
+ cl_uint ref = event->ref_n;
+ FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
+ } else {
+ return CL_INVALID_VALUE;
+ }
+
+error:
+ return err;
+
}
cl_event
clCreateUserEvent(cl_context context,
cl_int * errcode_ret)
{
- NOT_IMPLEMENTED;
- return NULL;
+ cl_int err = CL_SUCCESS;
+ cl_event event = NULL;
+ CHECK_CONTEXT(context);
+
+ TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER, CL_TRUE));
+
+error:
+ if(errcode_ret)
+ *errcode_ret = err;
+ return event;
}
cl_int
clRetainEvent(cl_event event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ cl_event_add_ref(event);
+
+error:
+ return err;
}
cl_int
clReleaseEvent(cl_event event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ cl_event_delete(event);
+
+error:
+ return err;
}
cl_int
clSetUserEventStatus(cl_event event,
cl_int execution_status)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ if(execution_status > CL_COMPLETE) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if(event->status != CL_SUBMITTED) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ cl_event_set_status(event, execution_status);
+error:
+ return err;
}
cl_int
@@ -1038,8 +1126,20 @@ clSetEventCallback(cl_event event,
void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *),
void * user_data)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_EVENT(event);
+ if((pfn_notify == NULL) ||
+ (command_exec_callback_type > CL_SUBMITTED) ||
+ (command_exec_callback_type < CL_COMPLETE)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
+
+error:
+ return err;
+
}
cl_int
@@ -1087,8 +1187,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
cl_event * event)
{
cl_int err = CL_SUCCESS;
- void* src_ptr;
-
+ enqueue_data *data, defer_enqueue_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_MEM(buffer);
if (command_queue->ctx != buffer->ctx) {
@@ -1109,14 +1208,20 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
goto error;
}
- if (!(src_ptr = cl_mem_map_auto(buffer))) {
- err = CL_MAP_FAILURE;
- goto error;
- }
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
- memcpy(ptr, (char*)src_ptr + offset, size);
+ data = &defer_enqueue_data;
+ data->type = EnqueueReadBuffer;
+ data->mem_obj = buffer;
+ data->ptr = ptr;
+ data->offset = offset;
+ data->size = size;
- err = cl_mem_unmap_auto(buffer);
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
error:
return err;
@@ -1154,7 +1259,7 @@ clEnqueueWriteBuffer(cl_command_queue command_queue,
cl_event * event)
{
cl_int err = CL_SUCCESS;
- void* dst_ptr;
+ enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_MEM(buffer);
@@ -1176,16 +1281,22 @@ clEnqueueWriteBuffer(cl_command_queue command_queue,
goto error;
}
- if (!(dst_ptr = cl_mem_map_auto(buffer))) {
- err = CL_MAP_FAILURE;
- goto error;
- }
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
- memcpy((char*)dst_ptr + offset, ptr, size);
+ data = &no_wait_data;
+ data->type = EnqueueWriteBuffer;
+ data->mem_obj = buffer;
+ data->const_ptr = ptr;
+ data->offset = offset;
+ data->size = size;
- err = cl_mem_unmap_auto(buffer);
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
-error:
+ error:
return err;
}
@@ -1257,7 +1368,7 @@ clEnqueueReadImage(cl_command_queue command_queue,
cl_event * event)
{
cl_int err = CL_SUCCESS;
- void* src_ptr;
+ enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_IMAGE(image);
@@ -1304,35 +1415,22 @@ clEnqueueReadImage(cl_command_queue command_queue,
goto error;
}
- if (!(src_ptr = cl_mem_map_auto(image))) {
- err = CL_MAP_FAILURE;
- goto error;
- }
-
- size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
- src_ptr = (char*)src_ptr + offset;
-
- if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
- (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
- {
- memcpy(ptr, src_ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueReadImage;
+ data->mem_obj = image;
+ data->ptr = ptr;
+ data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = row_pitch;
+ data->slice_pitch = slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
}
- else {
- cl_uint y, z;
- for (z = 0; z < region[2]; z++) {
- const char* src = src_ptr;
- char* dst = ptr;
- for (y = 0; y < region[1]; y++) {
- memcpy(dst, src, image->bpp*region[0]);
- src += image->row_pitch;
- dst += row_pitch;
- }
- src_ptr = (char*)src_ptr + image->slice_pitch;
- ptr = (char*)ptr + slice_pitch;
- }
- }
-
- err = cl_mem_unmap_auto(image);
error:
return err;
@@ -1352,7 +1450,7 @@ clEnqueueWriteImage(cl_command_queue command_queue,
cl_event * event)
{
cl_int err = CL_SUCCESS;
- void* dst_ptr;
+ enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_IMAGE(image);
@@ -1399,35 +1497,22 @@ clEnqueueWriteImage(cl_command_queue command_queue,
goto error;
}
- if (!(dst_ptr = cl_mem_map_auto(image))) {
- err = CL_MAP_FAILURE;
- goto error;
- }
-
- size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
- dst_ptr = (char*)dst_ptr + offset;
-
- if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
- (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
- {
- memcpy(dst_ptr, ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueWriteImage;
+ data->mem_obj = image;
+ data->const_ptr = ptr;
+ data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = row_pitch;
+ data->slice_pitch = slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
}
- else {
- cl_uint y, z;
- for (z = 0; z < region[2]; z++) {
- const char* src = ptr;
- char* dst = dst_ptr;
- for (y = 0; y < region[1]; y++) {
- memcpy(dst, src, image->bpp*region[0]);
- src += row_pitch;
- dst += image->row_pitch;
- }
- ptr = (char*)ptr + slice_pitch;
- dst_ptr = (char*)dst_ptr + image->slice_pitch;
- }
- }
-
- err = cl_mem_unmap_auto(image);
error:
return err;
@@ -1490,10 +1575,8 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
cl_event * event,
cl_int * errcode_ret)
{
- void *ptr = NULL;
- void *mem_ptr = NULL;
cl_int err = CL_SUCCESS;
- int slot = -1;
+ enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_MEM(buffer);
@@ -1519,73 +1602,25 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
goto error;
}
- if (!(ptr = cl_mem_map_auto(buffer))) {
- err = CL_MAP_FAILURE;
- goto error;
- }
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
- ptr = (char*)ptr + offset;
+ data = &no_wait_data;
+ data->type = EnqueueMapBuffer;
+ data->mem_obj = buffer;
+ data->offset = offset;
+ data->size = size;
+ data->map_flags = map_flags;
- if(buffer->flags & CL_MEM_USE_HOST_PTR) {
- assert(buffer->host_ptr);
- memcpy(buffer->host_ptr + offset, ptr, size);
- mem_ptr = buffer->host_ptr + offset;
- } else {
- mem_ptr = ptr;
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
}
- /* Record the mapped address. */
- if (!buffer->mapped_ptr_sz) {
- buffer->mapped_ptr_sz = 16;
- buffer->mapped_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz);
- if (!buffer->mapped_ptr) {
- cl_mem_unmap_auto (buffer);
- err = CL_OUT_OF_HOST_MEMORY;
- ptr = NULL;
- goto error;
- }
-
- memset(buffer->mapped_ptr, 0, buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- slot = 0;
- } else {
- int i = 0;
- for (; i < buffer->mapped_ptr_sz; i++) {
- if (buffer->mapped_ptr[i].ptr == NULL) {
- slot = i;
- break;
- }
- }
-
- if (i == buffer->mapped_ptr_sz) {
- cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz * 2);
- if (!new_ptr) {
- cl_mem_unmap_auto (buffer);
- err = CL_OUT_OF_HOST_MEMORY;
- ptr = NULL;
- goto error;
- }
- memset(new_ptr, 0, 2 * buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- memcpy(new_ptr, buffer->mapped_ptr,
- buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- slot = buffer->mapped_ptr_sz;
- buffer->mapped_ptr_sz *= 2;
- free(buffer->mapped_ptr);
- buffer->mapped_ptr = new_ptr;
- }
- }
-
- assert(slot != -1);
- buffer->mapped_ptr[slot].ptr = mem_ptr;
- buffer->mapped_ptr[slot].v_ptr = ptr;
- buffer->mapped_ptr[slot].size = size;
- buffer->map_ref++;
-
error:
if (errcode_ret)
*errcode_ret = err;
- return mem_ptr;
+ return data->ptr;
}
void *
@@ -1602,8 +1637,8 @@ clEnqueueMapImage(cl_command_queue command_queue,
cl_event * event,
cl_int * errcode_ret)
{
- void *ptr = NULL;
cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_IMAGE(image);
@@ -1638,18 +1673,27 @@ clEnqueueMapImage(cl_command_queue command_queue,
goto error;
}
- if (!(ptr = cl_mem_map_auto(image))) {
- err = CL_MAP_FAILURE;
- goto error;
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueMapImage;
+ data->mem_obj = image;
+ data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = *image_row_pitch;
+ data->slice_pitch = *image_slice_pitch;
+ data->map_flags = map_flags;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
}
- size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
- ptr = (char*)ptr + offset;
-
error:
if (errcode_ret)
*errcode_ret = err;
- return ptr;
+ return data->ptr; //TODO: map and unmap first
}
cl_int
@@ -1661,9 +1705,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
cl_event * event)
{
cl_int err = CL_SUCCESS;
- int i;
- size_t mapped_size = 0;
- void * v_ptr = NULL;
+ enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_MEM(memobj);
@@ -1672,54 +1714,17 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
goto error;
}
- assert(memobj->mapped_ptr_sz >= memobj->map_ref);
- INVALID_VALUE_IF(!mapped_ptr);
- for (i = 0; i < memobj->mapped_ptr_sz; i++) {
- if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
- memobj->mapped_ptr[i].ptr = NULL;
- mapped_size = memobj->mapped_ptr[i].size;
- v_ptr = memobj->mapped_ptr[i].v_ptr;
- memobj->mapped_ptr[i].size = 0;
- memobj->mapped_ptr[i].v_ptr = NULL;
- memobj->map_ref--;
- break;
- }
- }
- /* can not find a mapped address? */
- INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
-
- if (memobj->flags & CL_MEM_USE_HOST_PTR) {
- assert(mapped_ptr >= memobj->host_ptr &&
- mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
- /* Sync the data. */
- memcpy(v_ptr, mapped_ptr, mapped_size);
- } else {
- assert(v_ptr == mapped_ptr);
- }
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, memobj->ctx);
- cl_mem_unmap_auto(memobj);
+ data = &no_wait_data;
+ data->type = EnqueueUnmapMemObject;
+ data->mem_obj = memobj;
+ data->ptr = mapped_ptr;
- /* shrink the mapped slot. */
- if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
- int j = 0;
- cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
- if (!new_ptr) {
- /* Just do nothing. */
- goto error;
- }
- memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
-
- for (i = 0; i < memobj->mapped_ptr_sz; i++) {
- if (memobj->mapped_ptr[i].ptr) {
- new_ptr[j] = memobj->mapped_ptr[i];
- j++;
- assert(j < memobj->mapped_ptr_sz/2);
- }
- }
- memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
- free(memobj->mapped_ptr);
- memobj->mapped_ptr = new_ptr;
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
}
error:
@@ -1742,6 +1747,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
size_t fixed_local_sz[] = {1,1,1};
cl_int err = CL_SUCCESS;
cl_uint i;
+ enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
CHECK_KERNEL(kernel);
@@ -1774,8 +1780,8 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
}
/* Local sizes must be non-null and divide global sizes */
- if (local_work_size != NULL)
- for (i = 0; i < work_dim; ++i)
+ if (local_work_size != NULL)
+ for (i = 0; i < work_dim; ++i)
if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
err = CL_INVALID_WORK_GROUP_SIZE;
goto error;
@@ -1789,9 +1795,9 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
}
/* XXX No event right now */
- FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
- FATAL_IF(event_wait_list != NULL, "Events are not supported");
- FATAL_IF(event != NULL, "Events are not supported");
+ //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
+ //FATAL_IF(event_wait_list != NULL, "Events are not supported");
+ //FATAL_IF(event != NULL, "Events are not supported");
if (local_work_size != NULL)
for (i = 0; i < work_dim; ++i)
@@ -1810,6 +1816,17 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
fixed_global_off,
fixed_global_sz,
fixed_local_sz);
+ if(err != CL_SUCCESS)
+ goto error;
+
+ data = &no_wait_data;
+ data->type = EnqueueNDRangeKernel;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_command_queue_flush(command_queue);
+ }
error:
return err;
@@ -1855,8 +1872,12 @@ clEnqueueWaitForEvents(cl_command_queue command_queue,
cl_uint num_events,
const cl_event * event_list)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+ err = clWaitForEvents(num_events, event_list);
+
+error:
+ return err;
}
cl_int
@@ -1864,6 +1885,7 @@ clEnqueueBarrier(cl_command_queue command_queue)
{
NOT_IMPLEMENTED;
return 0;
+ //return clFinish(command_queue);
}
#define EXTFUNC(x) \
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index b296dd7..e82f75c 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -413,6 +413,13 @@ error:
}
LOCAL cl_int
+cl_command_queue_flush(cl_command_queue queue)
+{
+ cl_gpgpu_flush(queue->gpgpu);
+ return CL_SUCCESS;
+}
+
+LOCAL cl_int
cl_command_queue_finish(cl_command_queue queue)
{
cl_gpgpu_sync(queue->gpgpu);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 4f6f987..135d659 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -65,6 +65,9 @@ extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
/* Fulsim will dump this buffer (mostly to check its consistency */
cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
+/* Flush for the command queue */
+extern cl_int cl_command_queue_flush(cl_command_queue);
+
/* Wait for the completion of the command queue */
extern cl_int cl_command_queue_finish(cl_command_queue);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index e58433f..1d415d4 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -266,8 +266,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Close the batch buffer and submit it */
cl_gpgpu_batch_end(gpgpu, 0);
- cl_gpgpu_flush(gpgpu);
-
error:
return err;
}
diff --git a/src/cl_context.h b/src/cl_context.h
index 80bf777..718d589 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -62,10 +62,12 @@ struct _cl_context {
cl_program programs; /* All programs currently allocated */
cl_mem buffers; /* All memory object currently allocated */
cl_sampler samplers; /* All sampler object currently allocated */
+ cl_event events; /* All event object currently allocated */
pthread_mutex_t queue_lock; /* To allocate and deallocate queues */
pthread_mutex_t program_lock; /* To allocate and deallocate programs */
pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
+ pthread_mutex_t event_lock; /* To allocate and deallocate events */
uint32_t ver; /* Gen version */
struct _cl_context_prop props;
cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 673985d..1a0ec38 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -46,6 +46,9 @@ typedef struct _cl_driver *cl_driver;
/* Encapsulates the gpgpu stream of commands */
typedef struct _cl_gpgpu *cl_gpgpu;
+/* Encapsulates the event of a command stream */
+typedef struct _cl_gpgpu_event *cl_gpgpu_event;
+
typedef struct _cl_context_prop *cl_context_prop;
typedef struct _cl_sampler *cl_sampler;
@@ -86,6 +89,13 @@ typedef enum cl_cache_control {
cc_llc_l3 = 0x3
} cl_cache_control;
+typedef enum gpu_command_status {
+ command_queued = 3,
+ command_submitted = 2,
+ command_running = 1,
+ command_complete = 0
+} gpu_command_status;
+
/* Use this structure to bind kernels in the gpgpu state */
typedef struct cl_gpgpu_kernel {
const char *name; /* kernel name and bo name */
@@ -179,6 +189,27 @@ extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end;
typedef void (cl_gpgpu_flush_cb)(cl_gpgpu);
extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
+/* new a event for a batch buffer */
+typedef cl_gpgpu_event (cl_gpgpu_event_new_cb)(cl_gpgpu);
+extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new;
+
+/* new a event for a batch buffer */
+typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int);
+extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status;
+
+/* new a event for a batch buffer */
+typedef void (cl_gpgpu_event_pending_cb)(cl_gpgpu, cl_gpgpu_event);
+extern cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending;
+
+/* new a event for a batch buffer */
+typedef void (cl_gpgpu_event_resume_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
+
+/* new a event for a batch buffer */
+typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
+
+
/* Will spawn all threads */
typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
uint32_t simd_sz,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 9aa926e..e7412de 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -63,4 +63,9 @@ LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
+LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
+LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
+LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
+LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
new file mode 100644
index 0000000..a112cc4
--- /dev/null
+++ b/src/cl_enqueue.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+
+#include "cl_enqueue.h"
+#include "cl_image.h"
+#include "cl_driver.h"
+#include "cl_utils.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
+
+cl_int cl_enqueue_read_buffer(enqueue_data* data)
+{
+ cl_int err = CL_SUCCESS;
+ void* src_ptr;
+
+ if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ memcpy(data->ptr, (char*)src_ptr + data->offset, data->size);
+
+ err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_write_buffer(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ void* dst_ptr;
+
+ if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ memcpy((char*)dst_ptr + data->offset, data->const_ptr, data->size);
+
+ err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_read_image(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ void* src_ptr;
+
+ cl_mem image = data->mem_obj;
+ const size_t* origin = data->origin;
+ const size_t* region = data->region;
+
+ if (!(src_ptr = cl_mem_map_auto(image))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ src_ptr = (char*)src_ptr + offset;
+
+ if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+ {
+ memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = src_ptr;
+ char* dst = data->ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, image->bpp*region[0]);
+ src += image->row_pitch;
+ dst += data->row_pitch;
+ }
+ src_ptr = (char*)src_ptr + image->slice_pitch;
+ data->ptr = (char*)data->ptr + data->slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(image);
+
+error:
+ return err;
+
+}
+
+cl_int cl_enqueue_write_image(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ void* dst_ptr;
+
+ cl_mem image = data->mem_obj;
+ const size_t *origin = data->origin;
+ const size_t *region = data->region;
+
+ if (!(dst_ptr = cl_mem_map_auto(image))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ dst_ptr = (char*)dst_ptr + offset;
+
+ if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+ {
+ memcpy(dst_ptr, data->ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = data->const_ptr;
+ char* dst = dst_ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, image->bpp*region[0]);
+ src += data->row_pitch;
+ dst += image->row_pitch;
+ }
+ data->ptr = (char*)data->ptr + data->slice_pitch;
+ dst_ptr = (char*)dst_ptr + image->slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(image);
+
+error:
+ return err;
+
+}
+
+cl_int cl_enqueue_map_buffer(enqueue_data *data)
+{
+
+ void *ptr = NULL;
+ cl_int err = CL_SUCCESS;
+ void *mem_ptr = NULL;
+ cl_int slot = -1;
+ cl_mem buffer = data->mem_obj;
+
+ if (!(ptr = cl_mem_map_auto(buffer))) {
+ err = CL_MAP_FAILURE;
+ }
+
+ ptr = (char*)ptr + data->offset;
+
+ if(buffer->flags & CL_MEM_USE_HOST_PTR) {
+ assert(buffer->host_ptr);
+ memcpy(buffer->host_ptr + data->offset, ptr, data->size);
+ mem_ptr = buffer->host_ptr + data->offset;
+ } else {
+ mem_ptr = ptr;
+ }
+
+ /* Record the mapped address. */
+ if (!buffer->mapped_ptr_sz) {
+ buffer->mapped_ptr_sz = 16;
+ buffer->mapped_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz);
+ if (!buffer->mapped_ptr) {
+ cl_mem_unmap_auto (buffer);
+ err = CL_OUT_OF_HOST_MEMORY;
+ ptr = NULL;
+ goto error;
+ }
+
+ memset(buffer->mapped_ptr, 0, buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = 0;
+ } else {
+ int i = 0;
+ for (; i < buffer->mapped_ptr_sz; i++) {
+ if (buffer->mapped_ptr[i].ptr == NULL) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (i == buffer->mapped_ptr_sz) {
+ cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz * 2);
+ if (!new_ptr) {
+ cl_mem_unmap_auto (buffer);
+ err = CL_OUT_OF_HOST_MEMORY;
+ ptr = NULL;
+ goto error;
+ }
+ memset(new_ptr, 0, 2 * buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ memcpy(new_ptr, buffer->mapped_ptr,
+ buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = buffer->mapped_ptr_sz;
+ buffer->mapped_ptr_sz *= 2;
+ free(buffer->mapped_ptr);
+ buffer->mapped_ptr = new_ptr;
+ }
+ }
+
+ assert(slot != -1);
+ buffer->mapped_ptr[slot].ptr = mem_ptr;
+ buffer->mapped_ptr[slot].v_ptr = ptr;
+ buffer->mapped_ptr[slot].size = data->size;
+ buffer->map_ref++;
+
+ data->ptr = mem_ptr;
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_map_image(enqueue_data *data)
+{
+ void *ptr = NULL;
+ cl_int err = CL_SUCCESS;
+
+ cl_mem image = data->mem_obj;
+ const size_t *origin = data->origin;
+
+ if (!(ptr = cl_mem_map_auto(image))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ data->ptr = (char*)ptr + offset;
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ int i;
+ size_t mapped_size = 0;
+ void * v_ptr = NULL;
+ void * mapped_ptr = data->ptr;
+ cl_mem memobj = data->mem_obj;
+
+ assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+ INVALID_VALUE_IF(!mapped_ptr);
+ for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+ if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+ memobj->mapped_ptr[i].ptr = NULL;
+ mapped_size = memobj->mapped_ptr[i].size;
+ v_ptr = memobj->mapped_ptr[i].v_ptr;
+ memobj->mapped_ptr[i].size = 0;
+ memobj->mapped_ptr[i].v_ptr = NULL;
+ memobj->map_ref--;
+ break;
+ }
+ }
+ /* can not find a mapped address? */
+ INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
+
+ if (memobj->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mapped_ptr >= memobj->host_ptr &&
+ mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+ /* Sync the data. */
+ memcpy(v_ptr, mapped_ptr, mapped_size);
+ } else {
+ assert(v_ptr == mapped_ptr);
+ }
+
+ cl_mem_unmap_auto(memobj);
+
+ /* shrink the mapped slot. */
+ if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
+ int j = 0;
+ cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
+ if (!new_ptr) {
+ /* Just do nothing. */
+ goto error;
+ }
+ memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
+
+ for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+ if (memobj->mapped_ptr[i].ptr) {
+ new_ptr[j] = memobj->mapped_ptr[i];
+ j++;
+ assert(j < memobj->mapped_ptr_sz/2);
+ }
+ }
+ memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
+ free(memobj->mapped_ptr);
+ memobj->mapped_ptr = new_ptr;
+ }
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_handle(enqueue_data* data)
+{
+ switch(data->type) {
+ case EnqueueReadBuffer:
+ return cl_enqueue_read_buffer(data);
+ case EnqueueWriteBuffer:
+ return cl_enqueue_write_buffer(data);
+ case EnqueueReadImage:
+ return cl_enqueue_read_image(data);
+ case EnqueueWriteImage:
+ return cl_enqueue_write_image(data);
+ case EnqueueMapBuffer:
+ return cl_enqueue_map_buffer(data);
+ case EnqueueMapImage:
+ return cl_enqueue_map_image(data);
+ case EnqueueUnmapMemObject:
+ return cl_enqueue_unmap_mem_object(data);
+ case EnqueueNDRangeKernel:
+ cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr); //goto default
+ default:
+ return CL_SUCCESS;
+ }
+}
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
new file mode 100644
index 0000000..7dc8ceb
--- /dev/null
+++ b/src/cl_enqueue.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+#ifndef __CL_ENQUEUE_H__
+#define __CL_ENQUEUE_H__
+
+#include "cl_mem.h"
+#include "cl_command_queue.h"
+#include "cl_internals.h"
+#include "CL/cl.h"
+
+typedef enum {
+ EnqueueReadBuffer = 0,
+ EnqueueReadBufferRect,
+ EnqueueWriteBuffer,
+ EnqueueWriteBufferRect,
+ EnqueueCopyBuffer,
+ EnqueueCopyBufferRect,
+ EnqueueReadImage,
+ EnqueueWriteImage,
+ EnqueueCopyImage,
+ EnqueueCopyImageToBuffer,
+ EnqueueCopyBufferToImage,
+ EnqueueMapBuffer,
+ EnqueueMapImage,
+ EnqueueUnmapMemObject,
+ EnqueueNDRangeKernel,
+ EnqueueInvalid
+} enqueue_type;
+
+typedef struct _enqueue_data {
+ enqueue_type type; /* Command type */
+ cl_mem mem_obj; /* Enqueue's cl_mem */
+ cl_command_queue queue; /* Command queue */
+ size_t offset; /* Mem object's offset */
+ size_t size; /* Size */
+ size_t origin[3]; /* Origin */
+ size_t region[3]; /* Region */
+ size_t row_pitch; /* Row pitch */
+ size_t slice_pitch; /* Slice pitch */
+ cl_map_flags map_flags; /* Map flags */
+ const void * const_ptr; /* Const ptr for memory read */
+ void * ptr; /* ptr for write and return value */
+} enqueue_data;
+
+/* Do real enqueue commands */
+cl_int cl_enqueue_handle(enqueue_data* data);
+#endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index 6539b05..e882c7c 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -14,7 +14,389 @@
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <http://www.gnu.org/licenses/>.
*
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Author: Rong Yang <rong.r.yang at intel.com>
*/
-struct empty {int dummy;};
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
+{
+ cl_event event = NULL;
+
+ /* Allocate and inialize the structure itself */
+ TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
+ SET_ICD(event->dispatch)
+ event->magic = CL_MAGIC_EVENT_HEADER;
+ event->ref_n = 1;
+
+ /* Append the event in the context event list */
+ pthread_mutex_lock(&ctx->event_lock);
+ event->next = ctx->events;
+ if (ctx->events != NULL)
+ ctx->events->prev = event;
+ ctx->events = event;
+ pthread_mutex_unlock(&ctx->event_lock);
+ event->ctx = ctx;
+ cl_context_add_ref(ctx);
+
+ /* Initialize all members and create GPGPU event object */
+ event->queue = queue;
+ event->type = type;
+ event->gpgpu_event = NULL;
+ if(type == CL_COMMAND_USER) {
+ event->status = CL_SUBMITTED;
+ }
+ else {
+ event->status = CL_QUEUED;
+ event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
+ }
+ cl_event_add_ref(event); //dec when complete
+ event->user_cb = NULL;
+ event->enqueue_cb = NULL;
+ event->waits_head = NULL;
+ event->emplict = emplict;
+
+exit:
+ return event;
+error:
+ cl_event_delete(event);
+ event = NULL;
+ goto exit;
+}
+
+void cl_event_delete(cl_event event)
+{
+ if (UNLIKELY(event == NULL))
+ return;
+
+ if (atomic_dec(&event->ref_n) > 1)
+ return;
+
+ /* Call all user's callback if haven't execute */
+ user_callback *cb = event->user_cb;
+ while(event->user_cb) {
+ cb = event->user_cb;
+ if(cb->executed == CL_FALSE) {
+ cb->pfn_notify(event, event->status, cb->user_data);
+ }
+ event->user_cb = cb->next;
+ cl_free(cb);
+ }
+
+ /* delete gpgpu event object */
+ if(event->gpgpu_event)
+ cl_gpgpu_event_delete(event->gpgpu_event);
+
+ /* Remove it from the list */
+ assert(event->ctx);
+ pthread_mutex_lock(&event->ctx->event_lock);
+ if (event->prev)
+ event->prev->next = event->next;
+ if (event->next)
+ event->next->prev = event->prev;
+ if (event->prev == NULL && event->next == NULL)
+ event->ctx->events = NULL;
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ cl_context_delete(event->ctx);
+
+ cl_free(event);
+}
+
+void cl_event_add_ref(cl_event event)
+{
+ assert(event);
+ atomic_inc(&event->ref_n);
+}
+
+cl_int cl_event_set_callback(cl_event event ,
+ cl_int command_exec_callback_type,
+ EVENT_NOTIFY pfn_notify,
+ void* user_data)
+{
+ assert(event);
+ assert(pfn_notify);
+
+ cl_int err = CL_SUCCESS;
+ user_callback *cb;
+ TRY_ALLOC(cb, CALLOC(user_callback));
+
+ cb->pfn_notify = pfn_notify;
+ cb->user_data = user_data;
+ cb->status = command_exec_callback_type;
+ cb->executed = CL_FALSE;
+
+ cb->next = event->user_cb;
+ event->user_cb = cb;
+
+exit:
+ return err;
+error:
+ err = CL_OUT_OF_HOST_MEMORY;
+ cl_free(cb);
+ goto exit;
+};
+
+cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list,
+ cl_event *event,cl_context ctx)
+{
+ cl_int err = CL_SUCCESS;
+ cl_int i;
+ /* check the event_wait_list and num_events_in_wait_list */
+ if((event_wait_list == NULL) &&
+ (num_events_in_wait_list > 0))
+ goto exit;
+
+ if ((event_wait_list != NULL) &&
+ (num_events_in_wait_list == 0)){
+ goto error;
+ }
+
+ /* check the event and context */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ CHECK_EVENT(event_wait_list[i]);
+ if(event_wait_list[i]->status < CL_COMPLETE) {
+ err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+ goto exit;
+ }
+ if(event && *event == event_wait_list[i])
+ goto error;
+ if(event_wait_list[i]->ctx != ctx)
+ goto error;
+ }
+
+exit:
+ return err;
+error:
+ err = CL_INVALID_EVENT_WAIT_LIST; //reset error
+ goto exit;
+}
+
+cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list)
+{
+ cl_int i, j;
+ /* Check whether wait user events */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ if(event_wait_list[i]->status <= CL_COMPLETE)
+ continue;
+
+ /* Need wait on user event, return and do enqueue defer */
+ if((event_wait_list[i]->type == CL_COMMAND_USER) ||
+ (event_wait_list[i]->enqueue_cb &&
+ (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
+ for(j=0; j<num_events_in_wait_list; j++)
+ cl_event_add_ref(event_wait_list[j]); //add defer enqueue's wait event reference
+ return CL_ENQUEUE_EXECUTE_DEFER;
+ }
+ }
+
+ /* Non user events or all user event finished, wait all enqueue events finish */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ if(event_wait_list[i]->status <= CL_COMPLETE)
+ continue;
+
+ //enqueue callback haven't finish, in another thread, wait
+ if(event_wait_list[i]->enqueue_cb != NULL)
+ return CL_ENQUEUE_EXECUTE_DEFER;
+ cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
+ cl_event_set_status(event_wait_list[i], CL_COMPLETE); //Execute user's callback
+ }
+ return CL_ENQUEUE_EXECUTE_IMM;
+}
+
+void cl_event_new_enqueue_callback(cl_event event,
+ enqueue_data *data,
+ cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list)
+{
+ enqueue_callback *cb, *node;
+ user_event *user_events, *u_ev;
+ cl_int i;
+
+ /* Allocate and inialize the structure itself */
+ TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
+ cb->num_events = num_events_in_wait_list;
+ cb->wait_list = event_wait_list;
+ cb->event = event;
+ cb->next = NULL;
+ cb->wait_user_events = NULL;
+
+ /* Find out all user events that events in event_wait_list wait */
+ for(i=0; i<num_events_in_wait_list; i++) {
+ if(event_wait_list[i]->status <= CL_COMPLETE)
+ continue;
+
+ if(event_wait_list[i]->type == CL_COMMAND_USER) {
+ /* Insert the enqueue_callback to user event list */
+ node = event_wait_list[i]->waits_head;
+ if(node == NULL)
+ event_wait_list[i]->waits_head = cb;
+ else {
+ while((node != cb) && node->next)
+ node = node->next;
+ if(node == cb) //wait on dup user event
+ continue;
+ node->next = cb;
+ }
+ /* Insert the user event to enqueue_callback's wait_user_events */
+ TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+ u_ev->event = event_wait_list[i];
+ u_ev->next = cb->wait_user_events;
+ cb->wait_user_events = u_ev;
+ } else if(event_wait_list[i]->enqueue_cb != NULL) {
+ user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
+ while(user_events != NULL) {
+ /* Insert the enqueue_callback to user event's waits_tail */
+ node = user_events->event->waits_head;
+ while((node != cb) && node->next)
+ node = node->next;
+ if(node == cb) { //wait on dup user event
+ user_events = user_events->next;
+ continue;
+ }
+ node->next = cb;
+
+ /* Insert the user event to enqueue_callback's wait_user_events */
+ TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+ u_ev->event = user_events->event;
+ u_ev->next = cb->wait_user_events;
+ cb->wait_user_events = u_ev;
+ user_events = user_events->next;
+ }
+ }
+ }
+ if(data->queue != NULL) {
+ assert(event->gpgpu_event);
+ cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
+ data->ptr = (void *)event->gpgpu_event;
+ }
+ cb->data = *data;
+ event->enqueue_cb = cb;
+
+exit:
+ return;
+error:
+ if(cb) {
+ while(cb->wait_user_events) {
+ u_ev = cb->wait_user_events;
+ cb->wait_user_events = cb->wait_user_events->next;
+ cl_free(u_ev);
+ }
+ cl_free(cb);
+ }
+ goto exit;
+}
+
+void cl_event_set_status(cl_event event, cl_int status)
+{
+ user_callback *user_cb;
+ user_event *u_ev, *u_ev_next;
+ cl_int ret, i;
+ cl_event evt;
+
+ pthread_mutex_lock(&event->ctx->event_lock);
+ if(status >= event->status) {
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ return;
+ }
+ if(event->status <= CL_COMPLETE) {
+ event->status = status; //have done enqueue before or doing in another thread
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ return;
+ }
+
+ if(status <= CL_COMPLETE) {
+ if(event->enqueue_cb) {
+ cl_enqueue_handle(&event->enqueue_cb->data);
+ event->status = status; //Change the event status after enqueue and befor unlock
+
+ pthread_mutex_unlock(&event->ctx->event_lock);
+ for(i=0; i<event->enqueue_cb->num_events; i++)
+ cl_event_delete(event->enqueue_cb->wait_list[i]);
+ pthread_mutex_lock(&event->ctx->event_lock);
+
+ cl_free(event->enqueue_cb);
+ event->enqueue_cb = NULL;
+ }
+ }
+ if(event->status >= status) //maybe changed in other threads
+ event->status = status;
+ pthread_mutex_unlock(&event->ctx->event_lock);
+
+ if(event->status <= CL_COMPLETE)
+ cl_event_delete(event);
+
+ /* Call user callback */
+ user_cb = event->user_cb;
+ while(user_cb) {
+ if(user_cb->status >= status) {
+ user_cb->pfn_notify(event, event->status, user_cb->user_data);
+ user_cb->executed = CL_TRUE;
+ }
+ user_cb = user_cb->next;
+ }
+
+ if(event->type != CL_COMMAND_USER)
+ return;
+
+ /* Check all defer enqueue */
+ enqueue_callback *cb, *enqueue_cb = event->waits_head;
+ while(enqueue_cb) {
+ /* Remove this user event in enqueue_cb */
+ while(enqueue_cb->wait_user_events &&
+ enqueue_cb->wait_user_events->event == event) {
+ u_ev = enqueue_cb->wait_user_events;
+ enqueue_cb->wait_user_events = enqueue_cb->wait_user_events->next;
+ cl_free(u_ev);
+ }
+
+ u_ev = enqueue_cb->wait_user_events;
+ while(u_ev) {
+ u_ev_next = u_ev->next;
+ if(u_ev_next && u_ev_next->event == event) {
+ u_ev->next = u_ev_next->next;
+ cl_free(u_ev_next);
+ } else
+ u_ev->next = u_ev_next;
+ }
+
+ /* Still wait on other user events */
+ if(enqueue_cb->wait_user_events != NULL) {
+ enqueue_cb = enqueue_cb->next;
+ continue;
+ }
+
+ /* All user events complete, now wait enqueue events */
+ ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list);
+ assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
+
+ cb = enqueue_cb;
+ enqueue_cb = enqueue_cb->next;
+
+ /* Call the pending operation */
+ evt = cb->event;
+ cl_event_set_status(cb->event, CL_COMPLETE);
+ if(cb->event->emplict == CL_FALSE) {
+ cl_event_delete(evt);
+ }
+ }
+ event->waits_head = NULL;
+}
+
+void cl_event_update_status(cl_event event)
+{
+ if(event->status <= CL_COMPLETE)
+ return;
+ if((event->gpgpu_event) &&
+ (cl_gpgpu_event_update_status(event->gpgpu_event, 0) == command_complete))
+ cl_event_set_status(event, CL_COMPLETE);
+}
diff --git a/src/cl_event.h b/src/cl_event.h
index 23378e8..c921cb2 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -20,9 +20,73 @@
#ifndef __CL_EVENT_H__
#define __CL_EVENT_H__
+#include <semaphore.h>
+
+#include "cl_enqueue.h"
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+
+#define CL_ENQUEUE_EXECUTE_IMM 0
+#define CL_ENQUEUE_EXECUTE_DEFER 1
+
+typedef struct _user_event {
+ cl_event event; /* The user event */
+ struct _user_event* next; /* Next user event in list */
+} user_event;
+
+typedef struct _enqueue_callback {
+ cl_event event; /* The event relative this enqueue callback */
+ enqueue_data data; /* Hold all enqueue callback's infomation */
+ cl_uint num_events; /* num events in wait list */
+ const cl_event* wait_list; /* All event wait list this callback wait on */
+ user_event* wait_user_events; /* The head of user event list the callback wait on */
+ struct _enqueue_callback* next; /* The next enqueue callback in wait list */
+} enqueue_callback;
+
+typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int event_command_exec_status, void *user_data);
+
+typedef struct _user_callback {
+ cl_int status; /* The execution status */
+ cl_bool executed; /* Indicat the callback function been called or not */
+ EVENT_NOTIFY pfn_notify; /* Callback function */
+ void* user_data; /* Callback user data */
+ struct _user_callback* next; /* Next event callback in list */
+} user_callback;
+
struct _cl_event {
DEFINE_ICD(dispatch)
+ uint64_t magic; /* To identify it as a sampler object */
+ volatile int ref_n; /* We reference count this object */
+ cl_context ctx; /* The context associated with event */
+ cl_event prev, next; /* We chain the memory buffers together */
+ cl_command_queue queue; /* The command queue associated with event */
+ cl_command_type type; /* The command type associated with event */
+ cl_int status; /* The execution status */
+ cl_gpgpu_event gpgpu_event; /* The event object communicate with hardware */
+ user_callback* user_cb; /* The event callback functions */
+ enqueue_callback* enqueue_cb; /* This event's enqueue */
+ enqueue_callback* waits_head; /* The head of enqueues list wait on this event */
+ cl_bool emplict; /* Identify this event whether created by api emplict*/
};
+/* Create a new event object */
+cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
+/* Unref the object and delete it if no more reference on it */
+void cl_event_delete(cl_event);
+/* Add one more reference to this object */
+void cl_event_add_ref(cl_event);
+/* Rigister a user callback function for specific commond execution status */
+cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
+/* Check events wait list for enqueue commonds */
+cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
+/* Wait the all events in wait list complete */
+cl_int cl_event_wait_events(cl_uint, const cl_event *);
+/* New a enqueue suspend task */
+void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
+/* Set the event status and call all callbacks */
+void cl_event_set_status(cl_event, cl_int);
+/* Check and update event status */
+void cl_event_update_status(cl_event);
#endif /* __CL_EVENT_H__ */
diff --git a/src/cl_internals.h b/src/cl_internals.h
index b2b25b2..693de1d 100644
--- a/src/cl_internals.h
+++ b/src/cl_internals.h
@@ -28,6 +28,7 @@
#define CL_MAGIC_PROGRAM_HEADER 0x34560ab12789cdefLL
#define CL_MAGIC_QUEUE_HEADER 0x83650a12b79ce4dfLL
#define CL_MAGIC_SAMPLER_HEADER 0x686a0ecba79ce33fLL
+#define CL_MAGIC_EVENT_HEADER 0x8324a9c810ebf90fLL
#define CL_MAGIC_MEM_HEADER 0x381a27b9ce6504dfLL
#define CL_MAGIC_DEAD_HEADER 0xdeaddeaddeaddeadLL
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 59b7a2b..bfe418d 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -147,6 +147,18 @@ do { \
} \
} while (0)
+#define CHECK_EVENT(EVENT) \
+ do { \
+ if (UNLIKELY(EVENT == NULL)) { \
+ err = CL_INVALID_EVENT; \
+ goto error; \
+ } \
+ if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) { \
+ err = CL_INVALID_EVENT; \
+ goto error; \
+ } \
+ } while (0)
+
#define CHECK_SAMPLER(SAMPLER) \
do { \
if (UNLIKELY(SAMPLER == NULL)) { \
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index e553a55..1301b66 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -57,6 +57,12 @@ typedef struct surface_heap {
char surface[256][sizeof(gen6_surface_state_t)];
} surface_heap_t;
+typedef struct intel_event {
+ intel_batchbuffer_t *batch;
+ drm_intel_bo* buffer;
+ int status;
+} intel_event_t;
+
#define MAX_IF_DESC 32
/* We can bind only a limited number of buffers */
@@ -106,8 +112,8 @@ typedef struct intel_gpgpu intel_gpgpu_t;
static void
intel_gpgpu_sync(intel_gpgpu_t *gpgpu)
{
- if (gpgpu->batch->last_bo)
- drm_intel_bo_wait_rendering(gpgpu->batch->last_bo);
+ if (gpgpu->batch->last_bo)
+ drm_intel_bo_wait_rendering(gpgpu->batch->last_bo);
}
static void
@@ -225,7 +231,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
}
static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
@@ -243,7 +249,7 @@ intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
}
static void
-intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
@@ -256,7 +262,7 @@ intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
static const uint32_t gpgpu_l3_config_reg1[] = {
0x00080040, 0x02040040, 0x00800040, 0x01000038,
0x02000030, 0x01000038, 0x00000038, 0x00000040,
- 0x0A140091, 0x09100091, 0x08900091, 0x08900091
+ 0x0A140091, 0x09100091, 0x08900091, 0x08900091
};
static const uint32_t gpgpu_l3_config_reg2[] = {
@@ -369,6 +375,7 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
static void
intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
{
+ intel_batchbuffer_emit_mi_flush(gpgpu->batch);
intel_batchbuffer_flush(gpgpu->batch);
intel_gpgpu_check_binded_buf_address(gpgpu);
}
@@ -404,7 +411,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
/* surface state */
if(gpgpu->surface_heap_b.bo)
dri_bo_unreference(gpgpu->surface_heap_b.bo);
- bo = dri_bo_alloc(bufmgr,
+ bo = dri_bo_alloc(bufmgr,
"SURFACE_HEAP",
sizeof(surface_heap_t),
32);
@@ -416,7 +423,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
/* Interface descriptor remap table */
if(gpgpu->idrt_b.bo)
dri_bo_unreference(gpgpu->idrt_b.bo);
- bo = dri_bo_alloc(bufmgr,
+ bo = dri_bo_alloc(bufmgr,
"IDRT",
MAX_IF_DESC * sizeof(struct gen6_interface_descriptor),
32);
@@ -431,7 +438,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
/* sampler state */
if (gpgpu->sampler_state_b.bo)
dri_bo_unreference(gpgpu->sampler_state_b.bo);
- bo = dri_bo_alloc(gpgpu->drv->bufmgr,
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr,
"SAMPLER_STATE",
GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
32);
@@ -478,7 +485,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
ss1->ss2.width = ss0->ss2.width = 127; /* bits 6:0 of sz */
ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
ss0->ss3.depth = 1023; /* bits 30:21 of sz */
- ss1->ss3.depth = 510; /* bits 30:21 of sz */
+ ss1->ss3.depth = 1023; /* bits 30:21 of sz */
ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3;
heap->binding_table[0] = offsetof(surface_heap_t, surface);
heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
@@ -830,6 +837,83 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
ADVANCE_BATCH(gpgpu->batch);
}
+static intel_event_t*
+intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
+{
+ intel_event_t *event = NULL;
+ TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t));
+
+ event->status = command_queued;
+ event->batch = NULL;
+ event->buffer = gpgpu->batch->buffer;
+ if(event->buffer != NULL)
+ drm_intel_bo_reference(event->buffer);
+
+exit:
+ return event;
+error:
+ cl_free(event);
+ event = NULL;
+ goto exit;
+}
+
+static int
+intel_gpgpu_event_update_status(intel_event_t *event, int wait)
+{
+ if(event->status == command_complete)
+ return event->status;
+
+ if (event->buffer &&
+ event->batch == NULL && //have flushed
+ !drm_intel_bo_busy(event->buffer)) {
+ event->status = command_complete;
+ drm_intel_bo_unreference(event->buffer);
+ event->buffer = NULL;
+ return event->status;
+ }
+
+ if(wait == 0)
+ return event->status;
+
+ if (event->buffer) {
+ drm_intel_bo_wait_rendering(event->buffer);
+ event->status = command_complete;
+ drm_intel_bo_unreference(event->buffer);
+ event->buffer = NULL;
+ }
+ return event->status;
+}
+
+static void
+intel_gpgpu_event_pending(intel_gpgpu_t *gpgpu, intel_event_t *event)
+{
+ assert(event->buffer); //This is gpu enqueue command
+ assert(event->batch == NULL); //This command haven't pengding.
+ event->batch = intel_batchbuffer_new(gpgpu->drv);
+ assert(event->batch);
+ *event->batch = *gpgpu->batch;
+ if(event->batch->buffer)
+ drm_intel_bo_reference(event->batch->buffer);
+}
+
+static void
+intel_gpgpu_event_resume(intel_event_t *event)
+{
+ assert(event->batch); //This command have pending.
+ intel_batchbuffer_flush(event->batch);
+ intel_batchbuffer_delete(event->batch);
+ event->batch = NULL;
+}
+
+static void
+intel_gpgpu_event_delete(intel_event_t *event)
+{
+ assert(event->batch == NULL); //This command must have been flushed.
+ if(event->buffer)
+ drm_intel_bo_unreference(event->buffer);
+ cl_free(event);
+}
+
LOCAL void
intel_set_gpgpu_callbacks(void)
{
@@ -851,5 +935,10 @@ intel_set_gpgpu_callbacks(void)
cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
+ cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
+ cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status;
+ cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
+ cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
+ cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index b205c67..97b7519 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -74,6 +74,7 @@ set (utests_sources
compiler_uint16_copy.cpp
compiler_uint3_unaligned_copy.cpp
compiler_upsample_int.cpp
+ compiler_upsample_long.cpp
compiler_unstructured_branch0.cpp
compiler_unstructured_branch1.cpp
compiler_unstructured_branch2.cpp
@@ -84,9 +85,12 @@ set (utests_sources
compiler_switch.cpp
compiler_math.cpp
compiler_atomic_functions.cpp
+ compiler_async_copy.cpp
+ compiler_async_stride_copy.cpp
compiler_insn_selection_min.cpp
compiler_insn_selection_max.cpp
compiler_insn_selection_masked_min_max.cpp
+ compiler_load_bool_imm.cpp
compiler_global_memory_barrier.cpp
compiler_local_memory_two_ptr.cpp
compiler_local_memory_barrier.cpp
@@ -115,14 +119,22 @@ set (utests_sources
builtin_global_id.cpp
builtin_num_groups.cpp
builtin_local_id.cpp
+ builtin_acos_asin.cpp
runtime_createcontext.cpp
runtime_null_kernel_arg.cpp
+ runtime_event.cpp
compiler_double.cpp
compiler_double_2.cpp
compiler_double_3.cpp
compiler_double_4.cpp
compiler_long.cpp
compiler_long_2.cpp
+ compiler_long_convert.cpp
+ compiler_long_shl.cpp
+ compiler_long_shr.cpp
+ compiler_long_asr.cpp
+ compiler_long_mult.cpp
+ compiler_long_cmp.cpp
utest_assert.cpp
utest.cpp
utest_file_map.cpp
diff --git a/utests/builtin_acos_asin.cpp b/utests/builtin_acos_asin.cpp
new file mode 100644
index 0000000..0187226
--- /dev/null
+++ b/utests/builtin_acos_asin.cpp
@@ -0,0 +1,87 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+#define printf_c(...) \
+{\
+ printf("\033[1m\033[40;31m");\
+ printf( __VA_ARGS__ );\
+ printf("\033[0m");\
+}
+
+const float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
+const int count_input = sizeof(input_data) / sizeof(input_data[0]);
+const int max_function = 5;
+
+static void cpu_compiler_math(float *dst, const float *src)
+{
+ const float x = *src;
+
+ dst[0] = acos(x);
+ dst[1] = acosh(x);
+ dst[2] = asin(x);
+ dst[3] = asinh(x);
+ dst[4] = x;
+}
+
+static void builtin_acos_asin(void)
+{
+ // Setup kernel and buffers
+ int k, i, index_cur;
+ float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+
+ OCL_CREATE_KERNEL("builtin_acos_asin");
+
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+ globals[0] = count_input;
+ locals[0] = 1;
+
+ clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
+ clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+
+ // Run the kernel
+ OCL_NDRANGE( 1 );
+
+ clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+ for (k = 0; (uint)k < count_input; k++)
+ {
+ cpu_compiler_math( cpu_data + k * max_function, input_data + k);
+
+ for (i = 0; i < max_function; i++)
+ {
+ index_cur = k * max_function + i;
+#if udebug
+ if (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])){
+ printf_c("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+ else if (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])){
+ printf_c("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+ else if(fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-3f){
+ printf_c("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+ else
+ printf("%d/%d: %f -> gpu:%f cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+#else
+ if (isinf(cpu_data[index_cur]))
+ OCL_ASSERT(isinf(gpu_data[index_cur]));
+ else if (isnan(cpu_data[index_cur]))
+ OCL_ASSERT(isnan(gpu_data[index_cur]));
+ else
+ {
+ OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+ }
+#endif
+ }
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_acos_asin)
diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp
new file mode 100644
index 0000000..9384f85
--- /dev/null
+++ b/utests/compiler_async_copy.cpp
@@ -0,0 +1,39 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_copy(void)
+{
+ const size_t n = 1024;
+ const size_t local_size = 32;
+ const int copiesPerWorkItem = 5;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_async_copy");
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(int)*2, NULL);
+ OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; ++i)
+ ((int*)buf_data[1])[i] = rand();
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = local_size;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ int *dst = (int*)buf_data[0];
+ int *src = (int*)buf_data[1];
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; i++)
+ OCL_ASSERT(dst[i] == src[i] + 3);
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy);
diff --git a/utests/compiler_async_stride_copy.cpp b/utests/compiler_async_stride_copy.cpp
new file mode 100644
index 0000000..132f917
--- /dev/null
+++ b/utests/compiler_async_stride_copy.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_stride_copy(void)
+{
+ const size_t n = 1024;
+ const size_t local_size = 128;
+ const int copiesPerWorkItem = 5;
+ const int stride =3;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_async_stride_copy");
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(char)*4, NULL);
+ OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+ OCL_SET_ARG(4, sizeof(int), &stride);
+
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n * copiesPerWorkItem * 4 * stride; ++i)
+ ((char*)buf_data[1])[i] = rand() && 0xff;
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = local_size;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ char *dst = (char*)buf_data[0];
+ char *src = (char*)buf_data[1];
+ for (uint32_t i = 0; i < n * copiesPerWorkItem; i += stride * 4) {
+ OCL_ASSERT(dst[i + 0] == src[i + 0] + 3);
+ OCL_ASSERT(dst[i + 1] == src[i + 1] + 3);
+ OCL_ASSERT(dst[i + 2] == src[i + 2] + 3);
+ OCL_ASSERT(dst[i + 3] == src[i + 3] + 3);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_stride_copy);
diff --git a/utests/compiler_load_bool_imm.cpp b/utests/compiler_load_bool_imm.cpp
new file mode 100644
index 0000000..d060daf
--- /dev/null
+++ b/utests/compiler_load_bool_imm.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+static void compiler_load_bool_imm(void)
+{
+ const size_t n = 1024;
+ const size_t local_size = 16;
+ const int copiesPerWorkItem = 5;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_load_bool_imm");
+ OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, local_size*copiesPerWorkItem*sizeof(int), NULL); // 16 x int
+ OCL_SET_ARG(2, sizeof(int), &copiesPerWorkItem); // 16 x int
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = local_size;
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(0);
+
+ // Check results
+ int *dst = (int*)buf_data[0];
+ for (uint32_t i = 0; i < n * copiesPerWorkItem; i++)
+ OCL_ASSERT(dst[i] == copiesPerWorkItem);
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_load_bool_imm);
diff --git a/utests/compiler_long_asr.cpp b/utests/compiler_long_asr.cpp
new file mode 100644
index 0000000..0a70a23
--- /dev/null
+++ b/utests/compiler_long_asr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_asr(void)
+{
+ const size_t n = 64;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_asr");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ src[i] = (int64_t)1 << 63;
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, sizeof(src));
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ int64_t *dest = ((int64_t *)buf_data[1]);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ if (i > 7)
+ OCL_ASSERT(dest[i] == src[i] >> i);
+ else
+ OCL_ASSERT(dest[i] == src[i] + 1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_asr);
diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
new file mode 100644
index 0000000..3775556
--- /dev/null
+++ b/utests/compiler_long_cmp.cpp
@@ -0,0 +1,117 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_cmp(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ src1[0] = (int64_t)1 << 63, src2[0] = 0x7FFFFFFFFFFFFFFFll;
+ src1[1] = (int64_t)1 << 63, src2[1] = ((int64_t)1 << 63) | 1;
+ src1[2] = -1ll, src2[2] = 0;
+ src1[3] = ((int64_t)123 << 32) | 0x7FFFFFFF, src2[3] = ((int64_t)123 << 32) | 0x80000000;
+ src1[4] = 0x7FFFFFFFFFFFFFFFll, src2[4] = (int64_t)1 << 63;
+ src1[5] = ((int64_t)1 << 63) | 1, src2[5] = (int64_t)1 << 63;
+ src1[6] = 0, src2[6] = -1ll;
+ src1[7] = ((int64_t)123 << 32) | 0x80000000, src2[7] = ((int64_t)123 << 32) | 0x7FFFFFFF;
+ for(size_t i=8; i<n; i++) {
+ src1[i] = i;
+ src2[i] = i;
+ }
+
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_l");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] < src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_le");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] <= src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_g");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] > src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_ge");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] >= src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_eq");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] == src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_neq");
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ int64_t *dest = (int64_t *)buf_data[2];
+ int64_t x = (src1[i] != src2[i]) ? 3 : 4;
+ OCL_ASSERT(x == dest[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_cmp);
diff --git a/utests/compiler_long_convert.cpp b/utests/compiler_long_convert.cpp
new file mode 100644
index 0000000..18e13ee
--- /dev/null
+++ b/utests/compiler_long_convert.cpp
@@ -0,0 +1,67 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_convert(void)
+{
+ const size_t n = 16;
+ char src1[n];
+ short src2[n];
+ int src3[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_convert");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[4], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[5], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[4]);
+ OCL_SET_ARG(5, sizeof(cl_mem), &buf[5]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src1[i] = -i;
+ src2[i] = -i;
+ src3[i] = -i;
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ memcpy(buf_data[2], src3, sizeof(src3));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(3);
+ OCL_MAP_BUFFER(4);
+ OCL_MAP_BUFFER(5);
+ int64_t *dst1 = ((int64_t *)buf_data[3]);
+ int64_t *dst2 = ((int64_t *)buf_data[4]);
+ int64_t *dst3 = ((int64_t *)buf_data[5]);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%lx %lx %lx\n", dst1[i], dst2[i], dst3[i]);
+ OCL_ASSERT(dst1[i] == -(int64_t)i);
+ OCL_ASSERT(dst2[i] == -(int64_t)i);
+ OCL_ASSERT(dst3[i] == -(int64_t)i);
+ }
+ OCL_UNMAP_BUFFER(3);
+ OCL_UNMAP_BUFFER(4);
+ OCL_UNMAP_BUFFER(5);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert);
diff --git a/utests/compiler_long_mult.cpp b/utests/compiler_long_mult.cpp
new file mode 100644
index 0000000..06070f7
--- /dev/null
+++ b/utests/compiler_long_mult.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_mult(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_mult");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src1[i] = 0x77665544FFEEDDCCLL;
+ src2[i] = ((int64_t)rand() << 32) + rand();
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+ if (i < 3)
+ OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+ else
+ OCL_ASSERT(src1[i] * src2[i] == ((int64_t *)buf_data[2])[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mult);
diff --git a/utests/compiler_long_shl.cpp b/utests/compiler_long_shl.cpp
new file mode 100644
index 0000000..c8e4624
--- /dev/null
+++ b/utests/compiler_long_shl.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shl(void)
+{
+ const size_t n = 64;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_shl");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ src[i] = 1;
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, sizeof(src));
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ int64_t *dest = ((int64_t *)buf_data[1]);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ if (i > 7)
+ OCL_ASSERT(dest[i] == ((int64_t)1) << i);
+ else
+ OCL_ASSERT(dest[i] == src[i] + 1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shl);
diff --git a/utests/compiler_long_shr.cpp b/utests/compiler_long_shr.cpp
new file mode 100644
index 0000000..e9fea6a
--- /dev/null
+++ b/utests/compiler_long_shr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shr(void)
+{
+ const size_t n = 64;
+ uint64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_shr");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ src[i] = (uint64_t)1 << 63;
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, sizeof(src));
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ uint64_t *dest = ((uint64_t *)buf_data[1]);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ if (i > 7)
+ OCL_ASSERT(dest[i] == src[i] >> i);
+ else
+ OCL_ASSERT(dest[i] == src[i] + 1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shr);
diff --git a/utests/compiler_smoothstep.cpp b/utests/compiler_smoothstep.cpp
index 760063b..363ea7e 100644
--- a/utests/compiler_smoothstep.cpp
+++ b/utests/compiler_smoothstep.cpp
@@ -34,7 +34,7 @@ void compiler_smoothstep(void)
OCL_MAP_BUFFER(2);
for (int i = 0; i < n; ++i) {
float a = 0.1f * (rand() & 15) - 0.75f;
- float b = a + 0.1f * (rand() & 15);
+ float b = a + 0.1f * (rand() & 15) + 0.1f;
float c = 0.1f * (rand() & 15) - 0.75f;
src1[i] = ((float*)buf_data[0])[i] = a;
src2[i] = ((float*)buf_data[1])[i] = b;
diff --git a/utests/compiler_upsample_long.cpp b/utests/compiler_upsample_long.cpp
new file mode 100644
index 0000000..b125ff4
--- /dev/null
+++ b/utests/compiler_upsample_long.cpp
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include "utest_helper.hpp"
+
+void compiler_upsample_long(void)
+{
+ const int n = 32;
+ int src1[n];
+ unsigned int src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_upsample_long");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(unsigned int), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ src1[i] = ((int*)buf_data[0])[i] = rand();
+ src2[i] = ((unsigned int*)buf_data[1])[i] = rand();
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ for (int i = 0; i < n; ++i)
+ OCL_ASSERT(((int64_t*)buf_data[2])[i] == (((int64_t)(src1[i]) << 32) | src2[i]));
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_upsample_long);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index 7deb7cb..f8a3dcb 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -59,5 +59,5 @@ test_all_vector(int32_t, int)
test_all_vector(uint32_t, uint)
test_all_vector(float, float)
test_all_vector(double, double)
-//test_all_vector(int64_t, long)
-//test_all_vector(uint64_t, ulong)
+test_all_vector(int64_t, long)
+test_all_vector(uint64_t, ulong)
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
new file mode 100644
index 0000000..1ec8692
--- /dev/null
+++ b/utests/runtime_event.cpp
@@ -0,0 +1,61 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE 32*1024
+void runtime_event(void)
+{
+ const size_t n = BUFFERSIZE;
+ cl_int cpu_src[BUFFERSIZE];
+ cl_event ev[3];
+ cl_int status = 0;
+ cl_int value = 34;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_event");
+ OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+
+ for(cl_uint i=0; i<BUFFERSIZE; i++)
+ cpu_src[i] = 3;
+
+ OCL_CREATE_USER_EVENT(ev[0]);
+
+ clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(int), &value);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 32;
+ clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status >= CL_SUBMITTED);
+ }
+
+ OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+ clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status == CL_COMPLETE);
+
+ OCL_FINISH();
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+ OCL_ASSERT(status <= CL_COMPLETE);
+ }
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+
+ for (uint32_t i = 0; i < n; ++i) {
+ OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+ clReleaseEvent(ev[i]);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_event);
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 0365040..e7f43fc 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -88,7 +88,13 @@ extern EGLSurface eglSurface;
} while (0)
#define OCL_CREATE_BUFFER(BUFFER, FLAGS, SIZE, DATA) \
- OCL_CALL2(clCreateBuffer, BUFFER, ctx, FLAGS, SIZE, DATA)
+ OCL_CALL2(clCreateBuffer, BUFFER, ctx, FLAGS, SIZE, DATA)
+
+#define OCL_CREATE_USER_EVENT(EVENT) \
+ OCL_CALL2(clCreateUserEvent, EVENT, ctx)
+
+#define OCL_SET_USER_EVENT_STATUS(EVENT, STATUS) \
+ OCL_CALL(clSetUserEventStatus, EVENT, STATUS)
#define OCL_CREATE_IMAGE(IMAGE, FLAGS, FORMAT, DESC, DATA) \
OCL_CALL2(clCreateImage, IMAGE, ctx, FLAGS, FORMAT, DESC, DATA)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git
More information about the Pkg-opencl-devel
mailing list