[Pkg-opencl-devel] [beignet] 49/66: Imported Upstream version 0.2+git20130807+c4d1f40
Andreas Beckmann
anbe at moszumanska.debian.org
Fri Oct 31 07:27:07 UTC 2014
This is an automated email from the git hooks/post-receive script.
anbe pushed a commit to branch master
in repository beignet.
commit 56724b85bacbd1b966cfa5b9ed9392b444c49f72
Author: Simon Richter <sjr at debian.org>
Date: Wed Aug 7 16:14:06 2013 +0200
Imported Upstream version 0.2+git20130807+c4d1f40
---
backend/src/backend/context.cpp | 18 ++
backend/src/backend/context.hpp | 3 +
backend/src/backend/gen/gen_mesa_disasm.c | 44 ++-
backend/src/backend/gen_context.cpp | 299 +++++++++++++++++++-
backend/src/backend/gen_context.hpp | 21 +-
backend/src/backend/gen_defs.hpp | 28 ++
backend/src/backend/gen_encoder.cpp | 307 ++++++++++++---------
backend/src/backend/gen_encoder.hpp | 17 +-
.../src/backend/gen_insn_gen7_schedule_info.hxx | 6 +-
backend/src/backend/gen_insn_scheduling.cpp | 39 ++-
backend/src/backend/gen_insn_selection.cpp | 201 +++++++++++---
backend/src/backend/gen_insn_selection.hpp | 6 +
backend/src/backend/gen_insn_selection.hxx | 13 +-
backend/src/backend/gen_reg_allocation.cpp | 66 ++++-
backend/src/backend/gen_register.hpp | 76 +++++
backend/src/backend/program.cpp | 8 +
backend/src/backend/program.h | 4 +
backend/src/backend/program.hpp | 3 +
backend/src/ir/instruction.cpp | 2 +
backend/src/llvm/llvm_gen_backend.cpp | 2 -
backend/src/ocl_stdlib.tmpl.h | 241 +++++++++++-----
kernels/builtin_shuffle2.cl | 13 +
kernels/compiler_double_4.cl | 5 +
kernels/compiler_long.cl | 7 +
kernels/compiler_long_2.cl | 20 ++
kernels/compiler_vector_load_store.cl | 6 +-
src/cl_command_queue_gen7.c | 10 +-
src/cl_driver.h | 4 +
src/cl_driver_defs.c | 1 +
src/intel/intel_gpgpu.c | 52 +++-
utests/CMakeLists.txt | 4 +
utests/builtin_shuffle2.cpp | 45 +++
utests/compiler_double_4.cpp | 40 +++
utests/compiler_long.cpp | 58 ++++
utests/compiler_long_2.cpp | 51 ++++
utests/compiler_vector_load_store.cpp | 12 +-
36 files changed, 1432 insertions(+), 300 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 48160de..5484869 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -268,6 +268,15 @@ namespace gbe
}
}
+ static int
+ alignScratchSize(int size){
+ int i = 0;
+
+ for(; i < size; i+=1024)
+ ;
+
+ return i;
+ }
///////////////////////////////////////////////////////////////////////////
// Generic Context (shared by the simulator and the HW context)
///////////////////////////////////////////////////////////////////////////
@@ -284,6 +293,7 @@ namespace gbe
this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
else
this->simdWidth = fn.getSimdWidth();
+ this->scratchOffset = 0;
}
Context::~Context(void) {
@@ -306,6 +316,8 @@ namespace gbe
this->kernel = NULL;
}
if(this->kernel != NULL)
+ this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+ if(this->kernel != NULL)
this->kernel->ctx = this;
return this->kernel;
}
@@ -337,6 +349,12 @@ namespace gbe
return offset + GEN_REG_SIZE;
}
+ uint32_t Context::allocateScratchMem(uint32_t size) {
+ uint32_t offset = scratchOffset;
+ scratchOffset += size;
+ return offset;
+ }
+
void Context::buildStack(void) {
const auto &stackUse = dag->getUse(ir::ocl::stackptr);
if (stackUse.size() == 0) // no stack is used if stackptr is unused
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index c205388..50c0e70 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -91,6 +91,8 @@ namespace gbe
/* allocate a new entry for a specific image's information */
/*! Get (search or allocate if fail to find one) image info curbeOffset.*/
uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
+ /*! allocate size scratch memory and return start address */
+ uint32_t allocateScratchMem(uint32_t size);
protected:
/*! Build the instruction stream. Return false if failed */
virtual bool emitCode(void) = 0;
@@ -126,6 +128,7 @@ namespace gbe
set<ir::LabelIndex> usedLabels; //!< Set of all used labels
JIPMap JIPs; //!< Where to jump all labels/branches
uint32_t simdWidth; //!< Number of lanes per HW threads
+ uint32_t scratchOffset; //!< scratch slot for next scratch memory request
GBE_CLASS(Context); //!< Use custom allocators
};
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index f65cc30..bfb865a 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -83,6 +83,7 @@ static const struct {
[GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_ADDC] = { .name = "addc", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SUBB] = { .name = "subb", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
@@ -372,6 +373,28 @@ static const char *data_port_data_cache_category[] = {
"scratch",
};
+static const char *data_port_scratch_block_size[] = {
+ "1 register",
+ "2 registers",
+ "Reserve",
+ "4 registers",
+};
+
+static const char *data_port_scratch_invalidate[] = {
+ "no invalidate",
+ "invalidate cache line",
+};
+
+static const char *data_port_scratch_channel_mode[] = {
+ "Oword",
+ "Dword",
+};
+
+static const char *data_port_scratch_msg_type[] = {
+ "Scratch Read",
+ "Scratch Write",
+};
+
static const char *data_port_data_cache_msg_type[] = {
[0] = "OWord Block Read",
[1] = "Unaligned OWord Block Read",
@@ -1154,12 +1177,21 @@ int gen_disasm (FILE *file, const void *opaque_insn)
inst->bits3.sampler_gen7.simd_mode);
break;
case GEN_SFID_DATAPORT_DATA_CACHE:
- format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
- inst->bits3.gen7_untyped_rw.bti,
- inst->bits3.gen7_untyped_rw.rgba,
- data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
- data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
- data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+ if(inst->bits3.gen7_untyped_rw.category == 0) {
+ format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+ inst->bits3.gen7_untyped_rw.bti,
+ inst->bits3.gen7_untyped_rw.rgba,
+ data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+ data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+ data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+ } else {
+ format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+ inst->bits3.gen7_scratch_rw.offset,
+ data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
+ data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
+ data_port_scratch_channel_mode[inst->bits3.gen7_scratch_rw.channel_mode],
+ data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
+ }
break;
case GEN_SFID_MESSAGE_GATEWAY:
format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index e33d8da..621e7be 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -118,7 +118,7 @@ namespace gbe
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
- p->SHR(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(10));
+ p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
p->curr.execWidth = this->simdWidth;
p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
p->curr.execWidth = 1;
@@ -146,6 +146,7 @@ namespace gbe
case SEL_OP_RNDU: p->RNDU(dst, src); break;
case SEL_OP_RNDE: p->RNDE(dst, src); break;
case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
+ case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
default: NOT_IMPLEMENTED;
}
}
@@ -158,9 +159,85 @@ namespace gbe
case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break;
case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
+ case SEL_OP_SEL_INT64:
+ {
+ GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+ xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+ xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->curr.chooseNib(nib);
+ p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+ p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+ xdst = GenRegister::suboffset(xdst, 4);
+ xsrc0 = GenRegister::suboffset(xsrc0, 4);
+ xsrc1 = GenRegister::suboffset(xsrc1, 4);
+ }
+ p->pop();
+ }
+ break;
case SEL_OP_AND: p->AND(dst, src0, src1); break;
case SEL_OP_OR: p->OR (dst, src0, src1); break;
case SEL_OP_XOR: p->XOR(dst, src0, src1); break;
+ case SEL_OP_I64AND:
+ {
+ GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+ xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+ xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->curr.chooseNib(nib);
+ p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+ p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+ xdst = GenRegister::suboffset(xdst, 4),
+ xsrc0 = GenRegister::suboffset(xsrc0, 4),
+ xsrc1 = GenRegister::suboffset(xsrc1, 4);
+ }
+ p->pop();
+ }
+ break;
+ case SEL_OP_I64OR:
+ {
+ GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+ xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+ xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->curr.chooseNib(nib);
+ p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+ p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+ xdst = GenRegister::suboffset(xdst, 4),
+ xsrc0 = GenRegister::suboffset(xsrc0, 4),
+ xsrc1 = GenRegister::suboffset(xsrc1, 4);
+ }
+ p->pop();
+ }
+ break;
+ case SEL_OP_I64XOR:
+ {
+ GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+ xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+ xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->curr.chooseNib(nib);
+ p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+ p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+ xdst = GenRegister::suboffset(xdst, 4),
+ xsrc0 = GenRegister::suboffset(xsrc0, 4),
+ xsrc1 = GenRegister::suboffset(xsrc1, 4);
+ }
+ p->pop();
+ }
+ break;
case SEL_OP_SHR: p->SHR(dst, src0, src1); break;
case SEL_OP_SHL: p->SHL(dst, src0, src1); break;
case SEL_OP_RSR: p->RSR(dst, src0, src1); break;
@@ -175,12 +252,142 @@ namespace gbe
}
}
+ void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
+ int execWidth = p->curr.execWidth;
+ src = src.top_half();
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.execWidth = 8;
+ p->MOV(dest, src);
+ p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+ if (execWidth == 16) {
+ p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
+ p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+ }
+ p->pop();
+ }
+
+ void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
+ int execWidth = p->curr.execWidth;
+ dest = dest.top_half();
+ p->push();
+ p->curr.execWidth = 8;
+ p->MOV(dest, src);
+ p->curr.nibControl = 1;
+ p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+ if (execWidth == 16) {
+ p->curr.quarterControl = 1;
+ p->curr.nibControl = 0;
+ p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+ p->curr.nibControl = 1;
+ p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+ }
+ p->pop();
+ }
+
+ void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
+ int execWidth = p->curr.execWidth;
+ src = src.bottom_half();
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.execWidth = 8;
+ p->MOV(dest, src);
+ p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+ if (execWidth == 16) {
+ p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
+ p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+ }
+ p->pop();
+ }
+
+ void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
+ int execWidth = p->curr.execWidth;
+ dest = dest.bottom_half();
+ p->push();
+ p->curr.execWidth = 8;
+ p->MOV(dest, src);
+ p->curr.nibControl = 1;
+ p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+ if (execWidth == 16) {
+ p->curr.quarterControl = 1;
+ p->curr.nibControl = 0;
+ p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+ p->curr.nibControl = 1;
+ p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+ }
+ p->pop();
+ }
+
+ void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
+ int execWidth = p->curr.execWidth;
+ GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.execWidth = 8;
+ p->ADDC(dest, src0, src1);
+ p->MOV(src1, acc0);
+ if (execWidth == 16) {
+ p->ADDC(GenRegister::suboffset(dest, 8),
+ GenRegister::suboffset(src0, 8),
+ GenRegister::suboffset(src1, 8));
+ p->MOV(GenRegister::suboffset(src1, 8), acc0);
+ }
+ p->pop();
+ }
+
+ void GenContext::subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1) {
+ int execWidth = p->curr.execWidth;
+ GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.execWidth = 8;
+ p->SUBB(dest, src0, src1);
+ p->MOV(src1, acc0);
+ if (execWidth == 16) {
+ p->SUBB(GenRegister::suboffset(dest, 8),
+ GenRegister::suboffset(src0, 8),
+ GenRegister::suboffset(src1, 8));
+ p->MOV(GenRegister::suboffset(src1, 8), acc0);
+ }
+ p->pop();
+ }
+
void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
const GenRegister src2 = ra->genReg(insn.src(2));
switch (insn.opcode) {
+ case SEL_OP_I64ADD:
+ {
+ GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
+ y = GenRegister::suboffset(x, p->curr.execWidth);
+ loadBottomHalf(x, src0);
+ loadBottomHalf(y, src1);
+ addWithCarry(x, x, y);
+ storeBottomHalf(dst, x);
+ loadTopHalf(x, src0);
+ p->ADD(x, x, y);
+ loadTopHalf(y, src1);
+ p->ADD(x, x, y);
+ storeTopHalf(dst, x);
+ }
+ break;
+ case SEL_OP_I64SUB:
+ {
+ GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
+ y = GenRegister::suboffset(x, p->curr.execWidth);
+ loadBottomHalf(x, src0);
+ loadBottomHalf(y, src1);
+ subWithBorrow(x, x, y);
+ storeBottomHalf(dst, x);
+ loadTopHalf(x, src0);
+ subWithBorrow(x, x, y);
+ loadTopHalf(y, src1);
+ subWithBorrow(x, x, y);
+ storeTopHalf(dst, x);
+ }
+ break;
case SEL_OP_MUL_HI:
{
int w = p->curr.execWidth;
@@ -354,12 +561,52 @@ namespace gbe
p->pop();
}
- void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
- const GenRegister dst = ra->genReg(insn.dst(0));
+ void GenContext::emitSpillRegInstruction(const SelectionInstruction &insn) {
+ uint32_t simdWidth = p->curr.execWidth;
+ uint32_t scratchOffset = insn.extra.scratchOffset;
+ const uint32_t header = insn.extra.scratchMsgHeader;
+ p->push();
+
+ const GenRegister msg = GenRegister::ud8grf(header, 0);
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.extra.function;
+ GenRegister payload = src;
+ payload.nr = header + 1;
+ payload.subnr = 0;
+
+ p->MOV(payload, src);
+ uint32_t regType = insn.src(0).type;
+ uint32_t size = typeSize(regType);
+ assert(size <= 4);
+ uint32_t regNum = (stride(src.hstride)*size*simdWidth) > 32 ? 2 : 1;
+ this->scratchWrite(msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ p->pop();
+ }
+
+ void GenContext::emitUnSpillRegInstruction(const SelectionInstruction &insn) {
+ uint32_t scratchOffset = insn.extra.scratchOffset;
+ const GenRegister dst = insn.dst(0);
+ uint32_t regType = dst.type;
+ uint32_t simdWidth = p->curr.execWidth;
+ const uint32_t header = insn.extra.scratchMsgHeader;
+ uint32_t size = typeSize(regType);
+ assert(size <= 4);
+ uint32_t regNum = (stride(dst.hstride)*size*simdWidth) > 32 ? 2 : 1;
+ const GenRegister msg = GenRegister::ud8grf(header, 0);
+ this->scratchRead(GenRegister::retype(dst, GEN_TYPE_UD), msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+ }
+
+ // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
+ // then follow the real destination registers.
+ // For SIMD16, we allocate elemNum temporary registers from dst(0).
+ void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
const uint32_t elemNum = insn.extra.elem;
- p->READ_FLOAT64(dst, src, bti, elemNum);
+ const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+ const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
+ const GenRegister tmp = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister tempAddr = ra->genReg(insn.src(1));
+ const uint32_t bti = insn.extra.function;
+ p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
}
void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
@@ -370,11 +617,16 @@ namespace gbe
p->UNTYPED_READ(dst, src, bti, elemNum);
}
- void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
+ // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
+ // then follow the real destination registers.
+ // For SIMD16, we allocate elemNum temporary registers from dst(0).
+ void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.extra.function;
const uint32_t elemNum = insn.extra.elem;
- p->WRITE_FLOAT64(src, bti, elemNum);
+ const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+ const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1));
+ const uint32_t bti = insn.extra.function;
+ p->WRITE64(src, data, bti, elemNum);
}
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
@@ -421,6 +673,37 @@ namespace gbe
p->pop();
}
+ void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+ p->push();
+ uint32_t simdWidth = p->curr.execWidth;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+
+ p->curr.execWidth = 8;
+ p->MOV(header, GenRegister::ud8grf(0,0));
+ p->pop();
+
+ int size = typeSize(reg_type)*simdWidth;
+ p->push();
+ p->SCRATCH_WRITE(header, offset/32, size, reg_num, channel_mode);
+ p->pop();
+ }
+
+ void GenContext::scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+ p->push();
+ uint32_t simdWidth = p->curr.execWidth;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 8;
+ p->MOV(header, GenRegister::ud8grf(0,0));
+ p->pop();
+
+ int size = typeSize(reg_type)*simdWidth;
+ p->push();
+ p->SCRATCH_READ(dst, header, offset/32, size, reg_num, channel_mode);
+ p->pop();
+ }
+
void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 5dfaef9..694ae98 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -27,6 +27,7 @@
#include "backend/context.hpp"
#include "backend/program.h"
+#include "backend/gen_register.hpp"
#include "ir/function.hpp"
#include "ir/liveness.hpp"
#include "sys/map.hpp"
@@ -40,6 +41,7 @@ namespace gbe
class Selection; // Performs instruction selection
class SelectionInstruction; // Pre-RA Gen instruction
class SelectionReg; // Pre-RA Gen register
+ class GenRegister;
/*! Context is the helper structure to build the Gen ISA or simulation code
* from GenIR
@@ -73,6 +75,16 @@ namespace gbe
INLINE const ir::Liveness::LiveOut &getLiveOut(const ir::BasicBlock *bb) const {
return this->liveness->getLiveOut(bb);
}
+
+ void loadTopHalf(GenRegister dest, GenRegister src);
+ void storeTopHalf(GenRegister dest, GenRegister src);
+
+ void loadBottomHalf(GenRegister dest, GenRegister src);
+ void storeBottomHalf(GenRegister dest, GenRegister src);
+
+ void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
+ void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+
/*! Final Gen ISA emission helper functions */
void emitLabelInstruction(const SelectionInstruction &insn);
void emitUnaryInstruction(const SelectionInstruction &insn);
@@ -87,8 +99,8 @@ namespace gbe
void emitBarrierInstruction(const SelectionInstruction &insn);
void emitFenceInstruction(const SelectionInstruction &insn);
void emitMathInstruction(const SelectionInstruction &insn);
- void emitReadFloat64Instruction(const SelectionInstruction &insn);
- void emitWriteFloat64Instruction(const SelectionInstruction &insn);
+ void emitRead64Instruction(const SelectionInstruction &insn);
+ void emitWrite64Instruction(const SelectionInstruction &insn);
void emitUntypedReadInstruction(const SelectionInstruction &insn);
void emitUntypedWriteInstruction(const SelectionInstruction &insn);
void emitAtomicInstruction(const SelectionInstruction &insn);
@@ -96,8 +108,11 @@ namespace gbe
void emitByteScatterInstruction(const SelectionInstruction &insn);
void emitSampleInstruction(const SelectionInstruction &insn);
void emitTypedWriteInstruction(const SelectionInstruction &insn);
+ void emitSpillRegInstruction(const SelectionInstruction &insn);
+ void emitUnSpillRegInstruction(const SelectionInstruction &insn);
void emitGetImageInfoInstruction(const SelectionInstruction &insn);
-
+ void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+ void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
/*! Implements base class */
virtual Kernel *allocateKernel(void);
/*! Store the position of each label instruction in the Gen ISA stream */
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 5a9bb2d..e3959ff 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -157,6 +157,7 @@ enum opcode {
GEN_OPCODE_FBH = 75,
GEN_OPCODE_FBL = 76,
GEN_OPCODE_ADDC = 78,
+ GEN_OPCODE_SUBB = 79,
GEN_OPCODE_SAD2 = 80,
GEN_OPCODE_SADA2 = 81,
GEN_OPCODE_DP4 = 84,
@@ -242,6 +243,8 @@ enum GenMessageTarget {
#define GEN_TYPE_V 6 /* packed int vector, immediates only, uword dest only */
#define GEN_TYPE_DF 6
#define GEN_TYPE_F 7
+#define GEN_TYPE_UL 8
+#define GEN_TYPE_L 9
#define GEN_ARF_NULL 0x00
#define GEN_ARF_ADDRESS 0x10
@@ -316,6 +319,15 @@ enum GenMessageTarget {
#define GEN_BYTE_SCATTER 12//1100: Byte Scattered Write
#define GEN_UNTYPED_WRITE 13//1101: Untyped Surface Write
+/* Data port data cache scratch messages*/
+#define GEN_SCRATCH_READ 0
+#define GEN_SCRATCH_WRITE 1
+#define GEN_SCRATCH_CHANNEL_MODE_OWORD 0
+#define GEN_SCRATCH_CHANNEL_MODE_DWORD 1
+#define GEN_SCRATCH_BLOCK_SIZE_1 0
+#define GEN_SCRATCH_BLOCK_SIZE_2 1
+#define GEN_SCRATCH_BLOCK_SIZE_4 3
+
/* Data port render cache Message Type*/
#define GEN_MBLOCK_READ 4 //0100: Media Block Read
#define GEN_TYPED_READ 5 //0101: Typed Surface Read
@@ -762,6 +774,22 @@ struct GenInstruction
uint32_t end_of_thread:1;
} gen7_byte_rw;
+ /*! Data port Scratch Read/ write */
+ struct {
+ uint32_t offset:12;
+ uint32_t block_size:2;
+ uint32_t ignored0:1;
+ uint32_t invalidate_after_read:1;
+ uint32_t channel_mode:1;
+ uint32_t msg_type:1;
+ uint32_t category:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen7_scratch_rw;
+
/*! Data port OBlock read / write */
struct {
uint32_t bti:8;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index f84c6dd..4d6aa34 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -356,103 +356,69 @@ namespace gbe
0
};
- static int dst_type(int exec_width) {
- if (exec_width == 8)
- return GEN_TYPE_UD;
- if (exec_width == 16)
- return GEN_TYPE_UW;
- NOT_IMPLEMENTED;
- return 0;
- }
-
- void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
- int w = curr.execWidth;
- dst = GenRegister::h2(dst);
- dst.type = GEN_TYPE_UD;
- src.type = GEN_TYPE_UD;
- GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
- GenRegister imm4 = GenRegister::immud(4);
- GenInstruction *insn;
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::uw16grf(r.nr, 0));
- setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
- push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(dst, r);
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
- pop();
- ADD(src, src, imm4);
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::uw16grf(r.nr, 0));
- setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+ void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
+ src = GenRegister::retype(src, GEN_TYPE_UD);
+ addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
+ uint32_t originSimdWidth = curr.execWidth;
+ uint32_t originPredicate = curr.predicate;
+ uint32_t originMask = curr.noMask;
push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(GenRegister::suboffset(dst, 1), r);
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
+ for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
+ channels < originSimdWidth; channels += 8, currQuarter++) {
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = GEN_MASK_DISABLE;
+ curr.execWidth = 8;
+ /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
+ which is what we want here. */
+ MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
+ ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
+ MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
+ ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
+ // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
+ curr.execWidth = 16;
+ this->UNTYPED_READ(tmp, addr, bti, elemNum);
+ if (originSimdWidth == 16)
+ curr.quarterControl = currQuarter;
+ curr.predicate = originPredicate;
+ curr.noMask = originMask;
+ // Back to simd8 for correct predication flag.
+ curr.execWidth = 8;
+ MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
+ }
pop();
}
- void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
- int w = curr.execWidth;
- GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
- r.type = GEN_TYPE_UD;
- GenRegister hdr = GenRegister::h2(r);
- GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
- src.hstride = GEN_HORIZONTAL_STRIDE_2;
- GenRegister data = GenRegister::offset(r, w / 8);
- GenRegister imm4 = GenRegister::immud(4);
- MOV(r, GenRegister::ud8grf(msg.nr, 0));
- push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(data, src);
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
- pop();
- GenInstruction *insn;
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
- setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
-
- ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
+ void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) {
+ GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
+ msg = GenRegister::retype(msg, GEN_TYPE_UD);
+ int originSimdWidth = curr.execWidth;
+ int originPredicate = curr.predicate;
+ int originMask = curr.noMask;
push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(data, GenRegister::suboffset(src, 1));
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
+ for (uint32_t half = 0; half < 2; half++) {
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = GEN_MASK_DISABLE;
+ curr.execWidth = 8;
+ MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half));
+ if (originSimdWidth == 16) {
+ MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half));
+ curr.execWidth = 16;
+ }
+ if (half == 1)
+ ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
+ curr.predicate = originPredicate;
+ curr.noMask = originMask;
+ this->UNTYPED_WRITE(msg, bti, elemNum);
+ }
+ /* Let's restore the original message(addr) register. */
+ /* XXX could be optimized if we don't allocate the address to the header
+ position of the message. */
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = GEN_MASK_DISABLE;
+ ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(-4));
pop();
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
- setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
}
void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
@@ -470,7 +436,7 @@ namespace gbe
NOT_IMPLEMENTED;
this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(this,
@@ -601,24 +567,64 @@ namespace gbe
return &this->store.back();
}
- INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
- if (dst.isdf() && src.isdf()) {
+ INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src0, GenRegister src1 = GenRegister::null()) {
int w = p->curr.execWidth;
p->push();
- p->curr.quarterControl = 0;
p->curr.nibControl = 0;
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
- p->setSrc0(insn, src);
+ p->setSrc0(insn, src0);
+ if (!GenRegister::isNull(src1))
+ p->setSrc1(insn, src1);
if (w == 8)
p->curr.nibControl = 1; // second 1/8 mask
- else // w == 16
- p->curr.quarterControl = 1; // second 1/4 mask
insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, GenRegister::suboffset(dst, w / 2));
- p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+ p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+ if (!GenRegister::isNull(src1))
+ p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+ p->pop();
+ }
+
+ // Double register accessing is a little special,
+ // Per Gen spec, then only supported mode is SIMD8 and, it only
+ // handles four doubles each time.
+ // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8
+ // to two SIMD1x4.
+ INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src0, GenRegister src1 = GenRegister::null()) {
+ if (p->curr.execWidth == 8)
+ _handleDouble(p, opcode, dst, src0, src1);
+ else if (p->curr.execWidth == 16) {
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ _handleDouble(p, opcode, dst, src0, src1);
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ if (!GenRegister::isNull(src1))
+ src1 = GenRegister::offset(src1, 2);
+ _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
+ p->pop();
+ }
+ }
+
+ INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
+ if (dst.isdf() && src.isdf()) {
+ handleDouble(p, opcode, dst, src);
+ } else if (dst.isint64() && src.isint64()) { // handle int64
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for (int nib = 0; nib < execWidth / 4; nib ++) {
+ p->curr.chooseNib(nib);
+ p->MOV(dst.bottom_half(), src.bottom_half());
+ p->MOV(dst.top_half(), src.top_half());
+ dst = GenRegister::suboffset(dst, 4);
+ src = GenRegister::suboffset(src, 4);
+ }
p->pop();
} else if (needToSplitAlu1(p, dst, src) == false) {
GenInstruction *insn = p->next(opcode);
@@ -653,25 +659,7 @@ namespace gbe
GenRegister src1)
{
if (dst.isdf() && src0.isdf() && src1.isdf()) {
- int w = p->curr.execWidth;
- p->push();
- p->curr.quarterControl = 0;
- p->curr.nibControl = 0;
- GenInstruction *insn = p->next(opcode);
- p->setHeader(insn);
- p->setDst(insn, dst);
- p->setSrc0(insn, src0);
- p->setSrc1(insn, src1);
- if (w == 8)
- p->curr.nibControl = 1; // second 1/8 mask
- else // w == 16
- p->curr.quarterControl = 1; // second 1/4 mask
- insn = p->next(opcode);
- p->setHeader(insn);
- p->setDst(insn, GenRegister::suboffset(dst, w / 2));
- p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
- p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
- p->pop();
+ handleDouble(p, opcode, dst, src0, src1);
} else if (needToSplitAlu2(p, dst, src0, src1) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
@@ -808,7 +796,16 @@ namespace gbe
r.width = GEN_WIDTH_1;
r.hstride = GEN_HORIZONTAL_STRIDE_0;
push();
+ uint32_t width = curr.execWidth;
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ curr.quarterControl = GEN_COMPRESSION_Q1;
MOV(dest, r);
+ if (width == 16) {
+ curr.quarterControl = GEN_COMPRESSION_Q2;
+ MOV(GenRegister::offset(dest, 2), r);
+ }
pop();
}
@@ -836,17 +833,25 @@ namespace gbe
MOV(dest, src0);
}
+ void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
+ GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
+ int execWidth = curr.execWidth;
+ push();
+ curr.execWidth = 8;
+ for(int nib = 0; nib < execWidth/4; nib ++) {
+ curr.chooseNib(nib);
+ MOV(dest.top_half(), u1);
+ MOV(dest.bottom_half(), u0);
+ dest = GenRegister::suboffset(dest, 4);
+ }
+ pop();
+ }
+
void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
int w = curr.execWidth;
if (src0.isdf()) {
- push();
- curr.execWidth = 16;
- MOV(dest, src0);
- if (w == 16) {
- curr.quarterControl = 1;
- MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
- }
- pop();
+ GBE_ASSERT(0); // MOV DF is called from convert instruction,
+ // We should never convert a df to a df.
} else {
GenRegister r0 = GenRegister::h2(r);
push();
@@ -903,6 +908,13 @@ namespace gbe
ALU2(MACH)
ALU3(MAD)
+ void GenEncoder::SUBB(GenRegister dest, GenRegister src0, GenRegister src1) {
+ push();
+ curr.accWrEnable = 1;
+ alu2(this, GEN_OPCODE_SUBB, dest, src0, src1);
+ pop();
+ }
+
void GenEncoder::ADDC(GenRegister dest, GenRegister src0, GenRegister src1) {
push();
curr.accWrEnable = 1;
@@ -1150,6 +1162,49 @@ namespace gbe
this->setSrc0(insn, msg);
setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present);
}
+ static void setScratchMessage(GenEncoder *p,
+ GenInstruction *insn,
+ uint32_t offset,
+ uint32_t block_size,
+ uint32_t channel_mode,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+ setMessageDescriptor(p, insn, sfid, msg_length, response_length, true);
+ insn->bits3.gen7_scratch_rw.block_size = block_size;
+ insn->bits3.gen7_scratch_rw.msg_type = msg_type;
+ insn->bits3.gen7_scratch_rw.channel_mode = channel_mode;
+ insn->bits3.gen7_scratch_rw.offset = offset;
+ insn->bits3.gen7_scratch_rw.category = 1;
+ }
+
+ void GenEncoder::SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode)
+ {
+ assert(src_num == 1 || src_num ==2);
+ uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+ GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
+ // here src_num means register that will be write out: in terms of 32byte register number
+ setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_WRITE, src_num+1, 0);
+ }
+
+ void GenEncoder::SCRATCH_READ(GenRegister dst, GenRegister src, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode)
+ {
+ assert(dst_num == 1 || dst_num ==2);
+ uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+ GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src);
+ this->setSrc1(insn, GenRegister::immud(0));
+ // here dst_num is the register that will be write-back: in terms of 32byte register
+ setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
+ }
void GenEncoder::EOT(uint32_t msg) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index d3a7165..bbf240c 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -92,6 +92,7 @@ namespace gbe
ALU1(MOV)
ALU1(FBH)
ALU1(FBL)
+ ALU2(SUBB)
ALU2(UPSAMPLE_SHORT)
ALU2(UPSAMPLE_INT)
ALU1(RNDZ)
@@ -118,11 +119,13 @@ namespace gbe
ALU2(LINE)
ALU2(PLN)
ALU3(MAD)
- ALU2(MOV_DF);
+ //ALU2(MOV_DF);
#undef ALU1
#undef ALU2
#undef ALU3
+ void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+ void LOAD_INT64_IMM(GenRegister dest, int64_t value);
/*! Barrier message (to synchronize threads of a workgroup) */
void BARRIER(GenRegister src);
/*! Memory fence message (to order loads and stores between threads) */
@@ -141,10 +144,10 @@ namespace gbe
void WAIT(void);
/*! Atomic instructions */
void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
- /*! Read 64-bits float arrays */
- void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
- /*! Write 64-bits float arrays */
- void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
+ /*! Read 64-bits float/int arrays */
+ void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
+ /*! Write 64-bits float/int arrays */
+ void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum);
/*! Untyped read (upto 4 channels) */
void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
@@ -153,6 +156,10 @@ namespace gbe
void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+ /*! for scratch memory read */
+ void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
+ /*! for scratch memory write */
+ void SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode);
/*! Send instruction for the sampler */
void SAMPLE(GenRegister dest,
GenRegister msg,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index f3f4a25..da8f2a2 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -12,13 +12,15 @@ DECL_GEN7_SCHEDULE(Wait, 20, 2, 2)
DECL_GEN7_SCHEDULE(Math, 20, 4, 2)
DECL_GEN7_SCHEDULE(Barrier, 80, 1, 1)
DECL_GEN7_SCHEDULE(Fence, 80, 1, 1)
-DECL_GEN7_SCHEDULE(ReadFloat64, 80, 1, 1)
-DECL_GEN7_SCHEDULE(WriteFloat64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(Read64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(Write64, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedRead, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteScatter, 80, 1, 1)
DECL_GEN7_SCHEDULE(Sample, 80, 1, 1)
DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1)
+DECL_GEN7_SCHEDULE(SpillReg, 80, 1, 1)
+DECL_GEN7_SCHEDULE(UnSpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(GetImageInfo, 20, 4, 2)
DECL_GEN7_SCHEDULE(Atomic, 80, 1, 1)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index cb990be..0b720b7 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -283,19 +283,24 @@ namespace gbe
uint32_t DependencyTracker::getIndex(GenRegister reg) const {
// Non GRF physical register
if (reg.physical) {
- GBE_ASSERT (reg.file == GEN_ARCHITECTURE_REGISTER_FILE);
- const uint32_t file = reg.nr & 0xf0;
- const uint32_t nr = reg.nr & 0x0f;
- if (file == GEN_ARF_FLAG) {
- const uint32_t subnr = reg.subnr / sizeof(uint16_t);
- GBE_ASSERT(nr < MAX_FLAG_REGISTER && (subnr == 0 || subnr == 1));
- return grfNum + 2*nr + subnr;
- } else if (file == GEN_ARF_ACCUMULATOR) {
- GBE_ASSERT(nr < MAX_ACC_REGISTER);
- return grfNum + MAX_FLAG_REGISTER + nr;
+ //GBE_ASSERT (reg.file == GEN_ARCHITECTURE_REGISTER_FILE);
+ if(reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+ const uint32_t file = reg.nr & 0xf0;
+ const uint32_t nr = reg.nr & 0x0f;
+ if (file == GEN_ARF_FLAG) {
+ const uint32_t subnr = reg.subnr / sizeof(uint16_t);
+ GBE_ASSERT(nr < MAX_FLAG_REGISTER && (subnr == 0 || subnr == 1));
+ return grfNum + 2*nr + subnr;
+ } else if (file == GEN_ARF_ACCUMULATOR) {
+ GBE_ASSERT(nr < MAX_ACC_REGISTER);
+ return grfNum + MAX_FLAG_REGISTER + nr;
+ } else {
+ NOT_SUPPORTED;
+ return 0;
+ }
} else {
- NOT_SUPPORTED;
- return 0;
+ const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+ return simdWidth == 8 ? reg.nr : reg.nr / 2;
}
}
// We directly manipulate physical GRFs here
@@ -344,6 +349,10 @@ namespace gbe
this->nodes[index] = node;
}
+ if(insn.opcode == SEL_OP_SPILL_REG) {
+ const uint32_t index = this->getIndex(0xff);
+ this->nodes[index] = node;
+ }
// Consider barriers and wait write to memory
if (insn.opcode == SEL_OP_BARRIER ||
insn.opcode == SEL_OP_FENCE ||
@@ -424,6 +433,11 @@ namespace gbe
const uint32_t index = tracker.getIndex(insn.extra.function);
tracker.addDependency(node, index);
}
+ //read-after-write of scratch memory
+ if (insn.opcode == SEL_OP_UNSPILL_REG) {
+ const uint32_t index = tracker.getIndex(0xff);
+ tracker.addDependency(node, index);
+ }
// Consider barriers and wait are reading memory (local and global)
if (insn.opcode == SEL_OP_BARRIER ||
@@ -453,6 +467,7 @@ namespace gbe
tracker.addDependency(node, index);
}
+
// Consider barriers and wait are writing memory (local and global)
if (insn.opcode == SEL_OP_BARRIER ||
insn.opcode == SEL_OP_FENCE ||
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d4be8bf..1a3af68 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -128,6 +128,8 @@ namespace gbe
case TYPE_U16: return GEN_TYPE_UW;
case TYPE_S32: return GEN_TYPE_D;
case TYPE_U32: return GEN_TYPE_UD;
+ case TYPE_S64: return GEN_TYPE_L;
+ case TYPE_U64: return GEN_TYPE_UL;
case TYPE_FLOAT: return GEN_TYPE_F;
case TYPE_DOUBLE: return GEN_TYPE_DF;
default: NOT_SUPPORTED; return GEN_TYPE_F;
@@ -167,14 +169,14 @@ namespace gbe
bool SelectionInstruction::isRead(void) const {
return this->opcode == SEL_OP_UNTYPED_READ ||
- this->opcode == SEL_OP_READ_FLOAT64 ||
+ this->opcode == SEL_OP_READ64 ||
this->opcode == SEL_OP_ATOMIC ||
this->opcode == SEL_OP_BYTE_GATHER;
}
bool SelectionInstruction::isWrite(void) const {
return this->opcode == SEL_OP_UNTYPED_WRITE ||
- this->opcode == SEL_OP_WRITE_FLOAT64 ||
+ this->opcode == SEL_OP_WRITE64 ||
this->opcode == SEL_OP_ATOMIC ||
this->opcode == SEL_OP_BYTE_SCATTER;
}
@@ -313,6 +315,8 @@ namespace gbe
INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
/*! Implement public class */
INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
+ /*! spill a register (insert spill/unspill instructions) */
+ INLINE void spillReg(ir::Register reg, uint32_t registerPool);
/*! Implement public class */
INLINE uint32_t getRegNum(void) const { return file.regNum(); }
/*! Implements public interface */
@@ -413,19 +417,26 @@ namespace gbe
ALU1(MOV)
ALU2(MOV_DF)
ALU2(LOAD_DF_IMM)
+ ALU1(LOAD_INT64_IMM)
ALU1(RNDZ)
ALU1(RNDE)
ALU2(SEL)
+ ALU2(SEL_INT64)
ALU1(NOT)
ALU2(AND)
ALU2(OR)
ALU2(XOR)
+ ALU2(I64AND)
+ ALU2(I64OR)
+ ALU2(I64XOR)
ALU2(SHR)
ALU2(SHL)
ALU2(RSR)
ALU2(RSL)
ALU2(ASR)
ALU2(ADD)
+ ALU3(I64ADD)
+ ALU3(I64SUB)
ALU2(MUL)
ALU1(FRC)
ALU1(RNDD)
@@ -465,10 +476,10 @@ namespace gbe
void WAIT(void);
/*! Atomic instruction */
void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
- /*! Read 64 bits float array */
- void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
- /*! Write 64 bits float array */
- void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+ /*! Read 64 bits float/int array */
+ void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
+ /*! Write 64 bits float/int array */
+ void WRITE64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Untyped write (up to 4 elements) */
@@ -609,6 +620,57 @@ namespace gbe
return vector;
}
+ void Selection::Opaque::spillReg(ir::Register spilledReg, uint32_t registerPool) {
+ assert(registerPool != 0);
+ const uint32_t simdWidth = ctx.getSimdWidth();
+ const uint32_t dstStart = registerPool + 1;
+ const uint32_t srcStart = registerPool + 1;
+ uint32_t ptr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D)*simdWidth);
+
+ for (auto &block : blockList)
+ for (auto &insn : block.insnList) {
+ const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+
+ for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+ const GenRegister selReg = insn.src(srcID);
+ const ir::Register reg = selReg.reg();
+ if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+ GBE_ASSERT(srcID < 5);
+ SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
+ unspill->state = GenInstructionState(simdWidth);
+ unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE, srcStart+srcID, 0,
+ selReg.type, selReg.vstride, selReg.width, selReg.hstride);
+ GenRegister src = insn.src(srcID);
+ // change nr/subnr, keep other register settings
+ src.nr = srcStart+srcID; src.subnr=0; src.physical=1;
+ insn.src(srcID) = src;
+ unspill->extra.scratchOffset = ptr;
+ unspill->extra.scratchMsgHeader = registerPool;
+ insn.prepend(*unspill);
+ }
+ }
+
+ for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+ const GenRegister selReg = insn.dst(dstID);
+ const ir::Register reg = selReg.reg();
+ if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+ GBE_ASSERT(dstID < 5);
+ SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
+ spill->state = GenInstructionState(simdWidth);
+ spill->src(0) =GenRegister(GEN_GENERAL_REGISTER_FILE, dstStart + dstID, 0,
+ selReg.type, selReg.vstride, selReg.width, selReg.hstride);
+ GenRegister dst = insn.dst(dstID);
+ // change nr/subnr, keep other register settings
+ dst.physical =1; dst.nr = dstStart+dstID; dst.subnr = 0;
+ insn.dst(dstID)= dst;
+ spill->extra.scratchOffset = ptr;
+ spill->extra.scratchMsgHeader = registerPool;
+ insn.append(*spill);
+ }
+ }
+ }
+ }
+
ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID) {
SelectionBlock *block = insn->parent;
const uint32_t simdWidth = ctx.getSimdWidth();
@@ -760,12 +822,16 @@ namespace gbe
void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
- void Selection::Opaque::READ_FLOAT64(Reg addr,
+ /* elemNum contains all the temporary register and the
+ real destination registers.*/
+ void Selection::Opaque::READ64(Reg addr,
+ Reg tempAddr,
const GenRegister *dst,
uint32_t elemNum,
+ uint32_t valueNum,
uint32_t bti)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 2);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
@@ -773,11 +839,12 @@ namespace gbe
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID) = dst[elemID];
insn->src(0) = addr;
+ insn->src(1) = tempAddr;
insn->extra.function = bti;
- insn->extra.elem = elemNum;
+ insn->extra.elem = valueNum;
- // Sends require contiguous allocation
- dstVector->regNum = elemNum;
+ // Only the temporary registers need contiguous allocation
+ dstVector->regNum = elemNum - valueNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
@@ -807,19 +874,21 @@ namespace gbe
dstVector->regNum = elemNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
-
// Source cannot be scalar (yet)
srcVector->regNum = 1;
srcVector->isSrc = 1;
srcVector->reg = &insn->src(0);
}
- void Selection::Opaque::WRITE_FLOAT64(Reg addr,
+ /* elemNum contains all the temporary register and the
+ real data registers.*/
+ void Selection::Opaque::WRITE64(Reg addr,
const GenRegister *src,
uint32_t elemNum,
+ uint32_t valueNum,
uint32_t bti)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, elemNum+1);
SelectionVector *vector = this->appendVector();
// Regular instruction to encode
@@ -827,10 +896,10 @@ namespace gbe
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->src(elemID+1) = src[elemID];
insn->extra.function = bti;
- insn->extra.elem = elemNum;
+ insn->extra.elem = valueNum;
- // Sends require contiguous allocation for the sources
- vector->regNum = elemNum+1;
+ // Only the addr + temporary registers need to be contiguous.
+ vector->regNum = (elemNum - valueNum) + 1;
vector->reg = &insn->src(0);
vector->isSrc = 1;
}
@@ -1172,6 +1241,9 @@ namespace gbe
ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID) {
return this->opaque->replaceDst(insn, regID);
}
+ void Selection::spillReg(ir::Register reg, uint32_t registerPool) {
+ this->opaque->spillReg(reg, registerPool);
+ }
SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
return this->opaque->create(opcode, dstNum, srcNum);
@@ -1185,7 +1257,7 @@ namespace gbe
using namespace ir;
const auto &childInsn = cast<LoadImmInstruction>(insn);
const auto &imm = childInsn.getImmediate();
- if(imm.type != TYPE_DOUBLE)
+ if(imm.type != TYPE_DOUBLE && imm.type != TYPE_S64 && imm.type != TYPE_U64)
return true;
return false;
}
@@ -1408,17 +1480,44 @@ namespace gbe
// Output the binary instruction
switch (opcode) {
- case OP_ADD: sel.ADD(dst, src0, src1); break;
+ case OP_ADD:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+ sel.I64ADD(dst, src0, src1, t);
+ } else
+ sel.ADD(dst, src0, src1);
+ break;
case OP_ADDSAT:
sel.push();
sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
sel.ADD(dst, src0, src1);
sel.pop();
break;
- case OP_XOR: sel.XOR(dst, src0, src1); break;
- case OP_OR: sel.OR(dst, src0, src1); break;
- case OP_AND: sel.AND(dst, src0, src1); break;
- case OP_SUB: sel.ADD(dst, src0, GenRegister::negate(src1)); break;
+ case OP_XOR:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+ sel.I64XOR(dst, src0, src1);
+ else
+ sel.XOR(dst, src0, src1);
+ break;
+ case OP_OR:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+ sel.I64OR(dst, src0, src1);
+ else
+ sel.OR(dst, src0, src1);
+ break;
+ case OP_AND:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+ sel.I64AND(dst, src0, src1);
+ else
+ sel.AND(dst, src0, src1);
+ break;
+ case OP_SUB:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+ sel.I64SUB(dst, src0, src1, t);
+ } else
+ sel.ADD(dst, src0, GenRegister::negate(src1));
+ break;
case OP_SUBSAT:
sel.push();
sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
@@ -1782,6 +1881,8 @@ namespace gbe
case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); break;
+ case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.s64)); break;
+ case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.u64)); break;
default: NOT_SUPPORTED;
}
sel.pop();
@@ -1832,6 +1933,8 @@ namespace gbe
using namespace ir;
switch (type) {
case TYPE_DOUBLE:
+ case TYPE_S64:
+ case TYPE_U64:
return GEN_BYTE_SCATTER_QWORD;
case TYPE_FLOAT:
case TYPE_U32:
@@ -1864,20 +1967,25 @@ namespace gbe
sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
}
- void emitReadFloat64(Selection::Opaque &sel,
+ void emitRead64(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
uint32_t bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
- vector<GenRegister> dst(valueNum);
- for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
- dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
- dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- if (sel.ctx.getSimdWidth() == 16)
- dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+ uint32_t dstID;
+ /* XXX support scalar only right now. */
+ GBE_ASSERT(valueNum == 1);
+
+ // The first 16 DWORD register space is for temporary usage at encode stage.
+ uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+ GenRegister dst[valueNum + tmpRegNum];
+ for (dstID = 0; dstID < tmpRegNum ; ++dstID)
+ dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
+ for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
+ dst[dstID] = sel.selReg(insn.getValue(valueID));
+ sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, valueNum + tmpRegNum, valueNum, bti);
}
void emitByteGather(Selection::Opaque &sel,
@@ -1932,7 +2040,7 @@ namespace gbe
if (insn.getAddressSpace() == MEM_CONSTANT)
this->emitIndirectMove(sel, insn, address);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
+ this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else {
@@ -1963,7 +2071,7 @@ namespace gbe
sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
}
- void emitWriteFloat64(Selection::Opaque &sel,
+ void emitWrite64(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
uint32_t bti) const
{
@@ -1971,15 +2079,19 @@ namespace gbe
const uint32_t valueNum = insn.getValueNum();
const uint32_t addrID = ir::StoreInstruction::addressIndex;
GenRegister addr;
- vector<GenRegister> value(valueNum);
-
+ uint32_t srcID;
+ /* XXX support scalar only right now. */
+ GBE_ASSERT(valueNum == 1);
addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
- for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
- value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
- value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- if (sel.ctx.getSimdWidth() == 16)
- value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+ // The first 16 DWORD register space is for temporary usage at encode stage.
+ uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+ GenRegister src[valueNum + tmpRegNum];
+ for (srcID = 0; srcID < tmpRegNum; ++srcID)
+ src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD));
+
+ for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID)
+ src[srcID] = sel.selReg(insn.getValue(valueID));
+ sel.WRITE64(addr, src, valueNum + tmpRegNum, valueNum, bti);
}
void emitByteScatter(Selection::Opaque &sel,
@@ -2012,7 +2124,7 @@ namespace gbe
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitWriteFloat64(sel, insn, bti);
+ this->emitWrite64(sel, insn, bti);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedWrite(sel, insn, bti);
else {
@@ -2191,7 +2303,10 @@ namespace gbe
sel.curr.physicalFlag = 0;
sel.curr.flagIndex = uint16_t(pred);
sel.curr.noMask = 0;
- sel.SEL(tmp, src0, src1);
+ if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
+ sel.SEL_INT64(tmp, src0, src1);
+ else
+ sel.SEL(tmp, src0, src1);
sel.pop();
// Update the destination register properly now
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 5ae6e42..79b73e2 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -107,6 +107,10 @@ namespace gbe
/*! offset (0 to 7) */
uint16_t offset:5;
};
+ struct {
+ uint16_t scratchOffset;
+ uint16_t scratchMsgHeader;
+ };
} extra;
/*! Gen opcode */
uint8_t opcode;
@@ -197,6 +201,8 @@ namespace gbe
ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
/*! Replace a destination to the returned temporary register */
ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
+ /*! spill a register (insert spill/unspill instructions) */
+ void spillReg(ir::Register reg, uint32_t registerPool);
/*! Create a new selection instruction */
SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
/*! List of emitted blocks */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 33c3937..eeca9af 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -2,6 +2,7 @@ DECL_SELECTION_IR(LABEL, LabelInstruction)
DECL_SELECTION_IR(MOV, UnaryInstruction)
DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
+DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
DECL_SELECTION_IR(NOT, UnaryInstruction)
DECL_SELECTION_IR(LZD, UnaryInstruction)
DECL_SELECTION_IR(RNDZ, UnaryInstruction)
@@ -10,15 +11,21 @@ DECL_SELECTION_IR(RNDD, UnaryInstruction)
DECL_SELECTION_IR(RNDU, UnaryInstruction)
DECL_SELECTION_IR(FRC, UnaryInstruction)
DECL_SELECTION_IR(SEL, BinaryInstruction)
+DECL_SELECTION_IR(SEL_INT64, BinaryInstruction)
DECL_SELECTION_IR(AND, BinaryInstruction)
DECL_SELECTION_IR(OR, BinaryInstruction)
DECL_SELECTION_IR(XOR, BinaryInstruction)
+DECL_SELECTION_IR(I64AND, BinaryInstruction)
+DECL_SELECTION_IR(I64OR, BinaryInstruction)
+DECL_SELECTION_IR(I64XOR, BinaryInstruction)
DECL_SELECTION_IR(SHR, BinaryInstruction)
DECL_SELECTION_IR(SHL, BinaryInstruction)
DECL_SELECTION_IR(RSR, BinaryInstruction)
DECL_SELECTION_IR(RSL, BinaryInstruction)
DECL_SELECTION_IR(ASR, BinaryInstruction)
DECL_SELECTION_IR(ADD, BinaryInstruction)
+DECL_SELECTION_IR(I64ADD, TernaryInstruction)
+DECL_SELECTION_IR(I64SUB, TernaryInstruction)
DECL_SELECTION_IR(MUL, BinaryInstruction)
DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
DECL_SELECTION_IR(MACH, BinaryInstruction)
@@ -35,13 +42,15 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
DECL_SELECTION_IR(FENCE, FenceInstruction)
DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
-DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction)
-DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
+DECL_SELECTION_IR(READ64, Read64Instruction)
+DECL_SELECTION_IR(WRITE64, Write64Instruction)
DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
+DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
DECL_SELECTION_IR(MUL_HI, TernaryInstruction)
DECL_SELECTION_IR(FBH, UnaryInstruction)
DECL_SELECTION_IR(FBL, UnaryInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index e7c96ac..ccbc0da 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -31,6 +31,8 @@
#include <algorithm>
#include <climits>
+#define RESERVED_REG_NUM_FOR_SPILL 6
+
namespace gbe
{
/////////////////////////////////////////////////////////////////////////////
@@ -94,6 +96,10 @@ namespace gbe
vector<GenRegInterval*> starting;
/*! Intervals sorting based on ending point positions */
vector<GenRegInterval*> ending;
+ /*! registers that are spilled */
+ set<ir::Register> spilled;
+ /* reserved registers for register spill/reload */
+ uint32_t reservedReg;
/*! Current vector to expire */
uint32_t expiringID;
/*! Use custom allocator */
@@ -259,6 +265,11 @@ namespace gbe
continue;
}
+ //ignore register that already spilled
+ if(spilled.contains(reg)) {
+ this->expiringID++;
+ continue;
+ }
// Ignore booleans that were allocated with flags
// if (ctx.getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg)) {
if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL) {
@@ -473,33 +484,67 @@ namespace gbe
auto it = vectorMap.find(reg);
if (it != vectorMap.end()) {
const SelectionVector *vector = it->second.first;
+ // all the reg in the SelectionVector are spilled
+ if(spilled.contains(vector->reg[0].reg()))
+ continue;
const uint32_t simdWidth = ctx.getSimdWidth();
- const uint32_t alignment = simdWidth * sizeof(uint32_t);
+
+ const ir::RegisterData regData = ctx.sel->getRegisterData(reg);
+ const ir::RegisterFamily family = regData.family;
+ const uint32_t typeSize = familyVectorSize[family];
+ const uint32_t alignment = simdWidth*typeSize;
+
const uint32_t size = vector->regNum * alignment;
+
uint32_t grfOffset;
while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
const bool success = this->expireGRF(interval);
- if (success == false) return false;
+ if (success == false) {
+ // if no spill support, just return false, else simply spill the register
+ if(reservedReg == 0) return false;
+ break;
+ }
+ }
+ if(grfOffset == 0) {
+ // spill all the registers in the SelectionVector
+ // the tricky here is I need to use reservedReg+1 as scratch write payload.
+ // so, i need to write the first register to scratch memory first.
+ // the spillReg() will just append scratch write insn after the def. To spill
+ // the first register, need to call spillReg() last for the vector->reg[0]
+ GBE_ASSERT(vector->regNum < RESERVED_REG_NUM_FOR_SPILL);
+ for(int i = vector->regNum-1; i >= 0; i--) {
+ spilled.insert(vector->reg[i].reg());
+ selection.spillReg(vector->reg[i].reg(), reservedReg);
+ }
+ continue;
}
for (uint32_t regID = 0; regID < vector->regNum; ++regID, grfOffset += alignment) {
const ir::Register reg = vector->reg[regID].reg();
- GBE_ASSERT(RA.contains(reg) == false);
+ GBE_ASSERT(RA.contains(reg) == false
+ && ctx.sel->getRegisterData(reg).family == family);
RA.insert(std::make_pair(reg, grfOffset));
}
}
// Case 2: This is a regular scalar register, allocate it alone
- else if (this->createGenReg(interval) == false)
- return false;
+ else if (this->createGenReg(interval) == false) {
+ if(reservedReg == 0) return false;
+ spilled.insert(reg);
+ selection.spillReg(reg, reservedReg);
+ }
}
return true;
}
-
INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
using namespace ir;
const Kernel *kernel = ctx.getKernel();
const Function &fn = ctx.getFunction();
GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
-
+ if (ctx.getSimdWidth() == 8) {
+ reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE);
+ reservedReg /= GEN_REG_SIZE;
+ } else {
+ reservedReg = 0;
+ }
// Allocate all the vectors first since they need to be contiguous
this->allocateVector(selection);
// schedulePreRegAllocation(ctx, selection);
@@ -684,6 +729,10 @@ namespace gbe
int subreg = offst % 8;
std::cout << "%" << vReg << " g" << reg << "." << subreg << "D" << std::endl;
}
+ std::set<ir::Register>::iterator is;
+ std::cout << "## spilled registers:" << std::endl;
+ for(is = spilled.begin(); is != spilled.end(); is++)
+ std::cout << (int)*is << std::endl;
std::cout << std::endl;
}
@@ -698,6 +747,9 @@ namespace gbe
INLINE GenRegister GenRegAllocator::Opaque::genReg(const GenRegister ®) {
if (reg.file == GEN_GENERAL_REGISTER_FILE) {
+ if(reg.physical == 1) {
+ return reg;
+ }
GBE_ASSERT(RA.contains(reg.reg()) != false);
const uint32_t grfOffset = RA.find(reg.reg())->second;
const GenRegister dst = setGenReg(reg, grfOffset);
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index fedb743..2cad4c0 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -70,6 +70,8 @@ namespace gbe
INLINE int typeSize(uint32_t type) {
switch(type) {
case GEN_TYPE_DF:
+ case GEN_TYPE_UL:
+ case GEN_TYPE_L:
return 8;
case GEN_TYPE_UD:
case GEN_TYPE_D:
@@ -134,6 +136,28 @@ namespace gbe
uint32_t predicate:4;
uint32_t inversePredicate:1;
uint32_t saturate:1;
+ void chooseNib(int nib) {
+ switch (nib) {
+ case 0:
+ quarterControl = 0;
+ nibControl = 0;
+ break;
+ case 1:
+ quarterControl = 0;
+ nibControl = 1;
+ break;
+ case 2:
+ quarterControl = 1;
+ nibControl = 0;
+ break;
+ case 3:
+ quarterControl = 1;
+ nibControl = 1;
+ break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
};
/*! This is a book-keeping structure used to encode both virtual and physical
@@ -200,6 +224,7 @@ namespace gbe
int32_t d;
uint32_t ud;
uint16_t reg;
+ int64_t i64;
} value;
uint32_t nr:8; //!< Just for some physical registers (acc, null)
@@ -222,12 +247,32 @@ namespace gbe
return r;
}
+ INLINE bool isint64(void) const {
+ if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+ return true;
+ return false;
+ }
+
INLINE bool isimmdf(void) const {
if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
return true;
return false;
}
+ INLINE GenRegister top_half(void) const {
+ GenRegister r = bottom_half();
+ r.subnr += 4;
+ return r;
+ }
+
+ INLINE GenRegister bottom_half(void) const {
+ GBE_ASSERT(isint64());
+ GenRegister r = *this;
+ r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
+ r.hstride = GEN_HORIZONTAL_STRIDE_2;
+ return r;
+ }
+
INLINE bool isdf(void) const {
if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
return true;
@@ -412,6 +457,12 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_0);
}
+ static INLINE GenRegister immint64(int64_t i) {
+ GenRegister immediate = imm(GEN_TYPE_L);
+ immediate.value.i64 = i;
+ return immediate;
+ }
+
static INLINE GenRegister immdf(double df) {
GenRegister immediate = imm(GEN_TYPE_DF);
immediate.value.df = df;
@@ -553,6 +604,11 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_1);
}
+ static INLINE bool isNull(GenRegister reg) {
+ return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
+ && reg.nr == GEN_ARF_NULL);
+ }
+
static INLINE GenRegister acc(void) {
return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
GEN_ARF_ACCUMULATOR,
@@ -832,6 +888,26 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_2);
}
+ static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
static INLINE GenRegister mask(uint32_t subnr) {
return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
}
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 26c22f3..35d3a7c 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -374,6 +374,12 @@ namespace gbe {
return kernel->getStackSize();
}
+ static int32_t kernelGetScratchSize(gbe_kernel genKernel) {
+ if (genKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getScratchSize();
+ }
+
static int32_t kernelUseSLM(gbe_kernel genKernel) {
if (genKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -443,6 +449,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
@@ -476,6 +483,7 @@ namespace gbe
gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+ gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
gbe_kernel_use_slm = gbe::kernelUseSLM;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index f36bfbf..d20e7af 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -198,6 +198,10 @@ extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
+/*! Get the scratch size (zero if no scratch is required) */
+typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel);
+extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
+
/*! Get the curbe offset where to put the data. Returns -1 if not required */
typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 2d67310..83aaab8 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -96,6 +96,8 @@ namespace gbe {
INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; }
/*! Return the size of the stack (zero if none) */
INLINE uint32_t getStackSize(void) const { return this->stackSize; }
+ /*! Return the size of the scratch memory needed (zero if none) */
+ INLINE uint32_t getScratchSize(void) const { return this->scratchSize; }
/*! Get the SIMD width for the kernel */
INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
/*! Says if SLM is needed for it */
@@ -135,6 +137,7 @@ namespace gbe {
uint32_t curbeSize; //!< Size of the data to push
uint32_t simdWidth; //!< SIMD size for the kernel (lane number)
uint32_t stackSize; //!< Stack size (may be 0 if unused)
+ uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused)
bool useSLM; //!< SLM requires a special HW config
Context *ctx; //!< Save context after compiler to alloc constant buffer curbe
ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 2a77454..45095db 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -664,6 +664,7 @@ namespace ir {
static const Type allButBool[] = {TYPE_S8, TYPE_U8,
TYPE_S16, TYPE_U16,
TYPE_S32, TYPE_U32,
+ TYPE_S64, TYPE_U64,
TYPE_FLOAT, TYPE_DOUBLE};
static const uint32_t allButBoolNum = ARRAY_ELEM_NUM(allButBool);
@@ -671,6 +672,7 @@ namespace ir {
static const Type logicalType[] = {TYPE_S8, TYPE_U8,
TYPE_S16, TYPE_U16,
TYPE_S32, TYPE_U32,
+ TYPE_S64, TYPE_U64,
TYPE_BOOL};
static const uint32_t logicalTypeNum = ARRAY_ELEM_NUM(logicalType);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index c8c5484..18448cf 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2371,8 +2371,6 @@ namespace gbe
// Scalar is easy. We neednot build register tuples
if (isScalarType(llvmType) == true) {
const ir::Type type = getType(ctx, llvmType);
- if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
- OCL_SIMD_WIDTH = 8;
const ir::Register values = this->getRegister(llvmValues);
if (isLoad)
ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index c972a3e..84f15ca 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -543,13 +543,13 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_p
INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
return __gen_ocl_pow(x, 0.3333333333f);
}
-INLINE_OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
- *cosval = native_cos(x);
+#define BODY \
+ *cosval = native_cos(x); \
return native_sin(x);
-}
-INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
-INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
-INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
+INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
+#undef BODY
INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
@@ -562,7 +562,16 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
return (1 - y) / (1 + y);
}
INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
- return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
+ float sum = x, c = x, m = 1.0;
+ int n = 1;
+ do
+ {
+ c *= (2 * n - 1) * x * x;
+ m *= (2 * n);
+ sum += ( c / m / (2 * n + 1));
+ n++;
+ }while( n < 30);
+ return sum;
}
INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
return __gen_ocl_internal_asin(x) / M_PI_F;
@@ -726,23 +735,22 @@ DECL_MIN_MAX_CLAMP(unsigned short)
DECL_MIN_MAX_CLAMP(unsigned char)
#undef DECL_MIN_MAX_CLAMP
-INLINE_OVERLOADABLE float __gen_ocl_frexp(float x, int *exp) {
- uint u = as_uint(x);
- if ((u & 0x7FFFFFFFu) == 0) {
- *exp = 0;
- return x;
- }
- int e = (u >> 23) & 255;
- if (e == 255)
- return x;
- *exp = e - 126;
- u = (u & (0x807FFFFFu)) | 0x3F000000;
+#define BODY \
+ uint u = as_uint(x); \
+ if ((u & 0x7FFFFFFFu) == 0) { \
+ *exp = 0; \
+ return x; \
+ } \
+ int e = (u >> 23) & 255; \
+ if (e == 255) \
+ return x; \
+ *exp = e - 126; \
+ u = (u & (0x807FFFFFu)) | 0x3F000000; \
return as_float(u);
-}
-
-INLINE_OVERLOADABLE float frexp(float x, global int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
-INLINE_OVERLOADABLE float frexp(float x, local int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
-INLINE_OVERLOADABLE float frexp(float x, private int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
+INLINE_OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
+#undef BODY
INLINE_OVERLOADABLE float nextafter(float x, float y) {
uint hx = as_uint(x), ix = hx & 0x7FFFFFFF;
@@ -760,24 +768,22 @@ INLINE_OVERLOADABLE float nextafter(float x, float y) {
return as_float(hx);
}
-INLINE_OVERLOADABLE float __gen_ocl_modf(float x, float *i) {
- uint hx = as_uint(x), ix = hx & 0x7FFFFFFF;
- if (ix > 0x7F800000) {
- *i = nan(0u);
- return nan(0u);
- }
- if (ix == 0x7F800000) {
- *i = x;
- return as_float(hx & 0x80000000u);
- }
- *i = __gen_ocl_rndz(x);
+#define BODY \
+ uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+ if (ix > 0x7F800000) { \
+ *i = nan(0u); \
+ return nan(0u); \
+ } \
+ if (ix == 0x7F800000) { \
+ *i = x; \
+ return as_float(hx & 0x80000000u); \
+ } \
+ *i = __gen_ocl_rndz(x); \
return x - *i;
-}
-
-INLINE_OVERLOADABLE float modf(float x, global float *i) { return __gen_ocl_modf(x, (float *)i); }
-INLINE_OVERLOADABLE float modf(float x, local float *i) { return __gen_ocl_modf(x, (float *)i); }
-INLINE_OVERLOADABLE float modf(float x, private float *i) { return __gen_ocl_modf(x, (float *)i); }
-
+INLINE_OVERLOADABLE float modf(float x, global float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, local float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, private float *i) { BODY; }
+#undef BODY
INLINE_OVERLOADABLE float degrees(float radians) { return (180 / M_PI_F) * radians; }
INLINE_OVERLOADABLE float radians(float degrees) { return (M_PI_F / 180) * degrees; }
@@ -810,32 +816,30 @@ INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
return __gen_ocl_internal_fmax(x, y) - y;
}
-INLINE_OVERLOADABLE float __gen_ocl_fract(float x, float *p) {
- *p = __gen_ocl_internal_floor(x);
+#define BODY \
+ *p = __gen_ocl_internal_floor(x); \
return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
-}
-INLINE_OVERLOADABLE float fract(float x, global float *p) { return __gen_ocl_fract(x, (float *)p); }
-INLINE_OVERLOADABLE float fract(float x, local float *p) { return __gen_ocl_fract(x, (float *)p); }
-INLINE_OVERLOADABLE float fract(float x, private float *p) { return __gen_ocl_fract(x, (float *)p); }
-
-INLINE_OVERLOADABLE float __gen_ocl_remquo(float x, float y, int *quo) {
- uint hx = as_uint(x), ix = hx & 0x7FFFFFFF, hy = as_uint(y), iy = hy & 0x7FFFFFFF;
- if (ix > 0x7F800000 || iy > 0x7F800000 || ix == 0x7F800000 || iy == 0)
- return nan(0u);
- float k = x / y;
- int q = __gen_ocl_rnde(k);
- *quo = q >= 0 ? (q & 127) : (q | 0xFFFFFF80u);
- float r = x - q * y;
- uint hr = as_uint(r), ir = hr & 0x7FFFFFFF;
- if (ir == 0)
- hr = ir | (hx & 0x80000000u);
+INLINE_OVERLOADABLE float fract(float x, global float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, local float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, private float *p) { BODY; }
+#undef BODY
+
+#define BODY \
+ uint hx = as_uint(x), ix = hx & 0x7FFFFFFF, hy = as_uint(y), iy = hy & 0x7FFFFFFF; \
+ if (ix > 0x7F800000 || iy > 0x7F800000 || ix == 0x7F800000 || iy == 0) \
+ return nan(0u); \
+ float k = x / y; \
+ int q = __gen_ocl_rnde(k); \
+ *quo = q >= 0 ? (q & 127) : (q | 0xFFFFFF80u); \
+ float r = x - q * y; \
+ uint hr = as_uint(r), ir = hr & 0x7FFFFFFF; \
+ if (ir == 0) \
+ hr = ir | (hx & 0x80000000u); \
return as_float(hr);
-}
-
-INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
-INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
-INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
-
+INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) { BODY; }
+INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
+INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
+#undef BODY
INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
INLINE_OVERLOADABLE float ldexp(float x, int n) {
return __gen_ocl_pow(2, n) * x;
@@ -955,6 +959,7 @@ DECL_UNTYPED_RW_ALL(uint)
DECL_UNTYPED_RW_ALL(long)
DECL_UNTYPED_RW_ALL(ulong)
DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
#undef DECL_UNTYPED_RW_ALL
#undef DECL_UNTYPED_RW_ALL_SPACE
@@ -1055,6 +1060,114 @@ DEF(float)
#undef DEC8
#undef DEC16
+#define DEC2(TYPE, ARGTYPE, TEMPTYPE) \
+ INLINE_OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, uint2 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC2X(TYPE) \
+ INLINE_OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, uint2 mask) { \
+ TYPE##2 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ return z; \
+ }
+
+#define DEC4(TYPE, ARGTYPE, TEMPTYPE) \
+ INLINE_OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, uint4 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC4X(TYPE) \
+ INLINE_OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, uint4 mask) { \
+ TYPE##4 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+ return z; \
+ }
+
+#define DEC8(TYPE, ARGTYPE, TEMPTYPE) \
+ INLINE_OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, uint8 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC8X(TYPE) \
+ INLINE_OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, uint8 mask) { \
+ TYPE##8 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+ z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+ z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+ z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+ z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+ return z; \
+ }
+
+#define DEC16(TYPE, ARGTYPE, TEMPTYPE) \
+ INLINE_OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, uint16 mask) { \
+ return shuffle((TEMPTYPE)(x, y), mask); \
+ }
+
+#define DEC16X(TYPE) \
+ INLINE_OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, uint16 mask) { \
+ TYPE##16 z; \
+ z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+ z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+ z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+ z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+ z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+ z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+ z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+ z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+ z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
+ z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
+ z.sa = mask.sa < 16 ? ((TYPE *)&x)[mask.sa] : ((TYPE *)&y)[mask.sa & 15]; \
+ z.sb = mask.sb < 16 ? ((TYPE *)&x)[mask.sb] : ((TYPE *)&y)[mask.sb & 15]; \
+ z.sc = mask.sc < 16 ? ((TYPE *)&x)[mask.sc] : ((TYPE *)&y)[mask.sc & 15]; \
+ z.sd = mask.sd < 16 ? ((TYPE *)&x)[mask.sd] : ((TYPE *)&y)[mask.sd & 15]; \
+ z.se = mask.se < 16 ? ((TYPE *)&x)[mask.se] : ((TYPE *)&y)[mask.se & 15]; \
+ z.sf = mask.sf < 16 ? ((TYPE *)&x)[mask.sf] : ((TYPE *)&y)[mask.sf & 15]; \
+ return z; \
+ }
+
+#define DEF(TYPE) \
+ DEC2(TYPE, TYPE##2, TYPE##4) \
+ DEC2(TYPE, TYPE##4, TYPE##8) \
+ DEC2(TYPE, TYPE##8, TYPE##16) \
+ DEC2X(TYPE) \
+ DEC4(TYPE, TYPE##2, TYPE##4) \
+ DEC4(TYPE, TYPE##4, TYPE##8) \
+ DEC4(TYPE, TYPE##8, TYPE##16) \
+ DEC4X(TYPE) \
+ DEC8(TYPE, TYPE##2, TYPE##4) \
+ DEC8(TYPE, TYPE##4, TYPE##8) \
+ DEC8(TYPE, TYPE##8, TYPE##16) \
+ DEC8X(TYPE) \
+ DEC16(TYPE, TYPE##2, TYPE##4) \
+ DEC16(TYPE, TYPE##4, TYPE##8) \
+ DEC16(TYPE, TYPE##8, TYPE##16) \
+ DEC16X(TYPE)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+#undef DEF
+#undef DEC2
+#undef DEC2X
+#undef DEC4
+#undef DEC4X
+#undef DEC8
+#undef DEC8X
+#undef DEC16
+#undef DEC16X
/////////////////////////////////////////////////////////////////////////////
// Synchronization functions
/////////////////////////////////////////////////////////////////////////////
diff --git a/kernels/builtin_shuffle2.cl b/kernels/builtin_shuffle2.cl
new file mode 100644
index 0000000..1a122d4
--- /dev/null
+++ b/kernels/builtin_shuffle2.cl
@@ -0,0 +1,13 @@
+kernel void builtin_shuffle2(global float *src1, global float *src2, global float *dst1, global float *dst2) {
+ int i = get_global_id(0);
+ float2 x = (float2)(src1[i], src2[i]);
+ float2 y = (float2)(1234, 5678);
+ uint4 mask = (uint4)(1, 0, 0, 0);
+ float4 v1 = shuffle2(x, y, mask);
+ float16 x2 = 0;
+ float16 y2 = (float16)(src1[i], src2[i], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ uint16 mask2 = (uint16)(17, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ float16 v2 = shuffle2(x2, y2, mask2);
+ dst1[i] = v1.s0 + v2.s0;
+ dst2[i] = v1.s1 + v2.s1;
+}
diff --git a/kernels/compiler_double_4.cl b/kernels/compiler_double_4.cl
new file mode 100644
index 0000000..e5e46f9
--- /dev/null
+++ b/kernels/compiler_double_4.cl
@@ -0,0 +1,5 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_4(global double *src1, global double *src2, global double *dst) {
+ int i = get_global_id(0);
+ dst[i] = src1[i] + src2[i];
+}
diff --git a/kernels/compiler_long.cl b/kernels/compiler_long.cl
new file mode 100644
index 0000000..3087292
--- /dev/null
+++ b/kernels/compiler_long.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ if(i < 5)
+ dst[i] = src1[i] + src2[i];
+ if(i > 5)
+ dst[i] = src1[i] - src2[i];
+}
diff --git a/kernels/compiler_long_2.cl b/kernels/compiler_long_2.cl
new file mode 100644
index 0000000..92be93a
--- /dev/null
+++ b/kernels/compiler_long_2.cl
@@ -0,0 +1,20 @@
+kernel void compiler_long_2(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ switch(i) {
+ case 0:
+ dst[i] = 0xFEDCBA9876543210UL;
+ break;
+ case 1:
+ dst[i] = src1[i] & src2[i];
+ break;
+ case 2:
+ dst[i] = src1[i] | src2[i];
+ break;
+ case 3:
+ dst[i] = src1[i] ^ src2[i];
+ break;
+ case 4:
+ dst[i] = src1[i] ? 0x1122334455667788L : 0x8877665544332211UL;
+ break;
+ }
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
index 30f0e1e..320194e 100644
--- a/kernels/compiler_vector_load_store.cl
+++ b/kernels/compiler_vector_load_store.cl
@@ -1,5 +1,5 @@
/* test OpenCL 1.1 Vector Data Load/Store Functions (section 6.11.7) */
-
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define OFFSET2(type) (type ##2) {(type)1, (type)2}
#define OFFSET3(type) (type ##3) {(type)1, (type)2, (type)3}
#define OFFSET4(type) (type ##4) {(type)1, (type)2, (type)3, (type)4}
@@ -24,10 +24,10 @@ __kernel void test_##type ##n(__global type *pin, \
TEST_TYPE(ushort,n)\
TEST_TYPE(int,n) \
TEST_TYPE(uint,n) \
- TEST_TYPE(float,n)
+ TEST_TYPE(float,n) \
+ TEST_TYPE(double,n)
#if 0
- TEST_TYPE(double,n)
TEST_TYPE(long,n)
TEST_TYPE(ulong,n)
TEST_TYPE(half,n)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 048595c..e58433f 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -180,10 +180,17 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
assert(offset >= 0);
stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit;
- stack_sz *= device->max_thread_per_unit;
cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
}
+static void
+cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker)
+{
+ int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
+
+ cl_gpgpu_set_scratch(gpgpu, scratch_sz);
+}
+
LOCAL cl_int
cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_kernel ker,
@@ -232,6 +239,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Bind all samplers */
cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
+ cl_setup_scratch(gpgpu, ker);
/* Bind a stack if needed */
cl_bind_stack(gpgpu, ker);
cl_gpgpu_states_setup(gpgpu, &kernel);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 212beb3..673985d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -135,6 +135,10 @@ extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
+/* Setup scratch */
+typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
+extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
+
/* Configure internal state */
typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 4952288..9aa926e 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -50,6 +50,7 @@ LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL;
LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
+LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 2791fbe..e553a55 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -89,7 +89,9 @@ struct intel_gpgpu
struct { drm_intel_bo *bo; } curbe_b;
struct { drm_intel_bo *bo; } sampler_state_b;
struct { drm_intel_bo *bo; } perf_b;
+ struct { drm_intel_bo *bo; } scratch_b;
+ uint32_t per_thread_scratch;
struct {
uint32_t num_cs_entries;
uint32_t size_cs_entry; /* size of one entry in 512bit elements */
@@ -127,6 +129,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
drm_intel_bo_unreference(gpgpu->perf_b.bo);
if (gpgpu->stack_b.bo)
drm_intel_bo_unreference(gpgpu->stack_b.bo);
+ if (gpgpu->scratch_b.bo)
+ drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+
intel_batchbuffer_delete(gpgpu->batch);
cl_free(gpgpu);
}
@@ -199,18 +204,23 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
BEGIN_BATCH(gpgpu->batch, 8);
OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
- gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
- intel_batchbuffer_alloc_space(gpgpu->batch,0);
-
- memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
- vfe->vfe1.gpgpu_mode = 1;
- vfe->vfe1.bypass_gateway_ctl = 1;
- vfe->vfe1.reset_gateway_timer = 1;
- vfe->vfe1.max_threads = gpgpu->max_threads - 1;
- vfe->vfe1.urb_entries = 64;
- vfe->vfe3.curbe_size = 480;
- vfe->vfe4.scoreboard_mask = 0;
- intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_vfe_state_inline_t));
+ if(gpgpu->per_thread_scratch > 0) {
+ OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ gpgpu->per_thread_scratch/1024 - 1);
+ }
+ else {
+ OUT_BATCH(gpgpu->batch, 0);
+ }
+ /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+ OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (64 << 8) | 0xc4);
+ OUT_BATCH(gpgpu->batch, 0);
+ /* curbe_size */
+ OUT_BATCH(gpgpu->batch, 480);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
+ OUT_BATCH(gpgpu->batch, 0);
ADVANCE_BATCH(gpgpu->batch);
}
@@ -537,6 +547,23 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, u
}
static void
+intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
+{
+ drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+ drm_intel_bo* old = gpgpu->scratch_b.bo;
+ uint32_t total = per_thread_size * gpgpu->max_threads;
+
+ gpgpu->per_thread_scratch = per_thread_size;
+
+ if(old && old->size < total) {
+ drm_intel_bo_unreference(old);
+ old = NULL;
+ }
+
+ if(!old)
+ gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
+}
+static void
intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
{
drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
@@ -823,5 +850,6 @@ intel_set_gpgpu_callbacks(void)
cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
+ cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 5cd20c3..b205c67 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -107,6 +107,7 @@ set (utests_sources
builtin_nextafter.cpp
builtin_remquo.cpp
builtin_shuffle.cpp
+ builtin_shuffle2.cpp
builtin_sign.cpp
buildin_work_dim.cpp
builtin_global_size.cpp
@@ -119,6 +120,9 @@ set (utests_sources
compiler_double.cpp
compiler_double_2.cpp
compiler_double_3.cpp
+ compiler_double_4.cpp
+ compiler_long.cpp
+ compiler_long_2.cpp
utest_assert.cpp
utest.cpp
utest_file_map.cpp
diff --git a/utests/builtin_shuffle2.cpp b/utests/builtin_shuffle2.cpp
new file mode 100644
index 0000000..7a9ebd1
--- /dev/null
+++ b/utests/builtin_shuffle2.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void builtin_shuffle2(void)
+{
+ const int n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_shuffle2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; i ++) {
+ ((float *)(buf_data[0]))[i] = (rand() & 15) * 0.1f;
+ ((float *)(buf_data[1]))[i] = (rand() & 15) * 0.1f;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ for (int i = 0; i < n; i ++) {
+ OCL_ASSERT(2 * ((float *)(buf_data[0]))[i] == ((float *)(buf_data[3]))[i]);
+ OCL_ASSERT(2 * ((float *)(buf_data[1]))[i] == ((float *)(buf_data[2]))[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_shuffle2);
diff --git a/utests/compiler_double_4.cpp b/utests/compiler_double_4.cpp
new file mode 100644
index 0000000..cb25bd4
--- /dev/null
+++ b/utests/compiler_double_4.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void compiler_double_4(void)
+{
+ const size_t n = 16;
+ double cpu_src1[n], cpu_src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_double_4");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ cpu_src1[i] = ((double*)buf_data[0])[i] = rand() * 1e-2;
+ cpu_src2[i] = ((double*)buf_data[1])[i] = rand() * 1e-2;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(fabs(((double*)buf_data[2])[i] - cpu_src1[i] - cpu_src2[i]) < 1e-4);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_4);
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
new file mode 100644
index 0000000..fad2744
--- /dev/null
+++ b/utests/compiler_long.cpp
@@ -0,0 +1,58 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ src1[0] = -1L, src2[0] = -1L;
+ src1[1] = 0x8000000000000000UL, src2[1] = 0x8000000000000000UL;
+ src1[2] = 0x7FFFFFFFFFFFFFFFL, src2[2] = 1L;
+ src1[3] = 0xFFFFFFFEL, src2[3] = 1L;
+ src1[4] = 0x7FFFFFFFL, src2[4] = 0x80000000L;
+ src1[5] = 0, src2[5] = 0;
+ src1[6] = 0, src2[6] = 1;
+ src1[7] = -2L, src2[7] = -1L;
+ src1[8] = 0, src2[8] = 0x8000000000000000UL;
+ for (int32_t i = 9; i < (int32_t) n; ++i) {
+ src1[i] = ((long)rand() << 32) + rand();
+ src2[i] = ((long)rand() << 32) + rand();
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+ if (i < 5)
+ OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+ if (i > 5)
+ OCL_ASSERT(src1[i] - src2[i] == ((int64_t *)buf_data[2])[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long);
diff --git a/utests/compiler_long_2.cpp b/utests/compiler_long_2.cpp
new file mode 100644
index 0000000..e3c6640
--- /dev/null
+++ b/utests/compiler_long_2.cpp
@@ -0,0 +1,51 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_2(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src1[i] = ((long)rand() << 32) + rand();
+ src2[i] = ((long)rand() << 32) + rand();
+ }
+ src1[4] = 1;
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ int64_t *dest = ((int64_t *)buf_data[2]);
+ //for (int32_t i = 0; i < (int32_t) n; ++i)
+ // printf("%lx\n", dest[i]);
+ OCL_ASSERT(0xFEDCBA9876543210UL == (uint64_t)dest[0]);
+ OCL_ASSERT((src1[1] & src2[1]) == dest[1]);
+ OCL_ASSERT((src1[2] | src2[2]) == dest[2]);
+ OCL_ASSERT((src1[3] ^ src2[3]) == dest[3]);
+ OCL_ASSERT(0x1122334455667788L == dest[4]);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_2);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index 79f284f..7deb7cb 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -1,4 +1,5 @@
#include "utest_helper.hpp"
+#include <string.h>
template<typename T>
static void compiler_vector_load_store(int elemNum, const char *kernelName)
{
@@ -9,8 +10,8 @@ static void compiler_vector_load_store(int elemNum, const char *kernelName)
buf_data[0] = (T*) malloc(sizeof(T) * n);
for (uint32_t i = 0; i < n; ++i)
((T*)buf_data[0])[i] = i;
- OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(float), buf_data[0]);
- OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
free(buf_data[0]);
buf_data[0] = NULL;
@@ -27,7 +28,10 @@ static void compiler_vector_load_store(int elemNum, const char *kernelName)
for (uint32_t i = 0; i < n; ++i)
{
int shift = ((i % elemNum) + 1);
- OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
+ if (strstr(kernelName, "double") == NULL)
+ OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
+ else
+ OCL_ASSERT((((T*)buf_data[1])[i] - ((T)((T*)buf_data[0])[i] + shift)) < 1e-5);
}
OCL_UNMAP_BUFFER(0);
OCL_UNMAP_BUFFER(1);
@@ -54,6 +58,6 @@ test_all_vector(uint16_t, ushort)
test_all_vector(int32_t, int)
test_all_vector(uint32_t, uint)
test_all_vector(float, float)
-//test_all_vector(double, double)
+test_all_vector(double, double)
//test_all_vector(int64_t, long)
//test_all_vector(uint64_t, ulong)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git
More information about the Pkg-opencl-devel
mailing list