[Pkg-opencl-devel] [beignet] 25/66: Imported Upstream version 0.1+git20130521+a7ea35c
Andreas Beckmann
anbe at moszumanska.debian.org
Fri Oct 31 07:27:04 UTC 2014
This is an automated email from the git hooks/post-receive script.
anbe pushed a commit to branch master
in repository beignet.
commit e85d9784fbd36caa1b1d5031cf56c0862db101e4
Author: Simon Richter <sjr at debian.org>
Date: Tue May 21 09:17:45 2013 +0200
Imported Upstream version 0.1+git20130521+a7ea35c
---
.gitignore | 5 +-
backend/src/.gitignore | 3 +
backend/src/CMakeLists.txt | 5 +
backend/src/backend/context.cpp | 24 +-
backend/src/backend/context.hpp | 3 +
backend/src/backend/gen/gen_mesa_disasm.c | 3 +-
backend/src/backend/gen_context.cpp | 78 +-
backend/src/backend/gen_context.hpp | 3 +-
backend/src/backend/gen_defs.hpp | 2 -
backend/src/backend/gen_encoder.cpp | 68 +-
backend/src/backend/gen_encoder.hpp | 10 +-
.../src/backend/gen_insn_gen7_schedule_info.hxx | 3 +-
backend/src/backend/gen_insn_scheduling.cpp | 2 +-
backend/src/backend/gen_insn_selection.cpp | 98 ++-
backend/src/backend/gen_insn_selection.hxx | 3 +-
backend/src/backend/program.cpp | 51 +-
backend/src/backend/program.h | 37 +-
backend/src/backend/program.hpp | 26 +-
backend/src/ir/function.cpp | 2 +
backend/src/ir/function.hpp | 26 +
backend/src/ir/image.cpp | 108 +++
backend/src/ir/image.hpp | 71 ++
backend/src/ir/instruction.cpp | 45 +-
backend/src/ir/instruction.hpp | 45 +-
backend/src/ir/instruction.hxx | 2 +-
backend/src/ir/sampler.cpp | 78 ++
backend/src/ir/sampler.hpp | 78 ++
backend/src/ir/unit.hpp | 22 +-
backend/src/llvm/llvm_gen_backend.cpp | 551 ++++++--------
backend/src/llvm/llvm_gen_backend.hpp | 30 +-
backend/src/llvm/llvm_gen_ocl_function.hxx | 4 +
backend/src/llvm/llvm_scalarize.cpp | 836 +++++++++++++++++++++
backend/src/llvm/llvm_to_gen.cpp | 1 +
backend/src/ocl_common_defines.h | 9 +-
backend/src/ocl_stdlib.h | 400 +++++++++-
backend/src/sys/alloc.hpp | 1 +
include/CL/cl.hpp | 104 +--
include/CMakeLists.txt | 2 +
kernels/compiler_julia.cl | 2 -
kernels/compiler_julia_no_break.cl | 2 -
kernels/compiler_math.cl | 46 +-
kernels/compiler_math_2op.cl | 19 +
kernels/compiler_math_3op.cl | 9 +
kernels/compiler_menger_sponge.cl | 2 -
kernels/compiler_menger_sponge_no_shadow.cl | 2 -
kernels/compiler_nautilus.cl | 4 +-
kernels/test_copy_image1.cl | 33 +
kernels/test_fill_image0.cl | 2 +-
kernels/test_get_image_size.cl | 9 +
src/.gitignore | 2 +
src/CMakeLists.txt | 10 +-
src/cl_api.c | 270 ++++++-
src/cl_command_queue.c | 41 +-
src/cl_command_queue.h | 3 +
src/cl_command_queue_gen7.c | 4 +
src/cl_device_id.c | 1 +
src/cl_device_id.h | 6 +-
src/cl_driver.h | 29 +-
src/cl_driver_defs.c | 3 +-
src/cl_gt_device.h | 1 +
src/cl_kernel.c | 52 +-
src/cl_kernel.h | 4 +
src/cl_mem.c | 25 +-
src/cl_mem.h | 9 +-
src/cl_mem_gl.c | 4 +-
src/cl_sampler.c | 47 ++
src/cl_sampler.h | 5 +
src/cl_utils.h | 9 +
src/intel/intel_driver.c | 43 +-
src/intel/intel_gpgpu.c | 73 +-
utests/.gitignore | 13 +
utests/CMakeLists.txt | 2 +
...ovforphi_undef.cpp => compiler_copy_image1.cpp} | 50 +-
utests/compiler_fill_image0.cpp | 7 +-
...fill_image0.cpp => compiler_get_image_size.cpp} | 24 +-
utests/compiler_math.cpp | 88 ++-
utests/compiler_math_2op.cpp | 80 ++
utests/compiler_math_3op.cpp | 64 ++
utests/compiler_movforphi_undef.cpp | 4 +-
utests/utest_helper.hpp | 11 +
80 files changed, 3261 insertions(+), 692 deletions(-)
diff --git a/.gitignore b/.gitignore
index bae14b5..90fd161 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
*.o
-*.so*
+CMakeCache.txt
+CMakeFiles/
+Makefile
+cmake_install.cmake
diff --git a/backend/src/.gitignore b/backend/src/.gitignore
index afcb283..39239f3 100644
--- a/backend/src/.gitignore
+++ b/backend/src/.gitignore
@@ -1 +1,4 @@
+GBEConfig.h
+libgbe.so
+ocl_common_defines_str.cpp
ocl_stdlib_str.cpp
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index ac7e1da..183517a 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -61,6 +61,10 @@ else (GBE_USE_BLOB)
ir/unit.hpp
ir/constant.cpp
ir/constant.hpp
+ ir/sampler.cpp
+ ir/sampler.hpp
+ ir/image.cpp
+ ir/image.hpp
ir/instruction.cpp
ir/instruction.hpp
ir/liveness.cpp
@@ -79,6 +83,7 @@ else (GBE_USE_BLOB)
backend/program.h
llvm/llvm_gen_backend.cpp
llvm/llvm_passes.cpp
+ llvm/llvm_scalarize.cpp
llvm/llvm_to_gen.cpp
llvm/llvm_gen_backend.hpp
llvm/llvm_gen_ocl_function.hxx
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index c636b48..474c36a 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -29,6 +29,8 @@
#include "ir/profile.hpp"
#include "ir/liveness.hpp"
#include "ir/value.hpp"
+#include "ir/image.hpp"
+#include "ir/sampler.hpp"
#include "sys/cvar.hpp"
#include <algorithm>
@@ -120,7 +122,12 @@ namespace gbe
continue;
}
} else {
- aligned = ALIGN(list->offset+list->size-size-(alignment-1), alignment); //alloc from block's tail
+ int16_t unaligned = list->offset + list->size - size - (alignment-1);
+ if(unaligned < 0) {
+ list = list->prev;
+ continue;
+ }
+ aligned = ALIGN(unaligned, alignment); //alloc from block's tail
spaceOnLeft = aligned - list->offset;
spaceOnRight = list->size - size - spaceOnLeft;
@@ -196,7 +203,6 @@ namespace gbe
// Track the allocation to retrieve the size later
allocatedBlocks.insert(std::make_pair(aligned, size));
-
// We have a valid offset now
return aligned;
}
@@ -352,6 +358,20 @@ namespace gbe
kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
}
+ uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
+ {
+ int32_t offset = fn.getImageSet()->getInfoOffset(key);
+ if (offset >= 0)
+ return offset;
+ newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
+ std::sort(kernel->patches.begin(), kernel->patches.end());
+
+ offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
+ GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
+ fn.getImageSet()->appendInfo(key, offset);
+ return offset;
+ }
+
void Context::buildPatchList(void) {
const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
kernel->curbeSize = 0u;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 245ad01..c205388 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -88,6 +88,9 @@ namespace gbe
void deallocate(int16_t offset);
/* allocate curbe for constant ptr argument */
int32_t allocConstBuf(uint32_t argID);
+ /* allocate a new entry for a specific image's information */
+ /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
+ uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
protected:
/*! Build the instruction stream. Return false if failed */
virtual bool emitCode(void) = 0;
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index fa52f8c..420cd62 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -329,8 +329,7 @@ static const char *math_function[16] = {
[GEN_MATH_FUNCTION_RSQ] = "rsq",
[GEN_MATH_FUNCTION_SIN] = "sin",
[GEN_MATH_FUNCTION_COS] = "cos",
- [GEN_MATH_FUNCTION_SINCOS] = "sincos",
- [GEN_MATH_FUNCTION_TAN] = "tan",
+ [GEN_MATH_FUNCTION_FDIV] = "fdiv",
[GEN_MATH_FUNCTION_POW] = "pow",
[GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
[GEN_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4a16cae..18f6c11 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -212,7 +212,7 @@ namespace gbe
}
}
- void GenContext::emitCBMoveInstruction(const SelectionInstruction &insn) {
+ void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
const GenRegister src = GenRegister::unpacked_uw(ra->genReg(insn.src(0)).nr, 0);
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister a0 = GenRegister::addr8(0);
@@ -287,47 +287,35 @@ namespace gbe
void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
- const GenRegister bti = ra->genReg(insn.src(4));
- const GenRegister sampler = ra->genReg(insn.src(5));
- const GenRegister ucoord = ra->genReg(insn.src(6));
- const GenRegister vcoord = ra->genReg(insn.src(7));
- const GenRegister wcoord = ra->genReg(insn.src(8));
- const GenRegister temp = GenRegister::ud1grf(msgPayload.nr, msgPayload.subnr/sizeof(float) + 4);
- const GenRegister a0_0 = GenRegister::ud1arf(GEN_ARF_ADDRESS, 0);
+ const unsigned char bti = insn.extra.function;
+ const unsigned char sampler = insn.extra.elem;
+ const GenRegister ucoord = ra->genReg(insn.src(4));
+ const GenRegister vcoord = ra->genReg(insn.src(5));
+ const GenRegister wcoord = ra->genReg(insn.src(6));
uint32_t simdWidth = p->curr.execWidth;
p->push();
const uint32_t nr = msgPayload.nr;
// prepare mesg desc and move to a0.0.
// desc = bti | (sampler << 8) | (0 << 12) | (2 << 16) | (0 << 18) | (0 << 19) | (4 << 20) | (1 << 25) | (0 < 29) | (0 << 31)
- p->curr.execWidth = 1;
- p->MOV(a0_0, GenRegister::immud((GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE << 12) | (2 << 17)
- | ((4 * (simdWidth/8)) << 20)
- | ((2 * (simdWidth/8)) << 25)));
- p->SHL(temp, GenRegister::ud1grf(sampler.nr, sampler.subnr/sizeof(float)), GenRegister::immud(8));
- p->OR(a0_0, a0_0, temp);
- p->OR(a0_0, a0_0, GenRegister::ud1grf(bti.nr, bti.subnr/sizeof(float)));
- p->curr.execWidth = simdWidth;
/* Prepare message payload. */
p->MOV(GenRegister::f8grf(nr , 0), ucoord);
p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
if (insn.src(8).reg() != 0)
p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), wcoord);
- p->SAMPLE(dst, msgPayload, a0_0, -1, 0);
-
+ p->SAMPLE(dst, msgPayload, false, bti, sampler, simdWidth, -1, 0);
p->pop();
}
void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
- const GenRegister bti = ra->genReg(insn.src(0 + insn.extra.elem));
- const GenRegister ucoord = ra->genReg(insn.src(1 + insn.extra.elem));
- const GenRegister vcoord = ra->genReg(insn.src(2 + insn.extra.elem));
- const GenRegister wcoord = ra->genReg(insn.src(3 + insn.extra.elem));
- const GenRegister R = ra->genReg(insn.src(4 + insn.extra.elem));
- const GenRegister G = ra->genReg(insn.src(5 + insn.extra.elem));
- const GenRegister B = ra->genReg(insn.src(6 + insn.extra.elem));
- const GenRegister A = ra->genReg(insn.src(7 + insn.extra.elem));
- const GenRegister a0_0 = GenRegister::ud1arf(GEN_ARF_ADDRESS, 0);
+ const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
+ const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.elem));
+ const GenRegister wcoord = ra->genReg(insn.src(2 + insn.extra.elem));
+ const GenRegister R = ra->genReg(insn.src(3 + insn.extra.elem));
+ const GenRegister G = ra->genReg(insn.src(4 + insn.extra.elem));
+ const GenRegister B = ra->genReg(insn.src(5 + insn.extra.elem));
+ const GenRegister A = ra->genReg(insn.src(6 + insn.extra.elem));
+ const unsigned char bti = insn.extra.function;
p->push();
uint32_t simdWidth = p->curr.execWidth;
@@ -339,16 +327,18 @@ namespace gbe
// prepare mesg desc and move to a0.0.
// desc = bti | (msg_type << 14) | (header_present << 19))
- p->MOV(a0_0, GenRegister::immud((GEN_TYPED_WRITE << 14) | (1 << 19) | (9 << 25)));
- p->OR(a0_0, a0_0, GenRegister::ud1grf(bti.nr, bti.subnr/sizeof(float)));
// prepare header, we need to enable all the 8 planes.
- p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xff));
- // Typed write only support SIMD8.
+ p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xffff));
p->curr.execWidth = 8;
+ // Typed write only support SIMD8.
// Prepare message payload U + V + R(ignored) + LOD(0) + RGBA.
- // XXX currently only support U32 surface type with RGBA.
+ // Currently, we don't support non-zero lod, so we clear all lod to
+ // zero for both quarters thus save one instruction here.
+ // Thus we must put this instruction in noMask and no predication state.
p->MOV(GenRegister::ud8grf(nr + 4, 0), GenRegister::immud(0)); //LOD
-
+ p->pop();
+ p->push();
+ p->curr.execWidth = 8;
// TYPED WRITE send instruction only support SIMD8, if we are SIMD16, we
// need to call it twice.
uint32_t quarterNum = (simdWidth == 8) ? 1 : 2;
@@ -359,6 +349,8 @@ namespace gbe
GenRegister::retype(GenRegister::QnPhysical(src, quarter), src.type))
#define QUARTER_MOV1(dst_nr, src) p->MOV(GenRegister::retype(GenRegister::ud8grf(dst_nr, 0), src.type), \
GenRegister::retype(GenRegister::QnPhysical(src,quarter), src.type))
+ if (quarter == 1)
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
QUARTER_MOV0(nr + 1, ucoord);
QUARTER_MOV0(nr + 2, vcoord);
if (insn.src(3 + insn.extra.elem).reg() != 0)
@@ -368,12 +360,28 @@ namespace gbe
QUARTER_MOV1(nr + 7, B);
QUARTER_MOV1(nr + 8, A);
#undef QUARTER_MOV
- p->TYPED_WRITE(header, a0_0);
+ p->TYPED_WRITE(header, true, bti);
}
-
p->pop();
}
+ void GenContext::emitGetImageInfoInstruction(const SelectionInstruction &insn) {
+ const unsigned char bti = insn.extra.function;
+ const unsigned char type = insn.extra.elem;
+ const uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(type);
+ ir::ImageInfoKey key;
+ key.index = bti;
+ key.type = type;
+
+ uint32_t offset = this->getImageInfoCurbeOffset(key, dstNum * 4) + GEN_REG_SIZE;
+ for(uint32_t i = 0; i < dstNum; i++) {
+ const uint32_t nr = offset / GEN_REG_SIZE;
+ const uint32_t subnr = (offset % GEN_REG_SIZE) / sizeof(uint32_t);
+ p->MOV(ra->genReg(insn.dst(i)), GenRegister::ud1grf(nr, subnr));
+ offset += 32;
+ }
+ }
+
BVAR(OCL_OUTPUT_REG_ALLOC, false);
BVAR(OCL_OUTPUT_ASM, false);
bool GenContext::emitCode(void) {
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 33258f8..7c28bdf 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -80,7 +80,7 @@ namespace gbe
void emitTernaryInstruction(const SelectionInstruction &insn);
void emitCompareInstruction(const SelectionInstruction &insn);
void emitJumpInstruction(const SelectionInstruction &insn);
- void emitCBMoveInstruction(const SelectionInstruction &insn);
+ void emitIndirectMoveInstruction(const SelectionInstruction &insn);
void emitEotInstruction(const SelectionInstruction &insn);
void emitNoOpInstruction(const SelectionInstruction &insn);
void emitWaitInstruction(const SelectionInstruction &insn);
@@ -92,6 +92,7 @@ namespace gbe
void emitByteScatterInstruction(const SelectionInstruction &insn);
void emitSampleInstruction(const SelectionInstruction &insn);
void emitTypedWriteInstruction(const SelectionInstruction &insn);
+ void emitGetImageInfoInstruction(const SelectionInstruction &insn);
/*! Implements base class */
virtual Kernel *allocateKernel(void);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 187224a..c7a1581 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -350,8 +350,6 @@ enum GenMessageTarget {
#define GEN_MATH_FUNCTION_RSQ 5
#define GEN_MATH_FUNCTION_SIN 6 /* was 7 */
#define GEN_MATH_FUNCTION_COS 7 /* was 8 */
-#define GEN_MATH_FUNCTION_SINCOS 8 /* was 6 */
-#define GEN_MATH_FUNCTION_TAN 9 /* gen4 */
#define GEN_MATH_FUNCTION_FDIV 9 /* gen6+ */
#define GEN_MATH_FUNCTION_POW 10
#define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER 11
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index ed7c256..d6c34fb 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -166,6 +166,39 @@ namespace gbe
}
#endif
+ static void setSamplerMessage(GenEncoder *p,
+ GenInstruction *insn,
+ unsigned char bti,
+ unsigned char sampler,
+ uint32_t msg_type,
+ uint32_t response_length,
+ uint32_t msg_length,
+ bool header_present,
+ uint32_t simd_mode,
+ uint32_t return_format)
+ {
+ const GenMessageTarget sfid = GEN_SFID_SAMPLER;
+ setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+ insn->bits3.sampler_gen7.bti = bti;
+ insn->bits3.sampler_gen7.sampler = sampler;
+ insn->bits3.sampler_gen7.msg_type = msg_type;
+ insn->bits3.sampler_gen7.simd_mode = simd_mode;
+ }
+
+
+ static void setTypedWriteMessage(GenEncoder *p,
+ GenInstruction *insn,
+ unsigned char bti,
+ unsigned char msg_type,
+ uint32_t msg_length,
+ bool header_present)
+ {
+ const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+ setMessageDescriptor(p, insn, sfid, msg_length, 0, header_present);
+ insn->bits3.gen7_typed_rw.bti = bti;
+ insn->bits3.gen7_typed_rw.msg_type = msg_type;
+ }
+
//////////////////////////////////////////////////////////////////////////
// Gen Emitter encoding class
//////////////////////////////////////////////////////////////////////////
@@ -800,31 +833,42 @@ namespace gbe
}
void GenEncoder::SAMPLE(GenRegister dest,
- GenRegister src0,
- GenRegister src1,
+ GenRegister msg,
+ bool header_present,
+ unsigned char bti,
+ unsigned char sampler,
+ uint32_t simdWidth,
uint32_t writemask,
uint32_t return_format)
{
if (writemask == 0) return;
-
+ uint32_t msg_type = (simdWidth == 16) ?
+ GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+ uint32_t response_length = (4 * (simdWidth / 8));
+ uint32_t msg_length = (2 * (simdWidth / 8));
+ if (header_present)
+ msg_length++;
+ uint32_t simd_mode = (simdWidth == 16) ?
+ GEN_SAMPLER_SIMD_MODE_SIMD16 : GEN_SAMPLER_SIMD_MODE_SIMD8;
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
- insn->header.predicate_control = 0; /* XXX */
this->setHeader(insn);
this->setDst(insn, dest);
- this->setSrc0(insn, src0);
- this->setSrc1(insn, src1);
- insn->header.destreg_or_condmod = GEN_SFID_SAMPLER;
+ this->setSrc0(insn, msg);
+ setSamplerMessage(this, insn, bti, sampler, msg_type,
+ response_length, msg_length,
+ header_present,
+ simd_mode, return_format);
}
- void GenEncoder::TYPED_WRITE(GenRegister header, GenRegister desc)
+ void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
{
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
- insn->header.predicate_control = 0; /* XXX */
+ uint32_t msg_type = GEN_TYPED_WRITE;
+ uint32_t msg_length = header_present ? 9 : 8;
this->setHeader(insn);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
- this->setSrc0(insn, header);
- this->setSrc1(insn, desc);
- insn->header.destreg_or_condmod = GEN6_SFID_DATAPORT_RENDER_CACHE;
+ this->setSrc0(insn, msg);
+ setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present);
}
void GenEncoder::EOT(uint32_t msg) {
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 7e26f0a..83d83d2 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -140,14 +140,18 @@ namespace gbe
void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
/*! Send instruction for the sampler */
void SAMPLE(GenRegister dest,
- GenRegister src0,
- GenRegister src1,
+ GenRegister msg,
+ bool header_present,
+ unsigned char bti,
+ unsigned char sampler,
+ unsigned int simdWidth,
uint32_t writemask,
uint32_t return_format);
/*! TypedWrite instruction for texture */
void TYPED_WRITE(GenRegister header,
- GenRegister desc);
+ bool header_present,
+ unsigned char bti);
/*! Extended math function (2 sources) */
void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
/*! Extended math function (1 source) */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index ce8769f..a2c0fba 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -5,7 +5,7 @@ DECL_GEN7_SCHEDULE(Binary, 20, 4, 2)
DECL_GEN7_SCHEDULE(Ternary, 20, 4, 2)
DECL_GEN7_SCHEDULE(Compare, 20, 4, 2)
DECL_GEN7_SCHEDULE(Jump, 14, 1, 1)
-DECL_GEN7_SCHEDULE(CBMove, 20, 2, 2)
+DECL_GEN7_SCHEDULE(IndirectMove, 20, 2, 2)
DECL_GEN7_SCHEDULE(Eot, 20, 1, 1)
DECL_GEN7_SCHEDULE(NoOp, 20, 2, 2)
DECL_GEN7_SCHEDULE(Wait, 20, 2, 2)
@@ -17,4 +17,5 @@ DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteScatter, 80, 1, 1)
DECL_GEN7_SCHEDULE(Sample, 80, 1, 1)
DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1)
+DECL_GEN7_SCHEDULE(GetImageInfo, 20, 4, 2)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index 01c525e..95eedfe 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -109,7 +109,7 @@ namespace gbe
insn(insn), refNum(0), retiredCycle(0) {}
bool dependsOn(ScheduleDAGNode *node) const {
GBE_ASSERT(node != NULL);
- for (auto child : children)
+ for (auto child : node->children)
if (child.node == this)
return true;
return false;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 2bd9aca..08bc6af 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -440,7 +440,7 @@ namespace gbe
/*! Select instruction with embedded comparison */
void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
/* Constant buffer move instruction */
- void CB_MOVE(Reg dst, Reg src);
+ void INDIRECT_MOVE(Reg dst, Reg src);
/*! EOT is used to finish GPGPU threads */
void EOT(void);
/*! No-op */
@@ -466,9 +466,11 @@ namespace gbe
/*! Encode ternary instructions */
void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
/*! Encode sample instructions */
- void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum);
+ void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler);
/*! Encode typed write instructions */
- void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum);
+ void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
+ /*! Get image information */
+ void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
/*! Use custom allocators */
GBE_CLASS(Opaque);
friend class SelectionBlock;
@@ -700,8 +702,8 @@ namespace gbe
insn->src(1) = src1;
insn->extra.function = conditional;
}
- void Selection::Opaque::CB_MOVE(Reg dst, Reg src) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_CB_MOVE, 1, 1);
+ void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg src) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 1, 1);
insn->dst(0) = dst;
insn->src(0) = src;
}
@@ -964,8 +966,11 @@ namespace gbe
this->matchBasicBlock(insnNum);
});
}
- /* XXX always 4 return values? */
- void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum) {
+
+ void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
+ GenRegister *src, uint32_t srcNum,
+ GenRegister *msgPayloads, uint32_t msgNum,
+ uint32_t bti, uint32_t sampler) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum);
SelectionVector *dstVector = this->appendVector();
SelectionVector *msgVector = this->appendVector();
@@ -987,6 +992,9 @@ namespace gbe
msgVector->regNum = msgNum;
msgVector->isSrc = 1;
msgVector->reg = &insn->src(0);
+
+ insn->extra.function = bti;
+ insn->extra.elem = sampler;
}
///////////////////////////////////////////////////////////////////////////
@@ -999,7 +1007,8 @@ namespace gbe
}
void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum,
- GenRegister *msgs, uint32_t msgNum) {
+ GenRegister *msgs, uint32_t msgNum,
+ uint32_t bti) {
uint32_t elemID = 0;
uint32_t i;
SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum);
@@ -1010,6 +1019,7 @@ namespace gbe
for (i = 0; i < srcNum; ++i, ++elemID)
insn->src(elemID) = src[i];
+ insn->extra.function = bti;
insn->extra.elem = msgNum;
// Sends require contiguous allocation
msgVector->regNum = msgNum;
@@ -1017,6 +1027,17 @@ namespace gbe
msgVector->reg = &insn->src(0);
}
+ void Selection::Opaque::GET_IMAGE_INFO(uint32_t infoType, GenRegister *dst,
+ uint32_t dstNum, uint32_t bti) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_GET_IMAGE_INFO, dstNum, 0);
+
+ for(uint32_t i = 0; i < dstNum; ++i)
+ insn->dst(i) = dst[i];
+
+ insn->extra.function = bti;
+ insn->extra.elem = infoType;
+ }
+
Selection::~Selection(void) { GBE_DELETE(this->opaque); }
void Selection::select(void) {
@@ -1546,6 +1567,13 @@ namespace gbe
const Immediate imm = insn.getImmediate();
const GenRegister dst = sel.selReg(insn.getDst(0), type);
+ sel.push();
+ if (sel.isScalarOrBool(insn.getDst(0)) == true) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
switch (type) {
case TYPE_U32:
case TYPE_S32:
@@ -1559,6 +1587,7 @@ namespace gbe
case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
default: NOT_SUPPORTED;
}
+ sel.pop();
return true;
}
@@ -1661,7 +1690,7 @@ namespace gbe
sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
}
- void emitCBMove(Selection::Opaque &sel,
+ void emitIndirectMove(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister address) const
{
@@ -1670,7 +1699,7 @@ namespace gbe
const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
const GenRegister src = address;
- sel.CB_MOVE(dst, src);
+ sel.INDIRECT_MOVE(dst, src);
}
INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn) const {
@@ -1683,7 +1712,7 @@ namespace gbe
insn.getAddressSpace() == MEM_LOCAL);
GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
if (insn.getAddressSpace() == MEM_CONSTANT)
- this->emitCBMove(sel, insn, address);
+ this->emitIndirectMove(sel, insn, address);
else if (insn.isAligned() == true)
this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else {
@@ -1957,7 +1986,7 @@ namespace gbe
{
using namespace ir;
GenRegister msgPayloads[4];
- GenRegister dst[insn.getDstNum()], src[insn.getSrcNum()];
+ GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
for( int i = 0; i < 4; ++i)
msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
@@ -1965,10 +1994,15 @@ namespace gbe
for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
- for (uint32_t valueID = 0; valueID < insn.getSrcNum(); ++valueID)
- src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ for (uint32_t valueID = 0; valueID < insn.getSrcNum() - 2; ++valueID)
+ src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
- sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum(), msgPayloads, 4);
+ uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
+ (insn.getSrc(SampleInstruction::SURFACE_BTI));
+ uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
+ (insn.getSrc(SampleInstruction::SAMPLER_BTI));
+
+ sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum() - 2, msgPayloads, 4, bti, sampler);
return true;
}
DECL_CTOR(SampleInstruction, 1, 1);
@@ -1990,22 +2024,40 @@ namespace gbe
for(uint32_t i = 0; i < msgNum; i++)
msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- // bti always uses TYPE_U32.
- src[valueID] = sel.selReg(insn.getSrc(valueID), TYPE_U32);
- valueID++;
// u, v, w coords should use coord type.
for (; valueID < 1 + coordNum; ++valueID)
- src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getCoordType());
+ src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
- for (; valueID < insn.getSrcNum(); ++valueID)
- src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
+ src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getSrcType());
- sel.TYPED_WRITE(src, insn.getSrcNum(), msgs, msgNum);
+ uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
+ (insn.getSrc(TypedWriteInstruction::SURFACE_BTI));
+ sel.TYPED_WRITE(src, insn.getSrcNum() - 1, msgs, msgNum, bti);
return true;
}
DECL_CTOR(TypedWriteInstruction, 1, 1);
};
+ /*! get image info instruction pattern. */
+ DECL_PATTERN(GetImageInfoInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn) const
+ {
+ using namespace ir;
+ const uint32_t infoType = insn.getInfoType();
+ GenRegister dst[4];
+ uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(infoType);
+ for (uint32_t valueID = 0; valueID < dstNum; ++valueID)
+ dst[valueID] = sel.selReg(insn.getDst(valueID), TYPE_U32);
+ uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
+ (insn.getSrc(0));
+ sel.GET_IMAGE_INFO(infoType, dst, dstNum, bti);
+ return true;
+ }
+ DECL_CTOR(GetImageInfoInstruction, 1, 1);
+ };
+
/*! Branch instruction pattern */
DECL_PATTERN(BranchInstruction)
{
@@ -2159,7 +2211,6 @@ namespace gbe
SelectionLibrary::SelectionLibrary(void) {
this->insert<UnaryInstructionPattern>();
this->insert<BinaryInstructionPattern>();
- this->insert<SampleInstructionPattern>();
this->insert<TypedWriteInstructionPattern>();
this->insert<SyncInstructionPattern>();
this->insert<LoadImmInstructionPattern>();
@@ -2175,6 +2226,7 @@ namespace gbe
this->insert<MulAddInstructionPattern>();
this->insert<SelectModifierInstructionPattern>();
this->insert<SampleInstructionPattern>();
+ this->insert<GetImageInfoInstructionPattern>();
// Sort all the patterns with the number of instructions they output
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index f89ad4c..455bb92 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -24,7 +24,7 @@ DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
DECL_SELECTION_IR(MAD, TernaryInstruction)
DECL_SELECTION_IR(JMPI, JumpInstruction)
DECL_SELECTION_IR(EOT, EotInstruction)
-DECL_SELECTION_IR(CB_MOVE, CBMoveInstruction)
+DECL_SELECTION_IR(INDIRECT_MOVE, IndirectMoveInstruction)
DECL_SELECTION_IR(NOP, NoOpInstruction)
DECL_SELECTION_IR(WAIT, WaitInstruction)
DECL_SELECTION_IR(MATH, MathInstruction)
@@ -35,3 +35,4 @@ DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
+DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 38cc236..c46c681 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -49,10 +49,12 @@
namespace gbe {
Kernel::Kernel(const std::string &name) :
- name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), ctx(NULL)
+ name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), ctx(NULL), samplerSet(NULL), imageSet(NULL)
{}
Kernel::~Kernel(void) {
if(ctx) GBE_DELETE(ctx);
+ if(samplerSet) GBE_DELETE(samplerSet);
+ if(imageSet) GBE_DELETE(imageSet);
GBE_SAFE_DELETE_ARRAY(args);
}
int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
@@ -90,6 +92,8 @@ namespace gbe {
for (const auto &pair : set) {
const std::string &name = pair.first;
Kernel *kernel = this->compileKernel(unit, name);
+ kernel->setSamplerSet(pair.second->getSamplerSet());
+ kernel->setImageSet(pair.second->getImageSet());
kernels.insert(std::make_pair(name, kernel));
}
return true;
@@ -250,6 +254,39 @@ namespace gbe {
return kernel->setConstBufSize(argID, sz);
}
+ static size_t kernelGetSamplerSize(gbe_kernel gbeKernel) {
+ if (gbeKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ return kernel->getSamplerSize();
+ }
+
+ static void kernelGetSamplerData(gbe_kernel gbeKernel, uint32_t *samplers) {
+ if (gbeKernel == NULL) return;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ kernel->getSamplerData(samplers);
+ }
+
+ static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
+ if (gbeKernel == NULL) return 0;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ return kernel->getImageSize();
+ }
+
+ static void kernelGetImageData(gbe_kernel gbeKernel, ImageInfo *images) {
+ if (gbeKernel == NULL) return;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ kernel->getImageData(images);
+ }
+
+ static uint32_t gbeImageBaseIndex = 0;
+ static void setImageBaseIndex(uint32_t baseIdx) {
+ gbeImageBaseIndex = baseIdx;
+ }
+
+ static uint32_t getImageBaseIndex() {
+ return gbeImageBaseIndex;
+ }
+
static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
return 0u;
}
@@ -277,6 +314,12 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL
GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
+GBE_EXPORT_SYMBOL gbe_set_image_base_index_cb *gbe_set_image_base_index = NULL;
+GBE_EXPORT_SYMBOL gbe_get_image_base_index_cb *gbe_get_image_base_index = NULL;
namespace gbe
{
@@ -304,6 +347,12 @@ namespace gbe
gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
gbe_kernel_use_slm = gbe::kernelUseSLM;
+ gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
+ gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+ gbe_kernel_get_image_size = gbe::kernelGetImageSize;
+ gbe_kernel_get_image_data = gbe::kernelGetImageData;
+ gbe_get_image_base_index = gbe::getImageBaseIndex;
+ gbe_set_image_base_index = gbe::setImageBaseIndex;
genSetupCallBacks();
}
};
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 575196a..f178f8b 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -71,9 +71,7 @@ enum gbe_curbe_type {
GBE_CURBE_GROUP_NUM_Z,
GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
GBE_CURBE_GLOBAL_CONSTANT_DATA,
- GBE_CURBE_IMAGE_WIDTH,
- GBE_CURBE_IMAGE_HEIGHT,
- GBE_CURBE_IMAGE_DEPTH,
+ GBE_CURBE_IMAGE_INFO,
GBE_CURBE_STACK_POINTER,
GBE_CURBE_KERNEL_ARGUMENT,
GBE_CURBE_EXTRA_ARGUMENT,
@@ -87,6 +85,31 @@ enum gbe_extra_argument {
GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
};
+typedef struct ImageInfo {
+ int32_t arg_idx;
+ int32_t idx;
+ int32_t wSlot;
+ int32_t hSlot;
+ int32_t depthSlot;
+ int32_t dataTypeSlot;
+ int32_t channelOrderSlot;
+ int32_t dimOrderSlot;
+} ImageInfo;
+
+typedef void (gbe_set_image_base_index_cb)(uint32_t base_idx);
+extern gbe_set_image_base_index_cb *gbe_set_image_base_index;
+
+typedef uint32_t (gbe_get_image_base_index_cb)();
+extern gbe_get_image_base_index_cb *gbe_get_image_base_index;
+
+/*! Get the size of defined images */
+typedef size_t (gbe_kernel_get_image_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size;
+
+/*! Get the content of defined images */
+typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo *images);
+extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
+
/*! Create a new program from the given source code (zero terminated string) */
typedef gbe_program (gbe_program_new_from_source_cb)(const char *source,
size_t stringSize,
@@ -114,6 +137,14 @@ extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_
typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
+/*! Get the size of defined samplers */
+typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
+
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
+extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
+
/*! Destroy and deallocate the given program */
typedef void (gbe_program_delete_cb)(gbe_program);
extern gbe_program_delete_cb *gbe_program_delete;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index e754899..2d67310 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -29,6 +29,8 @@
#include "backend/context.hpp"
#include "ir/constant.hpp"
#include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/sampler.hpp"
#include "sys/hash_map.hpp"
#include "sys/vector.hpp"
#include <string>
@@ -53,9 +55,9 @@ namespace gbe {
INLINE PatchInfo(gbe_curbe_type type, uint32_t subType = 0u, uint32_t offset = 0u) :
type(uint32_t(type)), subType(subType), offset(offset) {}
INLINE PatchInfo(void) {}
- uint32_t type : 8; //!< Type of the patch (see program.h for the list)
- uint32_t subType : 8; //!< Optional sub-type of the patch (see program.h)
- uint32_t offset : 16; //!< Optional offset to encode
+ uint64_t type : 16; //!< Type of the patch (see program.h for the list)
+ uint64_t subType : 32; //!< Optional sub-type of the patch (see program.h)
+ uint64_t offset : 16; //!< Optional offset to encode
};
/*! We will sort PatchInfo to make binary search */
@@ -108,6 +110,22 @@ namespace gbe {
}
return -1;
}
+ /*! Set sampler set. */
+ void setSamplerSet(ir::SamplerSet *from) {
+ samplerSet = from;
+ }
+ /*! Get defined sampler size */
+ size_t getSamplerSize(void) const { return samplerSet->getDataSize(); }
+ /*! Get defined sampler value array */
+ void getSamplerData(uint32_t *samplers) const { samplerSet->getData(samplers); }
+ /*! Set image set. */
+ void setImageSet(ir::ImageSet * from) {
+ imageSet = from;
+ }
+ /*! Get defined image size */
+ size_t getImageSize(void) const { return imageSet->getDataSize(); }
+ /*! Get defined image value array */
+ void getImageData(ImageInfo *images) const { imageSet->getData(images); }
protected:
friend class Context; //!< Owns the kernels
const std::string name; //!< Kernel name
@@ -119,6 +137,8 @@ namespace gbe {
uint32_t stackSize; //!< Stack size (may be 0 if unused)
bool useSLM; //!< SLM requires a special HW config
Context *ctx; //!< Save context after compiler to alloc constant buffer curbe
+ ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
+ ir::ImageSet *imageSet; //!< Copy from the corresponding function.
GBE_CLASS(Kernel); //!< Use custom allocators
};
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 004afcd..88aae08 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -46,6 +46,8 @@ namespace ir {
name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false)
{
initProfile(*this);
+ samplerSet = GBE_NEW(SamplerSet);
+ imageSet = GBE_NEW(ImageSet);
}
Function::~Function(void) {
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 43aa826..6e712cd 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -28,6 +28,8 @@
#include "ir/register.hpp"
#include "ir/instruction.hpp"
#include "ir/profile.hpp"
+#include "ir/sampler.hpp"
+#include "ir/image.hpp"
#include "sys/vector.hpp"
#include "sys/set.hpp"
#include "sys/map.hpp"
@@ -196,6 +198,18 @@ namespace ir {
GBE_ASSERT(args[ID] != NULL);
return *args[ID];
}
+
+ /*! Get arg ID. */
+ INLINE int32_t getArgID(FunctionArgument *requestArg) {
+ for (uint32_t ID = 0; ID < args.size(); ID++)
+ {
+ if ( args[ID] == requestArg )
+ return ID;
+ }
+ GBE_ASSERTM(0, "Failed to get a valid argument ID.");
+ return -1;
+ }
+
/*! Get the number of pushed registers */
INLINE uint32_t pushedNum(void) const { return pushMap.size(); }
/*! Get the pushed data location for the given register */
@@ -217,6 +231,12 @@ namespace ir {
for (auto arg : args) if (arg->reg == reg) return arg;
return NULL;
}
+
+ INLINE FunctionArgument *getArg(const Register ®) {
+ for (auto arg : args) if (arg->reg == reg) return arg;
+ return NULL;
+ }
+
/*! Get output register */
INLINE Register getOutput(uint32_t ID) const { return outputs[ID]; }
/*! Get the argument location for the pushed register */
@@ -281,6 +301,10 @@ namespace ir {
INLINE bool getUseSLM(void) const { return this->useSLM; }
/*! Change the SLM config for the function */
INLINE bool setUseSLM(bool useSLM) { return this->useSLM = useSLM; }
+ /*! Get sampler set in this function */
+ SamplerSet* getSamplerSet(void) const {return samplerSet; }
+ /*! Get image set in this function */
+ ImageSet* getImageSet(void) const {return imageSet; }
private:
friend class Context; //!< Can freely modify a function
std::string name; //!< Function name
@@ -296,6 +320,8 @@ namespace ir {
LocationMap locationMap; //!< Pushed function arguments (loc->reg)
uint32_t simdWidth; //!< 8 or 16 if forced, 0 otherwise
bool useSLM; //!< Is SLM required?
+ SamplerSet *samplerSet; //!< samplers used in this function.
+ ImageSet* imageSet; //!< Image set in this function's arguments..
GBE_CLASS(Function); //!< Use custom allocator
};
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
new file mode 100644
index 0000000..9398e22
--- /dev/null
+++ b/backend/src/ir/image.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.cpp
+ *
+ */
+#include "image.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+#include "backend/program.h"
+
+namespace gbe {
+namespace ir {
+
+ static uint32_t getInfoOffset4Type(struct ImageInfo *imageInfo, int type)
+ {
+ switch (type) {
+ case GetImageInfoInstruction::WIDTH: return imageInfo->wSlot;
+ case GetImageInfoInstruction::HEIGHT: return imageInfo->hSlot;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ return 0;
+ }
+
+ static uint32_t setInfoOffset4Type(struct ImageInfo *imageInfo, int type, uint32_t offset)
+ {
+ switch (type) {
+ case GetImageInfoInstruction::WIDTH: imageInfo->wSlot = offset; break;
+ case GetImageInfoInstruction::HEIGHT: imageInfo->hSlot = offset; break;
+ default:
+ NOT_IMPLEMENTED;
+ }
+ return 0;
+ }
+
+ void ImageSet::appendInfo(ImageInfoKey key, uint32_t offset)
+ {
+ auto it = indexMap.find(key.index);
+ assert(it != indexMap.end());
+ struct ImageInfo *imageInfo = it->second;
+ setInfoOffset4Type(imageInfo, key.type, offset);
+ }
+
+ void ImageSet::append(Register imageReg, Context *ctx)
+ {
+ ir::FunctionArgument *arg = ctx->getFunction().getArg(imageReg);
+ GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
+ GBE_ASSERTM(regMap.find(imageReg) == regMap.end(), "Append the same image reg twice.");
+
+ int32_t id = ctx->getFunction().getArgID(arg);
+ struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
+ imageInfo->arg_idx = id;
+ imageInfo->idx = regMap.size() + gbe_get_image_base_index();
+ imageInfo->wSlot = -1;
+ imageInfo->hSlot = -1;
+ imageInfo->depthSlot = -1;
+ imageInfo->dataTypeSlot = -1;
+ imageInfo->channelOrderSlot = -1;
+ imageInfo->dimOrderSlot = -1;
+ regMap.insert(std::make_pair(imageReg, imageInfo));
+ indexMap.insert(std::make_pair(imageInfo->idx, imageInfo));
+ }
+
+ const int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
+ {
+ auto it = indexMap.find(key.index);
+ if (it == indexMap.end())
+ return -1;
+ struct ImageInfo *imageInfo = it->second;
+ return getInfoOffset4Type(imageInfo, key.type);
+ }
+
+ const uint32_t ImageSet::getIdx(const Register imageReg) const
+ {
+ auto it = regMap.find(imageReg);
+ GBE_ASSERT(it != regMap.end());
+ return it->second->idx;
+ }
+
+ void ImageSet::getData(struct ImageInfo *imageInfos) const {
+ for(auto &it : regMap)
+ imageInfos[it.second->idx - gbe_get_image_base_index()] = *it.second;
+ }
+
+ ImageSet::~ImageSet() {
+ for(auto &it : regMap)
+ GBE_DELETE(it.second);
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
new file mode 100644
index 0000000..04e78e6
--- /dev/null
+++ b/backend/src/ir/image.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.hpp
+ *
+ */
+#ifndef __GBE_IR_IMAGE_HPP__
+#define __GBE_IR_IMAGE_HPP__
+
+#include "ir/register.hpp"
+#include "ir/instruction.hpp" // for ImageInfoKey
+#include "sys/map.hpp"
+
+extern "C" {
+ struct ImageInfo;
+}
+
+namespace gbe {
+namespace ir {
+
+ class Context;
+ /*! An image set is a set of images which are defined in kernel args.
+ * We use this set to gather the images here and allocate a unique index
+ * for each individual image. And that individual image could be used
+ * at backend to identify this image's location.
+ */
+ class ImageSet
+ {
+ public:
+ /*! Append an image argument. */
+ void append(Register imageReg, Context *ctx);
+ /*! Append an image info slot. */
+ void appendInfo(ImageInfoKey key, uint32_t offset);
+ /*! Get the image's index(actual location). */
+ const uint32_t getIdx(const Register imageReg) const;
+ size_t getDataSize(void) { return regMap.size(); }
+ size_t getDataSize(void) const { return regMap.size(); }
+
+ const int32_t getInfoOffset(ImageInfoKey key) const;
+ void getData(struct ImageInfo *imageInfos) const;
+ void operator = (const ImageSet& other) {
+ regMap.insert(other.regMap.begin(), other.regMap.end());
+ }
+ ImageSet(const ImageSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
+ ImageSet() {}
+ ~ImageSet();
+ private:
+ map<Register, struct ImageInfo *> regMap;
+ map<uint32_t, struct ImageInfo *> indexMap;
+ GBE_CLASS(ImageSet);
+ };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_IMAGE_HPP__ */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 8980abf..a57c204 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -470,6 +470,39 @@ namespace ir {
Register dst[0]; //!< No dest register
};
+ class ALIGNED_INSTRUCTION GetImageInfoInstruction :
+ public BasePolicy,
+ public NSrcPolicy<GetImageInfoInstruction, 1>,
+ public TupleDstPolicy<GetImageInfoInstruction>
+ {
+ public:
+ GetImageInfoInstruction( int type,
+ Tuple dst,
+ Register src)
+ {
+ this->opcode = OP_GET_IMAGE_INFO;
+ this->infoType = type;
+ this->dst = dst;
+ this->src[0] = src;
+ }
+
+ INLINE uint32_t getInfoType(void) const { return infoType; }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << "." << this->getInfoType()
+ << " surface id %" << this->getSrc(fn, 0)
+ << " %" << this->getDst(fn, 0);
+ }
+
+ uint8_t infoType; //!< Type of the requested information.
+ Register src[1]; //!< Surface to get info
+ Tuple dst; //!< dest register to put the information.
+ static const uint32_t dstNum = 4; //! The maximum dst number. Not the actual number
+ // of destination tuple. We use the infoType to determin
+ // the actual num.
+ };
+
class ALIGNED_INSTRUCTION LoadImmInstruction :
public BasePolicy,
public NSrcPolicy<LoadImmInstruction, 0>,
@@ -758,6 +791,8 @@ namespace ir {
{ return true; }
INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
{ return true; }
+ INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+ { return true; }
// Ensure that types and register family match
INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
@@ -915,7 +950,6 @@ namespace ir {
case MEM_CONSTANT: return out << "constant";
case MEM_PRIVATE: return out << "private";
case IMAGE: return out << "image";
- case SAMPLER: return out << "sampler";
case MEM_INVALID: return out << "invalid";
};
return out;
@@ -991,6 +1025,10 @@ START_INTROSPECTION(TypedWriteInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(TypedWriteInstruction)
+START_INTROSPECTION(GetImageInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetImageInfoInstruction)
+
START_INTROSPECTION(LoadImmInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(LoadImmInstruction)
@@ -1176,6 +1214,7 @@ DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
+DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
#undef DECL_MEM_FN
@@ -1317,6 +1356,10 @@ DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
return internal::TypedWriteInstruction(src, srcType, coordType).convert();
}
+ Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src) {
+ return internal::GetImageInfoInstruction(infoType, dst, src).convert();
+ }
+
std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
const Function &fn = insn.getFunction();
switch (insn.getOpcode()) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index ec5d250..c948d2c 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -50,7 +50,6 @@ namespace ir {
MEM_CONSTANT, //!< Immutable global memory
MEM_PRIVATE, //!< Per thread private memory
IMAGE, //!< For texture image.
- SAMPLER, //!< For sampler.
MEM_INVALID
};
@@ -298,6 +297,9 @@ namespace ir {
/*! Store data in an texture */
class TypedWriteInstruction : public Instruction {
public:
+ enum {
+ SURFACE_BTI = 0
+ };
/*! Return true if the given instruction is an instance of this class */
static bool isClassOf(const Instruction &insn);
Type getSrcType(void) const;
@@ -307,12 +309,51 @@ namespace ir {
/*! Load texels from a texture */
class SampleInstruction : public Instruction {
public:
+ enum {
+ SURFACE_BTI = 0,
+ SAMPLER_BTI = 1
+ };
/*! Return true if the given instruction is an instance of this class */
static bool isClassOf(const Instruction &insn);
Type getSrcType(void) const;
Type getDstType(void) const;
};
+ typedef union {
+ struct {
+ uint16_t index; /*! the allocated image index */
+ uint16_t type; /*! the information type */
+ };
+ uint32_t data;
+ } ImageInfoKey;
+ /*! Get image information */
+ class GetImageInfoInstruction : public Instruction {
+ public:
+ enum {
+ SURFACE_BTI = 0
+ };
+ enum {
+ WIDTH = 0,
+ HEIGHT = 1,
+ };
+
+ static INLINE uint32_t getDstNum4Type(int infoType) {
+ switch (infoType) {
+ case GetImageInfoInstruction::WIDTH:
+ case GetImageInfoInstruction::HEIGHT:
+ return 1;
+ break;
+ default:
+ GBE_ASSERT(0);
+ }
+ return 0;
+ }
+
+ uint32_t getInfoType() const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
/*! Branch instruction is the unified way to branch (with or without
* predicate)
*/
@@ -522,6 +563,8 @@ namespace ir {
Instruction TYPED_WRITE(Tuple src, Type srcType, Type coordType);
/*! sample textures */
Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType);
+ /*! get image information , such as width/height/depth/... */
+ Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src);
/*! label labelIndex */
Instruction LABEL(LabelIndex labelIndex);
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index b16c22f..5cf37d2 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -70,4 +70,4 @@ DECL_INSN(TYPED_WRITE, TypedWriteInstruction)
DECL_INSN(SAMPLE, SampleInstruction)
DECL_INSN(SYNC, SyncInstruction)
DECL_INSN(LABEL, LabelInstruction)
-
+DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
new file mode 100644
index 0000000..62bdc16
--- /dev/null
+++ b/backend/src/ir/sampler.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.cpp
+ *
+ */
+#include "sampler.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+
+namespace gbe {
+namespace ir {
+
+ const uint32_t SamplerSet::getIdx(const Register reg) const
+ {
+ auto it = regMap.find(reg);
+ GBE_ASSERT(it != regMap.end());
+ return it->second.slot;
+ }
+
+ void SamplerSet::appendReg(const Register reg, uint32_t key, Context *ctx) {
+ struct SamplerRegSlot samplerSlot;
+ samplerSlot.reg = reg;
+ samplerSlot.slot = samplerMap.size();
+ samplerMap.insert(std::make_pair(key, samplerSlot));
+ regMap.insert(std::make_pair(samplerSlot.reg, samplerSlot));
+ }
+
+ Register SamplerSet::append(uint32_t samplerValue, Context *ctx)
+ {
+ auto it = samplerMap.find(samplerValue);
+ if (it != samplerMap.end())
+ return it->second.reg;
+ // This register is just used as a key.
+ Register reg = ctx->reg(FAMILY_DWORD);
+ appendReg(reg, samplerValue, ctx);
+ return reg;
+ }
+
+#define SAMPLER_ID(id) ((id << __CLK_SAMPLER_ARG_BASE) | __CLK_SAMPLER_ARG_KEY_BIT)
+ void SamplerSet::append(Register samplerReg, Context *ctx)
+ {
+ ir::FunctionArgument *arg = ctx->getFunction().getArg(samplerReg);
+ GBE_ASSERT(arg != NULL);
+
+ // XXX As LLVM 3.2/3.1 doesn't have a new data type for the sampler_t, we have to fix up the argument
+ // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
+ // work around.
+ arg->type = ir::FunctionArgument::SAMPLER;
+ int32_t id = ctx->getFunction().getArgID(arg);
+ GBE_ASSERT(id < (1 << __CLK_SAMPLER_ARG_BITS));
+
+ auto it = samplerMap.find(SAMPLER_ID(id));
+ if (it != samplerMap.end()) {
+ GBE_ASSERT(it->second.reg == samplerReg);
+ return;
+ }
+ appendReg(samplerReg, SAMPLER_ID(id), ctx);
+ }
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
new file mode 100644
index 0000000..f968299
--- /dev/null
+++ b/backend/src/ir/sampler.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_SAMPLER_HPP__
+#define __GBE_IR_SAMPLER_HPP__
+
+#include "ir/register.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+ /*! A sampler set is a set of global samplers which are defined as constant global
+ * sampler or defined in the outermost kernel scope variables. According to the spec
+ * all the variable should have a initialized integer value and can't be modified.
+ */
+ class Context;
+
+ struct SamplerRegSlot {
+ Register reg;
+ uint32_t slot;
+ };
+
+ class SamplerSet
+ {
+ public:
+ /*! Append the specified sampler and return the allocated offset.
+ * If the speficied sampler is exist, only return the previous offset and
+ * don't append it again. Return -1, if failed.*/
+ Register append(uint32_t clkSamplerValue, Context *ctx);
+ /*! Append a sampler defined in kernel args. */
+ void append(Register samplerArg, Context *ctx);
+ /*! Get the sampler idx (actual location) */
+ const uint32_t getIdx(const Register reg) const;
+ size_t getDataSize(void) { return samplerMap.size(); }
+ size_t getDataSize(void) const { return samplerMap.size(); }
+ void getData(uint32_t *samplers) const {
+ for(auto &it : samplerMap)
+ samplers[it.second.slot] = it.first;
+ }
+
+ void operator = (const SamplerSet& other) {
+ regMap.insert(other.regMap.begin(), other.regMap.end());
+ samplerMap.insert(other.samplerMap.begin(), other.samplerMap.end());
+ }
+
+ SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
+ SamplerSet() {}
+ private:
+ void appendReg(const Register reg, uint32_t key, Context *ctx);
+ map<uint32_t, SamplerRegSlot> samplerMap;
+ map<Register, SamplerRegSlot> regMap;
+ GBE_CLASS(SamplerSet);
+ };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_SAMPLER_HPP__ */
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index ae78638..3b293f5 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -24,9 +24,12 @@
#ifndef __GBE_IR_UNIT_HPP__
#define __GBE_IR_UNIT_HPP__
+#include "llvm/Value.h"
+
#include "ir/constant.hpp"
#include "ir/register.hpp"
#include "sys/hash_map.hpp"
+#include "sys/map.hpp"
namespace gbe {
namespace ir {
@@ -41,6 +44,7 @@ namespace ir {
{
public:
typedef hash_map<std::string, Function*> FunctionSet;
+ typedef std::pair<llvm::Value*, uint32_t> ValueIndex;
/*! Create an empty unit */
Unit(PointerSize pointerSize = POINTER_32_BITS);
/*! Release everything (*including* the function pointers) */
@@ -71,11 +75,27 @@ namespace ir {
ConstantSet& getConstantSet(void) { return constantSet; }
/*! Return the constant set */
const ConstantSet& getConstantSet(void) const { return constantSet; }
+
+ /*! Some values will not be allocated. For example a vector extract and
+ * a vector insertion when scalarize the vector load/store
+ */
+ void newValueProxy(llvm::Value *real,
+ llvm::Value *fake,
+ uint32_t realIndex = 0u,
+ uint32_t fakeIndex = 0u) {
+ const ValueIndex key(fake, fakeIndex);
+ const ValueIndex value(real, realIndex);
+ GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
+ valueMap[key] = value;
+ }
+ /*! Return the value map */
+ const map<ValueIndex, ValueIndex>& getValueMap(void) const { return valueMap; }
private:
friend class ContextInterface; //!< Can free modify the unit
hash_map<std::string, Function*> functions; //!< All the defined functions
ConstantSet constantSet; //!< All the constants defined in the unit
PointerSize pointerSize; //!< Size shared by all pointers
+ map<ValueIndex, ValueIndex> valueMap; //!< fake to real value map for vector load/store
GBE_CLASS(Unit);
};
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index ad465e2..deda687 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -60,7 +60,7 @@
* dependencies on endianness or ABIs. Fortunately, the ptx (and nvptx for LLVM
* 3.2) profile is pretty well adapted to our needs since NV and Gen GPU are
* kind of similar, or at least they are similar enough to share the same front
- * end.
+ * end.
*
* Problems
* ========
@@ -126,10 +126,8 @@
#include "ir/context.hpp"
#include "ir/unit.hpp"
#include "ir/liveness.hpp"
-#include "sys/map.hpp"
#include "sys/set.hpp"
#include "sys/cvar.hpp"
-#include <algorithm>
/* Not defined for LLVM 3.0 */
#if !defined(LLVM_VERSION_MAJOR)
@@ -207,7 +205,7 @@ namespace gbe
/*! Type to register family translation */
static ir::RegisterFamily getFamily(const ir::Context &ctx, const Type *type)
{
- GBE_ASSERT(isScalarType(type) == true);
+ GBE_ASSERT(isScalarType(type) == true);
if (type == Type::getInt1Ty(type->getContext()))
return ir::FAMILY_BOOL;
if (type == Type::getInt8Ty(type->getContext()))
@@ -256,7 +254,6 @@ namespace gbe
case 2: return ir::MEM_CONSTANT;
case 3: return ir::MEM_LOCAL;
case 4: return ir::IMAGE;
- case 5: return ir::SAMPLER;
}
GBE_ASSERT(false);
return ir::MEM_GLOBAL;
@@ -270,6 +267,8 @@ namespace gbe
class RegisterTranslator
{
public:
+ /*! Indices will be zero for scalar values */
+ typedef std::pair<Value*, uint32_t> ValueIndex;
RegisterTranslator(ir::Context &ctx) : ctx(ctx) {}
/*! Empty the maps */
@@ -290,6 +289,11 @@ namespace gbe
GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
valueMap[key] = value;
}
+ /*! After scalarize pass, there are some valueMap in unit,
+ * use this function to copy from unit valueMap */
+ void initValueMap(const map<ValueIndex, ValueIndex>& vMap) {
+ valueMap.insert(vMap.begin(), vMap.end());
+ }
/*! Mostly used for the preallocated registers (lids, gids) */
void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
const ValueIndex key(value, index);
@@ -326,10 +330,9 @@ namespace gbe
};
return ir::Register();
}
- /*! Get the register from the given value at given index possibly iterating
- * in the value map to get the final real register
- */
- ir::Register getScalar(Value *value, uint32_t index = 0u) {
+
+ /*! iterating in the value map to get the final real register */
+ void getRealValue(Value* &value, uint32_t& index) {
auto end = valueMap.end();
for (;;) {
auto it = valueMap.find(std::make_pair(value, index));
@@ -340,6 +343,14 @@ namespace gbe
index = it->second.second;
}
}
+ }
+
+ /*! Get the register from the given value at given index possibly iterating
+ * in the value map to get the final real register
+ */
+ ir::Register getScalar(Value *value, uint32_t index = 0u) {
+ getRealValue(value, index);
+
const auto key = std::make_pair(value, index);
GBE_ASSERT(scalarMap.find(key) != scalarMap.end());
return scalarMap[key];
@@ -352,16 +363,8 @@ namespace gbe
}
/*! Says if the value exists. Otherwise, it is undefined */
bool valueExists(Value *value, uint32_t index) {
- auto end = valueMap.end();
- for (;;) {
- auto it = valueMap.find(std::make_pair(value, index));
- if (it == end)
- break;
- else {
- value = it->second.first;
- index = it->second.second;
- }
- }
+ getRealValue(value, index);
+
const auto key = std::make_pair(value, index);
return scalarMap.find(key) != scalarMap.end();
}
@@ -376,8 +379,6 @@ namespace gbe
this->insertRegister(reg, key, index);
return reg;
}
- /*! Indices will be zero for scalar values */
- typedef std::pair<Value*, uint32_t> ValueIndex;
/*! Map value to ir::Register */
map<ValueIndex, ir::Register> scalarMap;
/*! Map values to values when this is only a translation (eq bitcast) */
@@ -385,28 +386,6 @@ namespace gbe
/*! Actually allocates the registers */
ir::Context &ctx;
};
- /*! All intrinsic Gen functions */
- enum OCLInstrinsic {
-#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
-#include "llvm_gen_ocl_function.hxx"
-#undef DECL_LLVM_GEN_FUNCTION
- };
-
- /*! Build the hash map for OCL functions on Gen */
- struct OCLIntrinsicMap {
- /*! Build the intrinsic hash map */
- OCLIntrinsicMap(void) {
-#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
- map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
-#include "llvm_gen_ocl_function.hxx"
-#undef DECL_LLVM_GEN_FUNCTION
- }
- /*! Sort intrinsics with their names */
- hash_map<std::string, OCLInstrinsic> map;
- };
-
- /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
- static const OCLIntrinsicMap instrinsicMap;
/*! Translate LLVM IR code to Gen IR code */
class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
@@ -424,7 +403,7 @@ namespace gbe
*/
set<const Value*> conditionSet;
/*! We visit each function twice. Once to allocate the registers and once to
- * emit the Gen IR instructions
+ * emit the Gen IR instructions
*/
enum Pass {
PASS_EMIT_REGISTERS = 0,
@@ -664,7 +643,7 @@ namespace gbe
if (dyn_cast<ConstantAggregateZero>(CPV)) {
return doIt(uint32_t(0)); // XXX Handle type
} else {
- if (dyn_cast<ConstantVector>(CPV))
+ if (dyn_cast<ConstantVector>(CPV))
CPV = extractConstantElem(CPV, index);
GBE_ASSERTM(dyn_cast<ConstantExpr>(CPV) == NULL, "Unsupported constant expression");
@@ -757,6 +736,9 @@ namespace gbe
}
ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
+ //the real value may be constant, so get real value before constant check
+ regTranslator.getRealValue(value, elemID);
+
if (dyn_cast<ConstantExpr>(value)) {
ConstantExpr *ce = dyn_cast<ConstantExpr>(value);
if(ce->isCast()) {
@@ -831,32 +813,29 @@ namespace gbe
PHINode *PN = cast<PHINode>(I);
Value *IV = PN->getIncomingValueForBlock(curr);
if (!isa<UndefValue>(IV)) {
- uint32_t elemNum;
Type *llvmType = PN->getType();
GBE_ASSERTM(llvmType != Type::getInt1Ty(llvmType->getContext()),
"TODO Boolean values cannot escape their definition basic block");
- const ir::Type type = getVectorInfo(ctx, llvmType, PN, elemNum);
+ const ir::Type type = getType(ctx, llvmType);
// Emit the MOV required by the PHI function. We do it simple and do not
// try to optimize them. A next data flow analysis pass on the Gen IR
// will remove them
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- Value *PHICopy = this->getPHICopy(PN);
- const ir::Register dst = this->getRegister(PHICopy, elemID);
- Constant *CP = dyn_cast<Constant>(IV);
- if (CP) {
- GBE_ASSERT(isa<GlobalValue>(CP) == false);
- ConstantVector *CPV = dyn_cast<ConstantVector>(CP);
- if (CPV && dyn_cast<ConstantVector>(CPV) &&
- isa<UndefValue>(extractConstantElem(CPV, elemID)))
- continue;
- const ir::ImmediateIndex immIndex = this->newImmediate(CP, elemID);
- const ir::Immediate imm = ctx.getImmediate(immIndex);
- ctx.LOADI(imm.type, dst, immIndex);
- } else if (regTranslator.valueExists(IV,elemID) || dyn_cast<Constant>(IV)) {
- const ir::Register src = this->getRegister(IV, elemID);
- ctx.MOV(type, dst, src);
- }
+ Value *PHICopy = this->getPHICopy(PN);
+ const ir::Register dst = this->getRegister(PHICopy);
+ Constant *CP = dyn_cast<Constant>(IV);
+ if (CP) {
+ GBE_ASSERT(isa<GlobalValue>(CP) == false);
+ ConstantVector *CPV = dyn_cast<ConstantVector>(CP);
+ if (CPV && dyn_cast<ConstantVector>(CPV) &&
+ isa<UndefValue>(extractConstantElem(CPV, 0)))
+ continue;
+ const ir::ImmediateIndex immIndex = this->newImmediate(CP);
+ const ir::Immediate imm = ctx.getImmediate(immIndex);
+ ctx.LOADI(imm.type, dst, immIndex);
+ } else if (regTranslator.valueExists(IV,0) || dyn_cast<Constant>(IV)) {
+ const ir::Register src = this->getRegister(IV);
+ ctx.MOV(type, dst, src);
}
}
}
@@ -868,6 +847,7 @@ namespace gbe
"Returned value for kernel functions is forbidden");
// Loop over the arguments and output registers for them
if (!F.arg_empty()) {
+ uint32_t argID = 0;
Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
// Insert a new register for each function argument
@@ -876,10 +856,33 @@ namespace gbe
uint32_t argID = 1; // Start at one actually
for (; I != E; ++I, ++argID) {
#else
- for (; I != E; ++I) {
+ for (; I != E; ++I, ++argID) {
#endif /* LLVM_VERSION_MINOR <= 1 */
const std::string &argName = I->getName().str();
Type *type = I->getType();
+
+ //add support for vector argument
+ if(type->isVectorTy()) {
+ VectorType *vectorType = cast<VectorType>(type);
+
+ this->newRegister(I);
+ ir::Register reg = getRegister(I, 0);
+
+ Type *elemType = vectorType->getElementType();
+ const uint32_t elemSize = getTypeByteSize(unit, elemType);
+ const uint32_t elemNum = vectorType->getNumElements();
+ //vector's elemType always scalar type
+ ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize);
+
+ ir::Function& fn = ctx.getFunction();
+ for(uint32_t i=1; i < elemNum; i++) {
+ ir::PushLocation argLocation(fn, argID, elemSize*i);
+ reg = getRegister(I, i);
+ ctx.appendPushedConstant(reg, argLocation); //add to push map for reg alloc
+ }
+ continue;
+ }
+
GBE_ASSERTM(isScalarType(type) == true,
"vector type in the function argument is not supported yet");
const ir::Register reg = regTranslator.newScalar(I);
@@ -915,9 +918,7 @@ namespace gbe
break;
case ir::IMAGE:
ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize);
- break;
- case ir::SAMPLER:
- ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, ptrSize);
+ ctx.getFunction().getImageSet()->append(reg, &ctx);
break;
default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
}
@@ -1143,6 +1144,7 @@ namespace gbe
ctx.startFunction(F.getName());
this->regTranslator.clear();
+ this->regTranslator.initValueMap(unit.getValueMap());
this->labelMap.clear();
this->emitFunctionPrototype(F);
@@ -1232,36 +1234,33 @@ namespace gbe
#endif /* GBE_DEBUG */
// Get the element type for a vector
- uint32_t elemNum;
- const ir::Type type = getVectorInfo(ctx, I.getType(), &I, elemNum);
+ const ir::Type type = getType(ctx, I.getType());
// Emit the instructions in a row
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register dst = this->getRegister(&I, elemID);
- const ir::Register src0 = this->getRegister(I.getOperand(0), elemID);
- const ir::Register src1 = this->getRegister(I.getOperand(1), elemID);
-
- switch (I.getOpcode()) {
- case Instruction::Add:
- case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
- case Instruction::Sub:
- case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
- case Instruction::Mul:
- case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
- case Instruction::And: ctx.AND(type, dst, src0, src1); break;
- case Instruction::Or: ctx.OR(type, dst, src0, src1); break;
- case Instruction::Xor: ctx.XOR(type, dst, src0, src1); break;
- case Instruction::Shl: ctx.SHL(type, dst, src0, src1); break;
- case Instruction::LShr: ctx.SHR(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
- case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
- default: NOT_SUPPORTED;
- }
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+ switch (I.getOpcode()) {
+ case Instruction::Add:
+ case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
+ case Instruction::Sub:
+ case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
+ case Instruction::Mul:
+ case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
+ case Instruction::And: ctx.AND(type, dst, src0, src1); break;
+ case Instruction::Or: ctx.OR(type, dst, src0, src1); break;
+ case Instruction::Xor: ctx.XOR(type, dst, src0, src1); break;
+ case Instruction::Shl: ctx.SHL(type, dst, src0, src1); break;
+ case Instruction::LShr: ctx.SHR(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+ case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
+ default: NOT_SUPPORTED;
}
}
@@ -1289,49 +1288,46 @@ namespace gbe
GBE_ASSERT(I.getOperand(0)->getType() != Type::getInt1Ty(I.getContext()));
// Get the element type and the number of elements
- uint32_t elemNum;
Type *operandType = I.getOperand(0)->getType();
- const ir::Type type = getVectorInfo(ctx, operandType, &I, elemNum);
+ const ir::Type type = getType(ctx, operandType);
const ir::Type signedType = makeTypeSigned(type);
const ir::Type unsignedType = makeTypeUnsigned(type);
// Emit the instructions in a row
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register dst = this->getRegister(&I, elemID);
- const ir::Register src0 = this->getRegister(I.getOperand(0), elemID);
- const ir::Register src1 = this->getRegister(I.getOperand(1), elemID);
-
- // We must invert the condition to simplify the branch code
- if (conditionSet.find(&I) != conditionSet.end()) {
- switch (I.getPredicate()) {
- case ICmpInst::ICMP_EQ: ctx.NE(type, dst, src0, src1); break;
- case ICmpInst::ICMP_NE: ctx.EQ(type, dst, src0, src1); break;
- case ICmpInst::ICMP_ULE: ctx.GT((unsignedType), dst, src0, src1); break;
- case ICmpInst::ICMP_SLE: ctx.GT(signedType, dst, src0, src1); break;
- case ICmpInst::ICMP_UGE: ctx.LT(unsignedType, dst, src0, src1); break;
- case ICmpInst::ICMP_SGE: ctx.LT(signedType, dst, src0, src1); break;
- case ICmpInst::ICMP_ULT: ctx.GE(unsignedType, dst, src0, src1); break;
- case ICmpInst::ICMP_SLT: ctx.GE(signedType, dst, src0, src1); break;
- case ICmpInst::ICMP_UGT: ctx.LE(unsignedType, dst, src0, src1); break;
- case ICmpInst::ICMP_SGT: ctx.LE(signedType, dst, src0, src1); break;
- default: NOT_SUPPORTED;
- }
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+ // We must invert the condition to simplify the branch code
+ if (conditionSet.find(&I) != conditionSet.end()) {
+ switch (I.getPredicate()) {
+ case ICmpInst::ICMP_EQ: ctx.NE(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_NE: ctx.EQ(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULE: ctx.GT((unsignedType), dst, src0, src1); break;
+ case ICmpInst::ICMP_SLE: ctx.GT(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGE: ctx.LT(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGE: ctx.LT(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULT: ctx.GE(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SLT: ctx.GE(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGT: ctx.LE(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGT: ctx.LE(signedType, dst, src0, src1); break;
+ default: NOT_SUPPORTED;
}
- // Nothing special to do
- else {
- switch (I.getPredicate()) {
- case ICmpInst::ICMP_EQ: ctx.EQ(type, dst, src0, src1); break;
- case ICmpInst::ICMP_NE: ctx.NE(type, dst, src0, src1); break;
- case ICmpInst::ICMP_ULE: ctx.LE((unsignedType), dst, src0, src1); break;
- case ICmpInst::ICMP_SLE: ctx.LE(signedType, dst, src0, src1); break;
- case ICmpInst::ICMP_UGE: ctx.GE(unsignedType, dst, src0, src1); break;
- case ICmpInst::ICMP_SGE: ctx.GE(signedType, dst, src0, src1); break;
- case ICmpInst::ICMP_ULT: ctx.LT(unsignedType, dst, src0, src1); break;
- case ICmpInst::ICMP_SLT: ctx.LT(signedType, dst, src0, src1); break;
- case ICmpInst::ICMP_UGT: ctx.GT(unsignedType, dst, src0, src1); break;
- case ICmpInst::ICMP_SGT: ctx.GT(signedType, dst, src0, src1); break;
- default: NOT_SUPPORTED;
- }
+ }
+ // Nothing special to do
+ else {
+ switch (I.getPredicate()) {
+ case ICmpInst::ICMP_EQ: ctx.EQ(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_NE: ctx.NE(type, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULE: ctx.LE((unsignedType), dst, src0, src1); break;
+ case ICmpInst::ICMP_SLE: ctx.LE(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGE: ctx.GE(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGE: ctx.GE(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_ULT: ctx.LT(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SLT: ctx.LT(signedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_UGT: ctx.GT(unsignedType, dst, src0, src1); break;
+ case ICmpInst::ICMP_SGT: ctx.GT(signedType, dst, src0, src1); break;
+ default: NOT_SUPPORTED;
}
}
}
@@ -1343,31 +1339,28 @@ namespace gbe
void GenWriter::emitFCmpInst(FCmpInst &I) {
// Get the element type and the number of elements
- uint32_t elemNum;
Type *operandType = I.getOperand(0)->getType();
- const ir::Type type = getVectorInfo(ctx, operandType, &I, elemNum);
+ const ir::Type type = getType(ctx, operandType);
// Emit the instructions in a row
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register dst = this->getRegister(&I, elemID);
- const ir::Register src0 = this->getRegister(I.getOperand(0), elemID);
- const ir::Register src1 = this->getRegister(I.getOperand(1), elemID);
-
- switch (I.getPredicate()) {
- case ICmpInst::FCMP_OEQ:
- case ICmpInst::FCMP_UEQ: ctx.EQ(type, dst, src0, src1); break;
- case ICmpInst::FCMP_ONE:
- case ICmpInst::FCMP_UNE: ctx.NE(type, dst, src0, src1); break;
- case ICmpInst::FCMP_OLE:
- case ICmpInst::FCMP_ULE: ctx.LE(type, dst, src0, src1); break;
- case ICmpInst::FCMP_OGE:
- case ICmpInst::FCMP_UGE: ctx.GE(type, dst, src0, src1); break;
- case ICmpInst::FCMP_OLT:
- case ICmpInst::FCMP_ULT: ctx.LT(type, dst, src0, src1); break;
- case ICmpInst::FCMP_OGT:
- case ICmpInst::FCMP_UGT: ctx.GT(type, dst, src0, src1); break;
- default: NOT_SUPPORTED;
- }
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+ switch (I.getPredicate()) {
+ case ICmpInst::FCMP_OEQ:
+ case ICmpInst::FCMP_UEQ: ctx.EQ(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_ONE:
+ case ICmpInst::FCMP_UNE: ctx.NE(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OLE:
+ case ICmpInst::FCMP_ULE: ctx.LE(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OGE:
+ case ICmpInst::FCMP_UGE: ctx.GE(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OLT:
+ case ICmpInst::FCMP_ULT: ctx.LT(type, dst, src0, src1); break;
+ case ICmpInst::FCMP_OGT:
+ case ICmpInst::FCMP_UGT: ctx.GT(type, dst, src0, src1); break;
+ default: NOT_SUPPORTED;
}
}
@@ -1397,10 +1390,7 @@ namespace gbe
// Bitcast just forward registers
case Instruction::BitCast:
{
- uint32_t elemNum;
- getVectorInfo(ctx, I.getType(), &I, elemNum);
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
- regTranslator.newValueProxy(srcValue, dstValue, elemID, elemID);
+ regTranslator.newValueProxy(srcValue, dstValue);
}
break;
// Various conversion operations -> just allocate registers for them
@@ -1448,15 +1438,14 @@ namespace gbe
case Instruction::Trunc:
{
// Get the element type for a vector
- uint32_t elemNum;
Type *llvmDstType = I.getType();
Type *llvmSrcType = I.getOperand(0)->getType();
- const ir::Type dstType = getVectorInfo(ctx, llvmDstType, &I, elemNum);
+ const ir::Type dstType = getType(ctx, llvmDstType);
ir::Type srcType;
if (I.getOpcode() == Instruction::ZExt) {
- srcType = getVectorInfo(ctx, llvmSrcType, &I, elemNum, true);
+ srcType = getUnsignedType(ctx, llvmSrcType);
} else {
- srcType = getVectorInfo(ctx, llvmSrcType, &I, elemNum);
+ srcType = getType(ctx, llvmSrcType);
}
// We use a select (0,1) not a convert when the destination is a boolean
@@ -1468,19 +1457,15 @@ namespace gbe
const ir::Register oneReg = ctx.reg(family);
ctx.LOADI(dstType, zeroReg, zero);
ctx.LOADI(dstType, oneReg, one);
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register dst = this->getRegister(&I, elemID);
- const ir::Register src = this->getRegister(I.getOperand(0), elemID);
- ctx.SEL(dstType, dst, src, oneReg, zeroReg);
- }
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ ctx.SEL(dstType, dst, src, oneReg, zeroReg);
}
// Use a convert for the other cases
else {
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register dst = this->getRegister(&I, elemID);
- const ir::Register src = this->getRegister(I.getOperand(0), elemID);
- ctx.CVT(dstType, srcType, dst, src);
- }
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ ctx.CVT(dstType, srcType, dst, src);
}
}
break;
@@ -1497,141 +1482,15 @@ namespace gbe
ir::Context &ctx;
};
- void GenWriter::regAllocateInsertElement(InsertElementInst &I) {
- Value *modified = I.getOperand(0);
- Value *toInsert = I.getOperand(1);
- Value *index = I.getOperand(2);
-
- // Get the index for the insertion
- Constant *CPV = dyn_cast<Constant>(index);
- GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values");
- auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
- GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32,
- "Invalid index type for InsertElement");
-
- // Crash on overrun
- VectorType *vectorType = cast<VectorType>(modified->getType());
- const uint32_t elemNum = vectorType->getNumElements();
- const uint32_t modifiedID = x.data.u32;
- GBE_ASSERTM(modifiedID < elemNum, "Out-of-bound index for InsertElement");
-
- // The source vector is not constant
- if (!isa<Constant>(modified) || isa<UndefValue>(modified)) {
- // Non modified values are just proxies
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
- if (elemID != modifiedID)
- regTranslator.newValueProxy(modified, &I, elemID, elemID);
- }
- // The source vector is constant
- else {
- // Non modified values will use LOADI
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
- if (elemID != modifiedID) {
- const ir::Type type = getType(ctx, toInsert->getType());
- const ir::Register reg = ctx.reg(getFamily(type));
- regTranslator.insertRegister(reg, &I, elemID);
- }
- }
-
- // If the element to insert is an immediate we will generate a LOADI.
- // Otherwise, the value is just a proxy of the inserted value
- if (dyn_cast<Constant>(toInsert) != NULL) {
- const ir::Type type = getType(ctx, toInsert->getType());
- const ir::Register reg = ctx.reg(getFamily(type));
- regTranslator.insertRegister(reg, &I, modifiedID);
- } else
- regTranslator.newValueProxy(toInsert, &I, 0, modifiedID);
- }
-
- void GenWriter::emitInsertElement(InsertElementInst &I) {
- // Note that we check everything in regAllocateInsertElement
- Value *modified = I.getOperand(0);
- Value *toInsert = I.getOperand(1);
- Value *index = I.getOperand(2);
-
- // Get the index of the value to insert
- Constant *indexCPV = dyn_cast<Constant>(index);
- auto x = processConstant<ir::Immediate>(indexCPV, InsertExtractFunctor(ctx));
- const uint32_t modifiedID = x.data.u32;
-
- // The source vector is constant. We need to insert LOADI for the unmodified
- // values
- if (isa<Constant>(modified) && !isa<UndefValue>(modified)) {
- VectorType *vectorType = cast<VectorType>(modified->getType());
- const uint32_t elemNum = vectorType->getNumElements();
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
- if (elemID != modifiedID) {
- Constant *sourceCPV = dyn_cast<Constant>(modified);
- if (isa<UndefValue>(extractConstantElem(sourceCPV, elemID)) == false) {
- const ir::ImmediateIndex immIndex = this->newImmediate(sourceCPV, elemID);
- const ir::Immediate imm = ctx.getImmediate(immIndex);
- const ir::Register reg = regTranslator.getScalar(&I, elemID);
- ctx.LOADI(imm.type, reg, immIndex);
- }
- }
- }
-
- // If the inserted value is not a constant, we just use a proxy
- if (dyn_cast<Constant>(toInsert) == NULL)
- return;
+ /*! Because there are still fake insert/extract instruction for
+ * load/store, so keep empty function here */
+ void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
+ void GenWriter::emitInsertElement(InsertElementInst &I) {}
- // We need a LOADI if we insert an immediate
- Constant *toInsertCPV = dyn_cast<Constant>(toInsert);
- const ir::ImmediateIndex immIndex = this->newImmediate(toInsertCPV);
- const ir::Immediate imm = ctx.getImmediate(immIndex);
- const ir::Register reg = regTranslator.getScalar(&I, modifiedID);
- ctx.LOADI(imm.type, reg, immIndex);
- }
-
- void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {
- Value *extracted = I.getOperand(0);
- Value *index = I.getOperand(1);
- GBE_ASSERTM(isa<Constant>(extracted) == false,
- "TODO support constant vector for extract");
- Constant *CPV = dyn_cast<Constant>(index);
- GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values");
- auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
- GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32,
- "Invalid index type for InsertElement");
-
- // Crash on overrun
- const uint32_t extractedID = x.data.u32;
-#if GBE_DEBUG
- VectorType *vectorType = cast<VectorType>(extracted->getType());
- const uint32_t elemNum = vectorType->getNumElements();
- GBE_ASSERTM(extractedID < elemNum, "Out-of-bound index for InsertElement");
-#endif /* GBE_DEBUG */
-
- // Easy when the vector is not immediate
- regTranslator.newValueProxy(extracted, &I, extractedID, 0);
- }
-
- void GenWriter::emitExtractElement(ExtractElementInst &I) {
- // TODO -> insert LOADI when the extracted vector is constant
- }
-
- void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {
- Value *first = I.getOperand(0);
- Value *second = I.getOperand(1);
- GBE_ASSERTM(!isa<Constant>(first) || isa<UndefValue>(first),
- "TODO support constant vector for shuffle");
- GBE_ASSERTM(!isa<Constant>(second) || isa<UndefValue>(second),
- "TODO support constant vector for shuffle");
- VectorType *dstType = cast<VectorType>(I.getType());
- VectorType *srcType = cast<VectorType>(first->getType());
- const uint32_t dstElemNum = dstType->getNumElements();
- const uint32_t srcElemNum = srcType->getNumElements();
- for (uint32_t elemID = 0; elemID < dstElemNum; ++elemID) {
- uint32_t srcID = I.getMaskValue(elemID);
- Value *src = first;
- if (srcID >= srcElemNum) {
- srcID -= srcElemNum;
- src = second;
- }
- regTranslator.newValueProxy(src, &I, srcID, elemID);
- }
- }
+ void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {}
+ void GenWriter::emitExtractElement(ExtractElementInst &I) {}
+ void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
void GenWriter::regAllocateSelectInst(SelectInst &I) {
@@ -1640,22 +1499,14 @@ namespace gbe
void GenWriter::emitSelectInst(SelectInst &I) {
// Get the element type for a vector
- uint32_t elemNum;
- const ir::Type type = getVectorInfo(ctx, I.getType(), &I, elemNum);
-
- // Condition can be either a vector or a scalar
- Type *condType = I.getOperand(0)->getType();
- const bool isVectorCond = isa<VectorType>(condType);
+ const ir::Type type = getType(ctx, I.getType());
// Emit the instructions in a row
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register dst = this->getRegister(&I, elemID);
- const uint32_t condID = isVectorCond ? elemID : 0;
- const ir::Register cond = this->getRegister(I.getOperand(0), condID);
- const ir::Register src0 = this->getRegister(I.getOperand(1), elemID);
- const ir::Register src1 = this->getRegister(I.getOperand(2), elemID);
- ctx.SEL(type, dst, cond, src0, src1);
- }
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register cond = this->getRegister(I.getOperand(0));
+ const ir::Register src0 = this->getRegister(I.getOperand(1));
+ const ir::Register src1 = this->getRegister(I.getOperand(2));
+ ctx.SEL(type, dst, cond, src0, src1);
}
void GenWriter::regAllocatePHINode(PHINode &I) {
@@ -1668,15 +1519,11 @@ namespace gbe
void GenWriter::emitPHINode(PHINode &I) {
Value *copy = this->getPHICopy(&I);
- uint32_t elemNum;
- const ir::Type type = getVectorInfo(ctx, I.getType(), &I, elemNum);
-
- // Emit the MOVs to avoid the lost copy issue
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register dst = this->getRegister(&I, elemID);
- const ir::Register src = this->getRegister(copy, elemID);
- ctx.MOV(type, dst, src);
- }
+ const ir::Type type = getType(ctx, I.getType());
+
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(copy);
+ ctx.MOV(type, dst, src);
}
void GenWriter::regAllocateBranchInst(BranchInst &I) {}
@@ -1813,6 +1660,8 @@ namespace gbe
case GEN_OCL_RNDE:
case GEN_OCL_RNDU:
case GEN_OCL_RNDD:
+ case GEN_OCL_GET_IMAGE_WIDTH:
+ case GEN_OCL_GET_IMAGE_HEIGHT:
// No structure can be returned
this->newRegister(&I);
break;
@@ -1848,7 +1697,7 @@ namespace gbe
case GEN_OCL_READ_IMAGE14:
case GEN_OCL_READ_IMAGE15:
{
- // dst is a 4 elements vector. We allocate all 4 registers here.
+ // dst is a 4 elements vector. We allocate all 4 registers here.
uint32_t elemNum;
(void)getVectorInfo(ctx, I.getType(), &I, elemNum);
GBE_ASSERT(elemNum == 4);
@@ -1976,6 +1825,29 @@ namespace gbe
case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+ case GEN_OCL_GET_IMAGE_WIDTH:
+ case GEN_OCL_GET_IMAGE_HEIGHT:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
+ uint32_t elemNum;
+ (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
+ vector<ir::Register> dstTupleData;
+ ir::Register lastReg;
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+ const ir::Register reg = this->getRegister(&I, elemID);
+ dstTupleData.push_back(reg);
+ lastReg = reg;
+ }
+ // A walk around for the gen IR limitation.
+ for (uint32_t elemID = elemNum; elemID < 4; ++ elemID) {
+ dstTupleData.push_back(lastReg);
+ }
+ const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], 4);
+ int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
+
+ ctx.GET_IMAGE_INFO(infoType, dstTuple, surface_id);
+ break;
+ }
case GEN_OCL_READ_IMAGE0:
case GEN_OCL_READ_IMAGE1:
case GEN_OCL_READ_IMAGE2:
@@ -1990,7 +1862,22 @@ namespace gbe
case GEN_OCL_READ_IMAGE15:
{
GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
- GBE_ASSERT(AI != AE); const ir::Register sampler = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE);
+ Constant *CPV = dyn_cast<Constant>(*AI);
+ ir::Register sampler;
+ if (CPV != NULL)
+ {
+ // This is not a kernel argument sampler, we need to append it to sampler set,
+ // and allocate a sampler slot for it.
+ auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+ GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+ sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+ } else {
+ sampler = this->getRegister(*AI);
+ ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
+ }
+ ++AI;
+
GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
ir::Register wcoord;
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index c270924..2ad879e 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -28,6 +28,9 @@
#include "llvm/Pass.h"
#include "sys/platform.hpp"
+#include "sys/map.hpp"
+#include "sys/hash_map.hpp"
+#include <algorithm>
// LLVM Type
namespace llvm { class Type; }
@@ -37,6 +40,29 @@ namespace gbe
// Final target of the Gen backend
namespace ir { class Unit; }
+ /*! All intrinsic Gen functions */
+ enum OCLInstrinsic {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+ };
+
+ /*! Build the hash map for OCL functions on Gen */
+ struct OCLIntrinsicMap {
+ /*! Build the intrinsic hash map */
+ OCLIntrinsicMap(void) {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
+ map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+ }
+ /*! Sort intrinsics with their names */
+ hash_map<std::string, OCLInstrinsic> map;
+ };
+
+ /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
+ static const OCLIntrinsicMap instrinsicMap;
+
/*! Pad the offset */
uint32_t getPadding(uint32_t offset, uint32_t align);
@@ -55,6 +81,8 @@ namespace gbe
/*! Remove the GEP instructions */
llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
+ llvm::FunctionPass* createScalarizePass(ir::Unit &unit);
+
} /* namespace gbe */
#endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 055afa6..2fb33c0 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -70,6 +70,10 @@ DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE13, _Z23__gen_ocl_write_imageuijfffDv4_j)
DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE14, _Z22__gen_ocl_write_imagefjiiiDv4_f)
DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE15, _Z22__gen_ocl_write_imagefjfffDv4_f)
+// To get image info function
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_WIDTH, __gen_ocl_get_image_width)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_HEIGHT, __gen_ocl_get_image_height)
+
// saturation related functions.
DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
DECL_LLVM_GEN_FUNCTION(SADD_SAT_SHORT, _Z12ocl_sadd_satss)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
new file mode 100644
index 0000000..f71401f
--- /dev/null
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -0,0 +1,836 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_scalarize.cpp
+ * \author Yang Rong <rong.r.yang at intel.com>
+ *
+ * This file is derived from:
+ * https://code.google.com/p/lunarglass/source/browse/trunk/Core/Passes/Transforms/Scalarize.cpp?r=605
+ */
+
+//===- Scalarize.cpp - Scalarize LunarGLASS IR ----------------------------===//
+//
+// LunarGLASS: An Open Modular Shader Compiler Architecture
+// Copyright (C) 2010-2011 LunarG, Inc.
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; version 2 of the
+// License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+//===----------------------------------------------------------------------===//
+//
+// Author: Michael Ilseman, LunarG
+//
+//===----------------------------------------------------------------------===//
+//
+// Scalarize the IR.
+// * Loads of uniforms become multiple loadComponent calls
+//
+// * Reads/writes become read/writeComponent calls
+//
+// * Component-wise operations become multiple ops over each component
+//
+// * Texture call become recomponsed texture calls
+//
+// * Vector ops disappear, with their users referring to the scalarized
+// * components
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "ir/unit.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+
+ struct VectorValues {
+ VectorValues() : vals()
+ { }
+
+ void setComponent(int c, llvm::Value* val)
+ {
+ assert(c >= 0 && c < 16 && "Out of bounds component");
+ vals[c] = val;
+ }
+ llvm::Value* getComponent(int c)
+ {
+ assert(c >= 0 && c < 16 && "Out of bounds component");
+ assert(vals[c] && "Requesting non-existing component");
+ return vals[c];
+ }
+
+ // {Value* x, Value* y, Value* z, Value* w}
+ llvm::Value* vals[16];
+ };
+
+ class Scalarize : public FunctionPass {
+
+ public:
+ // Standard pass stuff
+ static char ID;
+
+ Scalarize(ir::Unit& unit) : FunctionPass(ID), unit(unit)
+ {
+ initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+ initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+ }
+
+ virtual bool runOnFunction(Function&);
+ void print(raw_ostream&, const Module* = 0) const;
+ virtual void getAnalysisUsage(AnalysisUsage&) const;
+
+ protected:
+ // An instruction is valid post-scalarization iff it is fully scalar or it
+ // is a gla_loadn
+ bool isValid(const Instruction*);
+
+ // Take an instruction that produces a vector, and scalarize it
+ bool scalarize(Instruction*);
+ bool scalarizePerComponent(Instruction*);
+ bool scalarizeFuncCall(CallInst *);
+ bool scalarizeLoad(LoadInst*);
+ bool scalarizeStore(StoreInst*);
+ //bool scalarizeIntrinsic(IntrinsicInst*);
+ bool scalarizeExtract(ExtractElementInst*);
+ bool scalarizeInsert(InsertElementInst*);
+ bool scalarizeShuffleVector(ShuffleVectorInst*);
+ bool scalarizePHI(PHINode*);
+ void scalarizeArgs(Function& F);
+ // ...
+
+ // Helpers to make the actual multiple scalar calls, one per
+ // component. Updates the given VectorValues's components with the new
+ // Values.
+ void makeScalarizedCalls(Function*, ArrayRef<Value*>, int numComponents, VectorValues&);
+
+ void makePerComponentScalarizedCalls(Instruction*, ArrayRef<Value*>);
+
+ // Makes a scalar form of the given instruction: replaces the operands
+ // and chooses a correct return type
+ Instruction* createScalarInstruction(Instruction* inst, ArrayRef<Value*>);
+
+ // Gather the specified components in the given values. Returns the
+ // component if the given value is a vector, or the scalar itself.
+ void gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs);
+
+ // Get the assigned component for that value. If the value is a scalar,
+ // returns the scalar. If it's a constant, returns that component. If
+ // it's an instruction, returns the vectorValues of that instruction for
+ // that component
+ Value* getComponent(int component, Value*);
+
+ // Used for assertion purposes. Whether we can get the component out with
+ // a getComponent call
+ bool canGetComponent(Value*);
+
+ // Used for assertion purposes. Whether for every operand we can get
+ // components with a getComponent call
+ bool canGetComponentArgs(User*);
+
+ // Delete the instruction in the deadList
+ void dce();
+
+
+ int GetConstantInt(const Value* value);
+ bool IsPerComponentOp(const Instruction* inst);
+ bool IsPerComponentOp(const Value* value);
+
+ //these function used to add extract and insert instructions when load/store etc.
+ void extractFromeVector(Value* insn);
+ Value* InsertToVector(Value* insn, Value* vecValue);
+
+ Type* GetBasicType(Value* value) {
+ return GetBasicType(value->getType());
+ }
+
+ Type* GetBasicType(Type* type) {
+ switch(type->getTypeID()) {
+ case Type::VectorTyID:
+ case Type::ArrayTyID:
+ return GetBasicType(type->getContainedType(0));
+ default:
+ break;
+ }
+ return type;
+ }
+
+ int GetComponentCount(const Type* type) {
+ if (type->getTypeID() == Type::VectorTyID)
+ return llvm::dyn_cast<VectorType>(type)->getNumElements();
+ else
+ return 1;
+ }
+
+ int GetComponentCount(const Value* value) {
+ return GetComponentCount(value->getType());
+ }
+
+ DenseMap<Value*, VectorValues> vectorVals;
+ Module* module;
+ IRBuilder<>* builder;
+
+ Type* intTy;
+ Type* floatTy;
+ ir::Unit &unit;
+
+ std::vector<Instruction*> deadList;
+
+ // List of vector phis that were not completely scalarized because some
+ // of their operands hadn't before been visited (i.e. loop variant
+ // variables)
+ SmallVector<PHINode*, 16> incompletePhis;
+ };
+
+ Value* Scalarize::getComponent(int component, Value* v)
+ {
+ assert(canGetComponent(v) && "getComponent called on unhandled vector");
+
+ if (v->getType()->isVectorTy()) {
+ if (ConstantDataVector* c = dyn_cast<ConstantDataVector>(v)) {
+ return c->getElementAsConstant(component);
+ } else if (ConstantVector* c = dyn_cast<ConstantVector>(v)) {
+ return c->getOperand(component);
+ } else if (isa<ConstantAggregateZero>(v)) {
+ return Constant::getNullValue(GetBasicType(v));
+ } else if (isa<UndefValue>(v)) {
+ return UndefValue::get(GetBasicType(v));
+ } else {
+ return vectorVals[v].getComponent(component);
+ }
+ } else {
+ return v;
+ }
+ }
+
+ bool IsPerComponentOp(const llvm::Value* value)
+ {
+ const llvm::Instruction* inst = llvm::dyn_cast<const llvm::Instruction>(value);
+ return inst && IsPerComponentOp(inst);
+ }
+
+ bool Scalarize::IsPerComponentOp(const Instruction* inst)
+ {
+ //if (const IntrinsicInst* intr = dyn_cast<const IntrinsicInst>(inst))
+ // return IsPerComponentOp(intr);
+
+ if (inst->isTerminator())
+ return false;
+
+ switch (inst->getOpcode()) {
+
+ // Cast ops are only per-component if they cast back to the same vector
+ // width
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ return GetComponentCount(inst->getOperand(0)) == GetComponentCount(inst);
+
+ // Vector ops
+ case Instruction::InsertElement:
+ case Instruction::ExtractElement:
+ case Instruction::ShuffleVector:
+
+ // Ways of accessing/loading/storing vectors
+ case Instruction::ExtractValue:
+ case Instruction::InsertValue:
+
+ // Memory ops
+ case Instruction::Alloca:
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::GetElementPtr:
+ // Phis are a little special. We consider them not to be per-component
+ // because the mechanism of choice is a single value (what path we took to
+ // get here), and doesn't choose per-component (as select would). The caller
+ // should know to handle phis specially
+ case Instruction::PHI:
+ // Call insts, conservatively are no per-component
+ case Instruction::Call:
+ // Misc
+ case Instruction::LandingPad: //--- 3.0
+ case Instruction::VAArg:
+ return false;
+ } // end of switch (inst->getOpcode())
+
+ return true;
+ }
+ int Scalarize::GetConstantInt(const Value* value)
+ {
+ const ConstantInt *constantInt = dyn_cast<ConstantInt>(value);
+
+ // this might still be a constant expression, rather than a numeric constant,
+ // e.g., expression with undef's in it, so it was not folded
+ if (! constantInt)
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("non-simple constant");
+
+ return constantInt->getValue().getSExtValue();
+ }
+ bool Scalarize::canGetComponent(Value* v)
+ {
+ if (v->getType()->isVectorTy()) {
+ if (isa<ConstantDataVector>(v) || isa<ConstantVector>(v) || isa<ConstantAggregateZero>(v) || isa<UndefValue>(v)) {
+ return true;
+ } else {
+ assert((isa<Instruction>(v) || isa<Argument>(v)) && "Non-constant non-instuction?");
+ return vectorVals.count(v);
+ }
+ } else {
+ return true;
+ }
+ }
+
+ bool Scalarize::canGetComponentArgs(User* u)
+ {
+ if (PHINode* phi = dyn_cast<PHINode>(u)) {
+ for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i)
+ if (!canGetComponent(phi->getIncomingValue(i)))
+ return false;
+ } else {
+ for (User::op_iterator i = u->op_begin(), e = u->op_end(); i != e; ++i)
+ if (!canGetComponent(*i))
+ return false;
+ }
+ return true;
+ }
+
+ void Scalarize::gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs)
+ {
+ componentArgs.clear();
+ for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end(); i != e; ++i)
+ componentArgs.push_back(getComponent(component, *i));
+ }
+
+ Instruction* Scalarize::createScalarInstruction(Instruction* inst, ArrayRef<Value*> args)
+ {
+ // TODO: Refine the below into one large switch
+
+ unsigned op = inst->getOpcode();
+ if (inst->isCast()) {
+ assert(args.size() == 1 && "incorrect number of arguments for cast op");
+ return CastInst::Create((Instruction::CastOps)op, args[0], GetBasicType(inst));
+ }
+
+ if (inst->isBinaryOp()) {
+ assert(args.size() == 2 && "incorrect number of arguments for binary op");
+ return BinaryOperator::Create((Instruction::BinaryOps)op, args[0], args[1]);
+ }
+
+ if (PHINode* phi = dyn_cast<PHINode>(inst)) {
+ PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues());
+ assert(args.size() % 2 == 0 && "Odd number of arguments for a PHI");
+
+ // Loop over pairs of operands: [Value*, BasicBlock*]
+ for (unsigned int i = 0; i < args.size(); i++) {
+ BasicBlock* bb = phi->getIncomingBlock(i); //dyn_cast<BasicBlock>(args[i+1]);
+ //assert(bb && "Non-basic block incoming block?");
+ res->addIncoming(args[i], bb);
+ }
+
+ return res;
+ }
+
+ if (CmpInst* cmpInst = dyn_cast<CmpInst>(inst)) {
+ assert(args.size() == 2 && "incorrect number of arguments for comparison");
+ return CmpInst::Create(cmpInst->getOpcode(), cmpInst->getPredicate(), args[0], args[1]);
+ }
+
+ if (isa<SelectInst>(inst)) {
+ assert(args.size() == 3 && "incorrect number of arguments for select");
+ return SelectInst::Create(args[0], args[1], args[2]);
+ }
+
+ if (IntrinsicInst* intr = dyn_cast<IntrinsicInst>(inst)) {
+ if (! IsPerComponentOp(inst))
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarize instruction on a non-per-component intrinsic");
+
+ // TODO: Assumption is that all per-component intrinsics have all their
+ // arguments be overloadable. Need to find some way to assert on this
+ // assumption. This is due to how getDeclaration operates; it only takes
+ // a list of types that fit overloadable slots.
+ SmallVector<Type*, 8> tys(1, GetBasicType(inst->getType()));
+ // Call instructions have the decl as a last argument, so skip it
+ for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end() - 1; i != e; ++i) {
+ tys.push_back(GetBasicType((*i)->getType()));
+ }
+
+ Function* f = Intrinsic::getDeclaration(module, intr->getIntrinsicID(), tys);
+ return CallInst::Create(f, args);
+ }
+
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unsupported instruction: ", inst->getOpcode(),
+ // inst->getOpcodeName());
+ return 0;
+
+ }
+
+
+ void Scalarize::makeScalarizedCalls(Function* f, ArrayRef<Value*> args, int count, VectorValues& vVals)
+ {
+ assert(count > 0 && count <= 16 && "invalid number of vector components");
+ for (int i = 0; i < count; ++i) {
+ Value* res;
+ SmallVector<Value*, 8> callArgs(args.begin(), args.end());
+ callArgs.push_back(ConstantInt::get(intTy, i));
+
+ res = builder->CreateCall(f, callArgs);
+ vVals.setComponent(i, res);
+ }
+ }
+
+ void Scalarize::makePerComponentScalarizedCalls(Instruction* inst, ArrayRef<Value*> args)
+ {
+ int count = GetComponentCount(inst);
+ assert(count > 0 && count <= 16 && "invalid number of vector components");
+ assert((inst->getNumOperands() == args.size() || isa<PHINode>(inst))
+ && "not enough arguments passed for instruction");
+
+ VectorValues& vVals = vectorVals[inst];
+
+ for (int i = 0; i < count; ++i) {
+ // Set this component of each arg
+ SmallVector<Value*, 8> callArgs(args.size(), 0);
+ gatherComponents(i, args, callArgs);
+
+ Instruction* res = createScalarInstruction(inst, callArgs);
+
+ vVals.setComponent(i, res);
+ builder->Insert(res);
+ }
+ }
+
+ bool Scalarize::isValid(const Instruction* inst)
+ {
+ // The result
+ if (inst->getType()->isVectorTy())
+ return false;
+
+ // The arguments
+ for (Instruction::const_op_iterator i = inst->op_begin(), e = inst->op_end(); i != e; ++i) {
+ const Value* v = (*i);
+ assert(v);
+ if (v->getType()->isVectorTy())
+ return false;
+ }
+
+ return true;
+ }
+
+ bool Scalarize::scalarize(Instruction* inst)
+ {
+ if (isValid(inst))
+ return false;
+
+ assert(! vectorVals.count(inst) && "We've already scalarized this somehow?");
+ assert((canGetComponentArgs(inst) || isa<PHINode>(inst)) &&
+ "Scalarizing an op whose arguments haven't been scalarized ");
+ builder->SetInsertPoint(inst);
+
+ if (IsPerComponentOp(inst))
+ return scalarizePerComponent(inst);
+
+ if (LoadInst* ld = dyn_cast<LoadInst>(inst))
+ return scalarizeLoad(ld);
+
+ if (CallInst* call = dyn_cast<CallInst>(inst))
+ return scalarizeFuncCall(call);
+
+ if (ExtractElementInst* extr = dyn_cast<ExtractElementInst>(inst))
+ return scalarizeExtract(extr);
+
+ if (InsertElementInst* ins = dyn_cast<InsertElementInst>(inst))
+ return scalarizeInsert(ins);
+
+ if (ShuffleVectorInst* sv = dyn_cast<ShuffleVectorInst>(inst))
+ return scalarizeShuffleVector(sv);
+
+ if (PHINode* phi = dyn_cast<PHINode>(inst))
+ return scalarizePHI(phi);
+
+ if (isa<ExtractValueInst>(inst) || isa<InsertValueInst>(inst))
+ // TODO: need to come up with a struct/array model for scalarization
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarizing struct/array ops");
+
+ if (StoreInst* st = dyn_cast<StoreInst>(inst))
+ return scalarizeStore(st);
+
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unhandled instruction ", inst->getOpcode(), inst->getOpcodeName());
+ return false;
+ }
+
+ bool Scalarize::scalarizeShuffleVector(ShuffleVectorInst* sv)
+ {
+ // %res = shuffleVector <n x ty> %foo, <n x ty> bar, <n x i32> <...>
+ // ==> nothing (just make a new VectorValues with the new components)
+ VectorValues& vVals = vectorVals[sv];
+
+ int size = GetComponentCount(sv);
+ int srcSize = GetComponentCount(sv->getOperand(0)->getType());
+
+ for (int i = 0; i < size; ++i) {
+ int select = sv->getMaskValue(i);
+
+ if (select < 0) {
+ vVals.setComponent(i, UndefValue::get(GetBasicType(sv->getOperand(0))));
+ continue;
+ }
+
+ // Otherwise look up the corresponding component from the correct
+ // source.
+ Value* selectee;
+ if (select < srcSize) {
+ selectee = sv->getOperand(0);
+ } else {
+ // Choose from the second operand
+ select -= srcSize;
+ selectee = sv->getOperand(1);
+ }
+
+ vVals.setComponent(i, getComponent(select, selectee));
+ }
+
+ return true;
+ }
+
+ bool Scalarize::scalarizePerComponent(Instruction* inst)
+ {
+ // dst = op <n x ty> %foo, <n x ty> %bar
+ // ==> dstx = op ty %foox, ty %barx
+ // dsty = op ty %fooy, ty %bary
+ // ...
+
+ SmallVector<Value*, 16> args(inst->op_begin(), inst->op_end());
+
+ makePerComponentScalarizedCalls(inst, args);
+
+ return true;
+ }
+
+ bool Scalarize::scalarizePHI(PHINode* phi)
+ {
+ // dst = phi <n x ty> [ %foo, %bb1 ], [ %bar, %bb2], ...
+ // ==> dstx = phi ty [ %foox, %bb1 ], [ %barx, %bb2], ...
+ // dsty = phi ty [ %fooy, %bb1 ], [ %bary, %bb2], ...
+
+ // If the scalar values are all known up-front, then just make the full
+ // phinode now. If they are not yet known (phinode for a loop variant
+ // variable), then deferr the arguments until later
+
+ if (canGetComponentArgs(phi)) {
+ SmallVector<Value*, 8> args(phi->op_begin(), phi->op_end());
+ makePerComponentScalarizedCalls(phi, args);
+ } else {
+ makePerComponentScalarizedCalls(phi, ArrayRef<Value*>());
+ incompletePhis.push_back(phi);
+ }
+
+ return true;
+ }
+
+ void Scalarize::extractFromeVector(Value* insn) {
+ VectorValues& vVals = vectorVals[insn];
+
+ for (int i = 0; i < GetComponentCount(insn); ++i) {
+ Value *cv = ConstantInt::get(intTy, i);
+ Value *EI = builder->CreateExtractElement(insn, cv);
+ vVals.setComponent(i, EI);
+ //unit.fakeInsnMap[EI] = insn;
+ unit.newValueProxy(insn, EI, i, 0);
+ }
+ }
+
+ Value* Scalarize::InsertToVector(Value * insn, Value* vecValue) {
+ //VectorValues& vVals = vectorVals[writeValue];
+ //unit.vecValuesMap[call] = vectorVals[writeValue];
+
+ //add fake insert instructions to avoid removed
+ Value *II = NULL;
+ for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+ Value *vec = II ? II : UndefValue::get(vecValue->getType());
+ Value *cv = ConstantInt::get(intTy, i);
+ II = builder->CreateInsertElement(vec, getComponent(i, vecValue), cv);
+ //unit.vecValuesMap[insn].setComponent(i, getComponent(i, writeValue));
+ //unit.newValueProxy(getComponent(i, vecValue), vecValue, 0, i);
+ //unit.fakeInsnMap[II] = insn;
+ }
+
+ for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+ unit.newValueProxy(getComponent(i, vecValue), II, 0, i);
+ }
+ return II;
+ }
+
+ bool Scalarize::scalarizeFuncCall(CallInst* call) {
+ if (Function *F = call->getCalledFunction()) {
+ if (F->getIntrinsicID() != 0) { //Intrinsic functions
+ NOT_IMPLEMENTED;
+ } else {
+ Value *Callee = call->getCalledValue();
+ const std::string fnName = Callee->getName();
+ auto it = instrinsicMap.map.find(fnName);
+ GBE_ASSERT(it != instrinsicMap.map.end());
+
+ // Get the function arguments
+ CallSite CS(call);
+ CallSite::arg_iterator CI = CS.arg_begin() + 3;
+
+ switch (it->second) {
+ default: break;
+ case GEN_OCL_READ_IMAGE0:
+ case GEN_OCL_READ_IMAGE1:
+ case GEN_OCL_READ_IMAGE2:
+ case GEN_OCL_READ_IMAGE3:
+ case GEN_OCL_READ_IMAGE4:
+ case GEN_OCL_READ_IMAGE5:
+ case GEN_OCL_READ_IMAGE10:
+ case GEN_OCL_READ_IMAGE11:
+ case GEN_OCL_READ_IMAGE12:
+ case GEN_OCL_READ_IMAGE13:
+ case GEN_OCL_READ_IMAGE14:
+ case GEN_OCL_READ_IMAGE15:
+ case GEN_OCL_GET_IMAGE_WIDTH:
+ case GEN_OCL_GET_IMAGE_HEIGHT:
+ {
+ extractFromeVector(call);
+ break;
+ }
+ case GEN_OCL_WRITE_IMAGE10:
+ case GEN_OCL_WRITE_IMAGE11:
+ case GEN_OCL_WRITE_IMAGE12:
+ case GEN_OCL_WRITE_IMAGE13:
+ case GEN_OCL_WRITE_IMAGE14:
+ case GEN_OCL_WRITE_IMAGE15:
+ CI++;
+ case GEN_OCL_WRITE_IMAGE0:
+ case GEN_OCL_WRITE_IMAGE1:
+ case GEN_OCL_WRITE_IMAGE2:
+ case GEN_OCL_WRITE_IMAGE3:
+ case GEN_OCL_WRITE_IMAGE4:
+ case GEN_OCL_WRITE_IMAGE5:
+ {
+ *CI = InsertToVector(call, *CI);
+ break;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ bool Scalarize::scalarizeLoad(LoadInst* ld)
+ {
+ extractFromeVector(ld);
+ return false;
+ }
+
+ bool Scalarize::scalarizeStore(StoreInst* st) {
+ st->setOperand(0, InsertToVector(st, st->getValueOperand()));
+ return false;
+ }
+
+ bool Scalarize::scalarizeExtract(ExtractElementInst* extr)
+ {
+ // %res = extractelement <n X ty> %foo, %i
+ // ==> nothing (just use %foo's %ith component instead of %res)
+
+ if (! isa<Constant>(extr->getOperand(1))) {
+ // TODO: Variably referenced components. Probably handle/emulate through
+ // a series of selects.
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+ }
+ //if (isa<Argument>(extr->getOperand(0)))
+ // return false;
+ int component = GetConstantInt(extr->getOperand(1));
+ Value* v = getComponent(component, extr->getOperand(0));
+ if(extr == v)
+ return false;
+ extr->replaceAllUsesWith(v);
+
+ return true;
+ }
+
+ bool Scalarize::scalarizeInsert(InsertElementInst* ins)
+ {
+ // %res = insertValue <n x ty> %foo, %i
+ // ==> nothing (just make a new VectorValues with the new component)
+
+ if (! isa<Constant>(ins->getOperand(2))) {
+ // TODO: Variably referenced components. Probably handle/emulate through
+ // a series of selects.
+ NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+ }
+
+ int component = GetConstantInt(ins->getOperand(2));
+
+ VectorValues& vVals = vectorVals[ins];
+ for (int i = 0; i < GetComponentCount(ins); ++i) {
+ vVals.setComponent(i, i == component ? ins->getOperand(1)
+ : getComponent(i, ins->getOperand(0)));
+ }
+
+ return true;
+ }
+
+ void Scalarize::scalarizeArgs(Function& F) {
+ if (F.arg_empty())
+ return;
+ ReversePostOrderTraversal<Function*> rpot(&F);
+ BasicBlock::iterator instI = (*rpot.begin())->begin();
+ builder->SetInsertPoint(instI);
+
+ Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+
+#if LLVM_VERSION_MINOR <= 1
+ const AttrListPtr &PAL = F.getAttributes();
+ uint32_t argID = 1; // Start at one actually
+ for (; I != E; ++I, ++argID) {
+#else
+ for (; I != E; ++I) {
+#endif /* LLVM_VERSION_MINOR <= 1 */
+ Type *type = I->getType();
+
+ if(type->isVectorTy())
+ extractFromeVector(I);
+ }
+ return;
+ }
+
+ bool Scalarize::runOnFunction(Function& F)
+ {
+ switch (F.getCallingConv()) {
+ case CallingConv::PTX_Device:
+ return false;
+ case CallingConv::PTX_Kernel:
+ break;
+ default: GBE_ASSERTM(false, "Unsupported calling convention");
+ }
+
+ bool changed = false;
+ module = F.getParent();
+ intTy = IntegerType::get(module->getContext(), 32);
+ floatTy = Type::getFloatTy(module->getContext());
+ builder = new IRBuilder<>(module->getContext());
+
+ scalarizeArgs(F);
+
+ typedef ReversePostOrderTraversal<Function*> RPOTType;
+ RPOTType rpot(&F);
+ for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
+ for (BasicBlock::iterator instI = (*bbI)->begin(), instE = (*bbI)->end(); instI != instE; ++instI) {
+ bool scalarized = scalarize(instI);
+ if (scalarized) {
+ changed = true;
+ // TODO: uncomment when done
+ deadList.push_back(instI);
+ }
+ }
+ }
+
+ // Fill in the incomplete phis
+ for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
+ phiI != phiE; ++phiI) {
+ assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
+
+ // Fill in each component of this phi
+ VectorValues& vVals = vectorVals[*phiI];
+ for (int c = 0; c < GetComponentCount(*phiI); ++c) {
+ PHINode* compPhi = dyn_cast<PHINode>(vVals.getComponent(c));
+ assert(compPhi && "Vector phi got scalarized to non-phis?");
+
+ // Loop over pairs of operands: [Value*, BasicBlock*]
+ for (unsigned int i = 0; i < (*phiI)->getNumOperands(); i++) {
+ BasicBlock* bb = (*phiI)->getIncomingBlock(i);
+ assert(bb && "Non-basic block incoming block?");
+ compPhi->addIncoming(getComponent(c, (*phiI)->getOperand(i)), bb);
+ }
+ }
+ }
+
+ dce();
+
+ delete builder;
+ builder = 0;
+
+ return changed;
+ }
+
+ void Scalarize::dce()
+ {
+ //two passes delete for some phinode
+ for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+ (*i)->dropAllReferences();
+ if((*i)->use_empty())
+ (*i)->eraseFromParent();
+ }
+ for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+ if((*i)->getParent())
+ (*i)->eraseFromParent();
+ }
+ deadList.clear();
+ }
+
+ void Scalarize::getAnalysisUsage(AnalysisUsage& AU) const
+ {
+ }
+
+ void Scalarize::print(raw_ostream&, const Module*) const
+ {
+ return;
+ }
+ FunctionPass* createScalarizePass(ir::Unit &unit)
+ {
+ return new Scalarize(unit);
+ }
+ char Scalarize::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index ea3d9eb..559cde0 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -69,6 +69,7 @@ namespace gbe
// Print the code before further optimizations
if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
passes.add(createPrintModulePass(&*o));
+ passes.add(createScalarizePass(unit)); // Expand all vector ops
passes.add(createScalarReplAggregatesPass()); // Break up allocas
passes.add(createRemoveGEPPass(unit));
passes.add(createConstantPropagationPass());
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index d4b1b6c..1ea150b 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -111,9 +111,12 @@ typedef enum clk_sampler_type {
__CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
__CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
- __CLK_ANISOTROPIC_RATIO_BITS = 5,
- __CLK_ANISOTROPIC_RATIO_MASK = (int) 0x80000000 >>
- (__CLK_ANISOTROPIC_RATIO_BITS-1)
+ __CLK_SAMPLER_ARG_BASE = __CLK_MIP_BASE + __CLK_SAMPLER_BITS,
+ __CLK_SAMPLER_ARG_BITS = 8,
+ __CLK_SAMPLER_ARG_MASK = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE,
+ __CLK_SAMPLER_ARG_KEY_BIT = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
+ __CLK_SAMPLER_ARG_KEY_BITS = 1,
+
} clk_sampler_type;
// Memory synchronization
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 4c0d39c..92f9ba9 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -46,7 +46,6 @@ typedef unsigned int uintptr_t;
#define __constant __attribute__((address_space(2)))
#define __local __attribute__((address_space(3)))
#define __texture __attribute__((address_space(4)))
-#define __sampler __attribute__((address_space(5)))
#define global __global
//#define local __local
#define constant __constant
@@ -77,7 +76,7 @@ struct _image2d_t;
typedef __texture struct _image2d_t* image2d_t;
struct _image3d_t;
typedef __texture struct _image3d_t* image3d_t;
-typedef __sampler uint* sampler_t;
+typedef uint sampler_t;
typedef size_t event_t;
/////////////////////////////////////////////////////////////////////////////
// OpenCL conversions & type casting
@@ -394,8 +393,15 @@ PURE CONST float __gen_ocl_rndz(float x);
PURE CONST float __gen_ocl_rnde(float x);
PURE CONST float __gen_ocl_rndu(float x);
PURE CONST float __gen_ocl_rndd(float x);
+INLINE OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+ return __gen_ocl_cos(x * M_PI_F);
+}
INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+ return __gen_ocl_sin(x * M_PI_F);
+}
INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
@@ -405,14 +411,150 @@ INLINE OVERLOADABLE float native_log(float x) {
INLINE OVERLOADABLE float native_log10(float x) {
return native_log2(x) * 0.3010299956f;
}
+INLINE OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
+INLINE OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE OVERLOADABLE int2 ilogb(float2 x) {
+ return (int2)(ilogb(x.s0), ilogb(x.s1));
+}
+INLINE OVERLOADABLE int4 ilogb(float4 x) {
+ return (int4)(ilogb(x.s01), ilogb(x.s23));
+}
+INLINE OVERLOADABLE int8 ilogb(float8 x) {
+ return (int8)(ilogb(x.s0123), ilogb(x.s4567));
+}
+INLINE OVERLOADABLE int16 ilogb(float16 x) {
+ return (int16)(ilogb(x.s01234567), ilogb(x.s89abcdef));
+}
+INLINE OVERLOADABLE float nan(uint code) {
+ return NAN;
+}
+INLINE OVERLOADABLE float2 nan(uint2 code) {
+ return (float2)(nan(code.s0), nan(code.s1));
+}
+INLINE OVERLOADABLE float4 nan(uint4 code) {
+ return (float4)(nan(code.s01), nan(code.s23));
+}
+INLINE OVERLOADABLE float8 nan(uint8 code) {
+ return (float8)(nan(code.s0123), nan(code.s4567));
+}
+INLINE OVERLOADABLE float16 nan(uint16 code) {
+ return (float16)(nan(code.s01234567), nan(code.s89abcdef));
+}
INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
INLINE OVERLOADABLE float native_tan(float x) {
return native_sin(x) / native_cos(x);
}
-#define E 2.71828182845904523536f
-INLINE OVERLOADABLE float native_exp(float x) { return native_powr(E, x); }
-#undef E
+INLINE OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+ return native_tan(x * M_PI_F);
+}
+INLINE OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
+INLINE OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
+INLINE OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+INLINE OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
+INLINE OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+ return __gen_ocl_pow(x, 0.3333333333f);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
+ *cosval = native_cos(x);
+ return native_sin(x);
+}
+INLINE OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
+ return (float2)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+ __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval));
+}
+INLINE OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
+ return (float4)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+ __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval));
+}
+INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
+ return (float8)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+ __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s4, 4 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s5, 5 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval));
+}
+INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
+ return (float16)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+ __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s4, 4 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s5, 5 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s8, 8 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.s9, 9 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.sa, 10 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.sb, 11 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.sc, 12 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.sd, 13 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.se, 14 + (float *)cosval),
+ __gen_ocl_internal_sincos(x.sf, 15 + (float *)cosval));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
+ return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+ return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+ float y = native_exp(-2 * x);
+ return (1 - y) / (1 + y);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+ return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+ return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+ return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+ return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+ float a = 0, c = 1;
+ if (x <= -1) {
+ a = - M_PI_2_F;
+ x = 1 / x;
+ c = -1;
+ }
+ if (x >= 1) {
+ a = M_PI_2_F;
+ x = 1 / x;
+ c = -1;
+ }
+ return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+ return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
+ return native_log(x + native_sqrt(x * x + 1));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+ return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+ return 0.5f * native_sqrt((1 + x) / (1 - x));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+ return x * y < 0 ? -x : x;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+ return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+ return 1 - __gen_ocl_internal_erf(x);
+}
// XXX work-around PTX profile
#define sqrt native_sqrt
@@ -428,11 +570,36 @@ INLINE OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log1
INLINE OVERLOADABLE float __gen_ocl_internal_exp(float x) { return native_exp(x); }
INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
-
+INLINE OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
+INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+ return 2 * __gen_ocl_internal_round(x / 2);
+}
// TODO use llvm intrinsics definitions
#define cos native_cos
+#define cospi __gen_ocl_internal_cospi
+#define cosh __gen_ocl_internal_cosh
+#define acos __gen_ocl_internal_acos
+#define acospi __gen_ocl_internal_acospi
+#define acosh __gen_ocl_internal_acosh
#define sin native_sin
+#define sinpi __gen_ocl_internal_sinpi
+#define sinh __gen_ocl_internal_sinh
+#define sincos __gen_ocl_internal_sincos
+#define asin __gen_ocl_internal_asin
+#define asinpi __gen_ocl_internal_asinpi
+#define asinh __gen_ocl_internal_asinh
+#define tan native_tan
+#define tanpi __gen_ocl_internal_tanpi
+#define tanh __gen_ocl_internal_tanh
+#define atan __gen_ocl_internal_atan
+#define atanpi __gen_ocl_internal_atanpi
+#define atanh __gen_ocl_internal_atanh
#define pow powr
+#define cbrt __gen_ocl_internal_cbrt
+#define rint __gen_ocl_internal_rint
+#define copysign __gen_ocl_internal_copysign
+#define erf __gen_ocl_internal_erf
+#define erfc __gen_ocl_internal_erfc
INLINE OVERLOADABLE float mad(float a, float b, float c) {
return a*b+c;
@@ -484,25 +651,93 @@ DECL_SELECT4(float4, float, uint4, 0x80000000)
/////////////////////////////////////////////////////////////////////////////
// Common Functions (see 6.11.4 of OCL 1.1 spec)
/////////////////////////////////////////////////////////////////////////////
-#define DECL_MIN_MAX(TYPE) \
+#define DECL_MIN_MAX_CLAMP(TYPE) \
INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
return a > b ? a : b; \
} \
INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
return a < b ? a : b; \
+} \
+INLINE OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+ return max(min(v, u), l); \
}
-DECL_MIN_MAX(float)
-DECL_MIN_MAX(int)
-DECL_MIN_MAX(short)
-DECL_MIN_MAX(char)
-DECL_MIN_MAX(uint)
-DECL_MIN_MAX(unsigned short)
-DECL_MIN_MAX(unsigned char)
-#undef DECL_MIN_MAX
+DECL_MIN_MAX_CLAMP(float)
+DECL_MIN_MAX_CLAMP(int)
+DECL_MIN_MAX_CLAMP(short)
+DECL_MIN_MAX_CLAMP(char)
+DECL_MIN_MAX_CLAMP(uint)
+DECL_MIN_MAX_CLAMP(unsigned short)
+DECL_MIN_MAX_CLAMP(unsigned char)
+#undef DECL_MIN_MAX_CLAMP
INLINE OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
INLINE OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+INLINE OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+ float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+ return a > b ? x : b > a ? y : max(x, y);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+ float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+ return a < b ? x : b < a ? y : min(x, y);
+}
INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+INLINE OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+ return __gen_ocl_internal_fmax(x, y) - y;
+}
+INLINE OVERLOADABLE float fract(float x, float *p) {
+ *p = __gen_ocl_internal_floor(x);
+ return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+}
+INLINE OVERLOADABLE float2 fract(float2 x, float2 *p) {
+ return (float2)(fract(x.s0, (float *)p),
+ fract(x.s1, 1 + (float *)p));
+}
+INLINE OVERLOADABLE float4 fract(float4 x, float4 *p) {
+ return (float4)(fract(x.s0, (float *)p),
+ fract(x.s1, 1 + (float *)p),
+ fract(x.s2, 2 + (float *)p),
+ fract(x.s3, 3 + (float *)p));
+}
+INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
+ return (float8)(fract(x.s0, (float *)p),
+ fract(x.s1, 1 + (float *)p),
+ fract(x.s2, 2 + (float *)p),
+ fract(x.s3, 3 + (float *)p),
+ fract(x.s4, 4 + (float *)p),
+ fract(x.s5, 5 + (float *)p),
+ fract(x.s6, 6 + (float *)p),
+ fract(x.s7, 7 + (float *)p));
+}
+INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
+ return (float16)(fract(x.s0, (float *)p),
+ fract(x.s1, 1 + (float *)p),
+ fract(x.s2, 2 + (float *)p),
+ fract(x.s3, 3 + (float *)p),
+ fract(x.s4, 4 + (float *)p),
+ fract(x.s5, 5 + (float *)p),
+ fract(x.s6, 6 + (float *)p),
+ fract(x.s7, 7 + (float *)p),
+ fract(x.s8, 8 + (float *)p),
+ fract(x.s9, 9 + (float *)p),
+ fract(x.sa, 10 + (float *)p),
+ fract(x.sb, 11 + (float *)p),
+ fract(x.sc, 12 + (float *)p),
+ fract(x.sd, 13 + (float *)p),
+ fract(x.se, 14 + (float *)p),
+ fract(x.sf, 15 + (float *)p));
+}
+INLINE OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+INLINE OVERLOADABLE float ldexp(float x, int n) {
+ return __gen_ocl_pow(2, n) * x;
+}
+INLINE OVERLOADABLE float pown(float x, int n) {
+ if (x == 0 && n == 0)
+ return 1;
+ return powr(x, n);
+}
+INLINE OVERLOADABLE float rootn(float x, int n) {
+ return powr(x, 1.f / n);
+}
/////////////////////////////////////////////////////////////////////////////
// Geometric functions (see 6.11.5 of OCL 1.1 spec)
@@ -641,12 +876,33 @@ DECL_UNTYPED_RW_ALL(float)
return dst;\
}
DECL_VECTOR_1OP(native_cos, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_cospi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_cosh, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_acos, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_acospi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_acosh, float);
DECL_VECTOR_1OP(native_sin, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_sinpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_sinh, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_asin, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_asinpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_asinh, float);
DECL_VECTOR_1OP(native_tan, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_tanpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_tanh, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_atan, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_atanpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_atanh, float);
DECL_VECTOR_1OP(native_sqrt, float);
DECL_VECTOR_1OP(native_rsqrt, float);
DECL_VECTOR_1OP(native_log2, float);
+DECL_VECTOR_1OP(log1p, float);
+DECL_VECTOR_1OP(logb, float);
DECL_VECTOR_1OP(native_recip, float);
+DECL_VECTOR_1OP(native_exp2, float);
+DECL_VECTOR_1OP(native_exp10, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_expm1, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_cbrt, float);
DECL_VECTOR_1OP(__gen_ocl_internal_fabs, float);
DECL_VECTOR_1OP(__gen_ocl_internal_trunc, float);
DECL_VECTOR_1OP(__gen_ocl_internal_round, float);
@@ -655,6 +911,9 @@ DECL_VECTOR_1OP(__gen_ocl_internal_ceil, float);
DECL_VECTOR_1OP(__gen_ocl_internal_log, float);
DECL_VECTOR_1OP(__gen_ocl_internal_log2, float);
DECL_VECTOR_1OP(__gen_ocl_internal_log10, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_rint, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_erf, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_erfc, float);
#undef DECL_VECTOR_1OP
/////////////////////////////////////////////////////////////////////////////
// Arithmetic functions
@@ -682,12 +941,46 @@ DECL_VECTOR_1OP(__gen_ocl_internal_log10, float);
dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
return dst;\
}
+DECL_VECTOR_2OP(hypot, float);
DECL_VECTOR_2OP(min, float);
DECL_VECTOR_2OP(max, float);
DECL_VECTOR_2OP(__gen_ocl_internal_fmin, float);
DECL_VECTOR_2OP(__gen_ocl_internal_fmax, float);
+DECL_VECTOR_2OP(__gen_ocl_internal_fdim, float);
DECL_VECTOR_2OP(fmod, float);
+DECL_VECTOR_2OP(remainder, float);
DECL_VECTOR_2OP(powr, float);
+DECL_VECTOR_2OP(native_divide, float);
+DECL_VECTOR_2OP(copysign, float);
+DECL_VECTOR_2OP(__gen_ocl_internal_maxmag, float);
+DECL_VECTOR_2OP(__gen_ocl_internal_minmag, float);
+#undef DECL_VECTOR_2OP
+
+#define DECL_VECTOR_2OP(NAME, TYPE, TYPE2) \
+ INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
+ return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
+ }\
+ INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
+ return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
+ }\
+ INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
+ return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
+ }\
+ INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
+ TYPE##8 dst;\
+ dst.s0123 = NAME(v0.s0123, v1.s0123);\
+ dst.s4567 = NAME(v0.s4567, v1.s4567);\
+ return dst;\
+ }\
+ INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
+ TYPE##16 dst;\
+ dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
+ dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
+ return dst;\
+ }
+DECL_VECTOR_2OP(ldexp, float, int);
+DECL_VECTOR_2OP(pown, float, int);
+DECL_VECTOR_2OP(rootn, float, int);
#undef DECL_VECTOR_2OP
#define DECL_VECTOR_3OP(NAME, TYPE) \
@@ -733,8 +1026,15 @@ INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,
#define log2 __gen_ocl_internal_log2
#define log10 __gen_ocl_internal_log10
#define exp __gen_ocl_internal_exp
+#define exp2 native_exp2
+#define exp10 native_exp10
+#define expm1 __gen_ocl_internal_expm1
#define fmin __gen_ocl_internal_fmin
#define fmax __gen_ocl_internal_fmax
+#define fma mad
+#define fdim __gen_ocl_internal_fdim
+#define maxmag __gen_ocl_internal_maxmag
+#define minmag __gen_ocl_internal_minmag
/////////////////////////////////////////////////////////////////////////////
// Synchronization functions
@@ -796,6 +1096,9 @@ OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w,
OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
+int __gen_ocl_get_image_width(uint surface_id);
+int __gen_ocl_get_image_height(uint surface_id);
+//OVERLOADABLE int __gen_ocl_get_image_depth(image3d_t image);
#define GET_IMAGE(cl_image, surface_id) \
uint surface_id = (uint)cl_image
@@ -804,7 +1107,14 @@ OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, floa
INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, sampler_t sampler, coord_type coord) \
{\
GET_IMAGE(cl_image, surface_id);\
- return __gen_ocl_read_image ##suffix(surface_id, (uint)sampler, coord.s0, coord.s1);\
+ return __gen_ocl_read_image ##suffix(surface_id, sampler, coord.s0, coord.s1);\
+ }
+
+#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, coord_type coord) \
+ {\
+ GET_IMAGE(cl_image, surface_id);\
+ return __gen_ocl_read_image ##suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1);\
}
#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
@@ -817,6 +1127,7 @@ OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, floa
#define DECL_IMAGE(type, suffix) \
DECL_READ_IMAGE(type, suffix, int2) \
DECL_READ_IMAGE(type, suffix, float2) \
+ DECL_READ_IMAGE_NOSAMPLER(type, suffix, int2) \
DECL_WRITE_IMAGE(type, suffix, int2) \
DECL_WRITE_IMAGE(type, suffix, float2)
@@ -826,8 +1137,56 @@ DECL_IMAGE(float4, f)
#undef DECL_IMAGE
#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
#undef DECL_WRITE_IMAGE
+#define DECL_IMAGE_INFO(image_type) \
+ INLINE_OVERLOADABLE int get_image_width(image_type image) \
+ { \
+ GET_IMAGE(image, surface_id);\
+ return __gen_ocl_get_image_width(surface_id);\
+ } \
+ INLINE_OVERLOADABLE int get_image_height(image_type image)\
+ { \
+ GET_IMAGE(image, surface_id);\
+ return __gen_ocl_get_image_height(surface_id); \
+ }
+#if 0
+ INLINE_OVERLOADABLE int get_image_channel_data_type(image_type image)\
+ { NOT_IMPLEMENTED; }\
+ INLINE_OVERLOADABLE int get_image_channel_order(image_type image)\
+ { NOT_IMPLEMENTED; }
+#endif
+
+
+DECL_IMAGE_INFO(image2d_t)
+DECL_IMAGE_INFO(image3d_t)
+#if 0
+/* The following functions are not implemented yet. */
+DECL_IMAGE_INFO(image1d_t)
+DECL_IMAGE_INFO(image1d_buffer_t)
+DECL_IMAGE_INFO(image1d_array_t)
+DECL_IMAGE_INFO(image2d_array_t)
+
+INLINE_OVERLOADABLE int get_image_depth(image3d_t image)
+ { return __gen_ocl_get_image_depth(image); }
+
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_t image)
+ { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_array_t image)
+ { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE int4 get_image_dim(image2d_array_t image)
+ { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
+ { return __gen_ocl_get_image_array_size(image); }
+
+INLINE_OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
+ { return __gen_ocl_get_image_array_size(image); }
+#endif
+
#define DECL_READ_IMAGE(type, suffix, coord_type) \
INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, sampler_t sampler, coord_type coord) \
{\
@@ -835,6 +1194,13 @@ DECL_IMAGE(float4, f)
return __gen_ocl_read_image ## suffix(surface_id, (uint)sampler, coord.s0, coord.s1, coord.s2);\
}
+#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
+ INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, coord_type coord) \
+ {\
+ GET_IMAGE(cl_image, surface_id);\
+ return __gen_ocl_read_image ## suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1, coord.s2);\
+ }
+
#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
INLINE_OVERLOADABLE void write_image ## suffix(image3d_t cl_image, coord_type coord, type color)\
{\
@@ -845,6 +1211,7 @@ DECL_IMAGE(float4, f)
#define DECL_IMAGE(type, suffix) \
DECL_READ_IMAGE(type, suffix, int4) \
DECL_READ_IMAGE(type, suffix, float4) \
+ DECL_READ_IMAGE_NOSAMPLER(type, suffix, int4) \
DECL_WRITE_IMAGE(type, suffix, int4) \
DECL_WRITE_IMAGE(type, suffix, float4)
@@ -854,6 +1221,7 @@ DECL_IMAGE(float4, f)
#undef DECL_IMAGE
#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
#undef DECL_WRITE_IMAGE
#undef GET_IMAGE
diff --git a/backend/src/sys/alloc.hpp b/backend/src/sys/alloc.hpp
index 52a37a2..8fcb3a7 100644
--- a/backend/src/sys/alloc.hpp
+++ b/backend/src/sys/alloc.hpp
@@ -27,6 +27,7 @@
#include "sys/platform.hpp"
#include "sys/assert.hpp"
#include <algorithm>
+#include <limits>
namespace gbe
{
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
index 806e736..06448e2 100644
--- a/include/CL/cl.hpp
+++ b/include/CL/cl.hpp
@@ -29,11 +29,10 @@
*
* Additions and fixes from:
* Brian Cole, March 3rd 2010 and April 2012
- * Lee Howes, October 2011, March 2012
* Matt Gruenke, April 2012.
*
- * \version 1.2.1
- * \date September 2012
+ * \version 1.2.4
+ * \date January 2013
*
* Optional extension support
*
@@ -168,12 +167,6 @@
#include <CL/cl_ext.h>
#endif
-// TODO: Remove once declaration is moved elsewhere
-//#if !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-//#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-//#endif // #if !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-
#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenGL/OpenGL.h>
#include <OpenCL/opencl.h>
@@ -393,7 +386,7 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram)
#if defined(CL_VERSION_1_2)
#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram)
-#define __SET_PRINTF_CALLBACK_ERR __ERR_STR(clSetPrintfCallback)
+
#endif // #if defined(CL_VERSION_1_2)
#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram)
@@ -433,7 +426,7 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error)
/**
- * CL 1.1 version that uses device fission.
+ * CL 1.2 version that uses device fission.
*/
#if defined(CL_VERSION_1_2)
#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices)
@@ -458,6 +451,13 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
//! \endcond
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
typedef std::string STRING_CLASS;
@@ -623,10 +623,8 @@ public:
//! \brief Destructor - frees memory used to hold the current value.
~string()
{
- if (str_ != NULL) {
- delete[] str_;
- str_ = NULL;
- }
+ delete[] str_;
+ str_ = NULL;
}
//! \brief Queries the length of the string, excluding any added '\0's.
@@ -2327,9 +2325,16 @@ public:
cl_int* err = NULL)
{
cl_int error;
+
+ ::size_t numDevices = devices.size();
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
+
object_ = ::clCreateContext(
- properties, (cl_uint) devices.size(),
- (cl_device_id*) &devices.front(),
+ properties, (cl_uint) numDevices,
+ deviceIDs,
notifyFptr, data, &error);
detail::errHandler(error, __CREATE_CONTEXT_ERR);
@@ -2350,9 +2355,12 @@ public:
cl_int* err = NULL)
{
cl_int error;
+
+ cl_device_id deviceID = device();
+
object_ = ::clCreateContext(
properties, 1,
- (cl_device_id*) &device,
+ &deviceID,
notifyFptr, data, &error);
detail::errHandler(error, __CREATE_CONTEXT_ERR);
@@ -2554,25 +2562,6 @@ public:
formats->assign(&value[0], &value[numEntries]);
return CL_SUCCESS;
}
-
-
-#if defined(CL_VERSION_1_2)
- cl_int setPrintfCallback(
- void (CL_CALLBACK * pfn_notify)(
- cl_context /* program */,
- cl_uint /*printf_data_len */,
- char * /* printf_data_ptr */,
- void * /* user_data */),
- void * user_data )
- {
- return detail::errHandler(
- ::clSetPrintfCallback(
- object_,
- pfn_notify,
- user_data),
- __SET_PRINTF_CALLBACK_ERR);
- }
-#endif // #if defined(CL_VERSION_1_2)
};
inline Device Device::getDefault(cl_int * err)
@@ -3016,7 +3005,7 @@ public:
Buffer(
IteratorType startIterator,
IteratorType endIterator,
- bool readOnly = false,
+ bool readOnly,
bool useHostPtr = false,
cl_int* err = NULL)
{
@@ -4121,14 +4110,14 @@ public:
ImageGL& operator = (const ImageGL& rhs)
{
if (this != &rhs) {
- ImageGL::operator=(rhs);
+ Image::operator=(rhs);
}
return *this;
}
ImageGL& operator = (const cl_mem& rhs)
{
- ImageGL::operator=(rhs);
+ Image::operator=(rhs);
return *this;
}
};
@@ -4443,7 +4432,7 @@ public:
{
typename detail::param_traits<
detail::cl_kernel_arg_info, name>::param_type param;
- cl_int result = getArgInfo(name, argIndex, ¶m);
+ cl_int result = getArgInfo(argIndex, name, ¶m);
if (err != NULL) {
*err = result;
}
@@ -4650,9 +4639,15 @@ public:
lengths[i] = binaries[(int)i].second;
}
+ ::size_t numDevices = devices.size();
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
+
object_ = ::clCreateProgramWithBinary(
context(), (cl_uint) devices.size(),
- (cl_device_id*)&devices.front(),
+ deviceIDs,
lengths, images, binaryStatus != NULL
? (cl_int*) &binaryStatus->front()
: NULL, &error);
@@ -4676,11 +4671,18 @@ public:
cl_int* err = NULL)
{
cl_int error;
+
+
+ ::size_t numDevices = devices.size();
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
object_ = ::clCreateProgramWithBuiltInKernels(
context(),
(cl_uint) devices.size(),
- (cl_device_id*)&devices.front(),
+ deviceIDs,
kernelNames.c_str(),
&error);
@@ -4717,12 +4719,18 @@ public:
void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
void* data = NULL) const
{
+ ::size_t numDevices = devices.size();
+ cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+ for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+ deviceIDs[deviceIndex] = (devices[deviceIndex])();
+ }
+
return detail::errHandler(
::clBuildProgram(
object_,
(cl_uint)
devices.size(),
- (cl_device_id*)&devices.front(),
+ deviceIDs,
options,
notifyFptr,
data),
@@ -5668,7 +5676,7 @@ public:
(events != NULL) ? (cl_uint) events->size() : 0,
(events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
(event != NULL) ? &tmp : NULL),
- __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+ __ENQUEUE_MARKER_WAIT_LIST_ERR);
if (event != NULL && err == CL_SUCCESS)
*event = tmp;
@@ -5687,18 +5695,18 @@ public:
* all events either in the event_wait_list or all previously enqueued commands, queued
* before this command to command_queue, have completed.
*/
- cl_int clEnqueueBarrierWithWaitList(
+ cl_int enqueueBarrierWithWaitList(
const VECTOR_CLASS<Event> *events = 0,
Event *event = 0)
{
cl_event tmp;
cl_int err = detail::errHandler(
- ::clEnqueueMarkerWithWaitList(
+ ::clEnqueueBarrierWithWaitList(
object_,
(events != NULL) ? (cl_uint) events->size() : 0,
(events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
(event != NULL) ? &tmp : NULL),
- __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+ __ENQUEUE_BARRIER_WAIT_LIST_ERR);
if (event != NULL && err == CL_SUCCESS)
*event = tmp;
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 1f0437f..4d0bed7 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -1,3 +1,5 @@
FILE(GLOB HEADER_FILES "CL/*.h")
+FILE(GLOB HPP_FILES "CL/*.hpp")
install (FILES ${HEADER_FILES} DESTINATION include/CL)
+install (FILES ${HPP_FILES} DESTINATION include/CL)
diff --git a/kernels/compiler_julia.cl b/kernels/compiler_julia.cl
index 996c0b7..98c5799 100644
--- a/kernels/compiler_julia.cl
+++ b/kernels/compiler_julia.cl
@@ -14,8 +14,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
return I - 2.0f * dot(N, I) * N;
}
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
inline uint pack_fp4(float4 u4) {
uint u;
u = (((uint) u4.x)) |
diff --git a/kernels/compiler_julia_no_break.cl b/kernels/compiler_julia_no_break.cl
index c0bd3b1..1a9be64 100644
--- a/kernels/compiler_julia_no_break.cl
+++ b/kernels/compiler_julia_no_break.cl
@@ -14,8 +14,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
return I - 2.0f * dot(N, I) * N;
}
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
inline uint pack_fp4(float4 u4) {
uint u;
u = (((uint) u4.x)) |
diff --git a/kernels/compiler_math.cl b/kernels/compiler_math.cl
index 0659840..695fc2c 100644
--- a/kernels/compiler_math.cl
+++ b/kernels/compiler_math.cl
@@ -1,14 +1,40 @@
__kernel void compiler_math(__global float *dst, __global float *src) {
- const float x = src[get_global_id(0)];
- switch (get_global_id(0)) {
- case 0: dst[get_global_id(0)] = native_cos(x); break;
- case 1: dst[get_global_id(0)] = native_sin(x); break;
- case 2: dst[get_global_id(0)] = native_log2(x); break;
- case 3: dst[get_global_id(0)] = native_sqrt(x); break;
- case 4: dst[get_global_id(0)] = native_rsqrt(x); break;
- case 5: dst[get_global_id(0)] = native_recip(x); break;
- case 6: dst[get_global_id(0)] = native_tan(x); break;
- default: dst[get_global_id(0)] = 1.f; break;
+ int i = get_global_id(0);
+ const float x = src[i];
+ switch (i) {
+ case 0: dst[i] = cos(x); break;
+ case 1: dst[i] = sin(x); break;
+ case 2: dst[i] = log2(x); break;
+ case 3: dst[i] = sqrt(x); break;
+ case 4: dst[i] = rsqrt(x); break;
+ case 5: dst[i] = native_recip(x); break;
+ case 6: dst[i] = tan(x); break;
+ case 7: dst[i] = cbrt(x); break;
+ case 8: dst[i] = ceil(x); break;
+ case 9: dst[i] = cospi(x); break;
+ case 10: dst[i] = exp2(x); break;
+ case 11: dst[i] = exp10(x); break;
+ case 12: dst[i] = expm1(x); break;
+ case 13: dst[i] = log1p(x); break;
+ case 14: dst[i] = logb(x); break;
+ case 15: dst[i] = sinpi(x); break;
+ case 16: dst[i] = tanpi(x); break;
+ case 17: dst[i] = rint(x); break;
+ case 18: dst[i] = sinh(x); break;
+ case 19: dst[i] = cosh(x); break;
+ case 20: dst[i] = tanh(x); break;
+ case 21: dst[i] = asinh(x); break;
+ case 22: dst[i] = acosh(x); break;
+ case 23: dst[i] = atanh(x); break;
+ case 24: dst[i] = asin(x); break;
+ case 25: dst[i] = acos(x); break;
+ case 26: dst[i] = atan(x); break;
+ case 27: dst[i] = asinpi(x); break;
+ case 28: dst[i] = acospi(x); break;
+ case 29: dst[i] = atanpi(x); break;
+ case 30: dst[i] = erf(x); break;
+ case 31: dst[i] = nan((uint)x); break;
+ default: dst[i] = 1.f; break;
};
}
diff --git a/kernels/compiler_math_2op.cl b/kernels/compiler_math_2op.cl
new file mode 100644
index 0000000..6e970b8
--- /dev/null
+++ b/kernels/compiler_math_2op.cl
@@ -0,0 +1,19 @@
+kernel void compiler_math_2op(global float *dst, global float *src1, global float *src2) {
+ int i = get_global_id(0);
+ const float x = src1[i], y = src2[i];
+ float z;
+ switch (i) {
+ case 0: dst[i] = native_divide(x, y); break;
+ case 1: dst[i] = fdim(x, y); break;
+ case 2: dst[i] = fract(x, &z); break;
+ case 3: dst[i] = hypot(x, y); break;
+ case 4: dst[i] = ldexp(x, y); break;
+ case 5: dst[i] = pown(x, (int)y); break;
+ case 6: dst[i] = remainder(x, y); break;
+ case 7: dst[i] = rootn(x, (int)(y+1)); break;
+ case 8: dst[i] = copysign(x, y); break;
+ case 9: dst[i] = maxmag(x, y); break;
+ case 10: dst[i] = minmag(x, y); break;
+ default: dst[i] = 1.f; break;
+ };
+}
diff --git a/kernels/compiler_math_3op.cl b/kernels/compiler_math_3op.cl
new file mode 100644
index 0000000..95b0398
--- /dev/null
+++ b/kernels/compiler_math_3op.cl
@@ -0,0 +1,9 @@
+kernel void compiler_math_3op(global float *dst, global float *src1, global float *src2, global float *src3) {
+ int i = get_global_id(0);
+ const float x = src1[i], y = src2[i], z = src3[i];
+ switch (i) {
+ case 0: dst[i] = mad(x, y, z); break;
+ case 1: dst[i] = fma(x, y, z); break;
+ default: dst[i] = 1.f; break;
+ };
+}
diff --git a/kernels/compiler_menger_sponge.cl b/kernels/compiler_menger_sponge.cl
index b59c5e3..58af12a 100644
--- a/kernels/compiler_menger_sponge.cl
+++ b/kernels/compiler_menger_sponge.cl
@@ -25,8 +25,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
return I - 2.0f * dot(N, I) * N;
}
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
inline uint pack_fp4(float4 u4) {
uint u;
u = (((uint) u4.x)) |
diff --git a/kernels/compiler_menger_sponge_no_shadow.cl b/kernels/compiler_menger_sponge_no_shadow.cl
index 4f1093f..95469c5 100644
--- a/kernels/compiler_menger_sponge_no_shadow.cl
+++ b/kernels/compiler_menger_sponge_no_shadow.cl
@@ -25,8 +25,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
return I - 2.0f * dot(N, I) * N;
}
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
inline uint pack_fp4(float4 u4) {
uint u;
u = (((uint) u4.x)) |
diff --git a/kernels/compiler_nautilus.cl b/kernels/compiler_nautilus.cl
index b53771c..aa7251a 100644
--- a/kernels/compiler_nautilus.cl
+++ b/kernels/compiler_nautilus.cl
@@ -14,8 +14,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
return I - 2.0f * dot(N, I) * N;
}
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
inline uint pack_fp4(float4 u4) {
uint u;
u = (((uint) u4.x)) |
@@ -59,7 +57,7 @@ __kernel void compiler_nautilus(__global uint *dst, float resx, float resy, int
for(int q=0;q<100;q++)
{
float l = e(o+0.5f*(vec3)(cos(1.1f*(float)(q)),cos(1.6f*(float)(q)),cos(1.4f*(float)(q))))-m;
- a+=clamp(4.0f*l,0.0f,1.0f);
+ a+=floor(clamp(4.0f*l,0.0f,1.0f));
}
v*=a/100.0f;
vec4 gl_FragColor=(vec4)(v,1.0f);
diff --git a/kernels/test_copy_image1.cl b/kernels/test_copy_image1.cl
new file mode 100644
index 0000000..28e7a7d
--- /dev/null
+++ b/kernels/test_copy_image1.cl
@@ -0,0 +1,33 @@
+#define S(A,B,C) CLK_NORMALIZED_COORDS_##A | CLK_ADDRESS_##B | CLK_FILTER_##C
+
+#define COPY_IMAGE(_dst, _sampler, scoord, dcoord) \
+ color = read_imagei(src, _sampler, scoord);\
+ write_imagei(_dst, dcoord, color)
+
+__kernel void
+test_copy_image1(__read_only image2d_t src,
+ __write_only image2d_t dst0,
+ sampler_t sampler0,
+ __write_only image2d_t dst1,
+ __write_only image2d_t dst2,
+ __write_only image2d_t dst3,
+ __write_only image2d_t dst4,
+ float w_inv, float h_inv)
+{
+ const sampler_t sampler1 = S(FALSE, REPEAT, NEAREST);
+ const sampler_t sampler2 = S(FALSE, CLAMP, NEAREST);
+ const sampler_t sampler3 = S(FALSE, MIRRORED_REPEAT, NEAREST);
+ const sampler_t sampler4 = S(TRUE, REPEAT, NEAREST);
+ int2 coord;
+ float2 fcoord;
+ int4 color;
+ coord.x = (int)get_global_id(0);
+ coord.y = (int)get_global_id(1);
+ fcoord.x = coord.x * w_inv;
+ fcoord.y = coord.y * h_inv;
+ COPY_IMAGE(dst0, sampler0, coord, coord);
+ COPY_IMAGE(dst1, sampler1, coord, coord);
+ COPY_IMAGE(dst2, sampler2, coord, coord);
+ COPY_IMAGE(dst3, sampler3, coord, coord);
+ COPY_IMAGE(dst4, sampler4, fcoord, coord);
+}
diff --git a/kernels/test_fill_image0.cl b/kernels/test_fill_image0.cl
index ad1339f..9428092 100644
--- a/kernels/test_fill_image0.cl
+++ b/kernels/test_fill_image0.cl
@@ -2,8 +2,8 @@ __kernel void
test_fill_image0(__write_only image2d_t dst)
{
int2 coord;
- int4 color4 = {0x12, 0x34, 0x56, 0x78};
coord.x = (int)get_global_id(0);
coord.y = (int)get_global_id(1);
+ int4 color4 = {coord.y & 0xFF, (coord.y & 0xFF00) >> 8, coord.x & 0xFF, (coord.x & 0xFF00) >> 8};
write_imagei(dst, coord, color4);
}
diff --git a/kernels/test_get_image_size.cl b/kernels/test_get_image_size.cl
new file mode 100644
index 0000000..aeb7d66
--- /dev/null
+++ b/kernels/test_get_image_size.cl
@@ -0,0 +1,9 @@
+__kernel void
+test_get_image_size(__write_only image2d_t src, __global int *info)
+{
+ int id = (int)get_global_id(0);
+ int w, h;
+ w = get_image_width(src);
+ h = get_image_height(src);
+ info[id] = (w << 16 | h);
+}
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..fc1479e
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,2 @@
+OCLConfig.h
+libcl.so
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2f590c6..cea78c0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,14 +28,16 @@ set(OPENCL_SRC
x11/dricommon.c
x11/va_dri2.c)
-if (EGL_FOUND)
+if (EGL_FOUND AND GBM_FOUND)
set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/gbm_dri2_x11_platform.c)
SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
-else(EGL_FOUND)
+SET(OPTIONAL_GBM_LIBRARY "${GBM_LIBRARY}")
+else(EGL_FOUND AND GBM_FOUND)
SET(OPTIONAL_EGL_LIBRARY "")
-endif (EGL_FOUND)
+SET(OPTIONAL_GBM_LIBRARY "")
+endif (EGL_FOUND AND GBM_FOUND)
if (OCLIcd_FOUND)
set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
@@ -57,5 +59,5 @@ target_link_libraries(
${DRM_LIBRARY}
${OPENGL_LIBRARIES}
${OPTIONAL_EGL_LIBRARY}
- ${GBM_LIBRARY})
+ ${OPTIONAL_GBM_LIBRARY})
install (TARGETS cl LIBRARY DESTINATION lib)
diff --git a/src/cl_api.c b/src/cl_api.c
index cfbb44f..a4e534a 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -87,6 +87,31 @@ clGetDeviceInfo(cl_device_id device,
param_value_size_ret);
}
+cl_int
+clCreateSubDevices(cl_device_id in_device,
+ const cl_device_partition_property * properties,
+ cl_uint num_devices,
+ cl_device_id * out_devices,
+ cl_uint * num_devices_ret)
+{
+ NOT_IMPLEMENTED;
+ return 0;
+}
+
+cl_int
+clRetainDevice(cl_device_id device)
+{
+ // XXX stub for C++ Bindings
+ return CL_SUCCESS;
+}
+
+cl_int
+clReleaseDevice(cl_device_id device)
+{
+ // XXX stub for C++ Bindings
+ return CL_SUCCESS;
+}
+
cl_context
clCreateContext(const cl_context_properties * properties,
cl_uint num_devices,
@@ -292,7 +317,7 @@ clCreateImage(cl_context context,
image_format,
image_desc,
host_ptr,
- errcode_ret);
+ &err);
error:
if (errcode_ret)
*errcode_ret = err;
@@ -945,8 +970,86 @@ clEnqueueReadImage(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ void* src_ptr;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(image);
+ if (command_queue->ctx != image->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (blocking_read != CL_TRUE)
+ NOT_IMPLEMENTED;
+
+ if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!row_pitch)
+ row_pitch = image->bpp*region[0];
+ else if (row_pitch < image->bpp*region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (image->slice_pitch) {
+ if (!slice_pitch)
+ slice_pitch = row_pitch*region[1];
+ else if (slice_pitch < row_pitch*region[1]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ }
+ else if (slice_pitch) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!ptr) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ if (!(src_ptr = cl_mem_map_auto(image))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ src_ptr = (char*)src_ptr + offset;
+
+ if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
+ {
+ memcpy(ptr, src_ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = src_ptr;
+ char* dst = ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, image->bpp*region[0]);
+ src += image->row_pitch;
+ dst += row_pitch;
+ }
+ src_ptr = (char*)src_ptr + image->slice_pitch;
+ ptr = (char*)ptr + slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(image);
+
+error:
+ return err;
}
cl_int
@@ -955,15 +1058,93 @@ clEnqueueWriteImage(cl_command_queue command_queue,
cl_bool blocking_write,
const size_t * origin,
const size_t * region,
- size_t input_row_pitch,
- size_t input_slice_pitch,
+ size_t row_pitch,
+ size_t slice_pitch,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ void* dst_ptr;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(image);
+ if (command_queue->ctx != image->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (blocking_write != CL_TRUE)
+ NOT_IMPLEMENTED;
+
+ if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!row_pitch)
+ row_pitch = image->bpp*region[0];
+ else if (row_pitch < image->bpp*region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (image->slice_pitch) {
+ if (!slice_pitch)
+ slice_pitch = row_pitch*region[1];
+ else if (slice_pitch < row_pitch*region[1]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ }
+ else if (slice_pitch) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!ptr) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ if (!(dst_ptr = cl_mem_map_auto(image))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ dst_ptr = (char*)dst_ptr + offset;
+
+ if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
+ {
+ memcpy(dst_ptr, ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = ptr;
+ char* dst = dst_ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, image->bpp*region[0]);
+ src += row_pitch;
+ dst += image->row_pitch;
+ }
+ ptr = (char*)ptr + slice_pitch;
+ dst_ptr = (char*)dst_ptr + image->slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(image);
+
+error:
+ return err;
}
cl_int
@@ -1023,13 +1204,30 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
cl_event * event,
cl_int * errcode_ret)
{
- void *p;
+ void *ptr = NULL;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
if (blocking_map != CL_TRUE)
NOT_IMPLEMENTED;
if (offset != 0)
NOT_IMPLEMENTED;
- p = clMapBufferIntel(buffer, errcode_ret);
- return p;
+
+ if (!(ptr = cl_mem_map_auto(buffer))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return ptr;
}
void *
@@ -1046,8 +1244,54 @@ clEnqueueMapImage(cl_command_queue command_queue,
cl_event * event,
cl_int * errcode_ret)
{
- NOT_IMPLEMENTED;
- return NULL;
+ void *ptr = NULL;
+ cl_int err = CL_SUCCESS;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(image);
+ if (command_queue->ctx != image->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (blocking_map != CL_TRUE)
+ NOT_IMPLEMENTED;
+
+ if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ *image_row_pitch = image->row_pitch;
+ if (image_slice_pitch)
+ *image_slice_pitch = image->slice_pitch;
+
+ if ((map_flags & CL_MAP_READ &&
+ image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+ (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+ image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+ {
+ err = CL_INVALID_OPERATION;
+ goto error;
+ }
+
+ if (!(ptr = cl_mem_map_auto(image))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ ptr = (char*)ptr + offset;
+
+error:
+ if (errcode_ret)
+ *errcode_ret = err;
+ return ptr;
}
cl_int
@@ -1058,7 +1302,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- return clUnmapBufferIntel(memobj);
+ return cl_mem_unmap_auto(memobj);
}
cl_int
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 7d604c3..6ce9016 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -98,6 +98,31 @@ cl_command_queue_add_ref(cl_command_queue queue)
atomic_inc(&queue->ref_n);
}
+static void
+set_image_info(char *curbe, struct ImageInfo * image_info, cl_mem image)
+{
+ if (image_info->wSlot >= 0)
+ *(uint32_t*)(curbe + image_info->wSlot) = image->w;
+ if (image_info->hSlot >= 0)
+ *(uint32_t*)(curbe + image_info->hSlot) = image->h;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
+{
+ uint32_t i;
+ for (i = 0; i < k->image_sz; i++) {
+ int id = k->images[i].arg_idx;
+ assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
+ set_image_info(k->curbe, &k->images[i], k->args[id].mem);
+ cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, k->args[id].mem->bo,
+ k->args[id].mem->intel_fmt, k->args[id].mem->type,
+ k->args[id].mem->w, k->args[id].mem->h,
+ k->args[id].mem->row_pitch, k->args[id].mem->tiling);
+ }
+ return CL_SUCCESS;
+}
+
LOCAL cl_int
cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
{
@@ -107,22 +132,10 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
for (i = 0; i < k->arg_n; ++i) {
uint32_t offset; // location of the address in the curbe
arg_type = gbe_kernel_get_arg_type(k->opaque, i);
- if (arg_type != GBE_ARG_GLOBAL_PTR &&
- arg_type != GBE_ARG_IMAGE &&
- arg_type != GBE_ARG_SAMPLER)
+ if (arg_type != GBE_ARG_GLOBAL_PTR)
continue;
offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
- if (arg_type == GBE_ARG_IMAGE) {
- uint32_t *curbe_index = (uint32_t*)(k->curbe + offset);
- cl_gpgpu_bind_image(queue->gpgpu, curbe_index, k->args[i].mem->bo,
- k->args[i].mem->intel_fmt, k->args[i].mem->type,
- k->args[i].mem->w, k->args[i].mem->h,
- k->args[i].mem->pitch, k->args[i].mem->tiling);
- } else if (arg_type == GBE_ARG_SAMPLER) {
- uint32_t *curbe_index = (uint32_t*)(k->curbe + offset);
- cl_gpgpu_insert_sampler(queue->gpgpu, curbe_index, k->args[i].sampler);
- } else
- cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
+ cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
}
return CL_SUCCESS;
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index dcfc8c4..f0c00f4 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -70,6 +70,9 @@ extern cl_int cl_command_queue_finish(cl_command_queue);
/* Bind all the surfaces in the GPGPU state */
extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
+/* Bind all the image surfaces in the GPGPU state */
+extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
+
/*update constant buffer to final curbe */
extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
#endif /* __CL_COMMAND_QUEUE_H__ */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 108684f..770af4a 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -224,6 +224,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Bind user buffers */
cl_command_queue_bind_surface(queue, ker);
+ /* Bind user images */
+ cl_command_queue_bind_image(queue, ker);
+ /* Bind all samplers */
+ cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
/* Bind a stack if needed */
cl_bind_stack(gpgpu, ker);
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 6300b41..136f3b1 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -230,6 +230,7 @@ cl_get_device_info(cl_device_id device,
DECL_STRING_FIELD(PROFILE, profile)
DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
DECL_STRING_FIELD(EXTENSIONS, extensions);
+ DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
case CL_DRIVER_VERSION:
if (param_value_size_ret) {
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index d199ecb..1beff92 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -29,7 +29,7 @@ struct _cl_device_id {
cl_uint max_thread_per_unit;
cl_uint max_work_item_dimensions;
size_t max_work_item_sizes[3];
- cl_uint max_work_group_size;
+ size_t max_work_group_size;
cl_uint preferred_vector_width_char;
cl_uint preferred_vector_width_short;
cl_uint preferred_vector_width_int;
@@ -56,7 +56,7 @@ struct _cl_device_id {
size_t image3d_max_height;
size_t image3d_max_depth;
cl_uint max_samplers;
- cl_uint max_parameter_size;
+ size_t max_parameter_size;
cl_uint mem_base_addr_align;
cl_uint min_data_type_align_size;
cl_device_fp_config single_fp_config;
@@ -84,6 +84,7 @@ struct _cl_device_id {
const char *opencl_c_version;
const char *extensions;
const char *driver_version;
+ const char *built_in_kernels;
size_t name_sz;
size_t vendor_sz;
size_t version_sz;
@@ -91,6 +92,7 @@ struct _cl_device_id {
size_t opencl_c_version_sz;
size_t extensions_sz;
size_t driver_version_sz;
+ size_t built_in_kernels_sz;
/* Kernel specific info that we're assigning statically */
size_t wg_sz;
size_t compile_wg_sz[3];
diff --git a/src/cl_driver.h b/src/cl_driver.h
index f1e1454..e8ebad1 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -110,20 +110,21 @@ extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t cchint);
extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
-/* Insert a sampler */
-typedef void (cl_gpgpu_insert_sampler_cb)(cl_gpgpu, uint32_t *curbe_index, cl_sampler sampler);
-extern cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler;
+/* bind samplers defined in both kernel and kernel args. */
+typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
+extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
/* Set a 2d texture */
typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
- uint32_t *curbe_index,
- cl_buffer obj_bo,
- uint32_t format,
- uint32_t type,
- int32_t w,
- int32_t h,
- int pitch,
- cl_gpgpu_tiling tiling);
+ uint32_t id,
+ cl_buffer obj_bo,
+ uint32_t format,
+ uint32_t type,
+ int32_t w,
+ int32_t h,
+ int pitch,
+ cl_gpgpu_tiling tiling);
+
extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
/* Setup a stack */
@@ -183,9 +184,13 @@ extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
* Buffer
**************************************************************************/
/* Allocate a buffer */
-typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, unsigned long, unsigned long);
+typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, size_t, size_t);
extern cl_buffer_alloc_cb *cl_buffer_alloc;
+/* Set a buffer's tiling mode */
+typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
+extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
+
#include "cl_context.h"
typedef struct _cl_context *cl_context;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 5acc6a5..2c77a22 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -29,6 +29,7 @@ LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
/* Buffer */
LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
LOCAL cl_buffer_alloc_from_eglimage_cb *cl_buffer_alloc_from_eglimage = NULL;
LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
@@ -59,5 +60,5 @@ LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
-LOCAL cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler = NULL;
+LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f26cd8a..a535452 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -72,6 +72,7 @@ DECL_INFO_STRING(version, OCL_VERSION_STRING)
DECL_INFO_STRING(profile, "FULL_PROFILE")
DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
DECL_INFO_STRING(extensions, "")
+DECL_INFO_STRING(built_in_kernels, "")
DECL_INFO_STRING(driver_version, LIBCL_VERSION_STRING)
#undef DECL_INFO_STRING
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index ec0e2e8..d8671c6 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -56,6 +56,8 @@ cl_kernel_delete(cl_kernel k)
cl_mem_delete(k->args[i].mem);
cl_free(k->args);
}
+ if (k->image_sz)
+ cl_free(k->images);
k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
cl_free(k);
}
@@ -110,6 +112,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
if (arg_type == GBE_ARG_VALUE) {
if (UNLIKELY(value == NULL))
return CL_INVALID_KERNEL_ARGS;
+
offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
assert(offset + sz <= k->curbe_sz);
memcpy(k->curbe + offset, value, sz);
@@ -129,20 +132,18 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
return CL_SUCCESS;
}
- /* For a sampler*/
+ /* Is it a sampler*/
if (arg_type == GBE_ARG_SAMPLER) {
- cl_sampler sampler;
- if (UNLIKELY(value == NULL))
+ cl_sampler sampler;
+ memcpy(&sampler, value, sz);
+ if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
return CL_INVALID_KERNEL_ARGS;
- sampler = *(cl_sampler*)value;
-
- if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
- return CL_INVALID_ARG_VALUE;
- k->args[index].local_sz = 0;
- k->args[index].is_set = 1;
- k->args[index].mem = NULL;
- k->args[index].sampler = sampler;
- return CL_SUCCESS;
+ k->args[index].local_sz = 0;
+ k->args[index].is_set = 1;
+ k->args[index].mem = NULL;
+ k->args[index].sampler = sampler;
+ cl_set_sampler_arg_slot(k, index, sampler);
+ return CL_SUCCESS;
}
/* Otherwise, we just need to check that this is a buffer */
@@ -203,6 +204,24 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
/* Create the curbe */
k->curbe_sz = gbe_kernel_get_curbe_size(k->opaque);
+
+ /* Get sampler data & size */
+ k->sampler_sz = gbe_kernel_get_sampler_size(k->opaque);
+ assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
+ if (k->sampler_sz > 0)
+ gbe_kernel_get_sampler_data(k->opaque, k->samplers);
+ /* Get image data & size */
+ k->image_sz = gbe_kernel_get_image_size(k->opaque);
+ assert(k->sampler_sz <= GEN_MAX_SURFACES);
+ if (k->image_sz > 0) {
+ TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
+ gbe_kernel_get_image_data(k->opaque, k->images);
+ } else
+ k->images = NULL;
+ return;
+error:
+ cl_buffer_unreference(k->bo);
+ k->bo = NULL;
}
LOCAL cl_kernel
@@ -221,6 +240,15 @@ cl_kernel_dup(cl_kernel from)
to->program = from->program;
to->arg_n = from->arg_n;
to->curbe_sz = from->curbe_sz;
+ to->sampler_sz = from->sampler_sz;
+ to->image_sz = from->image_sz;
+ if (to->sampler_sz)
+ memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
+ if (to->image_sz) {
+ TRY_ALLOC_NO_ERR(to->images, cl_calloc(to->image_sz, sizeof(to->images[0])));
+ memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
+ } else
+ to->images = NULL;
TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index dd98fb3..e191058 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -52,6 +52,10 @@ struct _cl_kernel {
gbe_kernel opaque; /* (Opaque) compiler structure for the OCL kernel */
char *curbe; /* One curbe per kernel */
size_t curbe_sz; /* Size of it */
+ uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
+ size_t sampler_sz; /* sampler size defined in kernel & kernel args. */
+ struct ImageInfo *images; /* images defined in kernel args */
+ size_t image_sz; /* image count in kernel args */
cl_argument *args; /* To track argument setting */
uint32_t arg_n:31; /* Number of arguments */
uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 6950590..354fe34 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -410,14 +410,18 @@ _cl_mem_new_image(cl_context ctx,
mem->w = w;
mem->h = h;
+ mem->depth = depth;
mem->fmt = *fmt;
mem->intel_fmt = intel_fmt;
mem->bpp = bpp;
mem->is_image = 1;
- mem->pitch = aligned_pitch;
+ mem->row_pitch = aligned_pitch;
+ mem->slice_pitch = image_type == CL_MEM_OBJECT_IMAGE1D || image_type == CL_MEM_OBJECT_IMAGE2D ? 0 : aligned_pitch*aligned_h;
mem->tiling = tiling;
mem->type = image_type;
+ cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+
exit:
if (errcode_ret)
*errcode_ret = err;
@@ -518,6 +522,25 @@ cl_mem_unmap_gtt(cl_mem mem)
return CL_SUCCESS;
}
+LOCAL void*
+cl_mem_map_auto(cl_mem mem)
+{
+ if (mem->is_image && mem->tiling != CL_NO_TILE)
+ return cl_mem_map_gtt(mem);
+ else
+ return cl_mem_map(mem);
+}
+
+LOCAL cl_int
+cl_mem_unmap_auto(cl_mem mem)
+{
+ if (mem->is_image && mem->tiling != CL_NO_TILE)
+ cl_buffer_unmap_gtt(mem->bo);
+ else
+ cl_buffer_unmap(mem->bo);
+ return CL_SUCCESS;
+}
+
LOCAL cl_int
cl_mem_pin(cl_mem mem)
{
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 99f3835..3ca6381 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -43,7 +43,8 @@ struct _cl_mem {
uint32_t is_image; /* Indicate if this is an image or not */
cl_image_format fmt; /* only for images */
cl_mem_object_type type; /* only for images 1D/2D...*/
- size_t w,h,depth,pitch; /* only for images (depth is only for 3d images) */
+ size_t w,h,depth; /* only for images (depth is only for 3D images) */
+ size_t row_pitch,slice_pitch;
uint32_t intel_fmt; /* format to provide in the surface state */
uint32_t bpp; /* number of bytes per pixel */
cl_image_tiling_t tiling; /* only IVB+ supports TILE_[X,Y] (image only) */
@@ -82,6 +83,12 @@ extern void *cl_mem_map_gtt(cl_mem);
/* Unmap a memory object in GTT mode */
extern cl_int cl_mem_unmap_gtt(cl_mem);
+/* Directly map a memory object - tiled images are mapped in GTT mode */
+extern void *cl_mem_map_auto(cl_mem);
+
+/* Unmap a memory object - tiled images are unmapped in GTT mode */
+extern cl_int cl_mem_unmap_auto(cl_mem);
+
/* Pin/unpin the buffer in memory (you must be root) */
extern cl_int cl_mem_pin(cl_mem);
extern cl_int cl_mem_unpin(cl_mem);
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index 3dfac00..930107f 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -198,11 +198,13 @@ LOCAL cl_mem cl_mem_new_gl_texture(cl_context ctx,
mem->type = get_mem_type_from_target(texture_target);
mem->w = w;
mem->h = h;
+ mem->depth = 1;
mem->fmt = cl_format;
mem->intel_fmt = intel_fmt;
mem->bpp = bpp;
mem->is_image = 1;
- mem->pitch = pitch;
+ mem->row_pitch = pitch;
+ mem->slice_pitch = 0;
mem->tiling = tiling;
mem->ref_n = 1;
mem->magic = CL_MAGIC_MEM_HEADER;
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index d3e61da..7e0b7b0 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -22,9 +22,54 @@
#include "cl_utils.h"
#include "cl_alloc.h"
#include "cl_khr_icd.h"
+#include "cl_kernel.h"
#include <assert.h>
+uint32_t cl_to_clk(cl_bool normalized_coords,
+ cl_addressing_mode address,
+ cl_filter_mode filter)
+{
+ int clk_address;
+ int clk_filter;
+ switch (address) {
+ case CL_ADDRESS_NONE: clk_address = CLK_ADDRESS_NONE; break;
+ case CL_ADDRESS_CLAMP: clk_address = CLK_ADDRESS_CLAMP; break;
+ case CL_ADDRESS_CLAMP_TO_EDGE: clk_address = CLK_ADDRESS_CLAMP_TO_EDGE; break;
+ case CL_ADDRESS_REPEAT: clk_address = CLK_ADDRESS_REPEAT; break;
+ case CL_ADDRESS_MIRRORED_REPEAT: clk_address = CLK_ADDRESS_MIRRORED_REPEAT; break;
+ default:
+ assert(0);
+ }
+ switch(filter) {
+ case CL_FILTER_NEAREST: clk_filter = CLK_FILTER_NEAREST; break;
+ case CL_FILTER_LINEAR: clk_filter = CLK_FILTER_LINEAR; break;
+ default:
+ assert(0);
+ }
+ return (clk_address << __CLK_ADDRESS_BASE)
+ | (normalized_coords << __CLK_NORMALIZED_BASE)
+ | (clk_filter << __CLK_FILTER_BASE);
+}
+
+#define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
+#define SAMPLER_ARG_ID(v) ((v & __CLK_SAMPLER_ARG_MASK) >> __CLK_SAMPLER_ARG_BASE)
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
+{
+ int slot_id;
+ for(slot_id = 0; slot_id < k->sampler_sz; slot_id++)
+ {
+ if (IS_SAMPLER_ARG(k->samplers[slot_id])) {
+ if (SAMPLER_ARG_ID(k->samplers[slot_id]) == index) {
+ k->samplers[slot_id] = (k->samplers[slot_id] & (~__CLK_SAMPLER_MASK))
+ | sampler->clkSamplerValue;
+ return slot_id;
+ }
+ }
+ }
+ assert(0);
+}
+
LOCAL cl_sampler
cl_sampler_new(cl_context ctx,
cl_bool normalized_coords,
@@ -54,6 +99,8 @@ cl_sampler_new(cl_context ctx,
sampler->ctx = ctx;
cl_context_add_ref(ctx);
+ sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
+
exit:
if (errcode_ret)
*errcode_ret = err;
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
index da9a488..4785928 100644
--- a/src/cl_sampler.h
+++ b/src/cl_sampler.h
@@ -21,6 +21,7 @@
#define __CL_SAMPLER_H__
#include "CL/cl.h"
+#include "../backend/src/ocl_common_defines.h"
#include <stdint.h>
/* How to access images */
@@ -33,6 +34,7 @@ struct _cl_sampler {
cl_bool normalized_coords; /* Are coordinates normalized? */
cl_addressing_mode address;/* CLAMP / REPEAT and so on... */
cl_filter_mode filter; /* LINEAR / NEAREST mostly */
+ uint32_t clkSamplerValue;
};
/* Create a new sampler object */
@@ -48,5 +50,8 @@ extern void cl_sampler_delete(cl_sampler);
/* Add one more reference to this object */
extern void cl_sampler_add_ref(cl_sampler);
+/* set a sampler kernel argument */
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler);
+
#endif /* __CL_SAMPLER_H__ */
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 4493858..dfb1369 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -130,6 +130,15 @@ do { \
} \
} while (0)
+#define CHECK_IMAGE(IMAGE) \
+CHECK_MEM(image); \
+do { \
+ if (UNLIKELY(!IMAGE->is_image)) { \
+ err = CL_INVALID_MEM_OBJECT; \
+ goto error; \
+ } \
+} while (0)
+
#define CHECK_SAMPLER(SAMPLER) \
do { \
if (UNLIKELY(SAMPLER == NULL)) { \
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 842163b..ebc4961 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -45,12 +45,12 @@
* Zou Nan hai <nanhai.zou at intel.com>
*
*/
-#define GL_GLEXT_PROTOTYPES
#include "intel_driver.h"
#include "intel_gpgpu.h"
#include "intel_batchbuffer.h"
#include "intel_bufmgr.h"
#include "x11/dricommon.h"
+#include "cl_mem.h"
#include <assert.h>
#include <unistd.h>
@@ -369,14 +369,17 @@ cl_intel_driver_delete(intel_driver_t *driver)
intel_driver_terminate(driver);
intel_driver_delete(driver);
}
-
+#include "program.h"
static intel_driver_t*
cl_intel_driver_new(cl_context_prop props)
{
intel_driver_t *driver = NULL;
TRY_ALLOC_NO_ERR (driver, intel_driver_new());
intel_driver_open(driver, props);
-
+ /* We use the first 2 slots(0,1) for all the bufs.
+ * Notify the gbe this base index, thus gbe can avoid conflicts
+ * when it allocates slots for images*/
+ gbe_set_image_base_index(2);
exit:
return driver;
error:
@@ -405,7 +408,6 @@ static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
#include "GL/gl.h"
#include "EGL/egl.h"
#include "EGL/eglext.h"
-#include "cl_mem.h"
static int get_cl_tiling(uint32_t drm_tiling)
{
switch(drm_tiling) {
@@ -465,6 +467,38 @@ cl_buffer intel_alloc_buffer_from_eglimage(cl_context ctx,
}
#endif
+static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
+{
+ switch (tiling) {
+ case CL_NO_TILE:
+ *intel_tiling = I915_TILING_NONE;
+ break;
+ case CL_TILE_X:
+ *intel_tiling = I915_TILING_X;
+ break;
+ case CL_TILE_Y:
+ *intel_tiling = I915_TILING_Y;
+ break;
+ default:
+ assert(0);
+ return -1;
+ }
+ return 0;
+}
+
+static int intel_buffer_set_tiling(cl_buffer bo,
+ cl_image_tiling_t tiling, size_t stride)
+{
+ uint32_t intel_tiling, required_tiling;
+ int ret;
+ if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
+ return -1;
+ required_tiling = intel_tiling;
+ ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
+ assert(intel_tiling == required_tiling);
+ return ret;
+}
+
LOCAL void
intel_setup_callbacks(void)
{
@@ -474,6 +508,7 @@ intel_setup_callbacks(void)
cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+ cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
#ifdef HAS_EGL
cl_buffer_alloc_from_eglimage = (cl_buffer_alloc_from_eglimage_cb *) intel_alloc_buffer_from_eglimage;
#endif
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index d90368c..b0f556d 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -451,35 +451,6 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
}
-static inline unsigned long
-__fls(unsigned long x)
-{
- asm("bsf %1,%0"
- : "=r" (x)
- : "rm" (x));
- return x;
-}
-
-static int
-intel_gpgpu_get_free_img_index(intel_gpgpu_t *gpgpu)
-{
- int slot;
- assert(~gpgpu->img_bitmap != 0);
- slot = __fls(~gpgpu->img_bitmap);
- gpgpu->img_bitmap |= (1 << slot);
- return slot + gpgpu->img_index_base;
-}
-
-static int
-intel_gpgpu_get_free_sampler_index(intel_gpgpu_t *gpgpu)
-{
- int slot;
- assert(~gpgpu->sampler_bitmap != 0);
- slot = __fls(~gpgpu->sampler_bitmap);
- gpgpu->sampler_bitmap |= (1 << slot);
- return slot;
-}
-
static int
intel_get_surface_type(cl_mem_object_type type)
{
@@ -500,7 +471,7 @@ intel_get_surface_type(cl_mem_object_type type)
static void
intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
- uint32_t *curbe_index,
+ uint32_t index,
dri_bo* obj_bo,
uint32_t format,
cl_mem_object_type type,
@@ -509,7 +480,6 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
int32_t pitch,
int32_t tiling)
{
- int32_t index = intel_gpgpu_get_free_img_index(gpgpu);
surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
@@ -531,7 +501,6 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
}
ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo);
- *curbe_index = index;
gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
}
@@ -554,7 +523,7 @@ intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint
static void
intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
- uint32_t *index,
+ uint32_t index,
cl_buffer *obj_bo,
uint32_t format,
cl_mem_object_type type,
@@ -564,7 +533,7 @@ intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
cl_gpgpu_tiling tiling)
{
intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, type, w, h, pitch, tiling);
- assert(*index < GEN_MAX_SURFACES);
+ assert(index < GEN_MAX_SURFACES);
}
static void
@@ -662,10 +631,10 @@ intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
{
switch( cl_address_mode ) {
- case CL_ADDRESS_NONE:
- case CL_ADDRESS_REPEAT:
+ case CLK_ADDRESS_NONE:
+ case CLK_ADDRESS_REPEAT:
return GEN_TEXCOORDMODE_WRAP;
- case CL_ADDRESS_CLAMP:
+ case CLK_ADDRESS_CLAMP:
/* GL_CLAMP is the weird mode where coordinates are clamped to
* [0.0, 1.0], so linear filtering of coordinates outside of
* [0.0, 1.0] give you half edge texel value and half border
@@ -679,9 +648,9 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
return GEN_TEXCOORDMODE_CLAMP;
else
return GEN_TEXCOORDMODE_CLAMP_BORDER;
- case CL_ADDRESS_CLAMP_TO_EDGE:
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
return GEN_TEXCOORDMODE_CLAMP;
- case CL_ADDRESS_MIRRORED_REPEAT:
+ case CLK_ADDRESS_MIRRORED_REPEAT:
return GEN_TEXCOORDMODE_MIRROR;
default:
return GEN_TEXCOORDMODE_WRAP;
@@ -689,35 +658,33 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
}
static void
-intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t *curbe_index, cl_sampler cl_sampler)
+intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
{
- int index;
int using_nearest = 0;
uint32_t wrap_mode;
gen7_sampler_state_t *sampler;
- index = intel_gpgpu_get_free_sampler_index(gpgpu);
sampler = (gen7_sampler_state_t *)gpgpu->sampler_state_b.bo->virtual + index;
- if (!cl_sampler->normalized_coords)
+ if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
sampler->ss3.non_normalized_coord = 1;
else
sampler->ss3.non_normalized_coord = 0;
- switch (cl_sampler->filter) {
- case CL_FILTER_NEAREST:
+ switch (clk_sampler & __CLK_FILTER_MASK) {
+ case CLK_FILTER_NEAREST:
sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
using_nearest = 1;
break;
- case CL_FILTER_LINEAR:
+ case CLK_FILTER_LINEAR:
sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
break;
}
- wrap_mode = translate_wrap_mode(cl_sampler->address, using_nearest);
+ wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
sampler->ss3.r_wrap_mode = wrap_mode;
sampler->ss3.s_wrap_mode = wrap_mode;
sampler->ss3.t_wrap_mode = wrap_mode;
@@ -738,7 +705,15 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t *curbe_index, cl_sampl
sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
- *curbe_index = index;
+}
+
+static void
+intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
+{
+ int index;
+ assert(sampler_sz <= GEN_MAX_SAMPLERS);
+ for(index = 0; index < sampler_sz; index++)
+ intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK);
}
static void
@@ -815,6 +790,6 @@ intel_set_gpgpu_callbacks(void)
cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
- cl_gpgpu_insert_sampler = (cl_gpgpu_insert_sampler_cb *) intel_gpgpu_insert_sampler;
+ cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
}
diff --git a/utests/.gitignore b/utests/.gitignore
new file mode 100644
index 0000000..9a374dc
--- /dev/null
+++ b/utests/.gitignore
@@ -0,0 +1,13 @@
+compiler_box_blur.bmp
+compiler_box_blur_float.bmp
+compiler_clod.bmp
+compiler_julia.bmp
+compiler_julia_no_break.bmp
+compiler_mandelbrot.bmp
+compiler_mandelbrot_alternate.bmp
+compiler_menger_sponge_no_shadow.bmp
+compiler_nautilus.bmp
+compiler_ribbon.bmp
+flat_address_space
+libutests.so
+utest_run
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 20d5456..2ba01c4 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -74,6 +74,8 @@ set (utests_sources
compiler_local_memory_barrier_wg64.cpp
compiler_movforphi_undef.cpp
compiler_volatile.cpp
+ compiler_copy_image1.cpp
+ compiler_get_image_size.cpp
runtime_createcontext.cpp
utest_assert.cpp
utest.cpp
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_copy_image1.cpp
similarity index 52%
copy from utests/compiler_movforphi_undef.cpp
copy to utests/compiler_copy_image1.cpp
index 30e53bb..a9ef3f4 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_copy_image1.cpp
@@ -1,15 +1,15 @@
#include "utest_helper.hpp"
-static void compiler_movforphi_undef(void)
+static void compiler_copy_image1(void)
{
- const size_t w = 16;
- const size_t h = 16;
- cl_sampler sampler;
+ const size_t w = 512;
+ const size_t h = 512;
cl_image_format format;
cl_image_desc desc;
+ cl_sampler sampler;
// Setup kernel and images
- OCL_CREATE_KERNEL("test_movforphi_undef");
+ OCL_CREATE_KERNEL("test_copy_image1");
buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
for (uint32_t j = 0; j < h; ++j)
for (uint32_t i = 0; i < w; i++)
@@ -22,10 +22,14 @@ static void compiler_movforphi_undef(void)
desc.image_height = h;
desc.image_row_pitch = w * sizeof(uint32_t);
OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+ OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
desc.image_row_pitch = 0;
OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
- OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+ OCL_CREATE_IMAGE(buf[2], 0, &format, &desc, NULL);
+ OCL_CREATE_IMAGE(buf[3], 0, &format, &desc, NULL);
+ OCL_CREATE_IMAGE(buf[4], 0, &format, &desc, NULL);
+ OCL_CREATE_IMAGE(buf[5], 0, &format, &desc, NULL);
free(buf_data[0]);
buf_data[0] = NULL;
@@ -33,6 +37,15 @@ static void compiler_movforphi_undef(void)
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[3]);
+ OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+ OCL_SET_ARG(6, sizeof(cl_mem), &buf[5]);
+ float w_inv = 1.0/w;
+ float h_inv = 1.0/h;
+ OCL_SET_ARG(7, sizeof(float), &w_inv);
+ OCL_SET_ARG(8, sizeof(float), &h_inv);
+
globals[0] = w;
globals[1] = h;
locals[0] = 16;
@@ -42,16 +55,23 @@ static void compiler_movforphi_undef(void)
// Check result
OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
- // Just compare the initial 2 data is enough for this case, as the initial 2 data must in the first
- // tile box and we can just get the correct coords.
- for (uint32_t j = 0; j < 1; ++j)
- for (uint32_t i = 0; i < 3; i++)
- {
- if (i < w - 1)
- OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
- }
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ OCL_MAP_BUFFER(4);
+ OCL_MAP_BUFFER(5);
+
+ for(uint32_t k = 0; k < 5; k++)
+ {
+ for (uint32_t j = 0; j < h; ++j)
+ for (uint32_t i = 0; i < w; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1 + k])[j * w + i]);
+ }
OCL_UNMAP_BUFFER(0);
OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+ OCL_UNMAP_BUFFER(4);
+ OCL_UNMAP_BUFFER(5);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_movforphi_undef);
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_fill_image0.cpp
index cf76be3..7c8f40e 100644
--- a/utests/compiler_fill_image0.cpp
+++ b/utests/compiler_fill_image0.cpp
@@ -14,7 +14,6 @@ static void compiler_fill_image0(void)
desc.image_height = h;
desc.image_row_pitch = 0;
-
// Setup kernel and images
OCL_CREATE_KERNEL("test_fill_image0");
@@ -29,11 +28,11 @@ static void compiler_fill_image0(void)
OCL_NDRANGE(2);
// Check result
- OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER_GTT(0);
for (uint32_t j = 0; j < h; ++j)
for (uint32_t i = 0; i < w; i++)
- OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 0x78563412);
- OCL_UNMAP_BUFFER(0);
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == (i << 16 | j));
+ OCL_UNMAP_BUFFER_GTT(0);
}
MAKE_UTEST_FROM_FUNCTION(compiler_fill_image0);
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_get_image_size.cpp
similarity index 55%
copy from utests/compiler_fill_image0.cpp
copy to utests/compiler_get_image_size.cpp
index cf76be3..49c08ad 100644
--- a/utests/compiler_fill_image0.cpp
+++ b/utests/compiler_get_image_size.cpp
@@ -1,8 +1,8 @@
#include "utest_helper.hpp"
-static void compiler_fill_image0(void)
+static void compiler_get_image_size(void)
{
- const size_t w = 512;
+ const size_t w = 256;
const size_t h = 512;
cl_image_format format;
cl_image_desc desc;
@@ -14,26 +14,24 @@ static void compiler_fill_image0(void)
desc.image_height = h;
desc.image_row_pitch = 0;
-
// Setup kernel and images
- OCL_CREATE_KERNEL("test_fill_image0");
+ OCL_CREATE_KERNEL("test_get_image_size");
OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, 32 * sizeof(int), NULL);
// Run the kernel
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
- globals[0] = w;
- globals[1] = h;
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 32;
locals[0] = 16;
- locals[1] = 16;
- OCL_NDRANGE(2);
+ OCL_NDRANGE(1);
// Check result
- OCL_MAP_BUFFER(0);
- for (uint32_t j = 0; j < h; ++j)
- for (uint32_t i = 0; i < w; i++)
- OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 0x78563412);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 32; i++)
+ OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((w << 16) | (h)));
OCL_UNMAP_BUFFER(0);
}
-MAKE_UTEST_FROM_FUNCTION(compiler_fill_image0);
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_size);
diff --git a/utests/compiler_math.cpp b/utests/compiler_math.cpp
index 7303dd5..e0c4487 100644
--- a/utests/compiler_math.cpp
+++ b/utests/compiler_math.cpp
@@ -2,18 +2,44 @@
#include <cmath>
#include <algorithm>
-static void cpu_compiler_math(float *dst, float *src, int get_global_id0)
+static void cpu_compiler_math(float *dst, float *src, int i)
{
- const float x = src[get_global_id0];
- switch (get_global_id0) {
- case 0: dst[get_global_id0] = cosf(x); break;
- case 1: dst[get_global_id0] = sinf(x); break;
- case 2: dst[get_global_id0] = log2f(x); break;
- case 3: dst[get_global_id0] = sqrtf(x); break;
- case 4: dst[get_global_id0] = 1.f/ sqrtf(x); break;
- case 5: dst[get_global_id0] = 1.f / x; break;
- case 6: dst[get_global_id0] = tanf(x); break;
- default: dst[get_global_id0] = 1.f; break;
+ const float x = src[i];
+ const float PI = 3.141592653589793f;
+ switch (i) {
+ case 0: dst[i] = cosf(x); break;
+ case 1: dst[i] = sinf(x); break;
+ case 2: dst[i] = log2f(x); break;
+ case 3: dst[i] = sqrtf(x); break;
+ case 4: dst[i] = 1.f/ sqrtf(x); break;
+ case 5: dst[i] = 1.f / x; break;
+ case 6: dst[i] = tanf(x); break;
+ case 7: dst[i] = powf(x, 0.3333333333333333333f); break;
+ case 8: dst[i] = ceilf(x); break;
+ case 9: dst[i] = cosf(PI * x); break;
+ case 10: dst[i] = powf(2, x); break;
+ case 11: dst[i] = powf(10, x); break;
+ case 12: dst[i] = expf(x) - 1; break;
+ case 13: dst[i] = logf(x + 1); break;
+ case 14: dst[i] = floorf(log2f(x)); break;
+ case 15: dst[i] = sinf(PI * x); break;
+ case 16: dst[i] = tanf(PI * x); break;
+ case 17: dst[i] = 2 * roundf(x / 2); break;
+ case 18: dst[i] = sinhf(x); break;
+ case 19: dst[i] = coshf(x); break;
+ case 20: dst[i] = tanhf(x); break;
+ case 21: dst[i] = asinhf(x); break;
+ case 22: dst[i] = acoshf(x); break;
+ case 23: dst[i] = atanhf(x); break;
+ case 24: dst[i] = asinf(x); break;
+ case 25: dst[i] = acosf(x); break;
+ case 26: dst[i] = atanf(x); break;
+ case 27: dst[i] = asinf(x) / PI; break;
+ case 28: dst[i] = acosf(x) / PI; break;
+ case 29: dst[i] = atanf(x) / PI; break;
+ case 30: dst[i] = erff(x); break;
+ case 31: dst[i] = nanf(""); break;
+ default: dst[i] = 1.f; break;
};
}
@@ -31,23 +57,31 @@ static void compiler_math(void)
globals[0] = 16;
locals[0] = 16;
- OCL_MAP_BUFFER(1);
- for (uint32_t i = 0; i < 32; ++i)
- cpu_src[i] = ((float*)buf_data[1])[i] = float(i);
- OCL_UNMAP_BUFFER(1);
- OCL_NDRANGE(1);
-
- OCL_MAP_BUFFER(0);
- OCL_MAP_BUFFER(1);
- for (int i = 0; i < 16; ++i)
- cpu_compiler_math(cpu_dst, cpu_src, i);
- for (int i = 0; i < 16; ++i) {
- const float cpu = cpu_dst[i];
- const float gpu = ((float*)buf_data[0])[i];
- OCL_ASSERT(fabs(gpu-cpu)/std::max(fabs(cpu), fabs(gpu)) < 1e-4f);
+ int j;
+ for(j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < 32; ++i)
+ cpu_src[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+ OCL_UNMAP_BUFFER(1);
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < 16; ++i)
+ cpu_compiler_math(cpu_dst, cpu_src, i);
+ for (int i = 0; i < 16; ++i) {
+ const float cpu = cpu_dst[i];
+ const float gpu = ((float*)buf_data[0])[i];
+ if (isinf(cpu))
+ OCL_ASSERT(isinf(gpu));
+ else if (isnan(cpu))
+ OCL_ASSERT(isnan(gpu));
+ else
+ OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
}
- OCL_UNMAP_BUFFER(0);
- OCL_UNMAP_BUFFER(1);
}
MAKE_UTEST_FROM_FUNCTION(compiler_math)
diff --git a/utests/compiler_math_2op.cpp b/utests/compiler_math_2op.cpp
new file mode 100644
index 0000000..454967d
--- /dev/null
+++ b/utests/compiler_math_2op.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static float rnde(float v) {
+ if(v - floorf(v) > 0.5f)
+ return floorf(v) + 1;
+ if(v - floorf(v) < 0.5f)
+ return floorf(v);
+ if((int)(floorf(v)) & 1)
+ return floorf(v) + 1;
+ return floorf(v);
+}
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, int i)
+{
+ const float x = src1[i], y = src2[i];
+ switch (i) {
+ case 0: dst[i] = x / y; break;
+ case 1: dst[i] = x > y ? x - y : 0; break;
+ case 2: dst[i] = fminf(x - floorf(x), 0x1.FFFFFep-1F); break;
+ case 3: dst[i] = sqrtf(x*x + y*y); break;
+ case 4: dst[i] = x * powf(2, (int)y); break;
+ case 5: dst[i] = powf(x, (int)y); break;
+ case 6: dst[i] = x - rnde(x/y)*y; break;
+ case 7: dst[i] = powf(x, 1.f/(int)(y+1)); break;
+ case 8: dst[i] = x * y < 0 ? -x : x; break;
+ case 9: dst[i] = fabsf(x) > fabsf(y) ? x : fabsf(y) > fabsf(x) ? y : fmaxf(x, y); break;
+ case 10: dst[i] = fabsf(x) < fabsf(y) ? x : fabsf(y) < fabsf(x) ? y : fminf(x, y); break;
+ default: dst[i] = 1.f; break;
+ };
+}
+
+static void compiler_math_2op(void)
+{
+ const size_t n = 32;
+ float cpu_dst[32], cpu_src1[32], cpu_src2[32];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_math_2op");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ int j;
+ for(j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ for (uint32_t i = 0; i < 32; ++i) {
+ cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+ cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_NDRANGE(1);
+
+ for (int i = 0; i < 16; ++i)
+ cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, i);
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < 16; ++i) {
+ const float cpu = cpu_dst[i];
+ const float gpu = ((float*)buf_data[0])[i];
+ if (isinf(cpu))
+ OCL_ASSERT(isinf(gpu));
+ else if (isnan(cpu))
+ OCL_ASSERT(isnan(gpu));
+ else {
+ OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+ }
+ }
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_2op)
diff --git a/utests/compiler_math_3op.cpp b/utests/compiler_math_3op.cpp
new file mode 100644
index 0000000..a382b0a
--- /dev/null
+++ b/utests/compiler_math_3op.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, float *src3, int i)
+{
+ const float x = src1[i], y = src2[i], z = src3[i];
+ switch (i) {
+ case 0: dst[i] = x * y + z; break;
+ case 1: dst[i] = x * y + z; break;
+ default: dst[i] = 1.f; break;
+ };
+}
+
+static void compiler_math_3op(void)
+{
+ const size_t n = 32;
+ float cpu_dst[32], cpu_src1[32], cpu_src2[32], cpu_src3[32];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_math_3op");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ for (uint32_t i = 0; i < 32; ++i) {
+ cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+ cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+ cpu_src3[i] = ((float*)buf_data[3])[i] = .1f * (rand() & 15);
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+ OCL_NDRANGE(1);
+
+ for (int i = 0; i < 16; ++i)
+ cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, cpu_src3, i);
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < 16; ++i) {
+ const float cpu = cpu_dst[i];
+ const float gpu = ((float*)buf_data[0])[i];
+ if (isinf(cpu))
+ OCL_ASSERT(isinf(gpu));
+ else if (isnan(cpu))
+ OCL_ASSERT(isnan(gpu));
+ else
+ OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+ }
+ OCL_UNMAP_BUFFER(0);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_3op)
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
index 30e53bb..17bde8c 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_movforphi_undef.cpp
@@ -47,8 +47,8 @@ static void compiler_movforphi_undef(void)
for (uint32_t j = 0; j < 1; ++j)
for (uint32_t i = 0; i < 3; i++)
{
- if (i < w - 1)
- OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
+ if (i == 0)
+ OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
}
OCL_UNMAP_BUFFER(0);
OCL_UNMAP_BUFFER(1);
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 5258416..d882fc7 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -111,6 +111,17 @@ extern EGLSurface eglSurface;
} \
} while (0)
+#define OCL_MAP_BUFFER_GTT(ID) \
+ OCL_CALL2(clMapBufferGTTIntel, buf_data[ID], buf[ID])
+
+#define OCL_UNMAP_BUFFER_GTT(ID) \
+ do { \
+ if (buf[ID] != NULL) { \
+ OCL_CALL (clUnmapBufferGTTIntel, buf[ID]); \
+ buf_data[ID] = NULL; \
+ } \
+ } while (0)
+
#define OCL_NDRANGE(DIM_N) \
OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, DIM_N, NULL, globals, locals, 0, NULL, NULL)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git
More information about the Pkg-opencl-devel
mailing list