[Pkg-opencl-devel] [beignet] 53/66: Imported Upstream version 0.2+git20130928+187c17e
Andreas Beckmann
anbe at moszumanska.debian.org
Fri Oct 31 07:27:08 UTC 2014
This is an automated email from the git hooks/post-receive script.
anbe pushed a commit to branch master
in repository beignet.
commit 3756b221f8602b8211a4d3fce985dbd2c8c6b9e4
Author: Simon Richter <sjr at debian.org>
Date: Sat Sep 28 14:19:29 2013 +1000
Imported Upstream version 0.2+git20130928+187c17e
---
CMake/FindEGL.cmake | 18 +
CMake/FindGBM.cmake | 36 -
CMakeLists.txt | 18 +-
backend/CMakeLists.txt | 2 +-
backend/src/CMakeLists.txt | 13 +-
backend/src/backend/context.cpp | 82 +-
backend/src/backend/context.hpp | 5 +-
backend/src/backend/gen/gen_mesa_disasm.c | 5 +
backend/src/backend/gen_context.cpp | 649 ++++++++++-
backend/src/backend/gen_context.hpp | 15 +
backend/src/backend/gen_defs.hpp | 19 +
backend/src/backend/gen_encoder.cpp | 69 +-
backend/src/backend/gen_encoder.hpp | 6 +-
.../src/backend/gen_insn_gen7_schedule_info.hxx | 9 +
backend/src/backend/gen_insn_selection.cpp | 334 +++++-
backend/src/backend/gen_insn_selection.hpp | 6 +-
backend/src/backend/gen_insn_selection.hxx | 11 +
backend/src/backend/gen_program.cpp | 43 +-
backend/src/backend/gen_program.hpp | 10 +-
backend/src/backend/gen_reg_allocation.cpp | 100 +-
backend/src/backend/gen_register.hpp | 14 +
backend/src/backend/program.cpp | 320 +++++-
backend/src/backend/program.h | 3 +-
backend/src/backend/program.hpp | 59 +-
backend/src/builtin_vector_proto.def | 25 +-
backend/src/gbe_bin_generater.cpp | 308 ++++++
backend/src/gen_builtin_vector.py | 5 +-
backend/src/gen_convert.sh | 30 +-
backend/src/ir/constant.cpp | 101 ++
backend/src/ir/constant.hpp | 28 +-
backend/src/ir/context.hpp | 1 +
backend/src/ir/image.cpp | 139 +++
backend/src/ir/image.hpp | 25 +-
backend/src/ir/instruction.cpp | 122 ++-
backend/src/ir/instruction.hpp | 32 +-
backend/src/ir/instruction.hxx | 5 +
backend/src/ir/profile.cpp | 5 +-
backend/src/ir/profile.hpp | 4 +-
backend/src/ir/sampler.cpp | 98 ++
backend/src/ir/sampler.hpp | 25 +-
backend/src/llvm/llvm_gen_backend.cpp | 380 ++++---
backend/src/llvm/llvm_gen_ocl_function.hxx | 38 +-
backend/src/llvm/llvm_scalarize.cpp | 1 -
backend/src/ocl_common_defines.h | 57 +-
backend/src/ocl_convert.h | 411 +++++++
backend/src/ocl_stdlib.tmpl.h | 1136 +++++++++++++++++---
backend/src/sys/platform.hpp | 44 +
kernels/builtin_atan2.cl | 4 +
kernels/builtin_lgamma.cl | 4 +
kernels/builtin_lgamma_r.cl | 4 +
kernels/builtin_sinpi.cl | 4 +
kernels/builtin_tgamma.cl | 4 +
kernels/compiler_abs_diff.cl | 2 +
kernels/compiler_bool_cross_basic_block.cl | 21 +
kernels/compiler_box_blur_image.cl | 2 +-
kernels/compiler_function_constant0.cl | 2 +-
kernels/compiler_global_constant.cl | 59 +-
kernels/compiler_global_constant_2.cl | 13 +-
kernels/compiler_group_size.cl | 17 +
kernels/compiler_long_convert.cl | 12 +
kernels/compiler_upsample_long.cl | 2 +-
kernels/compiler_vector_inc.cl | 13 +
kernels/test_copy_image_3d.cl | 27 +-
kernels/test_fill_image_3d.cl | 2 +-
kernels/test_fill_image_3d_2.cl | 2 +-
src/CMakeLists.txt | 18 +-
src/cl_api.c | 705 ++++++++++--
src/cl_command_queue.c | 108 +-
src/cl_command_queue.h | 16 +-
src/cl_command_queue_gen7.c | 86 +-
src/cl_context.c | 30 +-
src/cl_context.h | 34 +-
src/cl_device_id.c | 2 +-
src/cl_driver.h | 82 +-
src/cl_driver_defs.c | 14 +-
src/cl_driver_type.h | 24 +
src/cl_enqueue.c | 256 +++--
src/cl_enqueue.h | 33 +-
src/cl_event.c | 110 +-
src/cl_event.h | 8 +-
src/cl_extensions.c | 23 +-
src/cl_extensions.h | 25 -
src/cl_gt_device.h | 10 +-
src/cl_image.c | 26 +-
src/cl_kernel.c | 14 +-
src/cl_khr_icd.h | 4 +
src/cl_mem.c | 636 +++++++++--
src/cl_mem.h | 138 ++-
src/cl_mem_gl.c | 194 +---
src/cl_platform_id.c | 2 +-
src/cl_platform_id.h | 5 +-
src/cl_program.c | 37 +-
src/cl_program.h | 2 +
src/cl_sampler.c | 2 +-
src/cl_utils.h | 12 +-
src/intel/intel_dri_resource_sharing.c | 208 ++++
src/intel/intel_dri_resource_sharing.h | 39 +
src/intel/intel_dri_resource_sharing_int.h | 143 +++
src/intel/intel_driver.c | 231 +++-
src/intel/intel_driver.h | 8 +-
src/intel/intel_gpgpu.c | 147 ++-
src/intel/intel_structs.h | 16 +-
src/x11/dricommon.h | 5 -
src/x11/gbm_deps/backend.h | 36 -
src/x11/gbm_deps/common.h | 42 -
src/x11/gbm_deps/common_drm.h | 48 -
src/x11/gbm_deps/gbm.h | 292 -----
src/x11/gbm_deps/gbm_driint.h | 108 --
src/x11/gbm_deps/gbmint.h | 116 --
src/x11/gbm_dri2_x11_platform.c | 126 ---
src/x11/mesa_egl_extension.c | 307 ++++++
src/x11/mesa_egl_extension.h | 20 +
src/x11/mesa_egl_res_share.c | 135 +++
src/x11/mesa_egl_res_share.h | 44 +
utests/CMakeLists.txt | 28 +-
utests/builtin_atan2.cpp | 43 +
utests/builtin_lgamma.cpp | 40 +
utests/builtin_lgamma_r.cpp | 46 +
utests/builtin_sinpi.cpp | 104 ++
utests/builtin_tgamma.cpp | 42 +
utests/compiler_abs_diff.cpp | 35 +-
utests/compiler_bool_cross_basic_block.cpp | 55 +
utests/compiler_copy_image_3d.cpp | 36 +-
utests/compiler_fill_image_3d.cpp | 6 +-
utests/compiler_fill_image_3d_2.cpp | 10 +-
utests/compiler_function_constant0.cpp | 4 +-
utests/compiler_global_constant.cpp | 75 ++
utests/compiler_global_constant_2.cpp | 29 +
utests/compiler_group_size.cpp | 55 +
utests/compiler_long.cpp | 4 +-
utests/compiler_long_2.cpp | 4 +-
utests/compiler_long_convert.cpp | 91 ++
utests/compiler_vector_inc.cpp | 46 +
utests/load_program_from_bin.cpp | 77 ++
utests/runtime_event.cpp | 7 +-
utests/utest.cpp | 28 +-
utests/utest.hpp | 14 +-
utests/utest_helper.cpp | 29 +-
utests/utest_helper.hpp | 5 +-
utests/utest_run.cpp | 94 +-
140 files changed, 8625 insertions(+), 2169 deletions(-)
diff --git a/CMake/FindEGL.cmake b/CMake/FindEGL.cmake
index 69d4852..d84ef95 100644
--- a/CMake/FindEGL.cmake
+++ b/CMake/FindEGL.cmake
@@ -33,4 +33,22 @@ ELSE(EGL_INCLUDE_PATH)
SET(EGL_FOUND 0 CACHE STRING "Set to 1 if EGL is found, 0 otherwise")
ENDIF(EGL_INCLUDE_PATH)
+# Find mesa source code.
+FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
+ $ENV{MESA_SOURCE_DIR}
+ ${MAKE_CURRENT_SOURCE_DIR}/../mesa
+ ~/mesa
+ DOC "The mesa source directory which is needed for cl_khr_gl_sharing.")
+
+IF(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
+ ${MESA_SOURCE_PREFIX}/include
+ ${MESA_SOURCE_PREFIX}/src/mapi
+ ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
+ ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
+SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ELSE(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ENDIF(MESA_SOURCE_PREFIX)
+
MARK_AS_ADVANCED(EGL_FOUND)
diff --git a/CMake/FindGBM.cmake b/CMake/FindGBM.cmake
deleted file mode 100644
index f20f4b2..0000000
--- a/CMake/FindGBM.cmake
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Try to find gbm library and include path.
-# Once done this will define
-#
-# GBM_FOUND
-# GBM_INCLUDE_PATH
-# GBM_LIBRARY
-#
-
-FIND_PATH(GBM_INCLUDE_PATH gbm.h
- ~/include/
- /usr/include/
- /usr/local/include/
- /sw/include/
- /opt/local/include/
- DOC "The directory where gen/program.h resides")
-FIND_LIBRARY(GBM_LIBRARY
- NAMES GBM gbm
- PATHS
- ~/lib/
- /usr/lib64
- /usr/lib
- /usr/local/lib64
- /usr/local/lib
- /sw/lib
- /opt/local/lib
- DOC "The GBM library")
-
-IF(GBM_INCLUDE_PATH)
- INCLUDE_DIRECTORIES(${GBM_INCLUDE_PATH})
- SET(GBM_FOUND 1 CACHE STRING "Set to 1 if GBM is found, 0 otherwise")
-ELSE(GBM_INCLUDE_PATH)
- SET(GBM_FOUND 0 CACHE STRING "Set to 1 if GBM is found, 0 otherwise")
-ENDIF(GBM_INCLUDE_PATH)
-
-MARK_AS_ADVANCED(GBM_FOUND)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb56567..3d18f50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,9 @@ ELSE (EMULATE_IVB)
ADD_DEFINITIONS(-DEMULATE_GEN=0)
ENDIF (EMULATE_HSW)
+# XXX now hard coded to enable the clamp to border workaround for IVB.
+ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
IF (USE_FULSIM)
ADD_DEFINITIONS(-DUSE_FULSIM=1)
ELSE (USE_FULSIM)
@@ -116,15 +119,6 @@ ELSE(GBE_FOUND)
MESSAGE(STATUS "Looking for Gen-Backend - not found")
ENDIF(GBE_FOUND)
-Find_Package(GBM)
-IF(GBM_FOUND)
- MESSAGE(STATUS "Looking for GBM - found")
- SET(CMAKE_CXX_FLAGS "-DHAS_GBM ${CMAKE_CXX_FLAGS}")
- SET(CMAKE_C_FLAGS "-DHAS_GBM ${CMAKE_C_FLAGS}")
-ELSE(GBM_FOUND)
- MESSAGE(STATUS "Looking for GBM - not found")
-ENDIF(GBM_FOUND)
-
Find_Package(EGL)
IF(EGL_FOUND)
MESSAGE(STATUS "Looking for EGL - found")
@@ -132,6 +126,12 @@ ELSE(EGL_FOUND)
MESSAGE(STATUS "Looking for EGL - not found")
ENDIF(EGL_FOUND)
+IF(MESA_SOURCE_FOUND)
+ MESSAGE(STATUS "Looking for mesa source code - found")
+ELSE(MESA_SOURCE_FOUND)
+ MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
+ENDIF(MESA_SOURCE_FOUND)
+
Find_Package(OCLIcd)
IF(OCLIcd_FOUND)
MESSAGE(STATUS "Looking for OCL ICD header file - found")
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 8622f3e..476c6f2 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -34,7 +34,7 @@ else (GBE_DEBUG_MEMORY)
endif (GBE_DEBUG_MEMORY)
# Hide all symbols and allows the symbols declared as visible to be exported
-set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden")
+set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden ${CMAKE_C_CXX_FLAGS}")
if (COMPILER STREQUAL "GCC")
set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index b7b47ae..36bf688 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -3,6 +3,7 @@ set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
+set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h)
set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
@@ -26,13 +27,13 @@ set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h
add_custom_command(
OUTPUT ${ocl_vector_file}
- COMMAND ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
+ COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file})
add_custom_command(
OUTPUT ${ocl_blob_file}
- COMMAND ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
- DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
+ COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
+ DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
set (pch_object ${ocl_blob_file}.pch)
@@ -46,7 +47,7 @@ else (LLVM_VERSION_NODOT VERSION_GREATER 32)
set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch)
endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
-set (clang_cmd ${clang_cmd} -fno-builtin)
+set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
add_custom_command(
OUTPUT ${pch_object}
@@ -154,6 +155,10 @@ target_link_libraries(
${CMAKE_THREAD_LIBS_INIT}
${CMAKE_DL_LIBS})
+link_directories (${LLVM_LIBRARY_DIR})
+ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
+TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
+
install (TARGETS gbe LIBRARY DESTINATION lib)
install (FILES ${pch_object} DESTINATION lib)
install (FILES backend/program.h DESTINATION include/gen)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5484869..cbd38f1 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -315,10 +315,10 @@ namespace gbe
GBE_DELETE(this->kernel);
this->kernel = NULL;
}
- if(this->kernel != NULL)
+ if(this->kernel != NULL) {
this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
- if(this->kernel != NULL)
this->kernel->ctx = this;
+ }
return this->kernel;
}
@@ -364,7 +364,7 @@ namespace gbe
this->kernel->stackSize = 1*KB; // XXX compute that in a better way
}
- void Context::newCurbeEntry(gbe_curbe_type value,
+ uint32_t Context::newCurbeEntry(gbe_curbe_type value,
uint32_t subValue,
uint32_t size,
uint32_t alignment)
@@ -374,6 +374,7 @@ namespace gbe
GBE_ASSERT(offset >= GEN_REG_SIZE);
kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
+ return offset;
}
uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
@@ -387,7 +388,12 @@ namespace gbe
offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
fn.getImageSet()->appendInfo(key, offset);
- return offset;
+ return offset + GEN_REG_SIZE;
+ }
+
+
+ void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
+ curbeRegs.insert(std::make_pair(reg, offset));
}
void Context::buildPatchList(void) {
@@ -395,7 +401,7 @@ namespace gbe
kernel->curbeSize = 0u;
// We insert the block IP mask first
- this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t));
+ this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
// Go over the arguments and find the related patch locations
const uint32_t argNum = fn.argNum();
@@ -409,35 +415,55 @@ namespace gbe
arg.type == ir::FunctionArgument::STRUCTURE ||
arg.type == ir::FunctionArgument::IMAGE ||
arg.type == ir::FunctionArgument::SAMPLER)
- this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize);
+ this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
}
// Already inserted registers go here
- set<ir::Register> specialRegs;
-
const size_t localIDSize = sizeof(uint32_t) * this->simdWidth;
- this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize);
- this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize);
- this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize);
- specialRegs.insert(ir::ocl::lid0);
- specialRegs.insert(ir::ocl::lid1);
- specialRegs.insert(ir::ocl::lid2);
+ insertCurbeReg(ir::ocl::lid0, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize));
+ insertCurbeReg(ir::ocl::lid1, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize));
+ insertCurbeReg(ir::ocl::lid2, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize));
+ insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
// Go over all the instructions and find the special register we need
// to push
#define INSERT_REG(SPECIAL_REG, PATCH, WIDTH) \
if (reg == ir::ocl::SPECIAL_REG) { \
- if (specialRegs.find(reg) != specialRegs.end()) continue; \
- this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH); \
+ if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
+ insertCurbeReg(reg, this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH)); \
} else
bool useStackPtr = false;
- fn.foreachInstruction([&](const ir::Instruction &insn) {
+ fn.foreachInstruction([&](ir::Instruction &insn) {
const uint32_t srcNum = insn.getSrcNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const ir::Register reg = insn.getSrc(srcID);
+ if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+ if (srcID != 0) continue;
+ const unsigned char bti = fn.getImageSet()->getIdx(insn.getSrc(srcID));
+ const unsigned char type = ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+ ir::ImageInfoKey key;
+ key.index = bti;
+ key.type = type;
+ const ir::Register imageInfo(key.data | 0x8000);
+ ir::Register realImageInfo;
+ if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+ uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+ realImageInfo = insn.getSrc(1);
+ insertCurbeReg(realImageInfo, offset);
+ insertCurbeReg(imageInfo, (uint32_t)realImageInfo);
+ } else
+ realImageInfo = ir::Register(curbeRegs.find(imageInfo)->second);
+ insn.setSrc(srcID, realImageInfo);
+ continue;
+ } else if (insn.getOpcode() == ir::OP_GET_SAMPLER_INFO) {
+ /* change the src to sampler information register. */
+ if (curbeRegs.find(ir::ocl::samplerinfo) == curbeRegs.end())
+ insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
+ continue;
+ }
if (fn.isSpecialReg(reg) == false) continue;
- if (specialRegs.contains(reg) == true) continue;
+ if (curbeRegs.find(reg) != curbeRegs.end()) continue;
if (reg == ir::ocl::stackptr) useStackPtr = true;
INSERT_REG(lsize0, LOCAL_SIZE_X, 1)
INSERT_REG(lsize1, LOCAL_SIZE_Y, 1)
@@ -453,33 +479,22 @@ namespace gbe
INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
INSERT_REG(stackptr, STACK_POINTER, this->simdWidth)
- do {} while (0);
- specialRegs.insert(reg);
+ do {} while(0);
}
});
#undef INSERT_REG
- this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int));
- specialRegs.insert(ir::ocl::constoffst);
-
- // Insert serialized global constant arrays if used
- const ir::ConstantSet& constantSet = unit.getConstantSet();
- if (constantSet.getConstantNum()) {
- size_t size = constantSet.getDataSize();
- this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size);
- }
// Insert the number of threads
- this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t));
+ insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)));
// Insert the stack buffer if used
if (useStackPtr)
- this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize);
+ insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
// After this point the vector is immutable. Sorting it will make
// research faster
std::sort(kernel->patches.begin(), kernel->patches.end());
- // Align it on 32 bytes properly
kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
}
@@ -640,8 +655,7 @@ namespace gbe
reg == ir::ocl::goffset0 ||
reg == ir::ocl::goffset1 ||
reg == ir::ocl::goffset2 ||
- reg == ir::ocl::workdim ||
- reg == ir::ocl::constoffst)
+ reg == ir::ocl::workdim)
return true;
return false;
}
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 50c0e70..ca2c88d 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -93,6 +93,8 @@ namespace gbe
uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
/*! allocate size scratch memory and return start address */
uint32_t allocateScratchMem(uint32_t size);
+ /*! Preallocated curbe register set including special registers. */
+ map<ir::Register, uint32_t> curbeRegs;
protected:
/*! Build the instruction stream. Return false if failed */
virtual bool emitCode(void) = 0;
@@ -115,7 +117,8 @@ namespace gbe
/*! Insert a new entry with the given size in the Curbe. Return the offset
* of the entry
*/
- void newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
+ void insertCurbeReg(ir::Register, uint32_t grfOffset);
+ uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
/*! Provide for each branch and label the label index target */
typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
const ir::Unit &unit; //!< Unit that contains the kernel
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index bfb865a..f911e7c 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -1193,6 +1193,11 @@ int gen_disasm (FILE *file, const void *opaque_insn)
data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
}
break;
+ case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
+ format (file, " (bti: %d, %s)",
+ inst->bits3.gen7_dword_rw.bti,
+ data_port_data_cache_msg_type[inst->bits3.gen7_dword_rw.msg_type]);
+ break;
case GEN_SFID_MESSAGE_GATEWAY:
format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
gateway_sub_function[inst->bits3.gen7_msg_gw.subfunc],
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 1a012fe..858105a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -88,6 +88,18 @@ namespace gbe
}
}
+ void GenContext::clearFlagRegister(void) {
+ // when group size not aligned to simdWidth, flag register need clear to
+ // make prediction(any8/16h) work correctly
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+ p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+ p->pop();
+ }
+
void GenContext::emitStackPointer(void) {
using namespace ir;
@@ -147,6 +159,21 @@ namespace gbe
case SEL_OP_RNDE: p->RNDE(dst, src); break;
case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
+ case SEL_OP_CONVI64_TO_I:
+ {
+ int execWidth = p->curr.execWidth;
+ GenRegister xsrc = src.bottom_half(), xdst = dst;
+ p->push();
+ p->curr.execWidth = 8;
+ for(int i = 0; i < execWidth/4; i ++) {
+ p->curr.chooseNib(i);
+ p->MOV(xdst, xsrc);
+ xdst = GenRegister::suboffset(xdst, 4);
+ xsrc = GenRegister::suboffset(xsrc, 8);
+ }
+ p->pop();
+ break;
+ }
default: NOT_IMPLEMENTED;
}
}
@@ -417,6 +444,273 @@ namespace gbe
p->pop();
}
+ void GenContext::I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2) {
+ addWithCarry(low1, low1, low2);
+ addWithCarry(high1, high1, high2);
+ p->ADD(high1, high1, low2);
+ }
+
+ void GenContext::I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low) {
+ GenRegister &e = dst1, &f = dst2, &g = dst3, &h = dst4,
+ &a = x_high, &b = x_low, &c = y_high, &d = y_low;
+ I32FullMult(e, h, b, d);
+ I32FullMult(f, g, a, d);
+ addWithCarry(g, g, e);
+ addWithCarry(f, f, e);
+ I32FullMult(e, d, b, c);
+ I64FullAdd(f, g, e, d);
+ I32FullMult(b, d, a, c);
+ I64FullAdd(e, f, b, d);
+ }
+
+ void GenContext::I64Neg(GenRegister high, GenRegister low, GenRegister tmp) {
+ p->NOT(high, high);
+ p->NOT(low, low);
+ p->MOV(tmp, GenRegister::immud(1));
+ addWithCarry(low, low, tmp);
+ p->ADD(high, high, tmp);
+ }
+
+ void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
+ p->SHR(sign, high, GenRegister::immud(31));
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ I64Neg(high, low, tmp);
+ p->pop();
+ }
+
+ void GenContext::emitI64MULHIInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ GenRegister g = ra->genReg(insn.dst(7));
+ GenRegister h = ra->genReg(insn.dst(8));
+ GenRegister i = ra->genReg(insn.dst(9));
+ GenRegister flagReg = ra->genReg(insn.dst(10));
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(x.type == GEN_TYPE_UL) {
+ I64FullMult(e, f, g, h, a, b, c, d);
+ } else {
+ I64ABS(e, a, b, i, flagReg);
+ I64ABS(f, c, d, i, flagReg);
+ p->XOR(i, e, f);
+ I64FullMult(e, f, g, h, a, b, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->NOT(e, e);
+ p->NOT(f, f);
+ p->NOT(g, g);
+ p->NOT(h, h);
+ p->MOV(i, GenRegister::immud(1));
+ addWithCarry(h, h, i);
+ addWithCarry(g, g, i);
+ addWithCarry(f, f, i);
+ p->ADD(e, e, i);
+ p->pop();
+ }
+ storeTopHalf(dest, e);
+ storeBottomHalf(dest, f);
+ }
+
+ void GenContext::emitI64MADSATInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister z = ra->genReg(insn.src(2));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ GenRegister g = ra->genReg(insn.dst(7));
+ GenRegister h = ra->genReg(insn.dst(8));
+ GenRegister i = ra->genReg(insn.dst(9));
+ GenRegister flagReg = ra->genReg(insn.dst(10));
+ GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(x.type == GEN_TYPE_UL) {
+ I64FullMult(e, f, g, h, a, b, c, d);
+ loadTopHalf(c, z);
+ loadBottomHalf(d, z);
+ addWithCarry(h, h, d);
+ addWithCarry(g, g, d);
+ addWithCarry(f, f, d);
+ p->ADD(e, e, d);
+ addWithCarry(g, g, c);
+ addWithCarry(f, f, c);
+ p->ADD(e, e, c);
+ p->OR(a, e, f);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(g, GenRegister::immd(-1));
+ p->MOV(h, GenRegister::immd(-1));
+ p->pop();
+ } else {
+ I64ABS(e, a, b, i, flagReg);
+ I64ABS(f, c, d, i, flagReg);
+ p->XOR(i, e, f);
+ I64FullMult(e, f, g, h, a, b, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NZ, i, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->NOT(e, e);
+ p->NOT(f, f);
+ p->NOT(g, g);
+ p->NOT(h, h);
+ p->MOV(i, one);
+ addWithCarry(h, h, i);
+ addWithCarry(g, g, i);
+ addWithCarry(f, f, i);
+ p->ADD(e, e, i);
+ p->pop();
+ loadTopHalf(c, z);
+ loadBottomHalf(d, z);
+ p->ASR(GenRegister::retype(b, GEN_TYPE_D), GenRegister::retype(c, GEN_TYPE_D), GenRegister::immd(31));
+ p->MOV(a, b);
+ addWithCarry(h, h, d);
+ addWithCarry(g, g, d);
+ addWithCarry(f, f, d);
+ p->ADD(e, e, d);
+ addWithCarry(g, g, c);
+ addWithCarry(f, f, c);
+ p->ADD(e, e, c);
+ addWithCarry(f, f, b);
+ p->ADD(e, e, b);
+ p->ADD(e, e, a);
+ p->MOV(b, zero);
+ p->push();
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NZ, e, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NZ, f, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_G, g, GenRegister::immud(0x7FFFFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->SHR(a, e, GenRegister::immud(31));
+ p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, zero);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(g, GenRegister::immud(0x7FFFFFFF));
+ p->MOV(h, GenRegister::immud(0xFFFFFFFFu));
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(b, zero);
+ p->CMP(GEN_CONDITIONAL_NEQ, e, GenRegister::immud(0xFFFFFFFFu));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NEQ, f, GenRegister::immud(0xFFFFFFFFu));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_LE, g, GenRegister::immud(0x7FFFFFFF));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_Z, a, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(b, zero);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(g, GenRegister::immud(0x80000000u));
+ p->MOV(h, zero);
+ p->pop();
+ }
+ storeTopHalf(dest, g);
+ storeBottomHalf(dest, h);
+ }
+
+ void GenContext::emitI64HADDInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+ loadBottomHalf(a, x);
+ loadBottomHalf(b, y);
+ loadTopHalf(c, x);
+ loadTopHalf(d, y);
+ addWithCarry(a, a, b);
+ addWithCarry(c, c, b);
+ addWithCarry(c, c, d);
+ p->ADD(b, b, d);
+ p->SHR(a, a, GenRegister::immud(1));
+ p->SHL(d, c, GenRegister::immud(31));
+ p->OR(a, a, d);
+ p->SHR(c, c, GenRegister::immud(1));
+ p->SHL(d, b, GenRegister::immud(31));
+ p->OR(c, c, d);
+ storeBottomHalf(dest, a);
+ storeTopHalf(dest, c);
+ }
+
+ void GenContext::emitI64RHADDInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+ loadBottomHalf(a, x);
+ loadBottomHalf(b, y);
+ addWithCarry(a, a, b);
+ p->MOV(c, GenRegister::immud(1));
+ addWithCarry(a, a, c);
+ p->ADD(b, b, c);
+ loadTopHalf(c, x);
+ loadTopHalf(d, y);
+ addWithCarry(c, c, b);
+ addWithCarry(c, c, d);
+ p->ADD(b, b, d);
+ p->SHR(a, a, GenRegister::immud(1));
+ p->SHL(d, c, GenRegister::immud(31));
+ p->OR(a, a, d);
+ p->SHR(c, c, GenRegister::immud(1));
+ p->SHL(d, b, GenRegister::immud(31));
+ p->OR(c, c, d);
+ storeBottomHalf(dest, a);
+ storeTopHalf(dest, c);
+ }
+
void GenContext::emitI64ShiftInstruction(const SelectionInstruction &insn) {
GenRegister dest = ra->genReg(insn.dst(0));
GenRegister x = ra->genReg(insn.src(0));
@@ -428,6 +722,7 @@ namespace gbe
GenRegister e = ra->genReg(insn.dst(5));
GenRegister f = ra->genReg(insn.dst(6));
a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+ GenRegister flagReg = ra->genReg(insn.dst(7));
GenRegister zero = GenRegister::immud(0);
switch(insn.opcode) {
case SEL_OP_I64SHL:
@@ -440,16 +735,16 @@ namespace gbe
p->SHL(c, e, a);
p->SHL(d, f, a);
p->OR(e, d, b);
- p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, e);
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(a, a, GenRegister::immud(32));
- p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, c);
p->SEL(c, c, zero);
@@ -467,16 +762,16 @@ namespace gbe
p->SHR(c, f, a);
p->SHR(d, e, a);
p->OR(e, d, b);
- p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, e);
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(a, a, GenRegister::immud(32));
- p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, c);
p->SEL(c, c, zero);
@@ -495,16 +790,16 @@ namespace gbe
p->ASR(c, f, a);
p->SHR(d, e, a);
p->OR(e, d, b);
- p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, e);
p->curr.predicate = GEN_PREDICATE_NONE;
p->AND(a, a, GenRegister::immud(32));
- p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+ p->MOV(flagReg, GenRegister::immuw(0xFFFF));
p->curr.predicate = GEN_PREDICATE_NORMAL;
- p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
p->CMP(GEN_CONDITIONAL_Z, a, zero);
p->SEL(d, d, c);
p->SEL(c, c, GenRegister::immd(-1));
@@ -524,6 +819,46 @@ namespace gbe
p->pop();
}
+ void GenContext::UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp) {
+ p->MOV(dst, high);
+ p->MUL(dst, dst, GenRegister::immf(65536.f * 65536.f));
+ tmp.type = GEN_TYPE_F;
+ p->MOV(tmp, low);
+ p->ADD(dst, dst, tmp);
+ }
+
+ void GenContext::emitI64ToFloatInstruction(const SelectionInstruction &insn) {
+ GenRegister src = ra->genReg(insn.src(0));
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister high = ra->genReg(insn.dst(1));
+ GenRegister low = ra->genReg(insn.dst(2));
+ GenRegister tmp = ra->genReg(insn.dst(3));
+ GenRegister flagReg = ra->genReg(insn.dst(4));
+ loadTopHalf(high, src);
+ loadBottomHalf(low, src);
+ if(!src.is_signed_int()) {
+ UnsignedI64ToFloat(dest, high, low, tmp);
+ } else {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_GE, high, GenRegister::immud(0x80000000));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->NOT(high, high);
+ p->NOT(low, low);
+ p->MOV(tmp, GenRegister::immud(1));
+ addWithCarry(low, low, tmp);
+ p->ADD(high, high, tmp);
+ p->pop();
+ UnsignedI64ToFloat(dest, high, low, tmp);
+ p->push();
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ dest.type = GEN_TYPE_UD;
+ p->OR(dest, dest, GenRegister::immud(0x80000000));
+ p->pop();
+ }
+ }
+
void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
GenRegister src0 = ra->genReg(insn.src(0));
GenRegister src1 = ra->genReg(insn.src(1));
@@ -535,10 +870,11 @@ namespace gbe
int flag = p->curr.flag, subFlag = p->curr.subFlag;
GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW),
f2 = GenRegister::suboffset(f1, 1),
- f3 = GenRegister::suboffset(f1, 2);
+ f3 = GenRegister::suboffset(f1, 2),
+ f4 = GenRegister::suboffset(f1, 3);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.flag = 0, p->curr.subFlag = 1;
+ saveFlag(f4, flag, subFlag);
loadTopHalf(tmp0, src0);
loadTopHalf(tmp1, src1);
switch(insn.extra.function) {
@@ -554,46 +890,147 @@ namespace gbe
cmpTopHalf = GEN_CONDITIONAL_G;
p->CMP(cmpTopHalf, tmp0, tmp1);
}
- saveFlag(f1, 0, 1);
+ saveFlag(f1, flag, subFlag);
p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
- saveFlag(f2, 0, 1);
+ saveFlag(f2, flag, subFlag);
tmp0.type = tmp1.type = GEN_TYPE_UD;
loadBottomHalf(tmp0, src0);
loadBottomHalf(tmp1, src1);
p->CMP(insn.extra.function, tmp0, tmp1);
- saveFlag(f3, 0, 1);
+ saveFlag(f3, flag, subFlag);
+ p->push();
+ p->curr.execWidth = 1;
p->AND(f2, f2, f3);
p->OR(f1, f1, f2);
+ p->pop();
break;
case GEN_CONDITIONAL_EQ:
p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
- saveFlag(f1, 0, 1);
+ saveFlag(f1, flag, subFlag);
tmp0.type = tmp1.type = GEN_TYPE_UD;
loadBottomHalf(tmp0, src0);
loadBottomHalf(tmp1, src1);
p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
- saveFlag(f2, 0, 1);
+ saveFlag(f2, flag, subFlag);
+ p->push();
+ p->curr.execWidth = 1;
p->AND(f1, f1, f2);
+ p->pop();
break;
case GEN_CONDITIONAL_NEQ:
p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
- saveFlag(f1, 0, 1);
+ saveFlag(f1, flag, subFlag);
tmp0.type = tmp1.type = GEN_TYPE_UD;
loadBottomHalf(tmp0, src0);
loadBottomHalf(tmp1, src1);
p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
- saveFlag(f2, 0, 1);
+ saveFlag(f2, flag, subFlag);
+ p->push();
+ p->curr.execWidth = 1;
p->OR(f1, f1, f2);
+ p->pop();
break;
default:
NOT_IMPLEMENTED;
}
- saveFlag(f2, flag, subFlag);
- p->AND(f1, f1, f2);
+ p->curr.execWidth = 1;
+ p->AND(f1, f1, f4);
p->MOV(GenRegister::flag(flag, subFlag), f1);
p->pop();
}
+ void GenContext::emitI64SATADDInstruction(const SelectionInstruction &insn) {
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister flagReg = ra->genReg(insn.dst(6));
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(dst.is_signed_int())
+ p->SHR(e, a, GenRegister::immud(31));
+ addWithCarry(b, b, d);
+ addWithCarry(a, a, d);
+ addWithCarry(a, a, c);
+ p->ADD(c, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ if(! dst.is_signed_int()) {
+ p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(a, GenRegister::immud(0xFFFFFFFFu));
+ p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+ } else {
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x80000000u));
+ p->MOV(b, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+ p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+ }
+ p->pop();
+ storeTopHalf(dst, a);
+ storeBottomHalf(dst, b);
+ }
+
+ void GenContext::emitI64SATSUBInstruction(const SelectionInstruction &insn) {
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister flagReg = ra->genReg(insn.dst(6));
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ if(dst.is_signed_int())
+ p->SHR(e, a, GenRegister::immud(31));
+ subWithBorrow(b, b, d);
+ subWithBorrow(a, a, d);
+ subWithBorrow(a, a, c);
+ p->ADD(c, c, d);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ if(! dst.is_signed_int()) {
+ p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(a, GenRegister::immud(0));
+ p->MOV(b, GenRegister::immud(0));
+ } else {
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x80000000u));
+ p->MOV(b, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+ p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+ p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+ }
+ p->pop();
+ storeTopHalf(dst, a);
+ storeBottomHalf(dst, b);
+ }
+
void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
int execWidth = p->curr.execWidth;
src = src.top_half();
@@ -664,11 +1101,11 @@ namespace gbe
int execWidth = p->curr.execWidth;
GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.execWidth = 8;
p->ADDC(dest, src0, src1);
p->MOV(src1, acc0);
if (execWidth == 16) {
+ p->curr.quarterControl = 1;
p->ADDC(GenRegister::suboffset(dest, 8),
GenRegister::suboffset(src0, 8),
GenRegister::suboffset(src1, 8));
@@ -681,11 +1118,11 @@ namespace gbe
int execWidth = p->curr.execWidth;
GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
p->push();
- p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.execWidth = 8;
p->SUBB(dest, src0, src1);
p->MOV(src1, acc0);
if (execWidth == 16) {
+ p->curr.quarterControl = 1;
p->SUBB(GenRegister::suboffset(dest, 8),
GenRegister::suboffset(src0, 8),
GenRegister::suboffset(src1, 8));
@@ -740,6 +1177,145 @@ namespace gbe
storeBottomHalf(dest, a);
}
+ void GenContext::emitI64DIVREMInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ GenRegister g = ra->genReg(insn.dst(7));
+ GenRegister h = ra->genReg(insn.dst(8));
+ GenRegister i = ra->genReg(insn.dst(9));
+ GenRegister j = ra->genReg(insn.dst(10));
+ GenRegister k = ra->genReg(insn.dst(11));
+ GenRegister l = ra->genReg(insn.dst(12));
+ GenRegister m = ra->genReg(insn.dst(13));
+ GenRegister flagReg = ra->genReg(insn.dst(14));
+ GenRegister zero = GenRegister::immud(0),
+ one = GenRegister::immud(1),
+ imm31 = GenRegister::immud(31);
+ // (a,b) <- x
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ // (c,d) <- y
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ // k <- sign_of_result
+ if(x.is_signed_int()) {
+ GBE_ASSERT(y.is_signed_int());
+ GBE_ASSERT(dest.is_signed_int());
+ I64ABS(k, a, b, e, flagReg);
+ I64ABS(l, c, d, e, flagReg);
+ if(insn.opcode == SEL_OP_I64DIV)
+ p->XOR(k, k, l);
+ }
+ // (e,f) <- 0
+ p->MOV(e, zero);
+ p->MOV(f, zero);
+ // (g,h) <- 2**63
+ p->MOV(g, GenRegister::immud(0x80000000));
+ p->MOV(h, zero);
+ // (i,j) <- 0
+ p->MOV(i, zero);
+ p->MOV(j, zero);
+ // m <- 0
+ p->MOV(m, zero);
+ {
+ uint32_t loop_start = p->n_instruction();
+ // (c,d,e,f) <- (c,d,e,f) / 2
+ p->SHR(f, f, one);
+ p->SHL(l, e, imm31);
+ p->OR(f, f, l);
+ p->SHR(e, e, one);
+ p->SHL(l, d, imm31);
+ p->OR(e, e, l);
+ p->SHR(d, d, one);
+ p->SHL(l, c, imm31);
+ p->OR(d, d, l);
+ p->SHR(c, c, one);
+ // condition <- (c,d)==0 && (a,b)>=(e,f)
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->MOV(l, zero);
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_EQ, a, e);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_GE, b, f);
+ p->MOV(l, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_G, a, e);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->MOV(l, one);
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->CMP(GEN_CONDITIONAL_NEQ, l, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->CMP(GEN_CONDITIONAL_EQ, c, zero);
+ p->CMP(GEN_CONDITIONAL_EQ, d, zero);
+ // under condition, (a,b) <- (a,b) - (e,f)
+ p->MOV(l, f);
+ subWithBorrow(b, b, l);
+ subWithBorrow(a, a, l);
+ p->MOV(l, e);
+ subWithBorrow(a, a, l);
+ // under condition, (i,j) <- (i,j) | (g,h)
+ p->OR(i, i, g);
+ p->OR(j, j, h);
+ p->pop();
+ // (g,h) /= 2
+ p->SHR(h, h, one);
+ p->SHL(l, g, imm31);
+ p->OR(h, h, l);
+ p->SHR(g, g, one);
+ // condition: m < 64
+ p->ADD(m, m, one);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ // under condition, jump back to start point
+ if (simdWidth == 8)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+ else if (simdWidth == 16)
+ p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+ else
+ NOT_IMPLEMENTED;
+ p->curr.execWidth = 1;
+ p->curr.noMask = 1;
+ int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
+ p->JMPI(zero);
+ p->patchJMPI(p->n_instruction()-1, jip);
+ p->pop();
+ // end of loop
+ }
+ // adjust sign of result
+ if(x.is_signed_int()) {
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+ p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ if(insn.opcode == SEL_OP_I64DIV)
+ I64Neg(i, j, l);
+ else
+ I64Neg(a, b, l);
+ p->pop();
+ }
+ // write dest
+ if(insn.opcode == SEL_OP_I64DIV) {
+ storeTopHalf(dest, i);
+ storeBottomHalf(dest, j);
+ } else {
+ GBE_ASSERT(insn.opcode == SEL_OP_I64REM);
+ storeTopHalf(dest, a);
+ storeBottomHalf(dest, b);
+ }
+ }
+
void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
@@ -841,11 +1417,11 @@ namespace gbe
}
void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
- p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.execWidth = 8;
p->curr.noMask = 1;
+ p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
+ p->curr.execWidth = 8;
p->EOT(112);
p->pop();
}
@@ -916,7 +1492,7 @@ namespace gbe
const GenRegister data = ra->genReg(insn.src(1));
const uint32_t bti = insn.extra.function;
p->MOV(src, addr);
- p->WRITE64(src, data, bti, elemNum);
+ p->WRITE64(src, data, bti, elemNum, isScalarReg(data.reg()));
}
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
@@ -941,6 +1517,13 @@ namespace gbe
p->BYTE_SCATTER(src, bti, elemSize);
}
+ void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.extra.function;
+ p->DWORD_GATHER(dst, src, bti);
+ }
+
void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
@@ -950,6 +1533,7 @@ namespace gbe
const GenRegister vcoord = ra->genReg(insn.src(5));
const GenRegister wcoord = ra->genReg(insn.src(6));
uint32_t simdWidth = p->curr.execWidth;
+ uint32_t coord_cnt = 2;
p->push();
const uint32_t nr = msgPayload.nr;
// prepare mesg desc and move to a0.0.
@@ -957,9 +1541,11 @@ namespace gbe
/* Prepare message payload. */
p->MOV(GenRegister::f8grf(nr , 0), ucoord);
p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
- if (insn.src(8).reg() != 0)
+ if (insn.src(6).reg() != 0) {
p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), wcoord);
- p->SAMPLE(dst, msgPayload, false, bti, sampler, simdWidth, -1, 0);
+ coord_cnt++;
+ }
+ p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0);
p->pop();
}
@@ -1041,7 +1627,7 @@ namespace gbe
p->curr.quarterControl = GEN_COMPRESSION_Q2;
QUARTER_MOV0(nr + 1, ucoord);
QUARTER_MOV0(nr + 2, vcoord);
- if (insn.src(3 + insn.extra.elem).reg() != 0)
+ if (insn.src(2 + insn.extra.elem).reg() != 0)
QUARTER_MOV0(nr + 3, wcoord);
QUARTER_MOV1(nr + 5, R);
QUARTER_MOV1(nr + 6, G);
@@ -1081,6 +1667,7 @@ namespace gbe
schedulePostRegAllocation(*this, *this->sel);
if (OCL_OUTPUT_REG_ALLOC)
ra->outputAllocation();
+ this->clearFlagRegister();
this->emitStackPointer();
this->emitInstructionStream();
this->patchBranches();
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 8b481d0..10e0603 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -61,6 +61,7 @@ namespace gbe
INLINE const ir::Function &getFunction(void) const { return fn; }
/*! Simd width chosen for the current function */
INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
+ void clearFlagRegister(void);
/*! Emit the per-lane stack pointer computation */
void emitStackPointer(void);
/*! Emit the instructions */
@@ -85,8 +86,13 @@ namespace gbe
void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+ void I64Neg(GenRegister high, GenRegister low, GenRegister tmp);
+ void I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg);
+ void I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2);
void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
+ void I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low);
void saveFlag(GenRegister dest, int flag, int subFlag);
+ void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp);
/*! Final Gen ISA emission helper functions */
void emitLabelInstruction(const SelectionInstruction &insn);
@@ -95,8 +101,15 @@ namespace gbe
void emitBinaryInstruction(const SelectionInstruction &insn);
void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
void emitTernaryInstruction(const SelectionInstruction &insn);
+ void emitI64MULHIInstruction(const SelectionInstruction &insn);
+ void emitI64MADSATInstruction(const SelectionInstruction &insn);
+ void emitI64HADDInstruction(const SelectionInstruction &insn);
+ void emitI64RHADDInstruction(const SelectionInstruction &insn);
void emitI64ShiftInstruction(const SelectionInstruction &insn);
void emitI64CompareInstruction(const SelectionInstruction &insn);
+ void emitI64SATADDInstruction(const SelectionInstruction &insn);
+ void emitI64SATSUBInstruction(const SelectionInstruction &insn);
+ void emitI64ToFloatInstruction(const SelectionInstruction &insn);
void emitCompareInstruction(const SelectionInstruction &insn);
void emitJumpInstruction(const SelectionInstruction &insn);
void emitIndirectMoveInstruction(const SelectionInstruction &insn);
@@ -113,12 +126,14 @@ namespace gbe
void emitAtomicInstruction(const SelectionInstruction &insn);
void emitByteGatherInstruction(const SelectionInstruction &insn);
void emitByteScatterInstruction(const SelectionInstruction &insn);
+ void emitDWordGatherInstruction(const SelectionInstruction &insn);
void emitSampleInstruction(const SelectionInstruction &insn);
void emitTypedWriteInstruction(const SelectionInstruction &insn);
void emitSpillRegInstruction(const SelectionInstruction &insn);
void emitUnSpillRegInstruction(const SelectionInstruction &insn);
void emitGetImageInfoInstruction(const SelectionInstruction &insn);
void emitI64MULInstruction(const SelectionInstruction &insn);
+ void emitI64DIVREMInstruction(const SelectionInstruction &insn);
void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index e3959ff..27ce58c 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -343,6 +343,10 @@ enum GenMessageTarget {
#define GEN_BYTE_SCATTER_DWORD 2
#define GEN_BYTE_SCATTER_QWORD 3
+/* dword scattered rw */
+#define GEN_DWORD_SCATTER_8_DWORDS 2
+#define GEN_DWORD_SCATTER_16_DWORDS 3
+
#define GEN_SAMPLER_RETURN_FORMAT_FLOAT32 0
#define GEN_SAMPLER_RETURN_FORMAT_UINT32 2
#define GEN_SAMPLER_RETURN_FORMAT_SINT32 3
@@ -805,6 +809,21 @@ struct GenInstruction
uint32_t end_of_thread:1;
} gen7_oblock_rw;
+ /*! Data port dword scatter / gather */
+ struct {
+ uint32_t bti:8;
+ uint32_t block_size:2;
+ uint32_t ignored0:3;
+ uint32_t invalidate_after_read:1;
+ uint32_t msg_type:4;
+ uint32_t ignored1:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen7_dword_rw;
+
/*! Data port typed read / write messages */
struct {
uint32_t bti:8;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 1a459e1..b0cc931 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -198,7 +198,21 @@ namespace gbe
insn->bits3.gen7_typed_rw.bti = bti;
insn->bits3.gen7_typed_rw.msg_type = msg_type;
}
-
+ static void setDWordScatterMessgae(GenEncoder *p,
+ GenInstruction *insn,
+ uint32_t bti,
+ uint32_t block_size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN6_SFID_DATAPORT_CONSTANT_CACHE;
+ setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+ insn->bits3.gen7_dword_rw.msg_type = msg_type;
+ insn->bits3.gen7_dword_rw.bti = bti;
+ insn->bits3.gen7_dword_rw.block_size = block_size;
+ insn->bits3.gen7_dword_rw.invalidate_after_read = 0;
+ }
//////////////////////////////////////////////////////////////////////////
// Gen Emitter encoding class
//////////////////////////////////////////////////////////////////////////
@@ -315,6 +329,7 @@ namespace gbe
void GenEncoder::setSrc1(GenInstruction *insn, GenRegister reg) {
assert(reg.nr < 128);
+ assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
insn->bits1.da1.src1_reg_file = reg.file;
insn->bits1.da1.src1_reg_type = reg.type;
@@ -390,8 +405,9 @@ namespace gbe
pop();
}
- void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) {
+ void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
+ GenRegister unpacked;
msg = GenRegister::retype(msg, GEN_TYPE_UD);
int originSimdWidth = curr.execWidth;
int originPredicate = curr.predicate;
@@ -401,9 +417,19 @@ namespace gbe
curr.predicate = GEN_PREDICATE_NONE;
curr.noMask = GEN_MASK_DISABLE;
curr.execWidth = 8;
- MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half));
+ if (is_scalar) {
+ unpacked = data32;
+ unpacked.subnr += half * 4;
+ } else
+ unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
+ MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
if (originSimdWidth == 16) {
- MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half));
+ if (is_scalar) {
+ unpacked = data32;
+ unpacked.subnr += half * 4;
+ } else
+ unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
+ MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
curr.execWidth = 16;
}
if (half == 1)
@@ -518,6 +544,36 @@ namespace gbe
response_length);
}
+ void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
+ GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t msg_length = 0;
+ uint32_t response_length = 0;
+ uint32_t block_size = 0;
+ if (this->curr.execWidth == 8) {
+ msg_length = 1;
+ response_length = 1;
+ block_size = GEN_DWORD_SCATTER_8_DWORDS;
+ } else if (this->curr.execWidth == 16) {
+ msg_length = 2;
+ response_length = 2;
+ block_size = GEN_DWORD_SCATTER_16_DWORDS;
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setHeader(insn);
+ this->setDst(insn, dst);
+ this->setSrc0(insn, src);
+ this->setSrc1(insn, GenRegister::immud(0));
+ setDWordScatterMessgae(this,
+ insn,
+ bti,
+ block_size,
+ GEN_DWORD_GATHER,
+ msg_length,
+ response_length);
+
+ }
+
void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
uint32_t msg_length = 0;
@@ -1123,15 +1179,16 @@ namespace gbe
bool header_present,
unsigned char bti,
unsigned char sampler,
+ unsigned int coord_cnt,
uint32_t simdWidth,
uint32_t writemask,
uint32_t return_format)
{
if (writemask == 0) return;
- uint32_t msg_type = (simdWidth == 16) ?
+ uint32_t msg_type = (simdWidth == 16) ?
GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
uint32_t response_length = (4 * (simdWidth / 8));
- uint32_t msg_length = (2 * (simdWidth / 8));
+ uint32_t msg_length = (coord_cnt * (simdWidth / 8));
if (header_present)
msg_length++;
uint32_t simd_mode = (simdWidth == 16) ?
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index bbf240c..d518c4a 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -147,7 +147,7 @@ namespace gbe
/*! Read 64-bits float/int arrays */
void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Write 64-bits float/int arrays */
- void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum);
+ void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
/*! Untyped read (upto 4 channels) */
void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
@@ -156,6 +156,8 @@ namespace gbe
void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+ /*! DWord gather (for constant cache read) */
+ void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
/*! for scratch memory read */
void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
/*! for scratch memory write */
@@ -166,6 +168,7 @@ namespace gbe
bool header_present,
unsigned char bti,
unsigned char sampler,
+ unsigned int coord_cnt,
unsigned int simdWidth,
uint32_t writemask,
uint32_t return_format);
@@ -190,6 +193,7 @@ namespace gbe
void setSrc0(GenInstruction *insn, GenRegister reg);
void setSrc1(GenInstruction *insn, GenRegister reg);
GenInstruction *next(uint32_t opcode);
+ uint32_t n_instruction(void) const { return store.size(); }
GBE_CLASS(GenEncoder); //!< Use custom allocators
};
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 2204837..b33112c 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -6,8 +6,14 @@ DECL_GEN7_SCHEDULE(Binary, 20, 4, 2)
DECL_GEN7_SCHEDULE(BinaryWithTemp, 20, 4, 2)
DECL_GEN7_SCHEDULE(Ternary, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64Shift, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64HADD, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64RHADD, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64ToFloat, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64MULHI, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64MADSAT, 20, 4, 2)
DECL_GEN7_SCHEDULE(Compare, 20, 4, 2)
DECL_GEN7_SCHEDULE(I64Compare, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64DIVREM, 20, 4, 2)
DECL_GEN7_SCHEDULE(Jump, 14, 1, 1)
DECL_GEN7_SCHEDULE(IndirectMove, 20, 2, 2)
DECL_GEN7_SCHEDULE(Eot, 20, 1, 1)
@@ -22,6 +28,7 @@ DECL_GEN7_SCHEDULE(UntypedRead, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteScatter, 80, 1, 1)
+DECL_GEN7_SCHEDULE(DWordGather, 80, 1, 1)
DECL_GEN7_SCHEDULE(Sample, 80, 1, 1)
DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(SpillReg, 80, 1, 1)
@@ -29,3 +36,5 @@ DECL_GEN7_SCHEDULE(UnSpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(GetImageInfo, 20, 4, 2)
DECL_GEN7_SCHEDULE(Atomic, 80, 1, 1)
DECL_GEN7_SCHEDULE(I64MUL, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64SATADD, 20, 4, 2)
+DECL_GEN7_SCHEDULE(I64SATSUB, 20, 4, 2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 8e4cd8f..bd52885 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -419,7 +419,7 @@ namespace gbe
#define ALU3(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
#define I64Shift(OP) \
- INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
+ INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
ALU1(MOV)
ALU1WithTemp(MOV_DF)
ALU1WithTemp(LOAD_DF_IMM)
@@ -459,6 +459,7 @@ namespace gbe
ALU2(UPSAMPLE_INT)
ALU2(UPSAMPLE_LONG)
ALU1WithTemp(CONVI_TO_I64)
+ ALU1(CONVI64_TO_I)
I64Shift(I64SHL)
I64Shift(I64SHR)
I64Shift(I64ASR)
@@ -468,10 +469,24 @@ namespace gbe
#undef ALU2WithTemp
#undef ALU3
#undef I64Shift
+ /*! Convert 64-bit integer to 32-bit float */
+ void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]);
+ /*! Saturated 64bit x*y + z */
+ void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]);
+ /*! High 64bit of x*y */
+ void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]);
+ /*! (x+y)>>1 without mod. overflow */
+ void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+ /*! (x+y+1)>>1 without mod. overflow */
+ void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
/*! Shift a 64-bit integer */
- void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+ void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]);
/*! Compare 64-bit integer */
void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
+ /*! Saturated addition of 64-bit integer */
+ void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+ /*! Saturated subtraction of 64-bit integer */
+ void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
/*! Encode a barrier instruction */
void BARRIER(GenRegister src);
/*! Encode a barrier instruction */
@@ -506,6 +521,8 @@ namespace gbe
void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
+ /*! DWord scatter (for constant cache read) */
+ void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
/*! Extended math function (2 arguments) */
void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
/*! Extended math function (1 argument) */
@@ -528,6 +545,10 @@ namespace gbe
void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
/*! Multiply 64-bit integers */
void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+ /*! 64-bit integer division */
+ void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
+ /*! 64-bit integer remainder of division */
+ void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
/*! Use custom allocators */
GBE_CLASS(Opaque);
friend class SelectionBlock;
@@ -993,6 +1014,14 @@ namespace gbe
vector->reg = &insn->src(0);
}
+ void Selection::Opaque::DWORD_GATHER(Reg dst, Reg addr, uint32_t bti) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_DWORD_GATHER, 1, 1);
+
+ insn->src(0) = addr;
+ insn->dst(0) = dst;
+ insn->extra.function = bti;
+ }
+
void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2);
insn->dst(0) = dst;
@@ -1017,6 +1046,24 @@ namespace gbe
insn->dst(i + 1) = tmp[i];
}
+ void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 15, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 14; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 15, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 14; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
insn->dst(0) = dst;
@@ -1062,12 +1109,75 @@ namespace gbe
insn->extra.function = conditional;
}
- void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
- SelectionInstruction *insn = this->appendInsn(opcode, 7, 2);
+ void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 7, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i=0; i<6; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 7, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i=0; i<6; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 5, 1);
+ insn->dst(0) = dst;
+ insn->src(0) = src;
+ for(int i = 0; i < 4; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 11, 3);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ insn->src(2) = src2;
+ for(int i = 0; i < 10; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 11, 2);
insn->dst(0) = dst;
insn->src(0) = src0;
insn->src(1) = src1;
- for(int i = 0; i < 6; i ++)
+ for(int i = 0; i < 10; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, 5, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 4; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, 5, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 4; i ++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
+ void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) {
+ SelectionInstruction *insn = this->appendInsn(opcode, 8, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 7; i ++)
insn->dst(i + 1) = tmp[i];
}
@@ -1384,16 +1494,21 @@ namespace gbe
/*! Unary instruction patterns */
DECL_PATTERN(UnaryInstruction)
{
- static ir::Type getType(const ir::Opcode opcode) {
+ static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType) {
+ if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
+ return insnType;
if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
return ir::TYPE_U32;
+ if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
+ return insnType;
return ir::TYPE_FLOAT;
}
INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn) const {
const ir::Opcode opcode = insn.getOpcode();
- const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode));
- const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode));
+ const ir::Type insnType = insn.getType();
+ const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType));
+ const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType));
switch (opcode) {
case ir::OP_ABS:
if (insn.getType() == ir::TYPE_S32) {
@@ -1484,8 +1599,17 @@ namespace gbe
} else if(type == TYPE_FLOAT) {
GBE_ASSERT(op != OP_REM);
sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
- } else {
- NOT_IMPLEMENTED;
+ } else if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[14];
+ for(int i=0; i<13; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ tmp[13] = sel.selReg(sel.reg(FAMILY_BOOL));
+ if(op == OP_DIV)
+ sel.I64DIV(dst, src0, src1, tmp);
+ else
+ sel.I64REM(dst, src0, src1, tmp);
}
markAllChildren(dag);
return true;
@@ -1561,6 +1685,16 @@ namespace gbe
sel.ADD(dst, src0, src1);
break;
case OP_ADDSAT:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister tmp[6];
+ for(int i=0; i<5; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
+ sel.I64SATADD(dst, src0, src1, tmp);
+ break;
+ }
sel.push();
sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
sel.ADD(dst, src0, src1);
@@ -1592,6 +1726,16 @@ namespace gbe
sel.ADD(dst, src0, GenRegister::negate(src1));
break;
case OP_SUBSAT:
+ if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+ GenRegister tmp[6];
+ for(int i=0; i<5; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
+ sel.I64SATSUB(dst, src0, src1, tmp);
+ break;
+ }
sel.push();
sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
sel.ADD(dst, src0, GenRegister::negate(src1));
@@ -1599,27 +1743,30 @@ namespace gbe
break;
case OP_SHL:
if (type == TYPE_S64 || type == TYPE_U64) {
- GenRegister tmp[6];
+ GenRegister tmp[7];
for(int i = 0; i < 6; i ++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64SHL(dst, src0, src1, tmp);
} else
sel.SHL(dst, src0, src1);
break;
case OP_SHR:
if (type == TYPE_S64 || type == TYPE_U64) {
- GenRegister tmp[6];
+ GenRegister tmp[7];
for(int i = 0; i < 6; i ++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64SHR(dst, src0, src1, tmp);
} else
sel.SHR(dst, src0, src1);
break;
case OP_ASR:
if (type == TYPE_S64 || type == TYPE_U64) {
- GenRegister tmp[6];
+ GenRegister tmp[7];
for(int i = 0; i < 6; i ++)
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
sel.I64ASR(dst, src0, src1, tmp);
} else
sel.ASR(dst, src0, src1);
@@ -1629,6 +1776,17 @@ namespace gbe
sel.MUL_HI(dst, src0, src1, temp);
break;
}
+ case OP_I64_MUL_HI:
+ {
+ GenRegister temp[10];
+ for(int i=0; i<9; i++) {
+ temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ temp[i].type = GEN_TYPE_UD;
+ }
+ temp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
+ sel.I64_MUL_HI(dst, src0, src1, temp);
+ break;
+ }
case OP_MUL:
if (type == TYPE_U32 || type == TYPE_S32) {
sel.pop();
@@ -1651,6 +1809,22 @@ namespace gbe
sel.RHADD(dst, src0, src1, temp);
break;
}
+ case OP_I64HADD:
+ {
+ GenRegister tmp[4];
+ for(int i=0; i<4; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64HADD(dst, src0, src1, tmp);
+ break;
+ }
+ case OP_I64RHADD:
+ {
+ GenRegister tmp[4];
+ for(int i=0; i<4; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64RHADD(dst, src0, src1, tmp);
+ break;
+ }
case OP_UPSAMPLE_SHORT:
sel.UPSAMPLE_SHORT(dst, src0, src1);
break;
@@ -1737,6 +1911,9 @@ namespace gbe
SelectionDAG *cmp = dag.child[0];
const SelectInstruction &insn = cast<SelectInstruction>(dag.insn);
+ if (insn.getType() == TYPE_S64 || insn.getType() == TYPE_U64) // not support
+ return false;
+
// Not in this block
if (cmp == NULL) return false;
@@ -2085,6 +2262,23 @@ namespace gbe
sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
}
+ void emitDWordGather(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister addr,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ GBE_ASSERT(valueNum == 1);
+ GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+ // get dword based address
+ GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+
+ sel.DWORD_GATHER(dst, addrDW, bti);
+ }
+
void emitRead64(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
@@ -2155,8 +2349,19 @@ namespace gbe
GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
- if (insn.getAddressSpace() == MEM_CONSTANT)
- this->emitIndirectMove(sel, insn, address);
+ if (insn.getAddressSpace() == MEM_CONSTANT) {
+ // XXX TODO read 64bit constant through constant cache
+ // Per HW Spec, constant cache messages can read at least DWORD data.
+ // So, byte/short data type, we have to read through data cache.
+ if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitRead64(sel, insn, address, 0x2);
+ else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ this->emitDWordGather(sel, insn, address, 0x2);
+ else {
+ const GenRegister value = sel.selReg(insn.getValue(0));
+ this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
+ }
+ }
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
@@ -2334,7 +2539,7 @@ namespace gbe
const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
// We need two instructions to make the conversion
- if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
+ if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
GenRegister unpacked;
if (dstFamily == FAMILY_WORD) {
const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
@@ -2345,8 +2550,24 @@ namespace gbe
unpacked = GenRegister::unpacked_ub(sel.reg(FAMILY_DWORD));
unpacked = GenRegister::retype(unpacked, type);
}
- sel.MOV(unpacked, src);
+ if(srcFamily == FAMILY_QWORD) {
+ GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp.type = GEN_TYPE_D;
+ sel.CONVI64_TO_I(tmp, src);
+ sel.MOV(unpacked, tmp);
+ } else
+ sel.MOV(unpacked, src);
sel.MOV(dst, unpacked);
+ } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) && srcFamily == FAMILY_QWORD) {
+ sel.CONVI64_TO_I(dst, src);
+ } else if (dstType == ir::TYPE_FLOAT && srcFamily == FAMILY_QWORD) {
+ GenRegister tmp[4];
+ for(int i=0; i<3; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ tmp[3] = sel.selReg(sel.reg(FAMILY_BOOL));
+ sel.CONVI64_TO_F(dst, src, tmp);
} else if (dst.isdf()) {
ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
sel.MOV_DF(dst, src, sel.selReg(r));
@@ -2450,6 +2671,36 @@ namespace gbe
}
};
+ DECL_PATTERN(TernaryInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn) const {
+ using namespace ir;
+ const Type type = insn.getType();
+ const GenRegister dst = sel.selReg(insn.getDst(0), type),
+ src0 = sel.selReg(insn.getSrc(0), type),
+ src1 = sel.selReg(insn.getSrc(1), type),
+ src2 = sel.selReg(insn.getSrc(2), type);
+ switch(insn.getOpcode()) {
+ case OP_I64MADSAT:
+ {
+ GenRegister tmp[10];
+ for(int i=0; i<9; i++) {
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ tmp[i].type = GEN_TYPE_UD;
+ }
+ tmp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
+ sel.I64MADSAT(dst, src0, src1, src2, tmp);
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ return true;
+ }
+
+ DECL_CTOR(TernaryInstruction, 1, 1);
+ };
+
/*! Label instruction pattern */
DECL_PATTERN(LabelInstruction)
{
@@ -2504,6 +2755,13 @@ namespace gbe
using namespace ir;
GenRegister msgPayloads[4];
GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
+ uint32_t srcNum = insn.getSrcNum();
+ uint32_t samplerOffset = 0;
+ if (srcNum == 6) {
+ /* We have the clamp border workaround. */
+ samplerOffset = insn.getSrc(srcNum - 1).value() * 8;
+ srcNum--;
+ }
for( int i = 0; i < 4; ++i)
msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
@@ -2511,15 +2769,15 @@ namespace gbe
for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
- for (uint32_t valueID = 0; valueID < insn.getSrcNum() - 2; ++valueID)
+ for (uint32_t valueID = 0; valueID < srcNum - 2; ++valueID)
src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
(insn.getSrc(SampleInstruction::SURFACE_BTI));
uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
- (insn.getSrc(SampleInstruction::SAMPLER_BTI));
+ (insn.getSrc(SampleInstruction::SAMPLER_BTI)) + samplerOffset;
- sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum() - 2, msgPayloads, 4, bti, sampler);
+ sel.SAMPLE(dst, insn.getDstNum(), src, srcNum - 2, msgPayloads, 4, bti, sampler);
return true;
}
DECL_CTOR(SampleInstruction, 1, 1);
@@ -2542,7 +2800,7 @@ namespace gbe
msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
// u, v, w coords should use coord type.
- for (; valueID < 1 + coordNum; ++valueID)
+ for (; valueID < coordNum; ++valueID)
src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
@@ -2562,19 +2820,32 @@ namespace gbe
INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn) const
{
using namespace ir;
- const uint32_t infoType = insn.getInfoType();
- GenRegister dst[4];
- uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(infoType);
- for (uint32_t valueID = 0; valueID < dstNum; ++valueID)
- dst[valueID] = sel.selReg(insn.getDst(valueID), TYPE_U32);
- uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
- (insn.getSrc(0));
- sel.GET_IMAGE_INFO(infoType, dst, dstNum, bti);
+ GenRegister dst;
+ dst = sel.selReg(insn.getDst(0), TYPE_U32);
+ GenRegister imageInfoReg = GenRegister::ud1grf(insn.getSrc(0));
+ sel.MOV(dst, imageInfoReg);
+
return true;
}
DECL_CTOR(GetImageInfoInstruction, 1, 1);
};
+ /*! get sampler info instruction pattern. */
+ DECL_PATTERN(GetSamplerInfoInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::GetSamplerInfoInstruction &insn) const
+ {
+ using namespace ir;
+ GenRegister dst, src;
+ dst = sel.selReg(insn.getDst(0), TYPE_U16);
+ src = GenRegister::offset(GenRegister::uw1grf(ocl::samplerinfo), 0, sel.ctx.getFunction().getSamplerSet()->getIdx(insn.getSrc(0)) * 2);
+ src.subphysical = 1;
+ sel.MOV(dst, src);
+ return true;
+ }
+ DECL_CTOR(GetSamplerInfoInstruction, 1, 1);
+ };
+
/*! Branch instruction pattern */
DECL_PATTERN(BranchInstruction)
{
@@ -2615,6 +2886,9 @@ namespace gbe
sel.CMP(GEN_CONDITIONAL_G, ip, GenRegister::immuw(nextLabel));
// Branch to the jump target
+ // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
+ // work correct, as flag register bits mapped to non-active lanes tend
+ // to be zero.
if (simdWidth == 8)
sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
else if (simdWidth == 16)
@@ -2737,6 +3011,7 @@ namespace gbe
this->insert<CompareInstructionPattern>();
this->insert<ConvertInstructionPattern>();
this->insert<AtomicInstructionPattern>();
+ this->insert<TernaryInstructionPattern>();
this->insert<LabelInstructionPattern>();
this->insert<BranchInstructionPattern>();
this->insert<Int32x32MulInstructionPattern>();
@@ -2745,6 +3020,7 @@ namespace gbe
this->insert<SelectModifierInstructionPattern>();
this->insert<SampleInstructionPattern>();
this->insert<GetImageInfoInstructionPattern>();
+ this->insert<GetSamplerInfoInstructionPattern>();
// Sort all the patterns with the number of instructions they output
for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 79b73e2..2422b2b 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -86,8 +86,8 @@ namespace gbe
const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
/*! No more than 17 sources (used by typed writes on simd8 mode.) */
enum { MAX_SRC_NUM = 17 };
- /*! No more than 4 destinations (used by samples and untyped reads) */
- enum { MAX_DST_NUM = 4 };
+ /*! No more than 11 destinations (used by samples and untyped reads) */
+ enum { MAX_DST_NUM = 11 };
/*! State of the instruction (extra fields neeed for the encoding) */
GenInstructionState state;
union {
@@ -115,7 +115,7 @@ namespace gbe
/*! Gen opcode */
uint8_t opcode;
/*! Number of destinations */
- uint8_t dstNum:3;
+ uint8_t dstNum:4;
/*! Number of sources */
uint8_t srcNum:5;
/*! To store various indices */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 32c7a05..21b0a43 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -28,9 +28,13 @@ DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
DECL_SELECTION_IR(ADD, BinaryInstruction)
DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATADD, I64SATADDInstruction)
DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATSUB, I64SATSUBInstruction)
DECL_SELECTION_IR(MUL, BinaryInstruction)
DECL_SELECTION_IR(I64MUL, I64MULInstruction)
+DECL_SELECTION_IR(I64DIV, I64DIVREMInstruction)
+DECL_SELECTION_IR(I64REM, I64DIVREMInstruction)
DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
DECL_SELECTION_IR(MACH, BinaryInstruction)
DECL_SELECTION_IR(CMP, CompareInstruction)
@@ -51,17 +55,24 @@ DECL_SELECTION_IR(READ64, Read64Instruction)
DECL_SELECTION_IR(WRITE64, Write64Instruction)
DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
+DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64_MUL_HI, I64MULHIInstruction)
DECL_SELECTION_IR(FBH, UnaryInstruction)
DECL_SELECTION_IR(FBL, UnaryInstruction)
DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
+DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
+DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
+DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
+DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 3d7bedd..33f07b2 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -27,12 +27,16 @@
#include "backend/gen_program.hpp"
#include "backend/gen_context.hpp"
#include "backend/gen_defs.hpp"
+#include "backend/gen/gen_mesa_disasm.h"
#include "backend/gen_reg_allocation.hpp"
#include "ir/unit.hpp"
#include "llvm/llvm_to_gen.hpp"
#include <cstring>
+#include <sstream>
#include <memory>
+#include <iostream>
+#include <fstream>
namespace gbe {
@@ -41,8 +45,32 @@ namespace gbe {
{}
GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
const char *GenKernel::getCode(void) const { return (const char*) insns; }
+ const void GenKernel::setCode(const char * ins, size_t size) {
+ insns = (GenInstruction *)ins;
+ insnNum = size / sizeof(GenInstruction);
+ }
size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
+ void GenKernel::printStatus(int indent, std::ostream& outs) {
+ Kernel::printStatus(indent, outs);
+
+ FILE *f = fopen("/dev/null", "w");
+ char *buf = new char[4096];
+ setbuffer(f, buf, 4096);
+
+ for (uint32_t i = 0; i < insnNum; i++) {
+ gen_disasm(f, insns+i);
+ outs << buf;
+ fflush(f);
+ setbuffer(f, NULL, 0);
+ setbuffer(f, buf, 4096);
+ }
+
+ setbuffer(f, NULL, 0);
+ delete [] buf;
+ fclose(f);
+ }
+
GenProgram::GenProgram(void) {}
GenProgram::~GenProgram(void) {}
@@ -88,8 +116,19 @@ namespace gbe {
}
static gbe_program genProgramNewFromBinary(const char *binary, size_t size) {
- NOT_IMPLEMENTED;
- return NULL;
+ using namespace gbe;
+ std::string binary_content;
+ binary_content.assign(binary, size);
+ GenProgram *program = GBE_NEW_NO_ARG(GenProgram);
+ std::istringstream ifs(binary_content, std::ostringstream::binary);
+
+ if (!program->deserializeFromBin(ifs)) {
+ delete program;
+ return NULL;
+ }
+
+ //program->printStatus(0, std::cout);
+ return reinterpret_cast<gbe_program>(program);
}
static gbe_program genProgramNewFromLLVM(const char *fileName,
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
index 68b0427..f78e324 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -42,8 +42,12 @@ namespace gbe
virtual ~GenKernel(void);
/*! Implements base class */
virtual const char *getCode(void) const;
- /*! Implements base class */
+ /*! Set the instruction stream (to be implemented) */
+ virtual const void setCode(const char *, size_t size);
+ /*! Implements get the code size */
virtual size_t getCodeSize(void) const;
+ /*! Implements printStatus*/
+ virtual void printStatus(int indent, std::ostream& outs);
GenInstruction *insns; //!< Instruction stream
uint32_t insnNum; //!< Number of instructions
GBE_CLASS(GenKernel); //!< Use custom allocators
@@ -59,6 +63,10 @@ namespace gbe
virtual ~GenProgram(void);
/*! Implements base class */
virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name);
+ /*! Allocate an empty kernel. */
+ virtual Kernel *allocateKernel(const std::string &name) {
+ return GBE_NEW(GenKernel, name);
+ }
/*! Use custom allocators */
GBE_CLASS(GenProgram);
};
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index a765917..ab8b7ee 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -65,8 +65,10 @@ namespace gbe
void allocateFlags(Selection &selection);
/*! Allocate the GRF registers */
bool allocateGRFs(Selection &selection);
+ /*! Create gen registers for all preallocated curbe registers. */
+ void allocatePayloadRegs(void);
/*! Create a Gen register from a register set in the payload */
- void allocatePayloadReg(gbe_curbe_type, ir::Register, uint32_t subValue = 0, uint32_t subOffset = 0);
+ void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
/*! Create the intervals for each register */
/*! Allocate the vectors detected in the instruction selection pass */
void allocateVector(Selection &selection);
@@ -124,19 +126,38 @@ namespace gbe
GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {}
GenRegAllocator::Opaque::~Opaque(void) {}
- void GenRegAllocator::Opaque::allocatePayloadReg(gbe_curbe_type value,
- ir::Register reg,
- uint32_t subValue,
+ void GenRegAllocator::Opaque::allocatePayloadReg(ir::Register reg,
+ uint32_t offset,
uint32_t subOffset)
{
using namespace ir;
- const Kernel *kernel = ctx.getKernel();
- const int32_t curbeOffset = kernel->getCurbeOffset(value, subValue);
- if (curbeOffset >= 0) {
- const uint32_t offset = GEN_REG_SIZE + curbeOffset + subOffset;
- RA.insert(std::make_pair(reg, offset));
- this->intervals[reg].minID = 0;
- this->intervals[reg].maxID = 0;
+ assert(offset >= GEN_REG_SIZE);
+ offset += subOffset;
+ RA.insert(std::make_pair(reg, offset));
+ GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
+ this->intervals[reg].minID = 0;
+ this->intervals[reg].maxID = 0;
+ }
+
+ INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+ using namespace ir;
+ for(auto &it : this->ctx.curbeRegs)
+ if (it.first.value() < 0x8000)
+ allocatePayloadReg(it.first, it.second);
+
+ // Allocate all pushed registers (i.e. structure kernel arguments)
+ const Function &fn = ctx.getFunction();
+ GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
+ const Function::PushMap &pushMap = fn.getPushMap();
+ for (const auto &pushed : pushMap) {
+ const uint32_t argID = pushed.second.argID;
+ const FunctionArgument arg = fn.getArg(argID);
+
+ const uint32_t subOffset = pushed.second.offset;
+ const Register reg = pushed.second.getRegister();
+ auto it = this->ctx.curbeRegs.find(arg.reg);
+ assert(it != ctx.curbeRegs.end());
+ allocatePayloadReg(reg, it->second, subOffset);
}
}
@@ -535,11 +556,9 @@ namespace gbe
}
return true;
}
+
INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
using namespace ir;
- const Kernel *kernel = ctx.getKernel();
- const Function &fn = ctx.getFunction();
- GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
if (ctx.getSimdWidth() == 8) {
reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE);
reservedReg /= GEN_REG_SIZE;
@@ -555,25 +574,7 @@ namespace gbe
this->intervals.push_back(ir::Register(regID));
// Allocate the special registers (only those which are actually used)
- allocatePayloadReg(GBE_CURBE_LOCAL_ID_X, ocl::lid0);
- allocatePayloadReg(GBE_CURBE_LOCAL_ID_Y, ocl::lid1);
- allocatePayloadReg(GBE_CURBE_LOCAL_ID_Z, ocl::lid2);
- allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_X, ocl::lsize0);
- allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Y, ocl::lsize1);
- allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Z, ocl::lsize2);
- allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_X, ocl::gsize0);
- allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Y, ocl::gsize1);
- allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Z, ocl::gsize2);
- allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
- allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
- allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
- allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
- allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
- allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
- allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
- allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr);
- allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn);
- allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst);
+ this->allocatePayloadRegs();
// Group and barrier IDs are always allocated by the hardware in r0
RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1
@@ -582,33 +583,6 @@ namespace gbe
RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
// block IP used to handle the mask in SW is always allocated
- const int32_t blockIPOffset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_BLOCK_IP,0);
- GBE_ASSERT(blockIPOffset >= 0 && blockIPOffset % GEN_REG_SIZE == 0);
- RA.insert(std::make_pair(ocl::blockip, blockIPOffset));
- this->intervals[ocl::blockip].minID = 0;
-
- // Allocate all (non-structure) argument parameters
- const uint32_t argNum = fn.argNum();
- for (uint32_t argID = 0; argID < argNum; ++argID) {
- const FunctionArgument &arg = fn.getArg(argID);
- GBE_ASSERT(arg.type == FunctionArgument::GLOBAL_POINTER ||
- arg.type == FunctionArgument::CONSTANT_POINTER ||
- arg.type == FunctionArgument::LOCAL_POINTER ||
- arg.type == FunctionArgument::VALUE ||
- arg.type == FunctionArgument::STRUCTURE ||
- arg.type == FunctionArgument::IMAGE ||
- arg.type == FunctionArgument::SAMPLER);
- allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, arg.reg, argID);
- }
-
- // Allocate all pushed registers (i.e. structure kernel arguments)
- const Function::PushMap &pushMap = fn.getPushMap();
- for (const auto &pushed : pushMap) {
- const uint32_t argID = pushed.second.argID;
- const uint32_t subOffset = pushed.second.offset;
- const Register reg = pushed.second.getRegister();
- allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, reg, argID, subOffset);
- }
// Compute the intervals
int32_t insnID = 0;
@@ -645,7 +619,8 @@ namespace gbe
// Flag registers can only go to src[0]
const SelectionOpcode opcode = SelectionOpcode(insn.opcode);
- if (opcode == SEL_OP_AND || opcode == SEL_OP_OR) {
+ if (opcode == SEL_OP_AND || opcode == SEL_OP_OR || opcode == SEL_OP_XOR
+ || opcode == SEL_OP_I64AND || opcode == SEL_OP_I64OR || opcode == SEL_OP_I64XOR) {
if (insn.src(1).physical == 0) {
const ir::Register reg = insn.src(1).reg();
if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL)
@@ -753,7 +728,8 @@ namespace gbe
}
GBE_ASSERT(RA.contains(reg.reg()) != false);
const uint32_t grfOffset = RA.find(reg.reg())->second;
- const GenRegister dst = setGenReg(reg, grfOffset);
+ const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+ const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
if (reg.quarter != 0)
return GenRegister::Qn(dst, reg.quarter);
else
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index ea1bc06..538f16c 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -158,6 +158,11 @@ namespace gbe
NOT_IMPLEMENTED;
}
}
+ void useFlag(int nr, int subnr) {
+ flag = nr;
+ subFlag = subnr;
+ physicalFlag = 1;
+ }
};
/*! This is a book-keeping structure used to encode both virtual and physical
@@ -230,6 +235,7 @@ namespace gbe
uint32_t nr:8; //!< Just for some physical registers (acc, null)
uint32_t subnr:8; //!< Idem
uint32_t physical:1; //!< 1 if physical, 0 otherwise
+ uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise
uint32_t type:4; //!< Gen type
uint32_t file:2; //!< Register file
uint32_t negation:1; //!< For source
@@ -286,6 +292,14 @@ namespace gbe
return false;
}
+ INLINE int flag_nr(void) const {
+ return nr & 15;
+ }
+
+ INLINE int flag_subnr(void) const {
+ return subnr / typeSize(type);
+ }
+
static INLINE GenRegister h2(GenRegister reg) {
GenRegister r = reg;
r.hstride = GEN_HORIZONTAL_STRIDE_2;
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 35d3a7c..ffd31d9 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -37,6 +37,7 @@
#include <fstream>
#include <dlfcn.h>
#include <sstream>
+#include <iostream>
#include <unistd.h>
/* Not defined for LLVM 3.0 */
@@ -124,6 +125,321 @@ namespace gbe {
return true;
}
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ size_t Program::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+ size_t ker_num = kernels.size();
+ int has_constset = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ if (constantSet) {
+ has_constset = 1;
+ OUT_UPDATE_SZ(has_constset);
+ size_t sz = constantSet->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ } else {
+ OUT_UPDATE_SZ(has_constset);
+ }
+
+ OUT_UPDATE_SZ(ker_num);
+ for (auto ker : kernels) {
+ size_t sz = ker.second->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+
+ OUT_UPDATE_SZ(ret_size);
+ return ret_size;
+ }
+
+ size_t Program::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ int has_constset = 0;
+ size_t ker_num;
+ uint32_t magic;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(has_constset);
+ if(has_constset) {
+ constantSet = new ir::ConstantSet;
+ size_t sz = constantSet->deserializeFromBin(ins);
+
+ if (sz == 0) {
+ return 0;
+ }
+
+ total_size += sz;
+ }
+
+ IN_UPDATE_SZ(ker_num);
+
+ for (size_t i = 0; i < ker_num; i++) {
+ size_t ker_serial_sz;
+ std::string ker_name; // Just a empty name here.
+ Kernel* ker = allocateKernel(ker_name);
+
+ if(!(ker_serial_sz = ker->deserializeFromBin(ins)))
+ return 0;
+
+ kernels.insert(std::make_pair(ker->getName(), ker));
+ total_size += ker_serial_sz;
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+ size_t Kernel::serializeToBin(std::ostream& outs) {
+ unsigned int i;
+ size_t ret_size = 0;
+ int has_samplerset = 0;
+ int has_imageset = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ OUT_UPDATE_SZ(name.size());
+ outs.write(name.c_str(), name.size());
+ ret_size += sizeof(char)*name.size();
+
+ OUT_UPDATE_SZ(argNum);
+ for (i = 0; i < argNum; i++) {
+ KernelArgument& arg = args[i];
+ OUT_UPDATE_SZ(arg.type);
+ OUT_UPDATE_SZ(arg.size);
+ OUT_UPDATE_SZ(arg.bufSize);
+ }
+
+ OUT_UPDATE_SZ(patches.size());
+ for (auto patch : patches) {
+ unsigned int tmp;
+ tmp = patch.type;
+ OUT_UPDATE_SZ(tmp);
+ tmp = patch.subType;
+ OUT_UPDATE_SZ(tmp);
+ tmp = patch.offset;
+ OUT_UPDATE_SZ(tmp);
+ }
+
+ OUT_UPDATE_SZ(curbeSize);
+ OUT_UPDATE_SZ(simdWidth);
+ OUT_UPDATE_SZ(stackSize);
+ OUT_UPDATE_SZ(useSLM);
+
+ /* samplers. */
+ if (samplerSet) {
+ has_samplerset = 1;
+ OUT_UPDATE_SZ(has_samplerset);
+ size_t sz = samplerSet->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ } else {
+ OUT_UPDATE_SZ(has_samplerset);
+ }
+
+ /* images. */
+ if (imageSet) {
+ has_imageset = 1;
+ OUT_UPDATE_SZ(has_imageset);
+ size_t sz = imageSet->serializeToBin(outs);
+ if (!sz)
+ return 0;
+
+ ret_size += sz;
+ } else {
+ OUT_UPDATE_SZ(has_imageset);
+ }
+
+ /* Code. */
+ const char * code = getCode();
+ OUT_UPDATE_SZ(getCodeSize());
+ outs.write(code, getCodeSize()*sizeof(char));
+ ret_size += getCodeSize()*sizeof(char);
+
+ OUT_UPDATE_SZ(magic_end);
+
+ OUT_UPDATE_SZ(ret_size);
+ return ret_size;
+ }
+
+ size_t Kernel::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ int has_samplerset = 0;
+ int has_imageset = 0;
+ size_t code_size = 0;
+ uint32_t magic = 0;
+ size_t patch_num = 0;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ size_t name_len;
+ IN_UPDATE_SZ(name_len);
+ char* c_name = new char[name_len+1];
+ ins.read(c_name, name_len*sizeof(char));
+ total_size += sizeof(char)*name_len;
+ c_name[name_len] = 0;
+ name = c_name;
+ delete[] c_name;
+
+ IN_UPDATE_SZ(argNum);
+ args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, argNum);
+ for (uint32_t i = 0; i < argNum; i++) {
+ KernelArgument& arg = args[i];
+ IN_UPDATE_SZ(arg.type);
+ IN_UPDATE_SZ(arg.size);
+ IN_UPDATE_SZ(arg.bufSize);
+ }
+
+ IN_UPDATE_SZ(patch_num);
+ for (uint32_t i = 0; i < patch_num; i++) {
+ unsigned int tmp;
+ PatchInfo patch;
+ IN_UPDATE_SZ(tmp);
+ patch.type = tmp;
+ IN_UPDATE_SZ(tmp);
+ patch.subType = tmp;
+ IN_UPDATE_SZ(tmp);
+ patch.offset = tmp;
+
+ patches.push_back(patch);
+ }
+
+ IN_UPDATE_SZ(curbeSize);
+ IN_UPDATE_SZ(simdWidth);
+ IN_UPDATE_SZ(stackSize);
+ IN_UPDATE_SZ(useSLM);
+
+ IN_UPDATE_SZ(has_samplerset);
+ if (has_samplerset) {
+ samplerSet = GBE_NEW(ir::SamplerSet);
+ size_t sz = samplerSet->deserializeFromBin(ins);
+ if (sz == 0) {
+ return 0;
+ }
+
+ total_size += sz;
+ }
+
+ IN_UPDATE_SZ(has_imageset);
+ if (has_imageset) {
+ imageSet = GBE_NEW(ir::ImageSet);
+ size_t sz = imageSet->deserializeFromBin(ins);
+ if (sz == 0) {
+ return 0;
+ }
+
+ total_size += sz;
+ }
+
+ IN_UPDATE_SZ(code_size);
+ if (code_size) {
+ char* code = GBE_NEW_ARRAY_NO_ARG(char, code_size);
+ ins.read(code, code_size*sizeof(char));
+ total_size += sizeof(char)*code_size;
+ setCode(code, code_size);
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+#undef OUT_UPDATE_SZ
+#undef IN_UPDATE_SZ
+
+ void Program::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+
+ outs << spaces << "=============== Begin Program ===============" << "\n";
+
+ if (constantSet) {
+ constantSet->printStatus(indent + 4, outs);
+ }
+
+ for (auto ker : kernels) {
+ ker.second->printStatus(indent + 4, outs);
+ }
+
+ outs << spaces << "================ End Program ================" << "\n";
+ }
+
+ void Kernel::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+ string spaces_nl = indent_to_str(indent + 4);
+ int num;
+
+ outs << spaces << "+++++++++++ Begin Kernel +++++++++++" << "\n";
+ outs << spaces_nl << "Kernel Name: " << name << "\n";
+ outs << spaces_nl << " curbeSize: " << curbeSize << "\n";
+ outs << spaces_nl << " simdWidth: " << simdWidth << "\n";
+ outs << spaces_nl << " stackSize: " << stackSize << "\n";
+ outs << spaces_nl << " useSLM: " << useSLM << "\n";
+
+ outs << spaces_nl << " Argument Number is " << argNum << "\n";
+ for (uint32_t i = 0; i < argNum; i++) {
+ KernelArgument& arg = args[i];
+ outs << spaces_nl << " Arg " << i << ":\n";
+ outs << spaces_nl << " type value: "<< arg.type << "\n";
+ outs << spaces_nl << " size: "<< arg.size << "\n";
+ outs << spaces_nl << " bufSize: "<< arg.bufSize << "\n";
+ }
+
+ outs << spaces_nl << " Patches Number is " << patches.size() << "\n";
+ num = 0;
+ for (auto patch : patches) {
+ num++;
+ outs << spaces_nl << " patch " << num << ":\n";
+ outs << spaces_nl << " type value: "<< patch.type << "\n";
+ outs << spaces_nl << " subtype value: "<< patch.subType << "\n";
+ outs << spaces_nl << " offset: "<< patch.offset << "\n";
+ }
+
+ if (samplerSet) {
+ samplerSet->printStatus(indent + 4, outs);
+ }
+
+ if (imageSet) {
+ imageSet->printStatus(indent + 4, outs);
+ }
+
+ outs << spaces << "++++++++++++ End Kernel ++++++++++++" << "\n";
+ }
+
+ /*********************** End of Program class member function *************************/
+
static void programDelete(gbe_program gbeProgram) {
gbe::Program *program = (gbe::Program*)(gbeProgram);
GBE_SAFE_DELETE(program);
@@ -152,7 +468,9 @@ namespace gbe {
useless.push_back(str);
args.push_back(str.c_str());
}
-
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
+#endif
args.push_back("-emit-llvm");
// XXX we haven't implement those builtin functions,
// so disable it currently.
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index d20e7af..8774344 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -70,8 +70,7 @@ enum gbe_curbe_type {
GBE_CURBE_GROUP_NUM_Y,
GBE_CURBE_GROUP_NUM_Z,
GBE_CURBE_WORK_DIM,
- GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
- GBE_CURBE_GLOBAL_CONSTANT_DATA,
+ GBE_CURBE_SAMPLER_INFO,
GBE_CURBE_IMAGE_INFO,
GBE_CURBE_STACK_POINTER,
GBE_CURBE_KERNEL_ARGUMENT,
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 83aaab8..28a792d 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -67,7 +67,7 @@ namespace gbe {
}
/*! Describe a compiled kernel */
- class Kernel : public NonCopyable
+ class Kernel : public NonCopyable, public Serializable
{
public:
/*! Create an empty kernel with the given name */
@@ -76,6 +76,8 @@ namespace gbe {
virtual ~Kernel(void);
/*! Return the instruction stream (to be implemented) */
virtual const char *getCode(void) const = 0;
+ /*! Set the instruction stream.*/
+ virtual const void setCode(const char *, size_t size) = 0;
/*! Return the instruction stream size (to be implemented) */
virtual size_t getCodeSize(void) const = 0;
/*! Get the kernel name */
@@ -128,9 +130,37 @@ namespace gbe {
size_t getImageSize(void) const { return imageSet->getDataSize(); }
/*! Get defined image value array */
void getImageData(ImageInfo *images) const { imageSet->getData(images); }
+
+ static const uint32_t magic_begin = TO_MAGIC('K', 'E', 'R', 'N');
+ static const uint32_t magic_end = TO_MAGIC('N', 'R', 'E', 'K');
+
+ /* format:
+ magic_begin |
+ name_size |
+ name |
+ arg_num |
+ args |
+ PatchInfo_num |
+ PatchInfo |
+ curbeSize |
+ simdWidth |
+ stackSize |
+ useSLM |
+ samplers |
+ images |
+ code_size |
+ code |
+ magic_end
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
protected:
friend class Context; //!< Owns the kernels
- const std::string name; //!< Kernel name
+ std::string name; //!< Kernel name
KernelArgument *args; //!< Each argument
vector<PatchInfo> patches; //!< Indicates how to build the curbe
uint32_t argNum; //!< Number of function arguments
@@ -146,7 +176,7 @@ namespace gbe {
};
/*! Describe a compiled program */
- class Program : public NonCopyable
+ class Program : public NonCopyable, public Serializable
{
public:
/*! Create an empty program */
@@ -186,9 +216,32 @@ namespace gbe {
size_t getGlobalConstantSize(void) const { return constantSet->getDataSize(); }
/*! Get the content of global constant arrays */
void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
+
+ static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
+ static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
+
+ /* format:
+ magic_begin |
+ constantSet_flag |
+ constSet_data |
+ kernel_num |
+ kernel_1 |
+ ........ |
+ kernel_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
protected:
/*! Compile a kernel */
virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name) = 0;
+ /*! Allocate an empty kernel. */
+ virtual Kernel *allocateKernel(const std::string &name) = 0;
/*! Kernels sorted by their name */
hash_map<std::string, Kernel*> kernels;
/*! Global (constants) outside any kernel */
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
index 440b455..2b8f913 100644
--- a/backend/src/builtin_vector_proto.def
+++ b/backend/src/builtin_vector_proto.def
@@ -6,11 +6,10 @@ gentype asin (gentype)
gentype asinh (gentype)
gentype asinpi (gentype x)
gentype atan (gentype y_over_x)
-# XXX atan2 is a builtin function
-#gentype atan2 (gentype y, gentype x)
+gentype atan2 (gentype y, gentype x)
gentype atanh (gentype)
gentype atanpi (gentype x)
-#gentype atan2pi (gentype y, gentype x)
+gentype atan2pi (gentype y, gentype x)
gentype cbrt (gentype)
gentype ceil (gentype)
gentype copysign (gentype x, gentype y)
@@ -61,13 +60,13 @@ float ldexp (float x, int k)
doublen ldexp (doublen x, intn k)
doublen ldexp (doublen x, int k)
double ldexp (double x, int k)
-#gentype lgamma (gentype x)
-#floatn lgamma_r (floatn x, __global intn *signp)
-#floatn lgamma_r (floatn x, __local intn *signp)
-#floatn lgamma_r (floatn x, __private intn *signp)
-#float lgamma_r (float x, __global int *signp)
-#float lgamma_r (float x, __local int *signp)
-#float lgamma_r (float x, __private int *signp)
+gentype lgamma (gentype x)
+floatn lgamma_r (floatn x, __global intn *signp)
+floatn lgamma_r (floatn x, __local intn *signp)
+floatn lgamma_r (floatn x, __private intn *signp)
+float lgamma_r (float x, __global int *signp)
+float lgamma_r (float x, __local int *signp)
+float lgamma_r (float x, __private int *signp)
#doublen lgamma_r (doublen x, __global intn *signp)
#doublen lgamma_r (doublen x, __local intn *signp)
#doublen lgamma_r (doublen x, __private intn *signp)
@@ -127,7 +126,7 @@ gentype sqrt (gentype)
gentype tan (gentype)
gentype tanh (gentype)
gentype tanpi (gentype x)
-#gentype tgamma (gentype)
+gentype tgamma (gentype)
gentype trunc (gentype)
##half_native_math
@@ -253,8 +252,8 @@ int any (igentype x)
int all (igentype x)
# XXX need to revisit select latter
#gentype bitselect (gentype a, gentype b, gentype c)
-#gentype select (gentype a, gentype b, igentype c)
-#gentype select (gentype a, gentype b, ugentype c)
+gentype select (gentype a, gentype b, igentype c)
+gentype select (gentype a, gentype b, ugentype c)
##misc
#gentypen shuffle (gentypem x, ugentypen mask)
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
new file mode 100644
index 0000000..afe86f2
--- /dev/null
+++ b/backend/src/gbe_bin_generater.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*******************************************************************************
+ This file is used to generating the gbe kernel binary. These binary may be
+ used in CL API, such as enqueue memory We generate the binary in build time
+ to improve the performance.
+ *******************************************************************************/
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <fstream>
+#include <deque>
+#include <vector>
+#include <algorithm>
+#include "backend/program.h"
+#include "backend/program.hpp"
+
+using namespace std;
+
+#define FILE_NOT_FIND_ERR 1
+#define FILE_MAP_ERR 2
+#define FILE_BUILD_FAILED 3
+#define FILE_SERIALIZATION_FAILED 4
+
+class program_build_instance {
+
+protected:
+ string prog_path;
+ string build_opt;
+ static string bin_path;
+ int fd;
+ int file_len;
+ const char* code;
+ gbe::Program* gbe_prog;
+
+public:
+ program_build_instance (void) : fd(-1), file_len(0), code(NULL), gbe_prog(NULL) { }
+ explicit program_build_instance (const char* file_path, const char* option = NULL)
+ : prog_path(file_path), build_opt(option), fd(-1), file_len(0),
+ code(NULL), gbe_prog(NULL) { }
+
+ ~program_build_instance () {
+ if (code) {
+ munmap((void *)(code), file_len);
+ code = NULL;
+ }
+
+ if (fd >= 0)
+ close(fd);
+
+ if (gbe_prog)
+ gbe_program_delete(reinterpret_cast<gbe_program>(gbe_prog));
+ }
+
+ program_build_instance(program_build_instance&& other) = default;
+#if 0
+ {
+#define SWAP(ELT) \
+ do { \
+ auto elt = this->ELT; \
+ this->ELT = other.ELT; \
+ other.ELT = elt; \
+ } while(0)
+
+ SWAP(fd);
+ SWAP(code);
+ SWAP(file_len);
+ SWAP(prog_path);
+ SWAP(build_opt);
+#undef SWAP
+ }
+#endif
+
+ explicit program_build_instance(const program_build_instance& other) = delete;
+ program_build_instance& operator= (const program_build_instance& other) {
+ /* we do not want to be Lvalue copied, but operator is needed to instance the
+ template of vector<program_build_instance>. */
+ assert(1);
+ return *this;
+ }
+
+
+ const char* file_map_open (void) throw (int);
+
+ const char* get_code (void) {
+ return code;
+ }
+
+ const string& get_program_path (void) {
+ return prog_path;
+ }
+
+ int get_size (void) {
+ return file_len;
+ }
+
+ void print_file (void) {
+ cout << code << endl;
+ }
+
+ void dump (void) {
+ cout << "program path: " << prog_path << endl;
+ cout << "Build option: " << build_opt << endl;
+ print_file();
+ }
+
+ static int set_bin_path (const char* path) {
+ if (bin_path.size())
+ return 0;
+
+ bin_path = path;
+ return 1;
+ }
+
+ void build_program(void) throw(int);
+ void serialize_program(void) throw(int);
+};
+
+string program_build_instance::bin_path;
+
+void program_build_instance::serialize_program(void) throw(int)
+{
+ ofstream ofs;
+ ostringstream oss;
+ ofs.open(bin_path, ofstream::out | ofstream::app | ofstream::binary);
+
+ size_t sz = gbe_prog->serializeToBin(ofs);
+ ofs.close();
+
+ if (!sz) {
+ throw FILE_SERIALIZATION_FAILED;
+ }
+}
+
+
+void program_build_instance::build_program(void) throw(int)
+{
+ gbe_program opaque = gbe_program_new_from_source(code, 0, build_opt.c_str(), NULL, NULL);
+ if (!opaque)
+ throw FILE_BUILD_FAILED;
+
+ gbe_prog = reinterpret_cast<gbe::Program*>(opaque);
+
+ assert(gbe_program_get_kernel_num(opaque));
+}
+
+const char* program_build_instance::file_map_open(void) throw(int)
+{
+ void * address;
+
+ /* Open the file */
+ fd = ::open(prog_path.c_str(), O_RDONLY);
+ if (fd < 0) {
+ throw FILE_NOT_FIND_ERR;
+ }
+
+ /* Map it */
+ file_len = lseek(fd, 0, SEEK_END);
+ lseek(fd, 0, SEEK_SET);
+ address = mmap(0, file_len, PROT_READ, MAP_SHARED, fd, 0);
+ if (address == NULL) {
+ throw FILE_MAP_ERR;
+ }
+
+ code = reinterpret_cast<const char*>(address);
+ return code;
+}
+
+typedef vector<program_build_instance> prog_vector;
+
+int main (int argc, const char **argv)
+{
+ prog_vector prog_insts;
+ vector<string> argv_saved;
+ const char* build_opt;
+ const char* file_path;
+ int i;
+ int oc;
+ deque<int> used_index;
+
+ if (argc < 2) {
+ cout << "Usage: kernel_path [-pbuild_parameter]\n[-obin_path]" << endl;
+ return 0;
+ }
+
+ used_index.assign(argc, 0);
+
+ /* because getopt will re-sort the argv, so we save here. */
+ for (i=0; i< argc; i++) {
+ argv_saved.push_back(string(argv[i]));
+ }
+
+ while ( (oc = getopt(argc, (char * const *)argv, "o:p:")) != -1 ) {
+ switch (oc) {
+ case 'p':
+ {
+ int opt_index;
+
+ if (argv[optind-1][0] == '-') {// -pXXX like
+ opt_index = optind - 1;
+ } else { // Must be -p XXXX mode
+ opt_index = optind - 2;
+ used_index[opt_index + 1] = 1;
+ }
+
+ /* opt must follow the file name.*/
+ if ((opt_index < 2 ) || argv[opt_index-1][0] == '-') {
+ cout << "Usage note: Building option must follow file name" << endl;
+ return 1;
+ }
+
+ file_path = argv[opt_index - 1];
+ build_opt = optarg;
+
+ prog_insts.push_back(program_build_instance(file_path, build_opt));
+ break;
+ }
+
+ case 'o':
+ if (!program_build_instance::set_bin_path(optarg)) {
+ cout << "Can not specify the bin path more than once." << endl;
+ return 1;
+ }
+ used_index[optind-1] = 1;
+ break;
+
+ case ':':
+ cout << "Miss the file option argument" << endl;
+ return 1;
+
+ default:
+ cout << "Unknown opt" << endl;
+ }
+ }
+
+ for (i=1; i < argc; i++) {
+ //cout << argv_saved[i] << endl;
+ if (argv_saved[i].size() && argv_saved[i][0] != '-') {
+ if (used_index[i])
+ continue;
+
+ string file_name = argv_saved[i];
+ prog_vector::iterator result = find_if(prog_insts.begin(), prog_insts.end(),
+ [&](program_build_instance & prog_inst)-> bool {
+ bool result = false;
+ if (prog_inst.get_program_path() == file_name)
+ result = true;
+
+ return result;
+ });
+
+ if (result == prog_insts.end()) {
+ prog_insts.push_back(program_build_instance(file_name.c_str(), ""));
+ }
+ }
+ }
+
+ for (auto& inst : prog_insts) {
+ try {
+ inst.file_map_open();
+ inst.build_program();
+ inst.serialize_program();
+ }
+ catch (int & err_no) {
+ if (err_no == FILE_NOT_FIND_ERR) {
+ cout << "can not open the file " <<
+ inst.get_program_path() << endl;
+ } else if (err_no == FILE_MAP_ERR) {
+ cout << "map the file " <<
+ inst.get_program_path() << " failed" << endl;
+ } else if (err_no == FILE_BUILD_FAILED) {
+ cout << "build the file " <<
+ inst.get_program_path() << " failed" << endl;
+ } else if (err_no == FILE_SERIALIZATION_FAILED) {
+ cout << "Serialize the file " <<
+ inst.get_program_path() << " failed" << endl;
+ }
+ return -1;
+ }
+ }
+
+ //for (auto& inst : prog_insts) {
+ // inst.dump();
+ //}
+
+ return 0;
+}
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
index 35e3a2a..0a30738 100755
--- a/backend/src/gen_builtin_vector.py
+++ b/backend/src/gen_builtin_vector.py
@@ -311,7 +311,10 @@ class builtinProto():
if (isPointer(ptype)):
formatStr += '({} {} *)param{} + {:2d}'.format(ptype[2], ptype[0], n, j)
else:
- formatStr += 'param{}.s{:x}'.format(n, j)
+ if (self.functionName == 'select' and n == 2):
+ formatStr += '({0})(param{1}.s{2:x} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
+ else:
+ formatStr += 'param{}.s{:x}'.format(n, j)
formatStr += ')'
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
index 056b529..6cc81f1 100755
--- a/backend/src/gen_convert.sh
+++ b/backend/src/gen_convert.sh
@@ -5,14 +5,33 @@
# For all vector lengths and types, generate conversion functions
for vector_length in $VECTOR_LENGTHS; do
if test $vector_length -eq 1; then
- continue;
- fi
- for ftype in $TYPES; do
+ for ftype in $TYPES; do
+ fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+ for ttype in $TYPES; do
+ tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+ if test $fbasetype = $tbasetype; then
+ continue
+ fi
+ echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
+ echo " return ($tbasetype)v;"
+ echo "}"
+ echo
+ done
+ done
+ else
+ for ftype in $TYPES; do
fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
for ttype in $TYPES; do
tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
if test $fbasetype = $tbasetype; then
- continue
+ if test $vector_length -gt 1; then
+ fvectortype=$fbasetype$vector_length
+ tvectortype=$tbasetype$vector_length
+ echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
+ else
+ echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
+ fi
+ continue
fi
fvectortype=$fbasetype$vector_length
tvectortype=$tbasetype$vector_length
@@ -48,5 +67,6 @@ for vector_length in $VECTOR_LENGTHS; do
echo "}"
echo
done
- done
+ done
+ fi
done
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index c9f5bfe..7a8f80f 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -40,6 +40,107 @@ namespace ir {
for (uint32_t i = 0; i < size; ++i) this->data.push_back(data[i]);
}
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ size_t ConstantSet::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ /* output the const data. */
+ OUT_UPDATE_SZ((data.size()*sizeof(char)));
+ if(data.size() > 0) {
+ outs.write(data.data(), data.size()*sizeof(char));
+ ret_size += data.size()*sizeof(char);
+ }
+
+ OUT_UPDATE_SZ(constants.size());
+ for (auto const &cnst : constants) {
+ size_t bytes = sizeof(cnst.getName().size()) //name length self
+ + cnst.getName().size()*sizeof(char) //name
+ + sizeof(cnst.getSize()) //size
+ + sizeof(cnst.getAlignment()) //alignment
+ + sizeof(cnst.getOffset()) //offset
+ + sizeof(cnst.getReg()); //reg
+ OUT_UPDATE_SZ(bytes);
+
+ OUT_UPDATE_SZ(cnst.getName().size());
+ outs.write(cnst.getName().c_str(), cnst.getName().size());
+ ret_size += sizeof(char)*cnst.getName().size();
+ OUT_UPDATE_SZ(cnst.getSize());
+ OUT_UPDATE_SZ(cnst.getAlignment());
+ OUT_UPDATE_SZ(cnst.getOffset());
+ OUT_UPDATE_SZ(cnst.getReg());
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+ OUT_UPDATE_SZ(ret_size);
+
+ return ret_size;
+ }
+
+ size_t ConstantSet::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ size_t global_data_sz = 0;
+ size_t const_num;
+ uint32_t magic;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(global_data_sz);
+ for (size_t i = 0; i < global_data_sz; i++) {
+ char elt;
+ IN_UPDATE_SZ(elt);
+ data.push_back(elt);
+ }
+
+ IN_UPDATE_SZ(const_num);
+ for (size_t i = 0; i < const_num; i++) {
+ size_t bytes;
+ IN_UPDATE_SZ(bytes);
+
+ size_t name_len;
+ IN_UPDATE_SZ(name_len);
+
+ char* c_name = new char[name_len+1];
+ ins.read(c_name, name_len);
+ total_size += sizeof(char)*name_len;
+ c_name[name_len] = 0;
+
+ uint32_t size, align, offset;
+ uint16_t reg;
+ IN_UPDATE_SZ(size);
+ IN_UPDATE_SZ(align);
+ IN_UPDATE_SZ(offset);
+ IN_UPDATE_SZ(reg);
+
+ ir::Constant constant(c_name, size, align, offset);
+ constant.setReg(reg);
+ constants.push_back(constant);
+
+ delete[] c_name;
+
+ /* Saint check */
+ if (bytes != sizeof(name_len) + sizeof(char)*name_len + sizeof(size)
+ + sizeof(align) + sizeof(offset) + sizeof(reg))
+ return 0;
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
} /* namespace ir */
} /* namespace gbe */
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index 0717391..4bb549e 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -52,6 +52,8 @@ namespace ir {
/*! Nothing happens here */
INLINE ~Constant(void) {}
const std::string& getName(void) const { return name; }
+ uint32_t getSize (void) const { return size; }
+ uint32_t getAlignment (void) const { return alignment; }
uint32_t getOffset(void) const { return offset; }
uint16_t getReg(void) const { return reg; }
void setReg(uint16_t reg) { this->reg = reg; }
@@ -67,7 +69,7 @@ namespace ir {
/*! A constant set is a set of immutable data associated to a compilation
* unit
*/
- class ConstantSet
+ class ConstantSet : public Serializable
{
public:
/*! Append a new constant in the constant set */
@@ -93,7 +95,8 @@ namespace ir {
mem[i] = data[i];
}
ConstantSet() {}
- ConstantSet(const ConstantSet& other) : data(other.data), constants(other.constants) {}
+ ConstantSet(const ConstantSet& other) : Serializable(other),
+ data(other.data), constants(other.constants) {}
ConstantSet & operator = (const ConstantSet& other) {
if (&other != this) {
data = other.data;
@@ -101,6 +104,27 @@ namespace ir {
}
return *this;
}
+
+ static const uint32_t magic_begin = TO_MAGIC('C', 'N', 'S', 'T');
+ static const uint32_t magic_end = TO_MAGIC('T', 'S', 'N', 'C');
+
+ /* format:
+ magic_begin |
+ const_data_size |
+ const_data |
+ constant_1_size |
+ constant_1 |
+ ........ |
+ constant_n_size |
+ constant_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+
private:
vector<char> data; //!< The constant data serialized in one array
vector<Constant> constants;//!< Each constant description
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index c286f1d..a7337e6 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -142,6 +142,7 @@ namespace ir {
this->NAME(type, dst, index); \
}
DECL_THREE_SRC_INSN(SEL);
+ DECL_THREE_SRC_INSN(I64MADSAT);
#undef DECL_THREE_SRC_INSN
/*! For all unary functions */
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index 486fde1..b901a12 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -110,5 +110,144 @@ namespace ir {
GBE_DELETE(it.second);
}
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ /*! Implements the serialization. */
+ size_t ImageSet::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ OUT_UPDATE_SZ(regMap.size());
+ for (auto iter : regMap) {
+ OUT_UPDATE_SZ(iter.first);
+ OUT_UPDATE_SZ(iter.second->arg_idx);
+ OUT_UPDATE_SZ(iter.second->idx);
+ OUT_UPDATE_SZ(iter.second->wSlot);
+ OUT_UPDATE_SZ(iter.second->hSlot);
+ OUT_UPDATE_SZ(iter.second->depthSlot);
+ OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+ OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+ OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+ }
+
+ OUT_UPDATE_SZ(indexMap.size());
+ for (auto iter : indexMap) {
+ OUT_UPDATE_SZ(iter.first);
+ OUT_UPDATE_SZ(iter.second->arg_idx);
+ OUT_UPDATE_SZ(iter.second->idx);
+ OUT_UPDATE_SZ(iter.second->wSlot);
+ OUT_UPDATE_SZ(iter.second->hSlot);
+ OUT_UPDATE_SZ(iter.second->depthSlot);
+ OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+ OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+ OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+ OUT_UPDATE_SZ(ret_size);
+
+ return ret_size;
+ }
+
+ size_t ImageSet::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ uint32_t magic;
+ size_t image_map_sz = 0;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(image_map_sz); //regMap
+ for (size_t i = 0; i < image_map_sz; i++) {
+ ir::Register reg;
+ ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+ IN_UPDATE_SZ(reg);
+ IN_UPDATE_SZ(img_info->arg_idx);
+ IN_UPDATE_SZ(img_info->idx);
+ IN_UPDATE_SZ(img_info->wSlot);
+ IN_UPDATE_SZ(img_info->hSlot);
+ IN_UPDATE_SZ(img_info->depthSlot);
+ IN_UPDATE_SZ(img_info->dataTypeSlot);
+ IN_UPDATE_SZ(img_info->channelOrderSlot);
+ IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+ regMap.insert(std::make_pair(reg, img_info));
+ }
+
+ IN_UPDATE_SZ(image_map_sz); //indexMap
+ for (uint32_t i = 0; i < image_map_sz; i++) {
+ uint32_t index;
+ ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+ IN_UPDATE_SZ(index);
+ IN_UPDATE_SZ(img_info->arg_idx);
+ IN_UPDATE_SZ(img_info->idx);
+ IN_UPDATE_SZ(img_info->wSlot);
+ IN_UPDATE_SZ(img_info->hSlot);
+ IN_UPDATE_SZ(img_info->depthSlot);
+ IN_UPDATE_SZ(img_info->dataTypeSlot);
+ IN_UPDATE_SZ(img_info->channelOrderSlot);
+ IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+ indexMap.insert(std::make_pair(index, img_info));
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+ void ImageSet::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+ string spaces_nl = indent_to_str(indent + 4);
+
+ outs << spaces << "------------ Begin ImageSet ------------" << "\n";
+
+ outs << spaces_nl << " ImageSet Map: [reg, arg_idx, idx, wSlot, hSlot, depthSlot, "
+ "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+ outs << spaces_nl << " regMap size: " << regMap.size() << "\n";
+ for (auto iter : regMap) {
+ outs << spaces_nl << " [" << iter.first << ", "
+ << iter.second->arg_idx << ", "
+ << iter.second->idx << ", "
+ << iter.second->wSlot << ", "
+ << iter.second->hSlot << ", "
+ << iter.second->depthSlot << ", "
+ << iter.second->dataTypeSlot << ", "
+ << iter.second->channelOrderSlot << ", "
+ << iter.second->dimOrderSlot << "]" << "\n";
+ }
+
+ outs << spaces_nl << " ImageSet Map: [index, arg_idx, idx, wSlot, hSlot, depthSlot, "
+ "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+ outs << spaces_nl << " regMap size: " << indexMap.size() << "\n";
+ for (auto iter : indexMap) {
+ outs << spaces_nl << " [" << iter.first << ", "
+ << iter.second->arg_idx << ", "
+ << iter.second->idx << ", "
+ << iter.second->wSlot << ", "
+ << iter.second->hSlot << ", "
+ << iter.second->depthSlot << ", "
+ << iter.second->dataTypeSlot << ", "
+ << iter.second->channelOrderSlot << ", "
+ << iter.second->dimOrderSlot << ", " << "\n";
+ }
+
+ outs << spaces << "------------- End ImageSet -------------" << "\n";
+ }
+
+
} /* namespace ir */
} /* namespace gbe */
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
index 04e78e6..c084c7d 100644
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -40,7 +40,7 @@ namespace ir {
* for each individual image. And that individual image could be used
* at backend to identify this image's location.
*/
- class ImageSet
+ class ImageSet : public Serializable
{
public:
/*! Append an image argument. */
@@ -60,6 +60,29 @@ namespace ir {
ImageSet(const ImageSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
ImageSet() {}
~ImageSet();
+
+ static const uint32_t magic_begin = TO_MAGIC('I', 'M', 'A', 'G');
+ static const uint32_t magic_end = TO_MAGIC('G', 'A', 'M', 'I');
+
+ /* format:
+ magic_begin |
+ regMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ indexMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
private:
map<Register, struct ImageInfo *> regMap;
map<uint32_t, struct ImageInfo *> indexMap;
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 48e83b4..7c6c6c6 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -173,6 +173,30 @@ namespace ir {
}
};
+ class ALIGNED_INSTRUCTION TernaryInstruction :
+ public BasePolicy,
+ public NDstPolicy<TernaryInstruction, 1>,
+ public TupleSrcPolicy<TernaryInstruction>
+ {
+ public:
+ TernaryInstruction(Opcode opcode,
+ Type type,
+ Register dst,
+ Tuple src) {
+ this->opcode = opcode;
+ this->type = type;
+ this->dst[0] = dst;
+ this->src = src;
+ }
+ Type getType(void) const { return type; }
+ bool wellFormed(const Function &fn, std::string &whyNot) const;
+ INLINE void out(std::ostream &out, const Function &fn) const;
+ Type type;
+ Register dst[1];
+ Tuple src;
+ static const uint32_t srcNum = 3;
+ };
+
/*! Three sources mean we need a tuple to encode it */
class ALIGNED_INSTRUCTION SelectInstruction :
public BasePolicy,
@@ -462,7 +486,7 @@ namespace ir {
INLINE Type getSrcType(void) const { return this->srcType; }
INLINE Type getDstType(void) const { return this->dstType; }
- static const uint32_t srcNum = 5;
+ static const uint32_t srcNum = 6;
static const uint32_t dstNum = 4;
};
@@ -504,20 +528,48 @@ namespace ir {
Register dst[0]; //!< No dest register
};
+ class ALIGNED_INSTRUCTION GetSamplerInfoInstruction :
+ public BasePolicy,
+ public NSrcPolicy<GetSamplerInfoInstruction, 1>,
+ public NDstPolicy<GetSamplerInfoInstruction, 1>
+ {
+ public:
+ GetSamplerInfoInstruction( Register dst,
+ Register src)
+ {
+ this->opcode = OP_GET_SAMPLER_INFO;
+ this->dst[0] = dst;
+ this->src[0] = src;
+ }
+
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << " sampler id %" << this->getSrc(fn, 0)
+ << " %" << this->getDst(fn, 0);
+ }
+
+ Register src[1]; //!< Surface to get info
+ Register dst[1]; //!< return value
+ static const uint32_t dstNum = 1;
+ };
+
class ALIGNED_INSTRUCTION GetImageInfoInstruction :
public BasePolicy,
- public NSrcPolicy<GetImageInfoInstruction, 1>,
- public TupleDstPolicy<GetImageInfoInstruction>
+ public NSrcPolicy<GetImageInfoInstruction, 2>,
+ public NDstPolicy<GetImageInfoInstruction, 1>
{
public:
GetImageInfoInstruction( int type,
- Tuple dst,
- Register src)
+ Register dst,
+ Register src,
+ Register infoReg)
{
this->opcode = OP_GET_IMAGE_INFO;
this->infoType = type;
- this->dst = dst;
+ this->dst[0] = dst;
this->src[0] = src;
+ this->src[1] = infoReg;
}
INLINE uint32_t getInfoType(void) const { return infoType; }
@@ -530,11 +582,9 @@ namespace ir {
}
uint8_t infoType; //!< Type of the requested information.
- Register src[1]; //!< Surface to get info
- Tuple dst; //!< dest register to put the information.
- static const uint32_t dstNum = 4; //! The maximum dst number. Not the actual number
- // of destination tuple. We use the infoType to determin
- // the actual num.
+ Register src[2]; //!< Surface to get info
+ Register dst[1]; //!< dest register to put the information.
+ static const uint32_t dstNum = 1;
};
class ALIGNED_INSTRUCTION LoadImmInstruction :
@@ -788,6 +838,25 @@ namespace ir {
return true;
}
+ INLINE bool TernaryInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+ {
+ const RegisterFamily family = getFamily(this->type);
+ if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+ return false;
+ if (UNLIKELY(src + 3u > fn.tupleNum())) {
+ whyNot = "Out-of-bound index for ternary instruction";
+ return false;
+ }
+ for (uint32_t srcID = 0; srcID < 3; ++srcID) {
+ const Register regID = fn.getRegister(src, srcID);
+ if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+ return false;
+ }
+ return true;
+ }
+
/*! Loads and stores follow the same restrictions */
template <typename T>
INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
@@ -843,6 +912,9 @@ namespace ir {
{ return true; }
INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
{ return true; }
+ INLINE bool GetSamplerInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+ { return true; }
+
// Ensure that types and register family match
INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
@@ -934,6 +1006,10 @@ namespace ir {
ternaryOrSelectOut(*this, out, fn);
}
+ INLINE void TernaryInstruction::out(std::ostream &out, const Function &fn) const {
+ ternaryOrSelectOut(*this, out, fn);
+ }
+
INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
this->outOpcode(out);
out << "." << addrSpace;
@@ -1077,6 +1153,10 @@ START_INTROSPECTION(SelectInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(SelectInstruction)
+START_INTROSPECTION(TernaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(TernaryInstruction)
+
START_INTROSPECTION(BranchInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(BranchInstruction)
@@ -1093,6 +1173,10 @@ START_INTROSPECTION(GetImageInfoInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(GetImageInfoInstruction)
+START_INTROSPECTION(GetSamplerInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetSamplerInfoInstruction)
+
START_INTROSPECTION(LoadImmInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(LoadImmInstruction)
@@ -1259,6 +1343,7 @@ DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())
DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())
DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
+DECL_MEM_FN(TernaryInstruction, Type, getType(void), getType())
DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
@@ -1333,6 +1418,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
DECL_EMIT_FUNCTION(SUB)
DECL_EMIT_FUNCTION(SUBSAT)
DECL_EMIT_FUNCTION(MUL_HI)
+ DECL_EMIT_FUNCTION(I64_MUL_HI)
DECL_EMIT_FUNCTION(UPSAMPLE_SHORT)
DECL_EMIT_FUNCTION(UPSAMPLE_INT)
DECL_EMIT_FUNCTION(UPSAMPLE_LONG)
@@ -1348,6 +1434,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
DECL_EMIT_FUNCTION(AND)
DECL_EMIT_FUNCTION(HADD)
DECL_EMIT_FUNCTION(RHADD)
+ DECL_EMIT_FUNCTION(I64HADD)
+ DECL_EMIT_FUNCTION(I64RHADD)
#undef DECL_EMIT_FUNCTION
@@ -1356,6 +1444,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
return internal::SelectInstruction(type, dst, src).convert();
}
+ Instruction I64MADSAT(Type type, Register dst, Tuple src) {
+ return internal::TernaryInstruction(OP_I64MADSAT, type, dst, src).convert();
+ }
+
// All compare functions
#define DECL_EMIT_FUNCTION(NAME) \
Instruction NAME(Type type, Register dst, Register src0, Register src1) { \
@@ -1436,8 +1528,12 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
return internal::TypedWriteInstruction(src, srcType, coordType).convert();
}
- Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src) {
- return internal::GetImageInfoInstruction(infoType, dst, src).convert();
+ Instruction GET_IMAGE_INFO(int infoType, Register dst, Register src, Register infoReg) {
+ return internal::GetImageInfoInstruction(infoType, dst, src, infoReg).convert();
+ }
+
+ Instruction GET_SAMPLER_INFO(Register dst, Register src) {
+ return internal::GetSamplerInfoInstruction(dst, src).convert();
}
std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 40a3d40..27a34d1 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -206,6 +206,13 @@ namespace ir {
static bool isClassOf(const Instruction &insn);
};
+ /*! Ternary instructions are typed. dst and sources share the same type */
+ class TernaryInstruction : public Instruction {
+ public:
+ Type getType(void) const;
+ static bool isClassOf(const Instruction &insn);
+ };
+
/*! Select instructions writes src0 to dst if cond is true. Otherwise, it
* writes src1
*/
@@ -353,11 +360,12 @@ namespace ir {
typedef union {
struct {
- uint16_t index; /*! the allocated image index */
- uint16_t type; /*! the information type */
+ uint8_t index; /*! the allocated image index */
+ uint8_t type; /*! the information type */
};
uint32_t data;
} ImageInfoKey;
+
/*! Get image information */
class GetImageInfoInstruction : public Instruction {
public:
@@ -392,6 +400,14 @@ namespace ir {
static bool isClassOf(const Instruction &insn);
};
+ /*! Get image information */
+ class GetSamplerInfoInstruction : public Instruction {
+ public:
+
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
/*! Branch instruction is the unified way to branch (with or without
* predicate)
*/
@@ -519,6 +535,10 @@ namespace ir {
Instruction SIN(Type type, Register dst, Register src);
/*! mul_hi.type dst src */
Instruction MUL_HI(Type type, Register dst, Register src0, Register src1);
+ /*! i64_mul_hi.type dst src */
+ Instruction I64_MUL_HI(Type type, Register dst, Register src0, Register src1);
+ /*! i64madsat.type dst src */
+ Instruction I64MADSAT(Type type, Register dst, Tuple src);
/*! upsample_short.type dst src */
Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
/*! upsample_int.type dst src */
@@ -533,6 +553,10 @@ namespace ir {
Instruction HADD(Type type, Register dst, Register src0, Register src1);
/*! rhadd.type dst src */
Instruction RHADD(Type type, Register dst, Register src0, Register src1);
+ /*! i64hadd.type dst src */
+ Instruction I64HADD(Type type, Register dst, Register src0, Register src1);
+ /*! i64rhadd.type dst src */
+ Instruction I64RHADD(Type type, Register dst, Register src0, Register src1);
/*! tan.type dst src */
Instruction RCP(Type type, Register dst, Register src);
/*! abs.type dst src */
@@ -620,7 +644,9 @@ namespace ir {
/*! sample textures */
Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType);
/*! get image information , such as width/height/depth/... */
- Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src);
+ Instruction GET_IMAGE_INFO(int infoType, Register dst, Register src, Register infoReg);
+ /*! get sampler information */
+ Instruction GET_SAMPLER_INFO(Register dst, Register src);
/*! label labelIndex */
Instruction LABEL(LabelIndex labelIndex);
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index c15e912..1a9f867 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -72,11 +72,16 @@ DECL_INSN(SAMPLE, SampleInstruction)
DECL_INSN(SYNC, SyncInstruction)
DECL_INSN(LABEL, LabelInstruction)
DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_INSN(GET_SAMPLER_INFO, GetSamplerInfoInstruction)
DECL_INSN(MUL_HI, BinaryInstruction)
+DECL_INSN(I64_MUL_HI, BinaryInstruction)
DECL_INSN(FBH, UnaryInstruction)
DECL_INSN(FBL, UnaryInstruction)
DECL_INSN(HADD, BinaryInstruction)
DECL_INSN(RHADD, BinaryInstruction)
+DECL_INSN(I64HADD, BinaryInstruction)
+DECL_INSN(I64RHADD, BinaryInstruction)
DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
+DECL_INSN(I64MADSAT, TernaryInstruction)
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 675018a..10e0c59 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,8 +40,7 @@ namespace ir {
"stack_pointer",
"block_ip",
"barrier_id", "thread_number",
- "const_curbe_offset",
- "work_dimension",
+ "work_dimension", "sampler_info"
};
#if GBE_DEBUG
@@ -76,8 +75,8 @@ namespace ir {
DECL_NEW_REG(FAMILY_WORD, blockip);
DECL_NEW_REG(FAMILY_DWORD, barrierid);
DECL_NEW_REG(FAMILY_DWORD, threadn);
- DECL_NEW_REG(FAMILY_DWORD, constoffst);
DECL_NEW_REG(FAMILY_DWORD, workdim);
+ DECL_NEW_REG(FAMILY_WORD, samplerinfo);
}
#undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 4b0ef5e..89dd69f 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -63,8 +63,8 @@ namespace ir {
static const Register blockip = Register(19); // blockip
static const Register barrierid = Register(20);// barrierid
static const Register threadn = Register(21); // number of threads
- static const Register constoffst = Register(22); // offset of global constant array's curbe
- static const Register workdim = Register(23); // work dimention.
+ static const Register workdim = Register(22); // work dimention.
+ static const Register samplerinfo = Register(23); // store sampler info.
static const uint32_t regNum = 24; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index 62bdc16..cff1012 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -74,5 +74,103 @@ namespace ir {
appendReg(samplerReg, SAMPLER_ID(id), ctx);
}
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+ /*! Implements the serialization. */
+ size_t SamplerSet::serializeToBin(std::ostream& outs) {
+ size_t ret_size = 0;
+
+ OUT_UPDATE_SZ(magic_begin);
+
+ OUT_UPDATE_SZ(samplerMap.size());
+ for (auto iter : samplerMap) {
+ OUT_UPDATE_SZ(iter.first);
+ OUT_UPDATE_SZ(iter.second.reg);
+ OUT_UPDATE_SZ(iter.second.slot);
+ }
+
+ OUT_UPDATE_SZ(regMap.size());
+ for (auto iter : regMap) {
+ OUT_UPDATE_SZ(iter.first);
+ OUT_UPDATE_SZ(iter.second.reg);
+ OUT_UPDATE_SZ(iter.second.slot);
+ }
+
+ OUT_UPDATE_SZ(magic_end);
+ OUT_UPDATE_SZ(ret_size);
+
+ return ret_size;
+ }
+
+ size_t SamplerSet::deserializeFromBin(std::istream& ins) {
+ size_t total_size = 0;
+ uint32_t magic;
+ size_t sampler_map_sz = 0;
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_begin)
+ return 0;
+
+ IN_UPDATE_SZ(sampler_map_sz);
+ for (size_t i = 0; i < sampler_map_sz; i++) {
+ uint32_t key;
+ ir::SamplerRegSlot reg_slot;
+
+ IN_UPDATE_SZ(key);
+ IN_UPDATE_SZ(reg_slot.reg);
+ IN_UPDATE_SZ(reg_slot.slot);
+ samplerMap.insert(std::make_pair(key, reg_slot));
+ }
+
+ IN_UPDATE_SZ(sampler_map_sz);
+ for (size_t i = 0; i < sampler_map_sz; i++) {
+ ir::Register key;
+ ir::SamplerRegSlot reg_slot;
+
+ IN_UPDATE_SZ(key);
+ IN_UPDATE_SZ(reg_slot.reg);
+ IN_UPDATE_SZ(reg_slot.slot);
+ regMap.insert(std::make_pair(key, reg_slot));
+ }
+
+ IN_UPDATE_SZ(magic);
+ if (magic != magic_end)
+ return 0;
+
+ size_t total_bytes;
+ IN_UPDATE_SZ(total_bytes);
+ if (total_bytes + sizeof(total_size) != total_size)
+ return 0;
+
+ return total_size;
+ }
+
+ void SamplerSet::printStatus(int indent, std::ostream& outs) {
+ using namespace std;
+ string spaces = indent_to_str(indent);
+ string spaces_nl = indent_to_str(indent + 4);
+
+ outs << spaces << "------------ Begin SamplerSet ------------" << "\n";
+
+ outs << spaces_nl << " SamplerSet Map: [index, sampler_reg, sampler_slot]\n";
+ outs << spaces_nl << " samplerMap size: " << samplerMap.size() << "\n";
+
+ for (auto iter : samplerMap) {
+ outs << spaces_nl << " [" << iter.first << ", "
+ << iter.second.reg << ", " << iter.second.slot << "]\n";
+ }
+
+ outs << spaces_nl << " SamplerSet Map: [reg, sampler_reg, sampler_slot]\n";
+ outs << spaces_nl << " regMap size: " << regMap.size() << "\n";
+ for (auto iter : regMap) {
+ outs << spaces_nl << " [" << iter.first << ", "
+ << iter.second.reg << ", " << iter.second.slot << "]\n";
+ }
+
+ outs << spaces << "------------- End SamplerSet -------------" << "\n";
+ }
+
} /* namespace ir */
} /* namespace gbe */
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index f968299..3c72e3e 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -41,7 +41,7 @@ namespace ir {
uint32_t slot;
};
- class SamplerSet
+ class SamplerSet : public Serializable
{
public:
/*! Append the specified sampler and return the allocated offset.
@@ -66,6 +66,29 @@ namespace ir {
SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
SamplerSet() {}
+
+ static const uint32_t magic_begin = TO_MAGIC('S', 'A', 'M', 'P');
+ static const uint32_t magic_end = TO_MAGIC('P', 'M', 'A', 'S');
+
+ /* format:
+ magic_begin |
+ samplerMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ regMap_size |
+ element_1 |
+ ........ |
+ element_n |
+ magic_end |
+ total_size
+ */
+
+ /*! Implements the serialization. */
+ virtual size_t serializeToBin(std::ostream& outs);
+ virtual size_t deserializeFromBin(std::istream& ins);
+ virtual void printStatus(int indent, std::ostream& outs);
+
private:
void appendReg(const Register reg, uint32_t key, Context *ctx);
map<uint32_t, SamplerRegSlot> samplerMap;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 12d809d..8b73ac9 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -460,7 +460,8 @@ namespace gbe
}
virtual bool doInitialization(Module &M);
-
+ /*! helper function for parsing global constant data */
+ void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
void collectGlobalConstant(void) const;
bool runOnFunction(Function &F) {
@@ -539,6 +540,8 @@ namespace gbe
// Emit unary instructions from gen native function
void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
+ ir::Register appendSampler(CallSite::arg_iterator AI);
+
// These instructions are not supported at all
void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
@@ -559,6 +562,101 @@ namespace gbe
};
char GenWriter::ID = 0;
+ void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
+ StringRef data = cda->getRawDataValues();
+ memcpy((char*)ptr+offset, data.data(), data.size());
+ offset += data.size();
+ return;
+ }
+
+ void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+ Type * type = c->getType();
+ Type::TypeID id = type->getTypeID();
+
+ GBE_ASSERT(c);
+ if(isa<UndefValue>(c)) {
+ uint32_t n = c->getNumOperands();
+ Type * opTy = type->getArrayElementType();
+ uint32_t size = opTy->getIntegerBitWidth()/ 8;
+ offset += size*n;
+ return;
+ }
+ switch(id) {
+ case Type::TypeID::StructTyID:
+ {
+ const StructType * strTy = cast<StructType>(c->getType());
+ uint32_t size = 0;
+
+ for(uint32_t op=0; op < strTy->getNumElements(); op++)
+ {
+ Type* elementType = strTy->getElementType(op);
+ uint32_t align = 8 * getAlignmentByte(unit, elementType);
+ uint32_t padding = getPadding(size, align);
+ size += padding;
+ size += getTypeBitSize(unit, elementType);
+
+ offset += padding/8;
+ const Constant* sub = cast<Constant>(c->getOperand(op));
+ GBE_ASSERT(sub);
+ getConstantData(sub, mem, offset);
+ }
+ break;
+ }
+ case Type::TypeID::ArrayTyID:
+ {
+ const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+ if(cds)
+ getSequentialData(cds, mem, offset);
+ else {
+ const ConstantArray *ca = dyn_cast<ConstantArray>(c);
+ const ArrayType *arrTy = ca->getType();
+ Type* elemTy = arrTy->getElementType();
+ uint32_t elemSize = getTypeBitSize(unit, elemTy);
+ uint32_t padding = getPadding(elemSize, 8 * getAlignmentByte(unit, elemTy));
+ padding /= 8;
+ uint32_t ops = c->getNumOperands();
+ for(uint32_t op = 0; op < ops; ++op) {
+ Constant * ca = dyn_cast<Constant>(c->getOperand(op));
+ getConstantData(ca, mem, offset);
+ offset += padding;
+ }
+ }
+ break;
+ }
+ case Type::TypeID::VectorTyID:
+ {
+ const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+ GBE_ASSERT(cds);
+ getSequentialData(cds, mem, offset);
+ break;
+ }
+ case Type::TypeID::IntegerTyID:
+ {
+ const ConstantInt *ci = dyn_cast<ConstantInt>(c);
+ uint32_t size = ci->getBitWidth() / 8;
+ uint64_t data = ci->isNegative() ? ci->getSExtValue() : ci->getZExtValue();
+ memcpy((char*)mem+offset, &data, size);
+ offset += size;
+ break;
+ }
+ case Type::TypeID::FloatTyID:
+ {
+ const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+ *(float *)((char*)mem + offset) = cf->getValueAPF().convertToFloat();
+ offset += sizeof(float);
+ break;
+ }
+ case Type::TypeID::DoubleTyID:
+ {
+ const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+ *(double *)((char*)mem + offset) = cf->getValueAPF().convertToDouble();
+ offset += sizeof(double);
+ break;
+ }
+ default:
+ NOT_IMPLEMENTED;
+ }
+ }
void GenWriter::collectGlobalConstant(void) const {
const Module::GlobalListType &globalList = TheModule->getGlobalList();
@@ -569,69 +667,13 @@ namespace gbe
if(addrSpace == ir::AddressSpace::MEM_CONSTANT) {
GBE_ASSERT(v.hasInitializer());
const Constant *c = v.getInitializer();
- if (c->getType()->getTypeID() != Type::ArrayTyID) {
- void *mem = malloc(sizeof(double));
- int size = 0;
- switch(c->getType()->getTypeID()) {
- case Type::TypeID::IntegerTyID: {
- const ConstantInt *ci = dyn_cast<ConstantInt>(c);
- *(int *)mem = ci->isNegative() ? ci->getSExtValue() : ci->getZExtValue();
- size = sizeof(int);
- break;
- }
- case Type::TypeID::FloatTyID: {
- const ConstantFP *cf = dyn_cast<ConstantFP>(c);
- *(float *)mem = cf->getValueAPF().convertToFloat();
- size = sizeof(float);
- break;
- }
- case Type::TypeID::DoubleTyID: {
- const ConstantFP *cf = dyn_cast<ConstantFP>(c);
- *(double *)mem = cf->getValueAPF().convertToDouble();
- size = sizeof(double);
- break;
- }
- default:
- NOT_IMPLEMENTED;
- }
- unit.newConstant((char *)mem, name, size, size);
- free(mem);
- continue;
- }
- GBE_ASSERT(c->getType()->getTypeID() == Type::ArrayTyID);
- const ConstantDataArray *cda = dyn_cast<ConstantDataArray>(c);
- GBE_ASSERT(cda);
- unsigned len = cda->getNumElements();
- uint64_t elementSize = cda->getElementByteSize();
- Type::TypeID typeID = cda->getElementType()->getTypeID();
- if(typeID == Type::TypeID::IntegerTyID)
- elementSize = sizeof(unsigned);
- void *mem = malloc(elementSize * len);
- for(unsigned j = 0; j < len; j ++) {
- switch(typeID) {
- case Type::TypeID::FloatTyID:
- {
- float f = cda->getElementAsFloat(j);
- memcpy((float *)mem + j, &f, elementSize);
- }
- break;
- case Type::TypeID::DoubleTyID:
- {
- double d = cda->getElementAsDouble(j);
- memcpy((double *)mem + j, &d, elementSize);
- }
- break;
- case Type::TypeID::IntegerTyID:
- {
- unsigned u = (unsigned) cda->getElementAsInteger(j);
- memcpy((unsigned *)mem + j, &u, elementSize);
- }
- break;
- default:
- NOT_IMPLEMENTED;
- }
- }
- unit.newConstant((char *)mem, name, elementSize * len, sizeof(unsigned));
+ Type * type = c->getType();
+
+ uint32_t size = getTypeByteSize(unit, type);
+ void* mem = malloc(size);
+ uint32_t offset = 0;
+ getConstantData(c, mem, offset);
+ unit.newConstant((char *)mem, name, size, sizeof(unsigned));
free(mem);
}
}
@@ -819,18 +861,38 @@ namespace gbe
return ir::Register(reg);
}
if (isa<ConstantExpr>(CPV)) {
+ uint32_t TypeIndex;
+ uint32_t constantOffset = 0;
+ uint32_t offset = 0;
ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV);
- GBE_ASSERT(CE->isGEPWithNoNotionalOverIndexing());
- auto pointer = CE->getOperand(0);
- auto offset1 = dyn_cast<ConstantInt>(CE->getOperand(1));
- GBE_ASSERT(offset1->getZExtValue() == 0);
- auto offset2 = dyn_cast<ConstantInt>(CE->getOperand(2));
- int type_size = pointer->getType()->getTypeID() == Type::TypeID::DoubleTyID ? sizeof(double) : sizeof(int);
- int type_offset = offset2->getSExtValue() * type_size;
- auto pointer_name = pointer->getName().str();
+
+ // currently only GetElementPtr is handled
+ GBE_ASSERT(CE->getOpcode() == Instruction::GetElementPtr);
+ Value *pointer = CE->getOperand(0);
+ CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+ for(uint32_t op=1; op<CE->getNumOperands(); ++op) {
+ ConstantInt* ConstOP = dyn_cast<ConstantInt>(CE->getOperand(op));
+ GBE_ASSERT(ConstOP);
+ TypeIndex = ConstOP->getZExtValue();
+ for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+ {
+ Type* elementType = CompTy->getTypeAtIndex(ty_i);
+ uint32_t align = getAlignmentByte(unit, elementType);
+ offset += getPadding(offset, align);
+ offset += getTypeByteSize(unit, elementType);
+ }
+
+ const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+ offset += getPadding(offset, align);
+
+ constantOffset += offset;
+ CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+ }
+
+ const std::string &pointer_name = pointer->getName().str();
ir::Register pointer_reg = ir::Register(unit.getConstantSet().getConstant(pointer_name).getReg());
ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
- ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(type_offset, ir::Type::TYPE_S32));
+ ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32));
ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
return reg;
@@ -1243,12 +1305,7 @@ namespace gbe
ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
ir::Constant &con = unit.getConstantSet().getConstant(j ++);
con.setReg(reg.value());
- if(con.getOffset() != 0) {
- ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
- ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg);
- } else {
- ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst);
- }
+ ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
}
// Visit all the instructions and emit the IR registers or the value to
@@ -1521,7 +1578,7 @@ namespace gbe
Type *llvmSrcType = I.getOperand(0)->getType();
const ir::Type dstType = getType(ctx, llvmDstType);
ir::Type srcType;
- if (I.getOpcode() == Instruction::ZExt) {
+ if (I.getOpcode() == Instruction::ZExt || I.getOpcode() == Instruction::UIToFP) {
srcType = getUnsignedType(ctx, llvmSrcType);
} else {
srcType = getType(ctx, llvmSrcType);
@@ -1754,6 +1811,7 @@ namespace gbe
case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
case GEN_OCL_GET_IMAGE_DEPTH:
+ case GEN_OCL_GET_SAMPLER_INFO:
case GEN_OCL_ATOMIC_ADD0:
case GEN_OCL_ATOMIC_ADD1:
case GEN_OCL_ATOMIC_SUB0:
@@ -1825,6 +1883,8 @@ namespace gbe
}
case GEN_OCL_MUL_HI_INT:
case GEN_OCL_MUL_HI_UINT:
+ case GEN_OCL_MUL_HI_I64:
+ case GEN_OCL_MUL_HI_UI64:
case GEN_OCL_UPSAMPLE_SHORT:
case GEN_OCL_UPSAMPLE_INT:
case GEN_OCL_UPSAMPLE_LONG:
@@ -1846,6 +1906,10 @@ namespace gbe
case GEN_OCL_USUB_SAT_LONG:
case GEN_OCL_HADD:
case GEN_OCL_RHADD:
+ case GEN_OCL_I64HADD:
+ case GEN_OCL_I64RHADD:
+ case GEN_OCL_I64_MAD_SAT:
+ case GEN_OCL_I64_MAD_SATU:
this->newRegister(&I);
break;
default:
@@ -1891,6 +1955,25 @@ namespace gbe
ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
}
+ /* append a new sampler. should be called before any reference to
+ * a sampler_t value. */
+ ir::Register GenWriter::appendSampler(CallSite::arg_iterator AI) {
+ Constant *CPV = dyn_cast<Constant>(*AI);
+ ir::Register sampler;
+ if (CPV != NULL)
+ {
+ // This is not a kernel argument sampler, we need to append it to sampler set,
+ // and allocate a sampler slot for it.
+ auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+ GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+ sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+ } else {
+ sampler = this->getRegister(*AI);
+ ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
+ }
+ return sampler;
+ }
+
void GenWriter::emitCallInst(CallInst &I) {
if (Function *F = I.getCalledFunction()) {
if (F->getIntrinsicID() != 0) {
@@ -2014,21 +2097,18 @@ namespace gbe
GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
uint32_t elemNum;
(void)getVectorInfo(ctx, I.getType(), &I, elemNum);
- vector<ir::Register> dstTupleData;
- ir::Register lastReg;
- for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register reg = this->getRegister(&I, elemID);
- dstTupleData.push_back(reg);
- lastReg = reg;
- }
- // A walk around for the gen IR limitation.
- for (uint32_t elemID = elemNum; elemID < 4; ++ elemID) {
- dstTupleData.push_back(lastReg);
- }
- const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], 4);
+ const ir::Register reg = this->getRegister(&I, 0);
int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
- ctx.GET_IMAGE_INFO(infoType, dstTuple, surface_id);
+ ctx.GET_IMAGE_INFO(infoType, reg, surface_id, ctx.reg(ir::FAMILY_DWORD));
+ break;
+ }
+ case GEN_OCL_GET_SAMPLER_INFO:
+ {
+ GBE_ASSERT(AI != AE);
+ const ir::Register sampler = this->appendSampler(AI); ++AI;
+ const ir::Register reg = this->getRegister(&I, 0);
+ ctx.GET_SAMPLER_INFO(reg, sampler);
break;
}
case GEN_OCL_READ_IMAGE0:
@@ -2046,29 +2126,13 @@ namespace gbe
{
GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE);
- Constant *CPV = dyn_cast<Constant>(*AI);
- ir::Register sampler;
- if (CPV != NULL)
- {
- // This is not a kernel argument sampler, we need to append it to sampler set,
- // and allocate a sampler slot for it.
- auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
- GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
- sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
- } else {
- sampler = this->getRegister(*AI);
- ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
- }
+ const ir::Register sampler = this->appendSampler(AI);
++AI;
GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
ir::Register wcoord;
- if (it->second == GEN_OCL_READ_IMAGE10 ||
- it->second == GEN_OCL_READ_IMAGE11 ||
- it->second == GEN_OCL_READ_IMAGE12 ||
- it->second == GEN_OCL_READ_IMAGE13 ||
- it->second == GEN_OCL_READ_IMAGE14) {
+ if (it->second >= GEN_OCL_READ_IMAGE10 && it->second <= GEN_OCL_READ_IMAGE15) {
GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
} else
wcoord = ir::Register(0);
@@ -2084,8 +2148,19 @@ namespace gbe
srcTupleData.push_back(ucoord);
srcTupleData.push_back(vcoord);
srcTupleData.push_back(wcoord);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+ assert(CPV);
+ auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+ GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+ ir::Register offsetReg(x.data.u32);
+ srcTupleData.push_back(offsetReg);
+#else
+ ir::Register offsetReg(0);
+#endif
+ srcTupleData.push_back(offsetReg);
const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
- const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 5);
+ const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 6);
ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32;
@@ -2137,11 +2212,7 @@ namespace gbe
GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
ir::Register wcoord;
- if(it->second == GEN_OCL_WRITE_IMAGE10 ||
- it->second == GEN_OCL_WRITE_IMAGE11 ||
- it->second == GEN_OCL_WRITE_IMAGE12 ||
- it->second == GEN_OCL_WRITE_IMAGE13 ||
- it->second == GEN_OCL_WRITE_IMAGE14) {
+ if(it->second >= GEN_OCL_WRITE_IMAGE10 && it->second <= GEN_OCL_WRITE_IMAGE15) {
GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
} else
wcoord = ir::Register(0);
@@ -2208,6 +2279,22 @@ namespace gbe
ctx.MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
+ case GEN_OCL_MUL_HI_I64:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64_MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
+ case GEN_OCL_MUL_HI_UI64:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64_MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+ break;
+ }
case GEN_OCL_UPSAMPLE_SHORT:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
@@ -2276,6 +2363,24 @@ namespace gbe
ctx.SUBSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
+ case GEN_OCL_I64_MAD_SAT:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64MADSAT(getType(ctx, I.getType()), dst, src0, src1, src2);
+ break;
+ }
+ case GEN_OCL_I64_MAD_SATU:
+ {
+ GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+ GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
+ break;
+ }
case GEN_OCL_HADD: {
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
@@ -2283,6 +2388,16 @@ namespace gbe
ctx.HADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
+ case GEN_OCL_I64HADD:
+ {
+ GBE_ASSERT(AI != AE);
+ const ir::Register src0 = this->getRegister(*(AI++));
+ GBE_ASSERT(AI != AE);
+ const ir::Register src1 = this->getRegister(*(AI++));
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64HADD(ir::TYPE_U64, dst, src0, src1);
+ break;
+ }
case GEN_OCL_RHADD: {
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
@@ -2290,6 +2405,16 @@ namespace gbe
ctx.RHADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
+ case GEN_OCL_I64RHADD:
+ {
+ GBE_ASSERT(AI != AE);
+ const ir::Register src0 = this->getRegister(*(AI++));
+ GBE_ASSERT(AI != AE);
+ const ir::Register src1 = this->getRegister(*(AI++));
+ const ir::Register dst = this->getRegister(&I);
+ ctx.I64RHADD(ir::TYPE_U64, dst, src0, src1);
+ break;
+ }
default: break;
}
}
@@ -2407,13 +2532,19 @@ namespace gbe
const ir::Type type = getType(ctx, elemType);
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
- if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) {
+ if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
// One message is enough here. Nothing special to do
if (elemNum <= 4) {
// Build the tuple data in the vector
vector<ir::Register> tupleData; // put registers here
for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
- const ir::Register reg = this->getRegister(llvmValues, elemID);
+ ir::Register reg;
+ if(regTranslator.isUndefConst(llvmValues, elemID)) {
+ Value *v = Constant::getNullValue(elemType);
+ reg = this->getRegister(v);
+ } else
+ reg = this->getRegister(llvmValues, elemID);
+
tupleData.push_back(reg);
}
const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
@@ -2433,7 +2564,13 @@ namespace gbe
// Build the tuple data in the vector
vector<ir::Register> tupleData; // put registers here
for (uint32_t elemID = 0; elemID < 4; ++elemID) {
- const ir::Register reg = this->getRegister(llvmValues, 4*msg+elemID);
+ ir::Register reg;
+ if(regTranslator.isUndefConst(llvmValues, elemID)) {
+ Value *v = Constant::getNullValue(elemType);
+ reg = this->getRegister(v);
+ } else
+ reg = this->getRegister(llvmValues, 4*msg+elemID);
+
tupleData.push_back(reg);
}
const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], 4);
@@ -2468,6 +2605,9 @@ namespace gbe
}
} else {
for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+ if(regTranslator.isUndefConst(llvmValues, elemID))
+ continue;
+
const ir::Register reg = this->getRegister(llvmValues, elemID);
ir::Register addr;
if (elemID == 0)
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index b712860..321fc4e 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -42,19 +42,19 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8, __gen_ocl_force_simd8)
DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
// To read_image functions.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfffj)
// To write_image functions.
DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE0, _Z22__gen_ocl_write_imageijiiDv4_i)
@@ -125,14 +125,24 @@ DECL_LLVM_GEN_FUNCTION(USUB_SAT_SHORT, _Z12ocl_usub_sattt)
DECL_LLVM_GEN_FUNCTION(USUB_SAT_INT, _Z12ocl_usub_satjj)
DECL_LLVM_GEN_FUNCTION(USUB_SAT_LONG, _Z12ocl_usub_satmm)
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SAT, _Z17__gen_ocl_mad_satlll)
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SATU, _Z17__gen_ocl_mad_satmmm)
+
// integer built-in functions
DECL_LLVM_GEN_FUNCTION(MUL_HI_INT, _Z16__gen_ocl_mul_hiii)
DECL_LLVM_GEN_FUNCTION(MUL_HI_UINT, _Z16__gen_ocl_mul_hijj)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_I64, _Z16__gen_ocl_mul_hill)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_UI64, _Z16__gen_ocl_mul_himm)
DECL_LLVM_GEN_FUNCTION(FBH, __gen_ocl_fbh)
DECL_LLVM_GEN_FUNCTION(FBL, __gen_ocl_fbl)
DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_abs)
-DECL_LLVM_GEN_FUNCTION(HADD, __gen_ocl_hadd)
-DECL_LLVM_GEN_FUNCTION(RHADD, __gen_ocl_rhadd)
+DECL_LLVM_GEN_FUNCTION(HADD, _Z14__gen_ocl_haddjj)
+DECL_LLVM_GEN_FUNCTION(RHADD, _Z15__gen_ocl_rhaddjj)
+DECL_LLVM_GEN_FUNCTION(I64HADD, _Z14__gen_ocl_haddmm)
+DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+
+// get sampler info
+DECL_LLVM_GEN_FUNCTION(GET_SAMPLER_INFO, __gen_ocl_get_sampler_info)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 41674b6..7a40616 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -383,7 +383,6 @@ namespace gbe {
if (PHINode* phi = dyn_cast<PHINode>(inst)) {
PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues());
- assert(args.size() % 2 == 0 && "Odd number of arguments for a PHI");
// Loop over pairs of operands: [Value*, BasicBlock*]
for (unsigned int i = 0; i < args.size(); i++) {
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index 1ea150b..b736a88 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -4,6 +4,7 @@
//
// Common defines for Image intrinsics
// Channel order
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
enum {
CLK_R = 0x10B0,
CLK_A = 0x10B1,
@@ -66,54 +67,52 @@ typedef enum clk_channel_type {
typedef enum clk_sampler_type {
__CLK_ADDRESS_BASE = 0,
- CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE,
- CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE,
+ CLK_ADDRESS_NONE = (0 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_CLAMP = (1 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_CLAMP_TO_EDGE = (2 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_REPEAT = (3 << __CLK_ADDRESS_BASE),
+ CLK_ADDRESS_MIRROR = (4 << __CLK_ADDRESS_BASE),
#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
#endif
- __CLK_ADDRESS_MASK = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+ __CLK_ADDRESS_MASK = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
CLK_ADDRESS_CLAMP_TO_EDGE |
- CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+ CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR),
__CLK_ADDRESS_BITS = 3, // number of bits required to
// represent address info
__CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
CLK_NORMALIZED_COORDS_FALSE = 0,
- CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE,
- __CLK_NORMALIZED_MASK = CLK_NORMALIZED_COORDS_FALSE |
- CLK_NORMALIZED_COORDS_TRUE,
+ CLK_NORMALIZED_COORDS_TRUE = (1 << __CLK_NORMALIZED_BASE),
+ __CLK_NORMALIZED_MASK = (CLK_NORMALIZED_COORDS_FALSE |
+ CLK_NORMALIZED_COORDS_TRUE),
__CLK_NORMALIZED_BITS = 1, // number of bits required to
// represent normalization
-
- __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE +
- __CLK_NORMALIZED_BITS,
- CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE,
- CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE,
- CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE,
- __CLK_FILTER_MASK = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
- CLK_FILTER_ANISOTROPIC,
+ __CLK_FILTER_BASE = (__CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS),
+ CLK_FILTER_NEAREST = (0 << __CLK_FILTER_BASE),
+ CLK_FILTER_LINEAR = (1 << __CLK_FILTER_BASE),
+ CLK_FILTER_ANISOTROPIC = (2 << __CLK_FILTER_BASE),
+ __CLK_FILTER_MASK = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+ CLK_FILTER_ANISOTROPIC),
__CLK_FILTER_BITS = 2, // number of bits required to
// represent address info
- __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
- CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE,
- CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE,
- CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE,
- __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR |
- CLK_MIP_ANISOTROPIC,
+ __CLK_MIP_BASE = (__CLK_FILTER_BASE + __CLK_FILTER_BITS),
+ CLK_MIP_NEAREST = (0 << __CLK_MIP_BASE),
+ CLK_MIP_LINEAR = (1 << __CLK_MIP_BASE),
+ CLK_MIP_ANISOTROPIC = (2 << __CLK_MIP_BASE),
+ __CLK_MIP_MASK = (CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+ CLK_MIP_ANISOTROPIC),
__CLK_MIP_BITS = 2,
- __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS,
- __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
- __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+ __CLK_SAMPLER_BITS = (__CLK_MIP_BASE + __CLK_MIP_BITS),
+ __CLK_SAMPLER_MASK = (__CLK_MIP_MASK | __CLK_FILTER_MASK |
+ __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK),
- __CLK_SAMPLER_ARG_BASE = __CLK_MIP_BASE + __CLK_SAMPLER_BITS,
+ __CLK_SAMPLER_ARG_BASE = (__CLK_MIP_BASE + __CLK_SAMPLER_BITS),
__CLK_SAMPLER_ARG_BITS = 8,
- __CLK_SAMPLER_ARG_MASK = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE,
+ __CLK_SAMPLER_ARG_MASK = (((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE),
__CLK_SAMPLER_ARG_KEY_BIT = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
__CLK_SAMPLER_ARG_KEY_BITS = 1,
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
index 4063788..13ae5ba 100644
--- a/backend/src/ocl_convert.h
+++ b/backend/src/ocl_convert.h
@@ -1,5 +1,366 @@
// This file is autogenerated by gen_convert.sh.
// Don't modify it manually.
+INLINE OVERLOADABLE ulong convert_ulong(long v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(long v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(long v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(long v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(long v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(long v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(long v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(long v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(long v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ulong v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ulong v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ulong v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ulong v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(ulong v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ulong v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ulong v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ulong v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ulong v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(int v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(int v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(int v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(int v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(int v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(int v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(int v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(int v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(int v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uint v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uint v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uint v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uint v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uint v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uint v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(uint v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uint v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uint v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(short v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(short v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(short v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(short v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(short v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(short v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(short v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(short v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(short v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ushort v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(ushort v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ushort v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ushort v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ushort v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ushort v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ushort v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ushort v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ushort v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(char v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(char v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(char v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(char v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(char v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(char v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(char v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(char v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(char v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uchar v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uchar v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uchar v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(uchar v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uchar v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uchar v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uchar v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uchar v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uchar v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(double v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(double v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(double v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(double v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(double v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(double v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(double v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(double v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE float convert_float(double v) {
+ return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(float v) {
+ return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(float v) {
+ return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(float v) {
+ return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(float v) {
+ return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(float v) {
+ return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(float v) {
+ return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(float v) {
+ return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(float v) {
+ return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(float v) {
+ return (double)v;
+}
+
+INLINE OVERLOADABLE long2 convert_long2(long2 v) { return v; }
INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
}
@@ -40,6 +401,7 @@ INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
return (long2)((long)(v.s0), (long)(v.s1));
}
+INLINE OVERLOADABLE ulong2 convert_ulong2(ulong2 v) { return v; }
INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
return (int2)((int)(v.s0), (int)(v.s1));
}
@@ -80,6 +442,7 @@ INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
}
+INLINE OVERLOADABLE int2 convert_int2(int2 v) { return v; }
INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
return (uint2)((uint)(v.s0), (uint)(v.s1));
}
@@ -120,6 +483,7 @@ INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
return (int2)((int)(v.s0), (int)(v.s1));
}
+INLINE OVERLOADABLE uint2 convert_uint2(uint2 v) { return v; }
INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
return (short2)((short)(v.s0), (short)(v.s1));
}
@@ -160,6 +524,7 @@ INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
return (uint2)((uint)(v.s0), (uint)(v.s1));
}
+INLINE OVERLOADABLE short2 convert_short2(short2 v) { return v; }
INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
}
@@ -200,6 +565,7 @@ INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
return (short2)((short)(v.s0), (short)(v.s1));
}
+INLINE OVERLOADABLE ushort2 convert_ushort2(ushort2 v) { return v; }
INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
return (char2)((char)(v.s0), (char)(v.s1));
}
@@ -240,6 +606,7 @@ INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
}
+INLINE OVERLOADABLE char2 convert_char2(char2 v) { return v; }
INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
}
@@ -280,6 +647,7 @@ INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
return (char2)((char)(v.s0), (char)(v.s1));
}
+INLINE OVERLOADABLE uchar2 convert_uchar2(uchar2 v) { return v; }
INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
return (double2)((double)(v.s0), (double)(v.s1));
}
@@ -320,6 +688,7 @@ INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
}
+INLINE OVERLOADABLE double2 convert_double2(double2 v) { return v; }
INLINE OVERLOADABLE float2 convert_float2(double2 v) {
return (float2)((float)(v.s0), (float)(v.s1));
}
@@ -360,6 +729,8 @@ INLINE OVERLOADABLE double2 convert_double2(float2 v) {
return (double2)((double)(v.s0), (double)(v.s1));
}
+INLINE OVERLOADABLE float2 convert_float2(float2 v) { return v; }
+INLINE OVERLOADABLE long3 convert_long3(long3 v) { return v; }
INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
}
@@ -400,6 +771,7 @@ INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
}
+INLINE OVERLOADABLE ulong3 convert_ulong3(ulong3 v) { return v; }
INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
}
@@ -440,6 +812,7 @@ INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
}
+INLINE OVERLOADABLE int3 convert_int3(int3 v) { return v; }
INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
}
@@ -480,6 +853,7 @@ INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
}
+INLINE OVERLOADABLE uint3 convert_uint3(uint3 v) { return v; }
INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
}
@@ -520,6 +894,7 @@ INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
}
+INLINE OVERLOADABLE short3 convert_short3(short3 v) { return v; }
INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
}
@@ -560,6 +935,7 @@ INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
}
+INLINE OVERLOADABLE ushort3 convert_ushort3(ushort3 v) { return v; }
INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
}
@@ -600,6 +976,7 @@ INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
}
+INLINE OVERLOADABLE char3 convert_char3(char3 v) { return v; }
INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
}
@@ -640,6 +1017,7 @@ INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
}
+INLINE OVERLOADABLE uchar3 convert_uchar3(uchar3 v) { return v; }
INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
}
@@ -680,6 +1058,7 @@ INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
}
+INLINE OVERLOADABLE double3 convert_double3(double3 v) { return v; }
INLINE OVERLOADABLE float3 convert_float3(double3 v) {
return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
}
@@ -720,6 +1099,8 @@ INLINE OVERLOADABLE double3 convert_double3(float3 v) {
return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
}
+INLINE OVERLOADABLE float3 convert_float3(float3 v) { return v; }
+INLINE OVERLOADABLE long4 convert_long4(long4 v) { return v; }
INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
}
@@ -760,6 +1141,7 @@ INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
}
+INLINE OVERLOADABLE ulong4 convert_ulong4(ulong4 v) { return v; }
INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
}
@@ -800,6 +1182,7 @@ INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
}
+INLINE OVERLOADABLE int4 convert_int4(int4 v) { return v; }
INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
}
@@ -840,6 +1223,7 @@ INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
}
+INLINE OVERLOADABLE uint4 convert_uint4(uint4 v) { return v; }
INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
}
@@ -880,6 +1264,7 @@ INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
}
+INLINE OVERLOADABLE short4 convert_short4(short4 v) { return v; }
INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
}
@@ -920,6 +1305,7 @@ INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
}
+INLINE OVERLOADABLE ushort4 convert_ushort4(ushort4 v) { return v; }
INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
}
@@ -960,6 +1346,7 @@ INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
}
+INLINE OVERLOADABLE char4 convert_char4(char4 v) { return v; }
INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
}
@@ -1000,6 +1387,7 @@ INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
}
+INLINE OVERLOADABLE uchar4 convert_uchar4(uchar4 v) { return v; }
INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
}
@@ -1040,6 +1428,7 @@ INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
}
+INLINE OVERLOADABLE double4 convert_double4(double4 v) { return v; }
INLINE OVERLOADABLE float4 convert_float4(double4 v) {
return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
}
@@ -1080,6 +1469,8 @@ INLINE OVERLOADABLE double4 convert_double4(float4 v) {
return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
}
+INLINE OVERLOADABLE float4 convert_float4(float4 v) { return v; }
+INLINE OVERLOADABLE long8 convert_long8(long8 v) { return v; }
INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
}
@@ -1120,6 +1511,7 @@ INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
}
+INLINE OVERLOADABLE ulong8 convert_ulong8(ulong8 v) { return v; }
INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
}
@@ -1160,6 +1552,7 @@ INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
}
+INLINE OVERLOADABLE int8 convert_int8(int8 v) { return v; }
INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
}
@@ -1200,6 +1593,7 @@ INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
}
+INLINE OVERLOADABLE uint8 convert_uint8(uint8 v) { return v; }
INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
}
@@ -1240,6 +1634,7 @@ INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
}
+INLINE OVERLOADABLE short8 convert_short8(short8 v) { return v; }
INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
}
@@ -1280,6 +1675,7 @@ INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
}
+INLINE OVERLOADABLE ushort8 convert_ushort8(ushort8 v) { return v; }
INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
}
@@ -1320,6 +1716,7 @@ INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
}
+INLINE OVERLOADABLE char8 convert_char8(char8 v) { return v; }
INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
}
@@ -1360,6 +1757,7 @@ INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
}
+INLINE OVERLOADABLE uchar8 convert_uchar8(uchar8 v) { return v; }
INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
}
@@ -1400,6 +1798,7 @@ INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
}
+INLINE OVERLOADABLE double8 convert_double8(double8 v) { return v; }
INLINE OVERLOADABLE float8 convert_float8(double8 v) {
return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
}
@@ -1440,6 +1839,8 @@ INLINE OVERLOADABLE double8 convert_double8(float8 v) {
return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
}
+INLINE OVERLOADABLE float8 convert_float8(float8 v) { return v; }
+INLINE OVERLOADABLE long16 convert_long16(long16 v) { return v; }
INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
}
@@ -1480,6 +1881,7 @@ INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
}
+INLINE OVERLOADABLE ulong16 convert_ulong16(ulong16 v) { return v; }
INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
}
@@ -1520,6 +1922,7 @@ INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
}
+INLINE OVERLOADABLE int16 convert_int16(int16 v) { return v; }
INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
}
@@ -1560,6 +1963,7 @@ INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
}
+INLINE OVERLOADABLE uint16 convert_uint16(uint16 v) { return v; }
INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
}
@@ -1600,6 +2004,7 @@ INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
}
+INLINE OVERLOADABLE short16 convert_short16(short16 v) { return v; }
INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
}
@@ -1640,6 +2045,7 @@ INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
}
+INLINE OVERLOADABLE ushort16 convert_ushort16(ushort16 v) { return v; }
INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
}
@@ -1680,6 +2086,7 @@ INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
}
+INLINE OVERLOADABLE char16 convert_char16(char16 v) { return v; }
INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
}
@@ -1720,6 +2127,7 @@ INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
}
+INLINE OVERLOADABLE uchar16 convert_uchar16(uchar16 v) { return v; }
INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
}
@@ -1760,6 +2168,7 @@ INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
}
+INLINE OVERLOADABLE double16 convert_double16(double16 v) { return v; }
INLINE OVERLOADABLE float16 convert_float16(double16 v) {
return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
}
@@ -1799,3 +2208,5 @@ INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
INLINE OVERLOADABLE double16 convert_double16(float16 v) {
return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
}
+
+INLINE OVERLOADABLE float16 convert_float16(float16 v) { return v; }
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 8d4220c..170ec70 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -182,9 +182,31 @@ INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); }
INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
SDEF(char);
SDEF(short);
-SDEF(int);
-SDEF(long);
#undef SDEF
+OVERLOADABLE int ocl_sadd_sat(int x, int y);
+INLINE_OVERLOADABLE int add_sat(int x, int y) { return ocl_sadd_sat(x, y); }
+OVERLOADABLE int ocl_ssub_sat(int x, int y);
+INLINE_OVERLOADABLE int sub_sat(int x, int y) {
+ return (y == 0x80000000u) ? (x & 0x7FFFFFFF) : ocl_ssub_sat(x, y);
+}
+OVERLOADABLE long ocl_sadd_sat(long x, long y);
+INLINE_OVERLOADABLE long add_sat(long x, long y) {
+ union {long l; uint i[2];} ux, uy;
+ ux.l = x;
+ uy.l = y;
+ if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+ return x + y;
+ return ocl_sadd_sat(x, y);
+}
+OVERLOADABLE long ocl_ssub_sat(long x, long y);
+INLINE_OVERLOADABLE long sub_sat(long x, long y) {
+ union {long l; uint i[2];} ux, uy;
+ ux.l = x;
+ uy.l = y;
+ if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+ return ocl_ssub_sat(x, y);
+ return x - y;
+}
#define UDEF(TYPE) \
OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y); \
OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y); \
@@ -196,7 +218,6 @@ UDEF(uint);
UDEF(ulong);
#undef UDEF
-
uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
return add_sat((uchar)x, (uchar)0);
}
@@ -258,6 +279,7 @@ DEC(16);
#define DEF(type) INLINE_OVERLOADABLE type bitselect(type a, type b, type c) { return (a & ~c) | (b & c); }
DEF(char); DEF(uchar); DEF(short); DEF(ushort); DEF(int); DEF(uint)
+DEF(long); DEF(ulong)
#undef DEF
INLINE_OVERLOADABLE float bitselect(float a, float b, float c) {
return as_float(bitselect(as_int(a), as_int(b), as_int(c)));
@@ -274,13 +296,13 @@ INLINE_OVERLOADABLE char clz(char x) {
return 0;
if (x == 0)
return 8;
- return __gen_ocl_fbl(x) - 24;
+ return __gen_ocl_fbh(x) - 24;
}
INLINE_OVERLOADABLE uchar clz(uchar x) {
if (x == 0)
return 8;
- return __gen_ocl_fbl(x) - 24;
+ return __gen_ocl_fbh(x) - 24;
}
INLINE_OVERLOADABLE short clz(short x) {
@@ -312,15 +334,33 @@ INLINE_OVERLOADABLE uint clz(uint x) {
}
INLINE_OVERLOADABLE long clz(long x) {
- return 0;
+ union { int i[2]; long x; } u;
+ u.x = x;
+ if (u.i[1] & 0x80000000u)
+ return 0;
+ if (u.i[1] == 0 && u.i[0] == 0)
+ return 64;
+ uint v = clz(u.i[1]);
+ if(v == 32)
+ v += clz(u.i[0]);
+ return v;
}
INLINE_OVERLOADABLE ulong clz(ulong x) {
- return 0;
+ if (x == 0)
+ return 64;
+ union { uint i[2]; ulong x; } u;
+ u.x = x;
+ uint v = clz(u.i[1]);
+ if(v == 32)
+ v += clz(u.i[0]);
+ return v;
}
OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
+OVERLOADABLE long __gen_ocl_mul_hi(long x, long y);
+OVERLOADABLE ulong __gen_ocl_mul_hi(ulong x, ulong y);
INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
@@ -328,10 +368,10 @@ INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
INLINE_OVERLOADABLE long mul_hi(long x, long y) {
- return 0;
+ return __gen_ocl_mul_hi(x, y);
}
INLINE_OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
- return 0;
+ return __gen_ocl_mul_hi(x, y);
}
#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
@@ -399,12 +439,15 @@ INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
return (uint)x;
}
+OVERLOADABLE long __gen_ocl_mad_sat(long a, long b, long c);
+OVERLOADABLE ulong __gen_ocl_mad_sat(ulong a, ulong b, ulong c);
+
INLINE_OVERLOADABLE long mad_sat(long a, long b, long c) {
- return 0;
+ return __gen_ocl_mad_sat(a, b, c);
}
INLINE_OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
- return 0;
+ return __gen_ocl_mad_sat(a, b, c);
}
INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
@@ -413,6 +456,8 @@ INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) |
INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
+INLINE_OVERLOADABLE ulong __rotate_left(ulong x, ulong y) { return (x << y) | (x >> (64 - y)); }
+INLINE_OVERLOADABLE long __rotate_left(long x, long y) { return __rotate_left((ulong)x, (ulong)y); }
#define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
DEF(char, 7)
DEF(uchar, 7)
@@ -420,13 +465,9 @@ DEF(short, 15)
DEF(ushort, 15)
DEF(int, 31)
DEF(uint, 31)
+DEF(long, 63)
+DEF(ulong, 63)
#undef DEF
-INLINE_OVERLOADABLE long rotate(long x, long y) {
- return 0;
-}
-INLINE_OVERLOADABLE ulong rotate(ulong x, ulong y) {
- return 0;
-}
OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
@@ -442,8 +483,8 @@ INLINE_OVERLOADABLE ulong upsample(uint hi, uint lo) {
return __gen_ocl_upsample((long)hi, (long)lo);
}
-PURE CONST uint __gen_ocl_hadd(uint x, uint y);
-PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
+OVERLOADABLE uint __gen_ocl_hadd(uint x, uint y);
+OVERLOADABLE uint __gen_ocl_rhadd(uint x, uint y);
#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
#define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
DEC
@@ -452,21 +493,35 @@ DEC
DEC
#undef DEF
#undef DEC
-INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y) >> 1) : __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE int hadd(int x, int y) {
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y) >> 1) :
+ __gen_ocl_hadd((uint)x, (uint)y);
+}
INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
-INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
+INLINE_OVERLOADABLE int rhadd(int x, int y) {
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y + 1) >> 1) :
+ __gen_ocl_rhadd((uint)x, (uint)y);
+ }
INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+OVERLOADABLE ulong __gen_ocl_hadd(ulong x, ulong y);
+OVERLOADABLE ulong __gen_ocl_rhadd(ulong x, ulong y);
INLINE_OVERLOADABLE long hadd(long x, long y) {
- return 0;
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y) >> 1) :
+ __gen_ocl_hadd((ulong)x, (ulong)y);
}
INLINE_OVERLOADABLE ulong hadd(ulong x, ulong y) {
- return 0;
+ return __gen_ocl_hadd(x, y);
}
INLINE_OVERLOADABLE long rhadd(long x, long y) {
- return 0;
+ return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+ ((x + y + 1) >> 1) :
+ __gen_ocl_rhadd((ulong)x, (ulong)y);
}
INLINE_OVERLOADABLE ulong rhadd(ulong x, ulong y) {
- return 0;
+ return __gen_ocl_rhadd(x, y);
}
int __gen_ocl_abs(int x);
@@ -508,10 +563,12 @@ INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
}
INLINE_OVERLOADABLE ulong abs_diff (long x, long y) {
- return 0;
+ if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+ return abs(x - y);
+ return abs(x) + abs(y);
}
INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
- return 0;
+ return y > x ? (y - x) : (x - y);
}
/////////////////////////////////////////////////////////////////////////////
@@ -552,7 +609,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
#undef DECL_PUBLIC_WORK_ITEM_FN
INLINE uint get_global_id(uint dim) {
- return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
+ return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
}
/////////////////////////////////////////////////////////////////////////////
@@ -577,7 +634,61 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
}
INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
- return __gen_ocl_sin(x * M_PI_F);
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ float y, z;
+ int n, ix;
+ ix = *(int *) (&x) & 0x7fffffff;
+ if (ix < 0x3e800000)
+ return __gen_ocl_sin(M_PI_F * x);
+ y = -x;
+ z = __gen_ocl_rndd(y);
+ if (z != y) {
+ y *= 0.5f;
+ y = 2.f * (y - __gen_ocl_rndd(y));
+ n = y * 4.f;
+ } else {
+ if (ix >= 0x4b800000) {
+ y = 0;
+ n = 0;
+ } else {
+ if (ix < 0x4b000000)
+ z = y + 8.3886080000e+06f;
+ int n = *(int *) (&z);
+ n &= 1;
+ y = n;
+ n <<= 2;
+ }
+ }
+ switch (n) {
+ case 0:
+ y = __gen_ocl_sin(M_PI_F * y);
+ break;
+ case 1:
+ case 2:
+ y = __gen_ocl_cos(M_PI_F * (0.5f - y));
+ break;
+ case 3:
+ case 4:
+ y = __gen_ocl_sin(M_PI_F * (1.f - y));
+ break;
+ case 5:
+ case 6:
+ y = -__gen_ocl_cos(M_PI_F * (y - 1.5f));
+ break;
+ default:
+ y = __gen_ocl_sin(M_PI_F * (y - 2.f));
+ break;
+ }
+ return -y;
}
INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
@@ -585,6 +696,572 @@ INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
INLINE_OVERLOADABLE float native_log(float x) {
return native_log2(x) * 0.6931472002f;
}
+INLINE_OVERLOADABLE float tgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ float pi = 3.1415927410e+00,
+ a0 = 7.7215664089e-02,
+ a1 = 3.2246702909e-01,
+ a2 = 6.7352302372e-02,
+ a3 = 2.0580807701e-02,
+ a4 = 7.3855509982e-03,
+ a5 = 2.8905137442e-03,
+ a6 = 1.1927076848e-03,
+ a7 = 5.1006977446e-04,
+ a8 = 2.2086278477e-04,
+ a9 = 1.0801156895e-04,
+ a10 = 2.5214456400e-05,
+ a11 = 4.4864096708e-05,
+ tc = 1.4616321325e+00,
+ tf = -1.2148628384e-01,
+ tt = 6.6971006518e-09,
+ t0 = 4.8383611441e-01,
+ t1 = -1.4758771658e-01,
+ t2 = 6.4624942839e-02,
+ t3 = -3.2788541168e-02,
+ t4 = 1.7970675603e-02,
+ t5 = -1.0314224288e-02,
+ t6 = 6.1005386524e-03,
+ t7 = -3.6845202558e-03,
+ t8 = 2.2596477065e-03,
+ t9 = -1.4034647029e-03,
+ t10 = 8.8108185446e-04,
+ t11 = -5.3859531181e-04,
+ t12 = 3.1563205994e-04,
+ t13 = -3.1275415677e-04,
+ t14 = 3.3552918467e-04,
+ u0 = -7.7215664089e-02,
+ u1 = 6.3282704353e-01,
+ u2 = 1.4549225569e+00,
+ u3 = 9.7771751881e-01,
+ u4 = 2.2896373272e-01,
+ u5 = 1.3381091878e-02,
+ v1 = 2.4559779167e+00,
+ v2 = 2.1284897327e+00,
+ v3 = 7.6928514242e-01,
+ v4 = 1.0422264785e-01,
+ v5 = 3.2170924824e-03,
+ s0 = -7.7215664089e-02,
+ s1 = 2.1498242021e-01,
+ s2 = 3.2577878237e-01,
+ s3 = 1.4635047317e-01,
+ s4 = 2.6642270386e-02,
+ s5 = 1.8402845599e-03,
+ s6 = 3.1947532989e-05,
+ r1 = 1.3920053244e+00,
+ r2 = 7.2193557024e-01,
+ r3 = 1.7193385959e-01,
+ r4 = 1.8645919859e-02,
+ r5 = 7.7794247773e-04,
+ r6 = 7.3266842264e-06,
+ w0 = 4.1893854737e-01,
+ w1 = 8.3333335817e-02,
+ w2 = -2.7777778450e-03,
+ w3 = 7.9365057172e-04,
+ w4 = -5.9518753551e-04,
+ w5 = 8.3633989561e-04,
+ w6 = -1.6309292987e-03;
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+ int i, hx, ix;
+ nadj = 0;
+ hx = *(int *) (&x);
+ ix = hx & 0x7fffffff;
+ if (ix >= 0x7f800000)
+ return x * x;
+ if (ix == 0)
+ return INFINITY;
+ if (ix < 0x1c800000) {
+ if (hx < 0) {
+ return - native_log(-x);
+ } else
+ return - native_log(x);
+ }
+ if (hx < 0) {
+ if (ix >= 0x4b000000)
+ return INFINITY;
+ t = __gen_ocl_internal_sinpi(x);
+ if (__gen_ocl_fabs(t) < 1e-8f)
+ return INFINITY;
+ nadj = native_log(M_PI_F / __gen_ocl_fabs(t * x));
+ x = -x;
+ }
+
+ if (ix == 0x3f800000 || ix == 0x40000000)
+ r = 0;
+ else if (ix < 0x40000000) {
+ if (ix <= 0x3f666666) {
+ r = - native_log(x);
+ if (ix >= 0x3f3b4a20) {
+ y = 1 - x;
+ i = 0;
+ } else if (ix >= 0x3e6d3308) {
+ y = x - (tc - 1);
+ i = 1;
+ } else {
+ y = x;
+ i = 2;
+ }
+ } else {
+ r = 0;
+ if (ix >= 0x3fdda618) {
+ y = 2 - x;
+ i = 0;
+ } else if (ix >= 0x3F9da620) {
+ y = x - tc;
+ i = 1;
+ } else {
+ y = x - 1;
+ i = 2;
+ }
+ }
+ switch (i) {
+ case 0:
+ z = y * y;
+ p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+ p = y * p1 + p2;
+ r += (p - .5f * y);
+ break;
+ case 1:
+ z = y * y;
+ w = z * y;
+ p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+ p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+ p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+ p = z * p1 - (tt - w * (p2 + y * p3));
+ r += (tf + p);
+ break;
+ case 2:
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+ p2 = 1 + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ r += (-.5f * y + p1 / p2);
+ }
+ } else if (ix < 0x41000000) {
+ i = x;
+ t = 0;
+ y = x - i;
+ p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
+ q = 1 + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+ r = .5f * y + p / q;
+ z = 1;
+ switch (i) {
+ case 7:
+ z *= (y + 6.f);
+ case 6:
+ z *= (y + 5.f);
+ case 5:
+ z *= (y + 4.f);
+ case 4:
+ z *= (y + 3.f);
+ case 3:
+ z *= (y + 2.f);
+ r += native_log(z);
+ break;
+ }
+ } else if (ix < 0x5c800000) {
+ t = native_log(x);
+ z = 1 / x;
+ y = z * z;
+ w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+ r = (x - .5f) * (t - 1) + w;
+ } else
+ r = x * (native_log(x) - 1);
+ if (hx < 0)
+ r = nadj - r;
+ return r;
+}
+
+INLINE_OVERLOADABLE float lgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ const float
+ zero= 0.,
+ one = 1.0000000000e+00,
+ pi = 3.1415927410e+00,
+ a0 = 7.7215664089e-02,
+ a1 = 3.2246702909e-01,
+ a2 = 6.7352302372e-02,
+ a3 = 2.0580807701e-02,
+ a4 = 7.3855509982e-03,
+ a5 = 2.8905137442e-03,
+ a6 = 1.1927076848e-03,
+ a7 = 5.1006977446e-04,
+ a8 = 2.2086278477e-04,
+ a9 = 1.0801156895e-04,
+ a10 = 2.5214456400e-05,
+ a11 = 4.4864096708e-05,
+ tc = 1.4616321325e+00,
+ tf = -1.2148628384e-01,
+ tt = 6.6971006518e-09,
+ t0 = 4.8383611441e-01,
+ t1 = -1.4758771658e-01,
+ t2 = 6.4624942839e-02,
+ t3 = -3.2788541168e-02,
+ t4 = 1.7970675603e-02,
+ t5 = -1.0314224288e-02,
+ t6 = 6.1005386524e-03,
+ t7 = -3.6845202558e-03,
+ t8 = 2.2596477065e-03,
+ t9 = -1.4034647029e-03,
+ t10 = 8.8108185446e-04,
+ t11 = -5.3859531181e-04,
+ t12 = 3.1563205994e-04,
+ t13 = -3.1275415677e-04,
+ t14 = 3.3552918467e-04,
+ u0 = -7.7215664089e-02,
+ u1 = 6.3282704353e-01,
+ u2 = 1.4549225569e+00,
+ u3 = 9.7771751881e-01,
+ u4 = 2.2896373272e-01,
+ u5 = 1.3381091878e-02,
+ v1 = 2.4559779167e+00,
+ v2 = 2.1284897327e+00,
+ v3 = 7.6928514242e-01,
+ v4 = 1.0422264785e-01,
+ v5 = 3.2170924824e-03,
+ s0 = -7.7215664089e-02,
+ s1 = 2.1498242021e-01,
+ s2 = 3.2577878237e-01,
+ s3 = 1.4635047317e-01,
+ s4 = 2.6642270386e-02,
+ s5 = 1.8402845599e-03,
+ s6 = 3.1947532989e-05,
+ r1 = 1.3920053244e+00,
+ r2 = 7.2193557024e-01,
+ r3 = 1.7193385959e-01,
+ r4 = 1.8645919859e-02,
+ r5 = 7.7794247773e-04,
+ r6 = 7.3266842264e-06,
+ w0 = 4.1893854737e-01,
+ w1 = 8.3333335817e-02,
+ w2 = -2.7777778450e-03,
+ w3 = 7.9365057172e-04,
+ w4 = -5.9518753551e-04,
+ w5 = 8.3633989561e-04,
+ w6 = -1.6309292987e-03;
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+ int i, hx, ix;
+ nadj = 0;
+ hx = *(int *)&x;
+ ix = hx & 0x7fffffff;
+ if (ix >= 0x7f800000)
+ return x * x;
+ if (ix == 0)
+ return ((x + one) / zero);
+ if (ix < 0x1c800000) {
+ if (hx < 0) {
+ return -native_log(-x);
+ } else
+ return -native_log(x);
+ }
+ if (hx < 0) {
+ if (ix >= 0x4b000000)
+ return ((-x) / zero);
+ t = __gen_ocl_internal_sinpi(x);
+ if (t == zero)
+ return ((-x) / zero);
+ nadj = native_log(pi / __gen_ocl_fabs(t * x));
+ x = -x;
+ }
+ if (ix == 0x3f800000 || ix == 0x40000000)
+ r = 0;
+ else if (ix < 0x40000000) {
+ if (ix <= 0x3f666666) {
+ r = -native_log(x);
+ if (ix >= 0x3f3b4a20) {
+ y = one - x;
+ i = 0;
+ } else if (ix >= 0x3e6d3308) {
+ y = x - (tc - one);
+ i = 1;
+ } else {
+ y = x;
+ i = 2;
+ }
+ } else {
+ r = zero;
+ if (ix >= 0x3fdda618) {
+ y = (float) 2.0 - x;
+ i = 0;
+ }
+ else if (ix >= 0x3F9da620) {
+ y = x - tc;
+ i = 1;
+ }
+ else {
+ y = x - one;
+ i = 2;
+ }
+ }
+ switch (i) {
+ case 0:
+ z = y * y;
+ p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+ p = y * p1 + p2;
+ r += (p - (float) 0.5 * y);
+ break;
+ case 1:
+ z = y * y;
+ w = z * y;
+ p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+ p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+ p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+ p = z * p1 - (tt - w * (p2 + y * p3));
+ r += (tf + p);
+ break;
+ case 2:
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+ p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ r += (-(float) 0.5 * y + p1 / p2);
+ }
+ } else if (ix < 0x41000000) {
+ i = (int) x;
+ t = zero;
+ y = x - (float) i;
+ p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
+ q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+ r = .5f * y + p / q;
+ z = one;
+ switch (i) {
+ case 7:
+ z *= (y + (float) 6.0);
+ case 6:
+ z *= (y + (float) 5.0);
+ case 5:
+ z *= (y + (float) 4.0);
+ case 4:
+ z *= (y + (float) 3.0);
+ case 3:
+ z *= (y + (float) 2.0);
+ r += native_log(z);
+ break;
+ }
+
+ } else if (ix < 0x5c800000) {
+ t = native_log(x);
+ z = one / x;
+ y = z * z;
+ w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+ r = (x - .5f) * (t - one) + w;
+ } else
+ r = x * (native_log(x) - one);
+ if (hx < 0)
+ r = nadj - r;
+ return r;
+}
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#define BODY \
+ const float \
+ zero= 0., \
+ one = 1.0000000000e+00, \
+ pi = 3.1415927410e+00, \
+ a0 = 7.7215664089e-02, \
+ a1 = 3.2246702909e-01, \
+ a2 = 6.7352302372e-02, \
+ a3 = 2.0580807701e-02, \
+ a4 = 7.3855509982e-03, \
+ a5 = 2.8905137442e-03, \
+ a6 = 1.1927076848e-03, \
+ a7 = 5.1006977446e-04, \
+ a8 = 2.2086278477e-04, \
+ a9 = 1.0801156895e-04, \
+ a10 = 2.5214456400e-05, \
+ a11 = 4.4864096708e-05, \
+ tc = 1.4616321325e+00, \
+ tf = -1.2148628384e-01, \
+ tt = 6.6971006518e-09, \
+ t0 = 4.8383611441e-01, \
+ t1 = -1.4758771658e-01, \
+ t2 = 6.4624942839e-02, \
+ t3 = -3.2788541168e-02, \
+ t4 = 1.7970675603e-02, \
+ t5 = -1.0314224288e-02, \
+ t6 = 6.1005386524e-03, \
+ t7 = -3.6845202558e-03, \
+ t8 = 2.2596477065e-03, \
+ t9 = -1.4034647029e-03, \
+ t10 = 8.8108185446e-04, \
+ t11 = -5.3859531181e-04, \
+ t12 = 3.1563205994e-04, \
+ t13 = -3.1275415677e-04, \
+ t14 = 3.3552918467e-04, \
+ u0 = -7.7215664089e-02, \
+ u1 = 6.3282704353e-01, \
+ u2 = 1.4549225569e+00, \
+ u3 = 9.7771751881e-01, \
+ u4 = 2.2896373272e-01, \
+ u5 = 1.3381091878e-02, \
+ v1 = 2.4559779167e+00, \
+ v2 = 2.1284897327e+00, \
+ v3 = 7.6928514242e-01, \
+ v4 = 1.0422264785e-01, \
+ v5 = 3.2170924824e-03, \
+ s0 = -7.7215664089e-02, \
+ s1 = 2.1498242021e-01, \
+ s2 = 3.2577878237e-01, \
+ s3 = 1.4635047317e-01, \
+ s4 = 2.6642270386e-02, \
+ s5 = 1.8402845599e-03, \
+ s6 = 3.1947532989e-05, \
+ r1 = 1.3920053244e+00, \
+ r2 = 7.2193557024e-01, \
+ r3 = 1.7193385959e-01, \
+ r4 = 1.8645919859e-02, \
+ r5 = 7.7794247773e-04, \
+ r6 = 7.3266842264e-06, \
+ w0 = 4.1893854737e-01, \
+ w1 = 8.3333335817e-02, \
+ w2 = -2.7777778450e-03, \
+ w3 = 7.9365057172e-04, \
+ w4 = -5.9518753551e-04, \
+ w5 = 8.3633989561e-04, \
+ w6 = -1.6309292987e-03; \
+ float t, y, z, nadj, p, p1, p2, p3, q, r, w; \
+ int i, hx, ix; \
+ nadj = 0; \
+ hx = *(int *)&x; \
+ *signgamp = 1; \
+ ix = hx & 0x7fffffff; \
+ if (ix >= 0x7f800000) \
+ return x * x; \
+ if (ix == 0) \
+ return ((x + one) / zero); \
+ if (ix < 0x1c800000) { \
+ if (hx < 0) { \
+ *signgamp = -1; \
+ return -native_log(-x); \
+ } else \
+ return -native_log(x); \
+ } \
+ if (hx < 0) { \
+ if (ix >= 0x4b000000) \
+ return ((-x) / zero); \
+ t = __gen_ocl_internal_sinpi(x); \
+ if (t == zero) \
+ return ((-x) / zero); \
+ nadj = native_log(pi / __gen_ocl_fabs(t * x)); \
+ if (t < zero) \
+ *signgamp = -1; \
+ x = -x; \
+ } \
+ if (ix == 0x3f800000 || ix == 0x40000000) \
+ r = 0; \
+ else if (ix < 0x40000000) { \
+ if (ix <= 0x3f666666) { \
+ r = -native_log(x); \
+ if (ix >= 0x3f3b4a20) { \
+ y = one - x; \
+ i = 0; \
+ } else if (ix >= 0x3e6d3308) { \
+ y = x - (tc - one); \
+ i = 1; \
+ } else { \
+ y = x; \
+ i = 2; \
+ } \
+ } else { \
+ r = zero; \
+ if (ix >= 0x3fdda618) { \
+ y = (float) 2.0 - x; \
+ i = 0; \
+ } \
+ else if (ix >= 0x3F9da620) { \
+ y = x - tc; \
+ i = 1; \
+ } \
+ else { \
+ y = x - one; \
+ i = 2; \
+ } \
+ } \
+ switch (i) { \
+ case 0: \
+ z = y * y; \
+ p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); \
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); \
+ p = y * p1 + p2; \
+ r += (p - (float) 0.5 * y); \
+ break; \
+ case 1: \
+ z = y * y; \
+ w = z * y; \
+ p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); \
+ p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); \
+ p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); \
+ p = z * p1 - (tt - w * (p2 + y * p3)); \
+ r += (tf + p); \
+ break; \
+ case 2: \
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); \
+ p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); \
+ r += (-(float) 0.5 * y + p1 / p2); \
+ } \
+ } else if (ix < 0x41000000) { \
+ i = (int) x; \
+ t = zero; \
+ y = x - (float) i; \
+ p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); \
+ q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); \
+ r = .5f * y + p / q; \
+ z = one; \
+ switch (i) { \
+ case 7: \
+ z *= (y + (float) 6.0); \
+ case 6: \
+ z *= (y + (float) 5.0); \
+ case 5: \
+ z *= (y + (float) 4.0); \
+ case 4: \
+ z *= (y + (float) 3.0); \
+ case 3: \
+ z *= (y + (float) 2.0); \
+ r += native_log(z); \
+ break; \
+ } \
+ \
+ } else if (ix < 0x5c800000) { \
+ t = native_log(x); \
+ z = one / x; \
+ y = z * z; \
+ w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); \
+ r = (x - .5f) * (t - one) + w; \
+ } else \
+ r = x * (native_log(x) - one); \
+ if (hx < 0) \
+ r = nadj - r; \
+ return r;
+INLINE_OVERLOADABLE float lgamma_r(float x, global int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, local int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
+#undef BODY
+
INLINE_OVERLOADABLE float native_log10(float x) {
return native_log2(x) * 0.3010299956f;
}
@@ -689,7 +1366,13 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
x = 1 / x;
c = -1;
}
- return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
+ a += c*x;
+ int i;
+ int sign;
+ for(i=3, sign=-1; i<63; i+=2, sign=-sign) {
+ a += c*sign*__gen_ocl_pow(x,i)/i;
+ }
+ return a;
}
INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
return __gen_ocl_internal_atan(x) / M_PI_F;
@@ -716,6 +1399,86 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
// XXX work-around PTX profile
#define sqrt native_sqrt
INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+ uint hx = *(uint *)(&x), ix = hx & 0x7FFFFFFF;
+ uint hy = *(uint *)(&y), iy = hy & 0x7FFFFFFF;
+ if (ix > 0x7F800000 || iy > 0x7F800000)
+ return nan(0u);
+ if (ix == 0) {
+ if (y > 0)
+ return M_PI_2_F;
+ if (y < 0)
+ return - M_PI_2_F;
+ return nan(0u);
+ } else {
+ float z = __gen_ocl_internal_atan(y / x);
+ if (x > 0)
+ return z;
+ if (y >= 0)
+ return M_PI_F + z;
+ return - M_PI_F + z;
+ }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+ uint ix = as_uint(x), iy = as_uint(y),
+ pos_zero = 0, neg_zero = 0x80000000u,
+ pos_inf = 0x7f800000, neg_inf = 0xff800000u;
+ if(iy == pos_zero) {
+ if(ix == pos_zero)
+ return 0;
+ if(ix == neg_zero)
+ return 1;
+ if(x < 0)
+ return 1;
+ if(x > 0)
+ return 0;
+ }
+ if(iy == neg_zero) {
+ if(ix == pos_zero)
+ return -0.f;
+ if(ix == neg_zero)
+ return -1;
+ if(x < 0)
+ return -1;
+ if(x > 0)
+ return -0.f;
+ }
+ if((ix & 0x7fffffff) == 0) {
+ if(y < 0)
+ return -.5f;
+ if(y > 0)
+ return .5f;
+ }
+ if(ix == pos_inf) {
+ if(y > 0 && iy != pos_inf)
+ return 0;
+ if(y < 0 && iy != neg_inf)
+ return -0.f;
+ }
+ if(ix == neg_inf) {
+ if(y > 0 && iy != pos_inf)
+ return 1;
+ if(y < 0 && iy != neg_inf)
+ return -1;
+ }
+ if(iy == pos_inf) {
+ if(ix == pos_inf)
+ return 0.25f;
+ if(ix == neg_inf)
+ return 0.75f;
+ if(x >= 0 || x <= 0)
+ return 0.5f;
+ }
+ if(iy == neg_inf) {
+ if(ix == pos_inf)
+ return -0.25f;
+ if(ix == neg_inf)
+ return -0.75f;
+ if(x >= 0 || x <= 0)
+ return -0.5f;
+ }
+ return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+}
INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x) { return __gen_ocl_fabs(x); }
INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
@@ -748,6 +1511,8 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
#define tanpi __gen_ocl_internal_tanpi
#define tanh __gen_ocl_internal_tanh
#define atan __gen_ocl_internal_atan
+#define atan2 __gen_ocl_internal_atan2
+#define atan2pi __gen_ocl_internal_atan2pi
#define atanpi __gen_ocl_internal_atanpi
#define atanh __gen_ocl_internal_atanh
#define pow powr
@@ -761,48 +1526,29 @@ INLINE_OVERLOADABLE float mad(float a, float b, float c) {
return a*b+c;
}
-INLINE_OVERLOADABLE uint select(uint src0, uint src1, int cond) {
- return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
- return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE int select(int src0, int src1, int cond) {
- return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE int select(int src0, int src1, uint cond) {
- return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE float select(float src0, float src1, int cond) {
- return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE float select(float src0, float src1, uint cond) {
- return cond ? src1 : src0;
-}
-
-// This will be optimized out by LLVM and will output LLVM select instructions
-#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-INLINE_OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
- TYPE4 dst; \
- const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
- const TYPE x1 = src1.x; \
- const TYPE y0 = src0.y; \
- const TYPE y1 = src1.y; \
- const TYPE z0 = src0.z; \
- const TYPE z1 = src1.z; \
- const TYPE w0 = src0.w; \
- const TYPE w1 = src1.w; \
- dst.x = (cond.x & MASK) ? x1 : x0; \
- dst.y = (cond.y & MASK) ? y1 : y0; \
- dst.z = (cond.z & MASK) ? z1 : z0; \
- dst.w = (cond.w & MASK) ? w1 : w0; \
- return dst; \
-}
-DECL_SELECT4(int4, int, int4, 0x80000000)
-DECL_SELECT4(int4, int, uint4, 0x80000000)
-DECL_SELECT4(float4, float, int4, 0x80000000)
-DECL_SELECT4(float4, float, uint4, 0x80000000)
-#undef DECL_SELECT4
+#define DEF(TYPE1, TYPE2) \
+ INLINE_OVERLOADABLE TYPE1 select(TYPE1 src0, TYPE1 src1, TYPE2 cond) { \
+ return cond ? src1 : src0; \
+ }
+DEF(char, char)
+DEF(char, uchar)
+DEF(uchar, char)
+DEF(uchar, uchar)
+DEF(short, short)
+DEF(short, ushort)
+DEF(ushort, short)
+DEF(ushort, ushort)
+DEF(int, int)
+DEF(int, uint)
+DEF(uint, int)
+DEF(uint, uint)
+DEF(long, long)
+DEF(long, ulong)
+DEF(ulong, long)
+DEF(ulong, ulong)
+DEF(float, int)
+DEF(float, uint)
+#undef DEF
/////////////////////////////////////////////////////////////////////////////
// Common Functions (see 6.11.4 of OCL 1.1 spec)
@@ -1034,9 +1780,19 @@ INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p)
*(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
}
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return *(SPACE TYPE##3 *) (p + 3 * offset); \
+}
+
#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
- DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \
+ DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
@@ -1151,6 +1907,8 @@ DEF(ushort)
DEF(int)
DEF(uint)
DEF(float)
+DEF(long)
+DEF(ulong)
#undef DEF
#undef DEC2
#undef DEC4
@@ -1256,6 +2014,8 @@ DEF(ushort)
DEF(int)
DEF(uint)
DEF(float)
+DEF(long)
+DEF(ulong)
#undef DEF
#undef DEC2
#undef DEC2X
@@ -1395,10 +2155,10 @@ OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
- DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX) \
+ DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
#define DECL_ATOMIC_OP(NAME) \
- DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_) \
+ DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_) \
DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
DECL_ATOMIC_OP(add)
@@ -1407,12 +2167,20 @@ DECL_ATOMIC_OP(and)
DECL_ATOMIC_OP(or)
DECL_ATOMIC_OP(xor)
DECL_ATOMIC_OP(xchg)
-DECL_ATOMIC_OP_TYPE(xchg, float, atomic_)
DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX) \
+ INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+ return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+ }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+
#undef DECL_ATOMIC_OP
#undef DECL_ATOMIC_OP_TYPE
#undef DECL_ATOMIC_OP_SPACE
@@ -1456,6 +2224,21 @@ DECL_ATOMIC_OP(cmpxchg)
#undef DECL_ATOMIC_OP_TYPE
#undef DECL_ATOMIC_OP_SPACE
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
/////////////////////////////////////////////////////////////////////////////
// Force the compilation to SIMD8 or SIMD16
/////////////////////////////////////////////////////////////////////////////
@@ -1472,19 +2255,19 @@ int __gen_ocl_force_simd16(void);
// Image access functions
/////////////////////////////////////////////////////////////////////////////
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
@@ -1504,46 +2287,145 @@ int __gen_ocl_get_image_height(uint surface_id);
int __gen_ocl_get_image_channel_data_type(uint surface_id);
int __gen_ocl_get_image_channel_order(uint surface_id);
int __gen_ocl_get_image_depth(uint surface_id);
+ushort __gen_ocl_get_sampler_info(uint sampler_id);
#define GET_IMAGE(cl_image, surface_id) \
uint surface_id = (uint)cl_image
-#define DECL_READ_IMAGE(type, suffix, coord_type) \
- INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, sampler_t sampler, coord_type coord) \
- {\
- GET_IMAGE(cl_image, surface_id);\
- return __gen_ocl_read_image ##suffix(surface_id, sampler, coord.s0, coord.s1);\
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_1 1
+#else
+#define GEN_FIX_1 0
+#endif
+
+#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix, \
+ image_type, type, suffix, coord_type) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \
+ sampler_t sampler, \
+ coord_type coord) \
+ { \
+ GET_IMAGE(cl_image, surface_id); \
+ coord_type tmpCoord = coord; \
+ ushort samplerValue; \
+ if (float_coord_rounding_fix | int_clamping_fix) { \
+ samplerValue = __gen_ocl_get_sampler_info(sampler); \
+ if (((samplerValue & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) \
+ && ((samplerValue & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) { \
+ if (float_coord_rounding_fix \
+ && ((samplerValue & CLK_NORMALIZED_COORDS_TRUE) == 0)) { \
+ FIXUP_FLOAT_COORD(tmpCoord); \
+ } \
+ if (int_clamping_fix) { \
+ if (OUT_OF_BOX(tmpCoord, surface_id, \
+ (samplerValue & CLK_NORMALIZED_COORDS_TRUE))) { \
+ unsigned int border_alpha; \
+ int order = __gen_ocl_get_image_channel_order(surface_id); \
+ if (!CLK_HAS_ALPHA(order)) { \
+ border_alpha = 1; \
+ } else \
+ border_alpha = 0; \
+ return (type)(0, 0, 0, border_alpha); \
+ } else \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\
+ } \
+ } \
+ } \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 0);\
}
-#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
- INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, coord_type coord) \
- {\
- GET_IMAGE(cl_image, surface_id);\
- return __gen_ocl_read_image ##suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1);\
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type) \
+ INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image, \
+ coord_type coord) \
+ { \
+ GET_IMAGE(cl_image, surface_id); \
+ return __gen_ocl_read_image ##suffix( \
+ EXPEND_READ_COORD(surface_id, \
+ CLK_NORMALIZED_COORDS_FALSE \
+ | CLK_ADDRESS_NONE \
+ | CLK_FILTER_NEAREST, coord), 0); \
}
-#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
- INLINE_OVERLOADABLE void write_image ##suffix(image2d_t cl_image, coord_type coord, type color)\
+#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
+ INLINE_OVERLOADABLE void write_image ##suffix(image_type cl_image, coord_type coord, type color)\
{\
GET_IMAGE(cl_image, surface_id);\
- __gen_ocl_write_image ##suffix(surface_id, coord.s0, coord.s1, color);\
+ __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
}
-#define DECL_IMAGE(type, suffix) \
- DECL_READ_IMAGE(type, suffix, int2) \
- DECL_READ_IMAGE(type, suffix, float2) \
- DECL_READ_IMAGE_NOSAMPLER(type, suffix, int2) \
- DECL_WRITE_IMAGE(type, suffix, int2) \
- DECL_WRITE_IMAGE(type, suffix, float2)
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
+
+#define OUT_OF_BOX(coord, surface, normalized) \
+ (coord.s0 < 0 || coord.s1 < 0 || \
+ ((normalized == 0) \
+ && (coord.s0 >= __gen_ocl_get_image_width(surface) \
+ || coord.s1 >= __gen_ocl_get_image_height(surface))) \
+ || ((normalized != 0) && (coord.s0 > 0x1p0 || coord.s1 > 0x1p0)))
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f) \
+ tmpCoord.s0 += -0x1p-9; \
+ if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f) \
+ tmpCoord.s1 += -0x1p-9f; \
+ }
-DECL_IMAGE(int4, i)
-DECL_IMAGE(uint4, ui)
-DECL_IMAGE(float4, f)
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n) \
+ DECL_READ_IMAGE(0, int_clamping_fix, image_type, type, suffix, int ##n) \
+ DECL_READ_IMAGE(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n) \
+ DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n) \
+ DECL_WRITE_IMAGE(image_type, type, suffix, int ## n) \
+ DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
+
+DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
+DECL_IMAGE(0, image2d_t, float4, f, 2)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
+#define OUT_OF_BOX(coord, surface, normalized) \
+ (coord.s0 < 0 || coord.s1 < 0 || coord.s2 < 0 || \
+ ((normalized == 0) \
+ && (coord.s0 >= __gen_ocl_get_image_width(surface) \
+ || coord.s1 >= __gen_ocl_get_image_height(surface) \
+ || coord.s2 >= __gen_ocl_get_image_depth(surface))) \
+ || ((normalized != 0) \
+ &&(coord.s0 > 1 || coord.s1 > 1 || coord.s2 > 1)))
+
+#define FIXUP_FLOAT_COORD(tmpCoord) \
+ { \
+ if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20) \
+ tmpCoord.s0 += -0x1p-9; \
+ if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20) \
+ tmpCoord.s1 += -0x1p-9; \
+ if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20) \
+ tmpCoord.s2 += -0x1p-9; \
+ }
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
+DECL_IMAGE(0, image3d_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
+DECL_IMAGE(0, image3d_t, float4, f, 3)
+#undef EXPEND_READ_COORD
+#undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
#undef DECL_IMAGE
#undef DECL_READ_IMAGE
#undef DECL_READ_IMAGE_NOSAMPLER
#undef DECL_WRITE_IMAGE
+#undef GEN_FIX_1
#define DECL_IMAGE_INFO(image_type) \
INLINE_OVERLOADABLE int get_image_width(image_type image) \
@@ -1601,40 +2483,6 @@ INLINE_OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
{ return __gen_ocl_get_image_array_size(image); }
#endif
-#define DECL_READ_IMAGE(type, suffix, coord_type) \
- INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, sampler_t sampler, coord_type coord) \
- {\
- GET_IMAGE(cl_image, surface_id);\
- return __gen_ocl_read_image ## suffix(surface_id, (uint)sampler, coord.s0, coord.s1, coord.s2);\
- }
-
-#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
- INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, coord_type coord) \
- {\
- GET_IMAGE(cl_image, surface_id);\
- return __gen_ocl_read_image ## suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1, coord.s2);\
- }
-
-#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
- INLINE_OVERLOADABLE void write_image ## suffix(image3d_t cl_image, coord_type coord, type color)\
- {\
- GET_IMAGE(cl_image, surface_id);\
- __gen_ocl_write_image ## suffix(surface_id, coord.s0, coord.s1, coord.s2, color);\
- }
-
-#define DECL_IMAGE(type, suffix) \
- DECL_READ_IMAGE(type, suffix, int4) \
- DECL_READ_IMAGE(type, suffix, float4) \
- DECL_READ_IMAGE_NOSAMPLER(type, suffix, int4) \
- DECL_WRITE_IMAGE(type, suffix, int4) \
- DECL_WRITE_IMAGE(type, suffix, float4)
-
-DECL_IMAGE(int4, i)
-DECL_IMAGE(uint4, ui)
-DECL_IMAGE(float4, f)
-
-
-
#pragma OPENCL EXTENSION cl_khr_fp64 : disable
#undef DECL_IMAGE
diff --git a/backend/src/sys/platform.hpp b/backend/src/sys/platform.hpp
index a665356..b8a2841 100644
--- a/backend/src/sys/platform.hpp
+++ b/backend/src/sys/platform.hpp
@@ -24,6 +24,9 @@
#include <cstdlib>
#include <cstdio>
#include <iostream>
+#include <ostream>
+#include <istream>
+#include <string>
#include <cassert>
#include <new>
@@ -323,6 +326,47 @@ private:
INLINE NonCopyable& operator= (const NonCopyable&) {return *this;}
};
+#define TO_MAGIC(A, B, C, D) (A<<24 | B<<16 | C<<8 | D)
+
+class Serializable
+{
+public:
+ INLINE Serializable(void) = default;
+ INLINE Serializable(const Serializable&) = default;
+ INLINE Serializable& operator= (const Serializable&) = default;
+
+ virtual size_t serializeToBin(std::ostream& outs) = 0;
+ virtual size_t deserializeFromBin(std::istream& ins) = 0;
+
+ /* These two will follow LLVM's ABI. */
+ virtual size_t serializeToLLVM(void) { return 0;/* not implemented now. */}
+ virtual size_t deserializeFromLLVM(void) { return 0;/* not implemented now. */}
+
+ virtual void printStatus(int indent = 0, std::ostream& outs = std::cout) { }
+
+ virtual ~Serializable(void) { }
+
+protected:
+ static std::string indent_to_str(int indent) {
+ std::string ind(indent, ' ');
+ return ind;
+ }
+};
+
+/* Help Macro for serialization. */
+#define SERIALIZE_OUT(elt, out, sz) \
+ do { \
+ auto tmp_val = elt; \
+ out.write((char *)(&tmp_val), sizeof(elt)); \
+ sz += sizeof(elt); \
+ } while(0)
+
+#define DESERIALIZE_IN(elt, in, sz) \
+ do { \
+ in.read((char *)(&(elt)), sizeof(elt)); \
+ sz += sizeof(elt); \
+ } while(0)
+
////////////////////////////////////////////////////////////////////////////////
/// Disable some compiler warnings
////////////////////////////////////////////////////////////////////////////////
diff --git a/kernels/builtin_atan2.cl b/kernels/builtin_atan2.cl
new file mode 100644
index 0000000..aba73be
--- /dev/null
+++ b/kernels/builtin_atan2.cl
@@ -0,0 +1,4 @@
+kernel void builtin_atan2(global float *y, global float *x, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = atan2(y[i], x[i]);
+};
diff --git a/kernels/builtin_lgamma.cl b/kernels/builtin_lgamma.cl
new file mode 100644
index 0000000..85bf859
--- /dev/null
+++ b/kernels/builtin_lgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = lgamma(src[i]);
+};
diff --git a/kernels/builtin_lgamma_r.cl b/kernels/builtin_lgamma_r.cl
new file mode 100644
index 0000000..71fcc36
--- /dev/null
+++ b/kernels/builtin_lgamma_r.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma_r(global float *src, global float *dst, global int *signp) {
+ int i = get_global_id(0);
+ dst[i] = lgamma_r(src[i], signp+i);
+};
diff --git a/kernels/builtin_sinpi.cl b/kernels/builtin_sinpi.cl
new file mode 100644
index 0000000..134152d
--- /dev/null
+++ b/kernels/builtin_sinpi.cl
@@ -0,0 +1,4 @@
+kernel void builtin_sinpi(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = sinpi(src[i]);
+};
diff --git a/kernels/builtin_tgamma.cl b/kernels/builtin_tgamma.cl
new file mode 100644
index 0000000..1f7abc3
--- /dev/null
+++ b/kernels/builtin_tgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_tgamma(global float *src, global float *dst) {
+ int i = get_global_id(0);
+ dst[i] = tgamma(src[i]);
+};
diff --git a/kernels/compiler_abs_diff.cl b/kernels/compiler_abs_diff.cl
index 583ba2b..1f30df4 100644
--- a/kernels/compiler_abs_diff.cl
+++ b/kernels/compiler_abs_diff.cl
@@ -26,3 +26,5 @@ COMPILER_ABS(char, uchar)
COMPILER_ABS(uchar, uchar)
COMPILER_ABS(short, ushort)
COMPILER_ABS(ushort, ushort)
+COMPILER_ABS(long, ulong)
+COMPILER_ABS(ulong, ulong)
diff --git a/kernels/compiler_bool_cross_basic_block.cl b/kernels/compiler_bool_cross_basic_block.cl
new file mode 100644
index 0000000..9aeb16d
--- /dev/null
+++ b/kernels/compiler_bool_cross_basic_block.cl
@@ -0,0 +1,21 @@
+__kernel
+void compiler_bool_cross_basic_block(__global int *src,
+ __global int *dst,
+ int scale){
+ int id = (int)get_global_id(0);
+
+ bool isRedRow = false;
+ bool isRed;
+ int val = src[id];
+ for (unsigned int i=0; i<scale; i++, isRedRow = !isRedRow) {
+ if (isRedRow) {
+ isRed= false;
+ for (unsigned int j=0; j < scale; j++, isRed=!isRed) {
+ if (isRed) {
+ val++;
+ }
+ }
+ }
+ }
+ dst[id] = val;
+}
diff --git a/kernels/compiler_box_blur_image.cl b/kernels/compiler_box_blur_image.cl
index 7bcbdeb..42f463b 100644
--- a/kernels/compiler_box_blur_image.cl
+++ b/kernels/compiler_box_blur_image.cl
@@ -10,7 +10,7 @@ __kernel void compiler_box_blur_image(__read_only image2d_t src,
for (offset.y = -1; offset.y <= 1; offset.y++) {
for (offset.x = -1; offset.x <= 1; offset.x++) {
- sum += read_imagef(src, sampler, coord + offset);
+ sum += read_imagef(src, sampler, coord + offset);
}
}
diff --git a/kernels/compiler_function_constant0.cl b/kernels/compiler_function_constant0.cl
index 363d84e..5340352 100644
--- a/kernels/compiler_function_constant0.cl
+++ b/kernels/compiler_function_constant0.cl
@@ -1,5 +1,5 @@
__kernel void
-compiler_function_constant0(__constant short *c0, __constant char *c1, __global int *dst, int value)
+compiler_function_constant0(__constant int *c0, __constant char *c1, __global int *dst, int value)
{
int id = (int)get_global_id(0);
dst[id] = value + c0[id%69] + c1[0];
diff --git a/kernels/compiler_global_constant.cl b/kernels/compiler_global_constant.cl
index 5db58d6..71fe86c 100644
--- a/kernels/compiler_global_constant.cl
+++ b/kernels/compiler_global_constant.cl
@@ -1,10 +1,65 @@
constant int m[3] = {71,72,73};
constant int n = 1;
-constant int o[3] = {1, 1, 1};
+constant int o[3] = {3, 2, 1};
+
+constant int4 a= {1, 2, 3, 4};
+constant int4 b = {0, -1, -2, -3};
+
+struct Person {
+ char name[7];
+ int3 idNumber;
+};
+
+struct Test1 {
+ int a0;
+ char a1;
+};
+
+struct Test2 {
+ char a0;
+ int a1;
+};
+
+constant struct Person james= {{"james"}, (int3)(1, 2, 3)};
+
+constant struct Test1 t0 = {1, 2};
+constant struct Test2 t1 = {1, 2};
+
+constant int3 c[3] = {(int3)(0, 1, 2), (int3)(3, 4, 5), (int3)(6,7,8) };
+constant char4 d[3] = {(char4)(0, 1, 2, 3), (char4)(4, 5, 6, 7), (char4)(8, 9, 10, 11)};
+
+constant struct Person members[3] = {{{"abc"}, (int3)(1, 2, 3)}, { {"defg"}, (int3)(4,5,6)}, { {"hijk"}, (int3)(7,8,9)} };
__kernel void
compiler_global_constant(__global int *dst, int e, int r)
{
int id = (int)get_global_id(0);
- dst[id] = m[id%3] * n * o[2] + e + r;
+
+ int4 x = a + b;
+ dst[id] = m[id%3] * n * o[2] + e + r *x.y * a.x;
+}
+// array of vectors
+__kernel void
+compiler_global_constant1(__global int *dst)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = c[id%3].y + d[id%3].w;
+}
+
+// structure
+__kernel void
+compiler_global_constant2(__global int *dst)
+{
+ int id = (int)get_global_id(0);
+
+ dst[id] = james.idNumber.y + t0.a1 + t1.a1;
+}
+
+//array of structure
+__kernel void
+compiler_global_constant3(__global int *dst)
+{
+ int id = (int)get_global_id(0);
+
+ dst[id] = members[id%3].idNumber.z + members[id%3].name[2];
}
diff --git a/kernels/compiler_global_constant_2.cl b/kernels/compiler_global_constant_2.cl
index 353ebd7..04536c7 100644
--- a/kernels/compiler_global_constant_2.cl
+++ b/kernels/compiler_global_constant_2.cl
@@ -1,5 +1,9 @@
constant int m[3] = {0x15b,0x25b,0x35b};
-constant int t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+constant short t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+constant long n[3] = {0x15b,0x25b,0xFFFFFFFFF};
+constant long p[3] = {1,1,1};
+constant long s = 1;
+
__kernel void
compiler_global_constant_2(__global int *dst, int e, int r)
@@ -7,3 +11,10 @@ compiler_global_constant_2(__global int *dst, int e, int r)
int id = (int)get_global_id(0);
dst[id] = m[id%3] + t[id%5] + e + r;
}
+
+__kernel void
+compiler_global_constant_2_long(__global long *dst, int e, int r)
+{
+ int id = (int)get_global_id(0);
+ dst[id] = n[id%3]*p[1] + e*s + r;
+}
diff --git a/kernels/compiler_group_size.cl b/kernels/compiler_group_size.cl
index 9dba236..4e2c333 100644
--- a/kernels/compiler_group_size.cl
+++ b/kernels/compiler_group_size.cl
@@ -10,3 +10,20 @@ compiler_group_size(__global unsigned int *dst)
dst[idz*size_x*size_y + idy*size_x + idx] = idz*size_x*size_y + idy*size_x +idx;
}
+struct xyz{
+ unsigned short b;
+ unsigned short e;
+ unsigned int o;
+};
+
+__kernel void
+compiler_group_size4(__global struct xyz *src, __global unsigned int *dst, unsigned int num, unsigned int c)
+{
+ uint idx = (uint)get_global_id(0);
+ if(idx>=num)
+ return;
+ struct xyz td = src[idx];
+ for(unsigned x = td.b;x<=td.e;x++)
+ dst[td.o+x] = c;
+}
+
diff --git a/kernels/compiler_long_convert.cl b/kernels/compiler_long_convert.cl
index f22914f..e5f7939 100644
--- a/kernels/compiler_long_convert.cl
+++ b/kernels/compiler_long_convert.cl
@@ -5,3 +5,15 @@ kernel void compiler_long_convert(global char *src1, global short *src2, global
dst2[i] = src2[i];
dst3[i] = src3[i];
}
+
+kernel void compiler_long_convert_2(global char *dst1, global short *dst2, global int *dst3, global long *src) {
+ int i = get_global_id(0);
+ dst1[i] = src[i];
+ dst2[i] = src[i];
+ dst3[i] = src[i];
+}
+
+kernel void compiler_long_convert_to_float(global float *dst, global long *src) {
+ int i = get_global_id(0);
+ dst[i] = src[i];
+}
diff --git a/kernels/compiler_upsample_long.cl b/kernels/compiler_upsample_long.cl
index 16f806b..8f914e4 100644
--- a/kernels/compiler_upsample_long.cl
+++ b/kernels/compiler_upsample_long.cl
@@ -1,4 +1,4 @@
-kernel void compiler_upsample_int(global int *src1, global uint *src2, global long *dst) {
+kernel void compiler_upsample_long(global int *src1, global uint *src2, global long *dst) {
int i = get_global_id(0);
dst[i] = upsample(src1[i], src2[i]);
}
diff --git a/kernels/compiler_vector_inc.cl b/kernels/compiler_vector_inc.cl
new file mode 100644
index 0000000..548dcb4
--- /dev/null
+++ b/kernels/compiler_vector_inc.cl
@@ -0,0 +1,13 @@
+kernel void compiler_vector_inc(global char *dst, global char *src) {
+ size_t i = get_global_id(0);
+ char2 dst2 = vload2(i, dst);
+ if (src[i] == 0)
+ dst2++;
+ else if(src[i] == 1)
+ ++dst2;
+ else if(src[i] == 2)
+ dst2--;
+ else
+ --dst2;
+ vstore2(dst2, i, dst);
+}
diff --git a/kernels/test_copy_image_3d.cl b/kernels/test_copy_image_3d.cl
index 766227a..103fb69 100644
--- a/kernels/test_copy_image_3d.cl
+++ b/kernels/test_copy_image_3d.cl
@@ -1,11 +1,28 @@
__kernel void
-test_copy_image_3d(__read_only image3d_t src, __write_only image3d_t dst, sampler_t sampler)
+test_copy_image_3d(__read_only image3d_t src,
+ __write_only image3d_t dst,
+ sampler_t sampler,
+ __write_only image2d_t buf0,
+ __write_only image2d_t buf1,
+ __write_only image2d_t buf2,
+ __write_only image2d_t buf3)
{
int4 coord;
- int4 color;
+ int2 coord2;
+ float4 color;
coord.x = (int)get_global_id(0);
coord.y = (int)get_global_id(1);
- coord.z = 0;
- color = read_imagei(src, sampler, coord);
- write_imagei(dst, coord, color);
+ coord.z = (int)get_global_id(2);
+ coord2.x = coord.x;
+ coord2.y = coord.y;
+ color = read_imagef(src, sampler, coord);
+ write_imagef(dst, coord, color);
+ if (coord.z == 0)
+ write_imagef(buf0, coord2, color);
+ else if (coord.z == 1)
+ write_imagef(buf1, coord2, color);
+ else if (coord.z == 2)
+ write_imagef(buf2, coord2, color);
+ else if (coord.z == 3)
+ write_imagef(buf3, coord2, color);
}
diff --git a/kernels/test_fill_image_3d.cl b/kernels/test_fill_image_3d.cl
index 0f0c6fd..4988f69 100644
--- a/kernels/test_fill_image_3d.cl
+++ b/kernels/test_fill_image_3d.cl
@@ -9,6 +9,6 @@ test_fill_image_3d(__write_only image3d_t dst, uint color)
color4.s3 = color & 0xFF;
coord.x = (int)get_global_id(0);
coord.y = (int)get_global_id(1);
- coord.z = 0;
+ coord.z = (int)get_global_id(2);
write_imagei(dst, coord, color4);
}
diff --git a/kernels/test_fill_image_3d_2.cl b/kernels/test_fill_image_3d_2.cl
index 22b6452..1f9eaa1 100644
--- a/kernels/test_fill_image_3d_2.cl
+++ b/kernels/test_fill_image_3d_2.cl
@@ -5,6 +5,6 @@ test_fill_image_3d_2(__write_only image3d_t dst)
int4 color4 = {0x12, 0x34, 0x56, 0x78};
coord.x = (int)get_global_id(0);
coord.y = (int)get_global_id(1);
- coord.z = 0;
+ coord.z = (int)get_global_id(2);
write_imagei(dst, coord, color4);
}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 58d23cb..3fc8689 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,8 @@
include_directories(${CMAKE_CURRENT_SOURCE_DIR}
${DRM_INCLUDE_PATH}
${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
- ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+ ${CMAKE_CURRENT_SOURCE_DIR}/../include
+ ${MESA_SOURCE_INCLUDES})
set(OPENCL_SRC
cl_api.c
@@ -29,16 +30,14 @@ set(OPENCL_SRC
x11/dricommon.c
x11/va_dri2.c)
-if (EGL_FOUND AND GBM_FOUND)
-set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/gbm_dri2_x11_platform.c)
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
+set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
-SET(OPTIONAL_GBM_LIBRARY "${GBM_LIBRARY}")
-else(EGL_FOUND AND GBM_FOUND)
+else(EGL_FOUND AND MESA_SOURCE_FOUND)
SET(OPTIONAL_EGL_LIBRARY "")
-SET(OPTIONAL_GBM_LIBRARY "")
-endif (EGL_FOUND AND GBM_FOUND)
+endif (EGL_FOUND AND MESA_SOURCE_FOUND)
if (OCLIcd_FOUND)
set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
@@ -46,7 +45,7 @@ SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
endif (OCLIcd_FOUND)
-SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic")
+SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
link_directories (${LLVM_LIBRARY_DIR})
add_library(cl SHARED ${OPENCL_SRC})
@@ -59,6 +58,5 @@ target_link_libraries(
${DRM_INTEL_LIBRARY}
${DRM_LIBRARY}
${OPENGL_LIBRARIES}
- ${OPTIONAL_EGL_LIBRARY}
- ${OPTIONAL_GBM_LIBRARY})
+ ${OPTIONAL_EGL_LIBRARY})
install (TARGETS cl LIBRARY DESTINATION lib)
diff --git a/src/cl_api.c b/src/cl_api.c
index 4f048ee..ded0e0c 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -66,7 +66,7 @@ inline cl_int
handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
cl_event* event, enqueue_data* data, cl_command_type type)
{
- cl_int status = cl_event_wait_events(num, wait_list);
+ cl_int status = cl_event_wait_events(num, wait_list, queue);
cl_event e;
if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
e = cl_event_new(queue->ctx, queue, type, event!=NULL);
@@ -79,6 +79,66 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
return status;
}
+/* The following code checking overlap is from Appendix of openCL spec 1.1 */
+inline cl_bool check_copy_overlap(const size_t src_offset[3],
+ const size_t dst_offset[3],
+ const size_t region[3],
+ size_t row_pitch, size_t slice_pitch)
+{
+ const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+ const size_t src_max[] = {src_offset[0] + region[0],
+ src_offset[1] + region[1],
+ src_offset[2] + region[2]};
+ const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+ const size_t dst_max[] = {dst_offset[0] + region[0],
+ dst_offset[1] + region[1],
+ dst_offset[2] + region[2]};
+ // Check for overlap
+ cl_bool overlap = CL_TRUE;
+ unsigned i;
+ size_t dst_start = dst_offset[2] * slice_pitch +
+ dst_offset[1] * row_pitch + dst_offset[0];
+ size_t dst_end = dst_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+ size_t src_start = src_offset[2] * slice_pitch +
+ src_offset[1] * row_pitch + src_offset[0];
+ size_t src_end = src_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+
+ for (i=0; i != 3; ++i) {
+ overlap = overlap && (src_min[i] < dst_max[i])
+ && (src_max[i] > dst_min[i]);
+ }
+
+ if (!overlap) {
+ size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
+ src_offset[0] + region[0] - row_pitch : 0;
+ size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
+ dst_offset[0] + region[0] - row_pitch : 0;
+ if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+ (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
+ if ( (src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end) )
+ overlap = CL_TRUE;
+ }
+ if (region[2] > 1) {
+ size_t src_height = slice_pitch / row_pitch;
+ size_t dst_height = slice_pitch / row_pitch;
+ size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
+ src_offset[1] + region[1] - src_height : 0;
+ size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
+ dst_offset[1] + region[1] - dst_height : 0;
+ if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+ (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
+ if ( (src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end) )
+ overlap = CL_TRUE;
+ }
+ }
+ }
+ return overlap;
+}
+
static cl_int
cl_check_device_type(cl_device_type device_type)
{
@@ -408,7 +468,7 @@ clCreateBuffer(cl_context context,
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
- mem = cl_mem_new(context, flags, size, host_ptr, &err);
+ mem = cl_mem_new_buffer(context, flags, size, host_ptr, &err);
error:
if (errcode_ret)
*errcode_ret = err;
@@ -469,6 +529,7 @@ clCreateImage2D(cl_context context,
cl_int err = CL_SUCCESS;
CHECK_CONTEXT (context);
cl_image_desc image_desc;
+ memset(&image_desc, 0, sizeof(image_desc));
image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
image_desc.image_width = image_width;
@@ -592,13 +653,13 @@ error:
}
cl_int
-clGetImageInfo(cl_mem image,
+clGetImageInfo(cl_mem mem,
cl_image_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret)
{
- return cl_get_image_info(image,
+ return cl_get_image_info(mem,
param_name,
param_value_size,
param_value,
@@ -777,7 +838,8 @@ clBuildProgram(cl_program program,
/* TODO support create program from binary */
assert(program->source_type == FROM_LLVM ||
- program->source_type == FROM_SOURCE);
+ program->source_type == FROM_SOURCE ||
+ program->source_type == FROM_BINARY);
if((err = cl_program_build(program, options)) != CL_SUCCESS) {
goto error;
}
@@ -1015,7 +1077,7 @@ clWaitForEvents(cl_uint num_events,
TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
- while(cl_event_wait_events(num_events, event_list) == CL_ENQUEUE_EXECUTE_DEFER) {
+ while(cl_event_wait_events(num_events, event_list, NULL) == CL_ENQUEUE_EXECUTE_DEFER) {
usleep(8000); //sleep 8ms to wait other thread
}
@@ -1034,11 +1096,6 @@ clGetEventInfo(cl_event event,
CHECK_EVENT(event);
if (param_name == CL_EVENT_COMMAND_QUEUE) {
- if(event->queue == NULL) {
- param_value_size_ret = 0;
- param_value = NULL;
- return err;
- }
FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
} else if (param_name == CL_EVENT_CONTEXT) {
FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
@@ -1243,8 +1300,74 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (blocking_read != CL_TRUE)
+ NOT_IMPLEMENTED;
+
+ if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(buffer_row_pitch == 0)
+ buffer_row_pitch = region[0];
+ if(buffer_slice_pitch == 0)
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+ if(host_row_pitch == 0)
+ host_row_pitch = region[0];
+ if(host_slice_pitch == 0)
+ host_slice_pitch = region[1] * host_row_pitch;
+
+ if (buffer_row_pitch < region[0] ||
+ host_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+ (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_origin[2]+region[2])*buffer_slice_pitch + (buffer_origin[1]+region[1])*buffer_row_pitch + buffer_origin[0] + region[0] > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueReadBufferRect;
+ data->mem_obj = buffer;
+ data->ptr = ptr;
+ data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+ data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = buffer_row_pitch;
+ data->slice_pitch = buffer_slice_pitch;
+ data->host_row_pitch = host_row_pitch;
+ data->host_slice_pitch = host_slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+ error:
+ return err;
}
cl_int
@@ -1291,7 +1414,7 @@ clEnqueueWriteBuffer(cl_command_queue command_queue,
data->size = size;
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
err = cl_enqueue_handle(data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1316,8 +1439,75 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(buffer);
+
+ if (command_queue->ctx != buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (blocking_write != CL_TRUE)
+ NOT_IMPLEMENTED;
+
+
+ if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(buffer_row_pitch == 0)
+ buffer_row_pitch = region[0];
+ if(buffer_slice_pitch == 0)
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+ if(host_row_pitch == 0)
+ host_row_pitch = region[0];
+ if(host_slice_pitch == 0)
+ host_slice_pitch = region[1] * host_row_pitch;
+
+ if (buffer_row_pitch < region[0] ||
+ host_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+ (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((buffer_origin[2]+region[2])*buffer_slice_pitch + (buffer_origin[1]+region[1])*buffer_row_pitch + buffer_origin[0] + region[0] > buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueWriteBufferRect;
+ data->mem_obj = buffer;
+ data->const_ptr = ptr;
+ data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+ data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+ data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
+ data->row_pitch = buffer_row_pitch;
+ data->slice_pitch = buffer_slice_pitch;
+ data->host_row_pitch = host_row_pitch;
+ data->host_slice_pitch = host_slice_pitch;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
}
cl_int
@@ -1350,13 +1540,84 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(src_buffer);
+ CHECK_MEM(dst_buffer);
+
+ if ((command_queue->ctx != src_buffer->ctx) ||
+ (command_queue->ctx != dst_buffer->ctx)) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(src_row_pitch == 0)
+ src_row_pitch = region[0];
+ if(src_slice_pitch == 0)
+ src_slice_pitch = region[1] * src_row_pitch;
+
+ if(dst_row_pitch == 0)
+ dst_row_pitch = region[0];
+ if(dst_slice_pitch == 0)
+ dst_slice_pitch = region[1] * dst_row_pitch;
+
+ if (src_row_pitch < region[0] ||
+ dst_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
+ (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_origin[2]+region[2])*src_slice_pitch + (src_origin[1]+region[1])*src_row_pitch + src_origin[0] + region[0] > src_buffer->size ||
+ (dst_origin[2]+region[2])*dst_slice_pitch + (dst_origin[1]+region[1])*dst_row_pitch + dst_origin[0] + region[0] > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_buffer == dst_buffer &&
+ check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+
+ cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+ src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyBufferRect;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_command_queue_flush(command_queue);
+ }
+
+error:
+ return err;
}
cl_int
clEnqueueReadImage(cl_command_queue command_queue,
- cl_mem image,
+ cl_mem mem,
cl_bool blocking_read,
const size_t * origin,
const size_t * region,
@@ -1371,8 +1632,8 @@ clEnqueueReadImage(cl_command_queue command_queue,
enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
- CHECK_IMAGE(image);
- if (command_queue->ctx != image->ctx) {
+ CHECK_IMAGE(mem, image);
+ if (command_queue->ctx != mem->ctx) {
err = CL_INVALID_CONTEXT;
goto error;
}
@@ -1410,16 +1671,16 @@ clEnqueueReadImage(cl_command_queue command_queue,
goto error;
}
- if (image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
err = CL_INVALID_OPERATION;
goto error;
}
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
data = &no_wait_data;
data->type = EnqueueReadImage;
- data->mem_obj = image;
+ data->mem_obj = mem;
data->ptr = ptr;
data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
@@ -1427,7 +1688,7 @@ clEnqueueReadImage(cl_command_queue command_queue,
data->slice_pitch = slice_pitch;
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
err = cl_enqueue_handle(data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1438,7 +1699,7 @@ error:
cl_int
clEnqueueWriteImage(cl_command_queue command_queue,
- cl_mem image,
+ cl_mem mem,
cl_bool blocking_write,
const size_t * origin,
const size_t * region,
@@ -1453,8 +1714,8 @@ clEnqueueWriteImage(cl_command_queue command_queue,
enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
- CHECK_IMAGE(image);
- if (command_queue->ctx != image->ctx) {
+ CHECK_IMAGE(mem, image);
+ if (command_queue->ctx != mem->ctx) {
err = CL_INVALID_CONTEXT;
goto error;
}
@@ -1492,16 +1753,16 @@ clEnqueueWriteImage(cl_command_queue command_queue,
goto error;
}
- if (image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+ if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
err = CL_INVALID_OPERATION;
goto error;
}
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
data = &no_wait_data;
data->type = EnqueueWriteImage;
- data->mem_obj = image;
+ data->mem_obj = mem;
data->const_ptr = ptr;
data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
@@ -1509,7 +1770,7 @@ clEnqueueWriteImage(cl_command_queue command_queue,
data->slice_pitch = slice_pitch;
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
err = cl_enqueue_handle(data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1520,8 +1781,8 @@ error:
cl_int
clEnqueueCopyImage(cl_command_queue command_queue,
- cl_mem src_image,
- cl_mem dst_image,
+ cl_mem src_mem,
+ cl_mem dst_mem,
const size_t * src_origin,
const size_t * dst_origin,
const size_t * region,
@@ -1529,13 +1790,74 @@ clEnqueueCopyImage(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+ cl_bool overlap = CL_TRUE;
+ cl_int i = 0;
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(src_mem, src_image);
+ CHECK_IMAGE(dst_mem, dst_image);
+ if (command_queue->ctx != src_mem->ctx ||
+ command_queue->ctx != dst_mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
+ src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
+ err = CL_IMAGE_FORMAT_MISMATCH;
+ goto error;
+ }
+
+ if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+ src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+ dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
+ (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_image == dst_image) {
+ for(i = 0; i < 3; i++)
+ overlap = overlap && (src_origin[i] < dst_origin[i] + region[i])
+ && (dst_origin[i] < src_origin[i] + region[i]);
+ if(overlap == CL_TRUE) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+ }
+
+ cl_mem_kernel_copy_image(command_queue, src_image, dst_image, src_origin, dst_origin, region);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyImage;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_command_queue_flush(command_queue);
+ }
+
+error:
+ return err;
}
cl_int
clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
- cl_mem src_image,
+ cl_mem src_mem,
cl_mem dst_buffer,
const size_t * src_origin,
const size_t * region,
@@ -1544,14 +1866,55 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_IMAGE(src_mem, src_image);
+ CHECK_MEM(dst_buffer);
+ if (command_queue->ctx != src_mem->ctx ||
+ command_queue->ctx != dst_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+ src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer, src_origin, dst_offset, region);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyImageToBuffer;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_command_queue_flush(command_queue);
+ }
+
+error:
+ return err;
}
cl_int
clEnqueueCopyBufferToImage(cl_command_queue command_queue,
cl_mem src_buffer,
- cl_mem dst_image,
+ cl_mem dst_mem,
size_t src_offset,
const size_t * dst_origin,
const size_t * region,
@@ -1559,8 +1922,113 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(src_buffer);
+ CHECK_IMAGE(dst_mem, dst_image);
+ if (command_queue->ctx != src_buffer->ctx ||
+ command_queue->ctx != dst_mem->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+ dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image, src_offset, dst_origin, region);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, dst_mem->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyBufferToImage;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_command_queue_flush(command_queue);
+ }
+
+error:
+ return err;
+}
+
+static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset, size_t size)
+{
+ cl_int slot = -1;
+ int err = CL_SUCCESS;
+ if (!(*ptr = cl_mem_map_gtt_unsync(mem))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+ *ptr = (char*)(*ptr) + offset;
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mem->host_ptr);
+ //only calc ptr here, will do memcpy in enqueue
+ *mem_ptr = mem->host_ptr + offset;
+ } else {
+ *mem_ptr = *ptr;
+ }
+ /* Record the mapped address. */
+ if (!mem->mapped_ptr_sz) {
+ mem->mapped_ptr_sz = 16;
+ mem->mapped_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
+ if (!mem->mapped_ptr) {
+ cl_mem_unmap_gtt(mem);
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = 0;
+ } else {
+ int i = 0;
+ for (; i < mem->mapped_ptr_sz; i++) {
+ if (mem->mapped_ptr[i].ptr == NULL) {
+ slot = i;
+ break;
+ }
+ }
+ if (i == mem->mapped_ptr_sz) {
+ cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+ sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
+ if (!new_ptr) {
+ cl_mem_unmap_gtt (mem);
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ memcpy(new_ptr, mem->mapped_ptr,
+ mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+ slot = mem->mapped_ptr_sz;
+ mem->mapped_ptr_sz *= 2;
+ free(mem->mapped_ptr);
+ mem->mapped_ptr = new_ptr;
+ }
+ }
+ assert(slot != -1);
+ mem->mapped_ptr[slot].ptr = *mem_ptr;
+ mem->mapped_ptr[slot].v_ptr = *ptr;
+ mem->mapped_ptr[slot].size = size;
+ mem->map_ref++;
+error:
+ if (err != CL_SUCCESS)
+ *mem_ptr = NULL;
+ return err;
}
void *
@@ -1576,6 +2044,8 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
cl_int * errcode_ret)
{
cl_int err = CL_SUCCESS;
+ void *ptr = NULL;
+ void *mem_ptr = NULL;
enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
@@ -1602,6 +2072,10 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
goto error;
}
+ err = _cl_map_mem(buffer, &ptr, &mem_ptr, offset, size);
+ if (err != CL_SUCCESS)
+ goto error;
+
TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
data = &no_wait_data;
@@ -1609,10 +2083,10 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
data->mem_obj = buffer;
data->offset = offset;
data->size = size;
- data->map_flags = map_flags;
+ data->ptr = ptr;
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
err = cl_enqueue_handle(data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1620,12 +2094,12 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
error:
if (errcode_ret)
*errcode_ret = err;
- return data->ptr;
+ return mem_ptr;
}
void *
clEnqueueMapImage(cl_command_queue command_queue,
- cl_mem image,
+ cl_mem mem,
cl_bool blocking_map,
cl_map_flags map_flags,
const size_t * origin,
@@ -1638,11 +2112,13 @@ clEnqueueMapImage(cl_command_queue command_queue,
cl_int * errcode_ret)
{
cl_int err = CL_SUCCESS;
+ void *ptr = NULL;
+ void *mem_ptr = NULL;
enqueue_data *data, no_wait_data = { 0 };
CHECK_QUEUE(command_queue);
- CHECK_IMAGE(image);
- if (command_queue->ctx != image->ctx) {
+ CHECK_IMAGE(mem, image);
+ if (command_queue->ctx != mem->ctx) {
err = CL_INVALID_CONTEXT;
goto error;
}
@@ -1665,27 +2141,51 @@ clEnqueueMapImage(cl_command_queue command_queue,
*image_slice_pitch = image->slice_pitch;
if ((map_flags & CL_MAP_READ &&
- image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+ mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
(map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
- image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+ mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
{
err = CL_INVALID_OPERATION;
goto error;
}
- TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+ if (!(ptr = cl_mem_map_gtt_unsync(mem))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+ size_t size;
+ if(region[2] == 1) {
+ if(region[1] == 1)
+ size = image->bpp * region[0];
+ else
+ size = image->row_pitch * (region[1] - 1) + (image->bpp * (origin[0] + region[0]));
+ } else {
+ size = image->slice_pitch * (region[2] - 1);
+ size += image->row_pitch * (origin[1] + region[1]);
+ size += image->bpp * (origin[0] + region[0]);
+ }
+
+ err = _cl_map_mem(mem, &ptr, &mem_ptr, offset, size);
+ if (err != CL_SUCCESS)
+ goto error;
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
data = &no_wait_data;
data->type = EnqueueMapImage;
- data->mem_obj = image;
+ data->mem_obj = mem;
data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2];
data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2];
data->row_pitch = *image_row_pitch;
- data->slice_pitch = *image_slice_pitch;
- data->map_flags = map_flags;
+ if (image_slice_pitch)
+ data->slice_pitch = *image_slice_pitch;
+ data->ptr = ptr;
+ data->offset = offset;
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
err = cl_enqueue_handle(data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1693,7 +2193,7 @@ clEnqueueMapImage(cl_command_queue command_queue,
error:
if (errcode_ret)
*errcode_ret = err;
- return data->ptr; //TODO: map and unmap first
+ return mem_ptr; //TODO: map and unmap first
}
cl_int
@@ -1722,7 +2222,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
data->ptr = mapped_ptr;
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
err = cl_enqueue_handle(data);
if(event) cl_event_set_status(*event, CL_COMPLETE);
}
@@ -1764,19 +2264,12 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
goto error;
}
- /* Check offset values. We add a non standard restriction. The offsets must
- * also be evenly divided by the local sizes
- */
if (global_work_offset != NULL)
for (i = 0; i < work_dim; ++i) {
if (UNLIKELY(~0LL - global_work_offset[i] > global_work_size[i])) {
err = CL_INVALID_GLOBAL_OFFSET;
goto error;
}
- if (UNLIKELY(local_work_size != NULL && global_work_offset[i] % local_work_size[i])) {
- err = CL_INVALID_GLOBAL_OFFSET;
- goto error;
- }
}
/* Local sizes must be non-null and divide global sizes */
@@ -1824,7 +2317,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
data->queue = command_queue;
if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
- event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
err = cl_command_queue_flush(command_queue);
}
@@ -1839,8 +2332,11 @@ clEnqueueTask(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ const size_t global_size[3] = {1, 0, 0};
+ const size_t local_size[3] = {1, 0, 0};
+
+ return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size,
+ num_events_in_wait_list, event_wait_list, event);
}
cl_int
@@ -1855,16 +2351,74 @@ clEnqueueNativeKernel(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ void *new_args = NULL;
+ enqueue_data *data, no_wait_data = { 0 };
+ cl_int i;
+
+ if(user_func == NULL ||
+ (args == NULL && cb_args > 0) ||
+ (args == NULL && num_mem_objects ==0) ||
+ (args != NULL && cb_args == 0) ||
+ (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
+ (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ //Per spec, need copy args
+ if (cb_args)
+ {
+ new_args = malloc(cb_args);
+ if (!new_args)
+ {
+ err = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ memcpy(new_args, args, cb_args);
+
+ for (i=0; i<num_mem_objects; ++i)
+ {
+ CHECK_MEM(mem_list[i]);
+ args_mem_loc[i] = new_args + (args_mem_loc[i] - args); //change to new args
+ }
+ }
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueNativeKernel;
+ data->mem_list = mem_list;
+ data->ptr = new_args;
+ data->size = cb_args;
+ data->offset = (size_t)num_mem_objects;
+ data->const_ptr = args_mem_loc;
+ data->user_func = user_func;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_enqueue_handle(data);
+ if(event) cl_event_set_status(*event, CL_COMPLETE);
+ }
+
+error:
+ return err;
}
cl_int
clEnqueueMarker(cl_command_queue command_queue,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+ if(event == NULL) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ cl_event_marker(command_queue, event);
+error:
+ return err;
}
cl_int
@@ -1883,9 +2437,12 @@ error:
cl_int
clEnqueueBarrier(cl_command_queue command_queue)
{
- NOT_IMPLEMENTED;
- return 0;
- //return clFinish(command_queue);
+ cl_int err = CL_SUCCESS;
+ CHECK_QUEUE(command_queue);
+ cl_command_queue_set_barrier(command_queue);
+
+error:
+ return err;
}
#define EXTFUNC(x) \
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index e82f75c..ff78770 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -87,6 +87,7 @@ cl_command_queue_delete(cl_command_queue queue)
cl_mem_delete(queue->perf);
cl_context_delete(queue->ctx);
cl_gpgpu_delete(queue->gpgpu);
+ cl_free(queue->wait_events);
queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
cl_free(queue);
}
@@ -98,7 +99,9 @@ cl_command_queue_add_ref(cl_command_queue queue)
}
static void
-set_image_info(char *curbe, struct ImageInfo * image_info, cl_mem image)
+set_image_info(char *curbe,
+ struct ImageInfo * image_info,
+ struct _cl_mem_image *image)
{
if (image_info->wSlot >= 0)
*(uint32_t*)(curbe + image_info->wSlot) = image->w;
@@ -118,12 +121,14 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
uint32_t i;
for (i = 0; i < k->image_sz; i++) {
int id = k->images[i].arg_idx;
+ struct _cl_mem_image *image;
assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
- set_image_info(k->curbe, &k->images[i], k->args[id].mem);
- cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, k->args[id].mem->bo,
- k->args[id].mem->intel_fmt, k->args[id].mem->type,
- k->args[id].mem->w, k->args[id].mem->h,
- k->args[id].mem->row_pitch, k->args[id].mem->tiling);
+ image = cl_mem_image(k->args[id].mem);
+ set_image_info(k->curbe, &k->images[i], image);
+ cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo,
+ image->intel_fmt, image->image_type,
+ image->w, image->h, image->depth,
+ image->row_pitch, image->tiling);
}
return CL_SUCCESS;
}
@@ -146,24 +151,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
return CL_SUCCESS;
}
-LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
- char * dst)
-{
- int i;
- for(i = 0; i < k->arg_n; i++) {
- enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
-
- if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) {
- uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
- cl_mem mem = k->args[i].mem;
- cl_buffer_map(mem->bo, 1);
- void * addr = cl_buffer_get_virtual(mem->bo);
- memcpy(dst + offset, addr, mem->size);
- cl_buffer_unmap(mem->bo);
- }
- }
- return CL_SUCCESS;
-}
#if USE_FULSIM
extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
@@ -426,3 +413,76 @@ cl_command_queue_finish(cl_command_queue queue)
return CL_SUCCESS;
}
+#define DEFAULT_WAIT_EVENTS_SIZE 16
+LOCAL void
+cl_command_queue_insert_event(cl_command_queue queue, cl_event event)
+{
+ cl_int i=0;
+ cl_event *new_list;
+
+ assert(queue != NULL);
+ if(queue->wait_events == NULL) {
+ queue->wait_events_size = DEFAULT_WAIT_EVENTS_SIZE;
+ TRY_ALLOC_NO_ERR (queue->wait_events, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+ }
+
+ for(i=0; i<queue->wait_events_num; i++) {
+ if(queue->wait_events[i] == event)
+ return; //is in the wait_events, need to insert
+ }
+
+ if(queue->wait_events_num < queue->wait_events_size) {
+ queue->wait_events[queue->wait_events_num++] = event;
+ return;
+ }
+
+ //wait_events_num == wait_events_size, array is full
+ queue->wait_events_size *= 2;
+ TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+ memcpy(new_list, queue->wait_events, sizeof(cl_event)*queue->wait_events_num);
+ cl_free(queue->wait_events);
+ queue->wait_events = new_list;
+ queue->wait_events[queue->wait_events_num++] = event;
+ return;
+
+exit:
+ return;
+error:
+ if(queue->wait_events)
+ cl_free(queue->wait_events);
+ queue->wait_events = NULL;
+ queue->wait_events_size = 0;
+ queue->wait_events_num = 0;
+ goto exit;
+
+}
+
+LOCAL void
+cl_command_queue_remove_event(cl_command_queue queue, cl_event event)
+{
+ cl_int i=0;
+
+ assert(queue->wait_events);
+ for(i=0; i<queue->wait_events_num; i++) {
+ if(queue->wait_events[i] == event)
+ break;
+ }
+
+ if(i == queue->wait_events_num)
+ return;
+
+ if(queue->barrier_index >= i)
+ queue->barrier_index -= 1;
+
+ for(; i<queue->wait_events_num-1; i++) {
+ queue->wait_events[i] = queue->wait_events[i+1];
+ }
+ queue->wait_events_num -= 1;
+}
+
+LOCAL void
+cl_command_queue_set_barrier(cl_command_queue queue)
+{
+ queue->barrier_index = queue->wait_events_num;
+}
+
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 135d659..9396fd7 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -33,6 +33,11 @@ struct _cl_command_queue {
uint64_t magic; /* To identify it as a command queue */
volatile int ref_n; /* We reference count this object */
cl_context ctx; /* Its parent context */
+ cl_event* wait_events; /* Point to array of non-complete user events that block this command queue */
+ cl_int wait_events_num; /* Number of Non-complete user events */
+ cl_int wait_events_size; /* The size of array that wait_events point to */
+ cl_int barrier_index; /* Indicate event count in wait_events as barrier events */
+ cl_event last_event; /* The last event in the queue, for enqueue mark used */
cl_command_queue_properties props; /* Queue properties */
cl_command_queue prev, next; /* We chain the command queues together */
cl_gpgpu gpgpu; /* Setup all GEN commands */
@@ -77,7 +82,14 @@ extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
/* Bind all the image surfaces in the GPGPU state */
extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
-/*update constant buffer to final curbe */
-extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
+/* Insert a user event to command's wait_events */
+extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
+
+/* Remove a user event from command's wait_events */
+extern void cl_command_queue_remove_event(cl_command_queue, cl_event);
+
+/* Set the barrier index */
+extern void cl_command_queue_set_barrier(cl_command_queue);
+
#endif /* __CL_COMMAND_QUEUE_H__ */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 1d415d4..b85c0cd 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -76,7 +76,7 @@ cl_set_varying_payload(const cl_kernel ker,
block_ips[curr] = 0;
}
- /* Copy them to the constant buffer */
+ /* Copy them to the curbe buffer */
curr = 0;
for (i = 0; i < thread_n; ++i, data += cst_sz) {
uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
@@ -95,6 +95,62 @@ error:
return err;
}
+static void
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+{
+ /* calculate constant buffer size */
+ int32_t arg;
+ size_t offset;
+ gbe_program prog = ker->program->opaque;
+ const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
+ size_t global_const_size = gbe_program_get_global_constant_size(prog);
+ uint32_t constant_buf_size = 0;
+ for (arg = 0; arg < arg_n; ++arg) {
+ const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+ if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+ cl_mem mem = ker->args[arg].mem;
+ constant_buf_size += ALIGN(mem->size, 4);
+ }
+ }
+ if(global_const_size == 0 && constant_buf_size == 0)
+ return;
+
+ cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
+ cl_buffer_map(bo, 1);
+ char * cst_addr = cl_buffer_get_virtual(bo);
+ offset = 0;
+ if (global_const_size > 0) {
+ /* Write the global constant arrays */
+ gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+ }
+ offset += ALIGN(global_const_size, 4);
+
+ if(global_const_size == 0) {
+ /* reserve 4 bytes to get rid of 0 address */
+ offset += 4;
+ }
+
+ /* upload constant buffer argument */
+ int32_t curbe_offset = 0;
+ for (arg = 0; arg < arg_n; ++arg) {
+ const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+ if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+ cl_mem mem = ker->args[arg].mem;
+
+ curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+ assert(curbe_offset >= 0);
+ *(uint32_t *) (ker->curbe + curbe_offset) = offset;
+
+ cl_buffer_map(mem->bo, 1);
+ void * addr = cl_buffer_get_virtual(mem->bo);
+ memcpy(cst_addr + offset, addr, mem->size);
+ cl_buffer_unmap(mem->bo);
+ offset += ALIGN(mem->size, 4);
+ }
+ }
+ cl_buffer_unmap(bo);
+}
+
/* Will return the total amount of slm used */
static int32_t
cl_curbe_fill(cl_kernel ker,
@@ -122,9 +178,17 @@ cl_curbe_fill(cl_kernel ker,
UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
- UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
#undef UPLOAD
+ /* Upload sampler information. */
+ offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0);
+ if (offset >= 0) {
+ uint32_t i;
+ for(i = 0; i < ker->sampler_sz; i++, offset += 2) {
+ *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF;
+ }
+ }
+
/* Write identity for the stack pointer. This is required by the stack pointer
* computation in the kernel
*/
@@ -134,14 +198,6 @@ cl_curbe_fill(cl_kernel ker,
int32_t i;
for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
}
-
- /* Write global constant arrays */
- if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) {
- /* Write the global constant arrays */
- gbe_program prog = ker->program->opaque;
- gbe_program_get_global_constant_data(prog, ker->curbe + offset);
- }
-
/* Handle the various offsets to SLM */
const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
int32_t arg, slm_offset = 0;
@@ -220,9 +276,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Compute the number of HW threads we need */
TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
- kernel.cst_sz = cst_sz;
+ kernel.curbe_sz = cst_sz;
- /* Curbe step 1: fill the constant buffer data shared by all threads */
+ /* Curbe step 1: fill the constant urb buffer data shared by all threads */
if (ker->curbe) {
kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
@@ -242,6 +298,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_setup_scratch(gpgpu, ker);
/* Bind a stack if needed */
cl_bind_stack(gpgpu, ker);
+
+ cl_upload_constant_buffer(queue, ker);
+
cl_gpgpu_states_setup(gpgpu, &kernel);
/* Curbe step 2. Give the localID and upload it to video memory */
@@ -250,10 +309,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
for (i = 0; i < thread_n; ++i) {
memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
- cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
}
TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
- cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+ cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
}
/* Start a new batch buffer */
diff --git a/src/cl_context.c b/src/cl_context.c
index a48436c..4f1c611 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -26,6 +26,8 @@
#include "cl_utils.h"
#include "cl_driver.h"
#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_program.h"
#include "CL/cl.h"
#include "CL/cl_gl.h"
@@ -123,7 +125,6 @@ cl_create_context(const cl_context_properties * properties,
cl_int err = CL_SUCCESS;
cl_uint prop_len = 0;
/* XXX */
- FATAL_IF (pfn_notify != NULL || user_data != NULL, "Unsupported call back");
FATAL_IF (num_devices != 1, "Only one device is supported");
/* Check that we are getting the right platform */
@@ -144,6 +145,10 @@ cl_create_context(const cl_context_properties * properties,
/* Attach the device to the context */
ctx->device = *devices;
+ /* Save the user callback and user data*/
+ ctx->pfn_notify = pfn_notify;
+ ctx->user_data = user_data;
+
exit:
if (errcode_ret != NULL)
*errcode_ret = err;
@@ -240,3 +245,26 @@ cl_context_get_bufmgr(cl_context ctx)
return cl_driver_get_bufmgr(ctx->drv);
}
+cl_kernel
+cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
+{
+ cl_int ret;
+ if (!ctx->internal_prgs[index])
+ {
+ size_t length = strlen(str_kernel) + 1;
+ ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
+
+ if (!ctx->internal_prgs[index])
+ return NULL;
+
+ ret = cl_program_build(ctx->internal_prgs[index], str_option);
+ if (ret != CL_SUCCESS)
+ return NULL;
+
+ ctx->internal_prgs[index]->is_built = 1;
+
+ ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ }
+
+ return ctx->internel_kernels[index];
+}
diff --git a/src/cl_context.h b/src/cl_context.h
index 718d589..7016733 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -39,18 +39,35 @@ enum _cl_gl_context_type {
CL_GL_CGL_SHAREGROUP
};
+enum _cl_internal_ker_type {
+ CL_ENQUEUE_COPY_BUFFER = 0,
+ CL_ENQUEUE_COPY_BUFFER_RECT = 1,
+ CL_ENQUEUE_COPY_IMAGE_0 = 2, //copy image 2d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_1 = 3, //copy image 3d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_2 = 4, //copy image 2d to image 3d
+ CL_ENQUEUE_COPY_IMAGE_3 = 5, //copy image 3d to image 3d
+ CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0 = 6, //copy image 2d to buffer
+ CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1 = 7, //copy image 3d tobuffer
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0 = 8, //copy buffer to image 2d
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1 = 9, //copy buffer to image 3d
+ CL_INTERNAL_KERNEL_MAX = 10
+};
+
struct _cl_context_prop {
cl_context_properties platform_id;
enum _cl_gl_context_type gl_type;
cl_context_properties gl_context;
union {
- cl_context_properties egl_display;
- cl_context_properties glx_display;
+ cl_context_properties egl_display;
+ cl_context_properties glx_display;
cl_context_properties wgl_hdc;
cl_context_properties cgl_sharegroup;
};
};
+#define IS_EGL_CONTEXT(ctx) (ctx->props.gl_type == CL_GL_EGL_DISPLAY)
+#define EGL_DISP(ctx) (EGLDisplay)(ctx->props.egl_display)
+#define EGL_CTX(ctx) (EGLContext)(ctx->props.gl_context)
/* Encapsulate the whole device */
struct _cl_context {
DEFINE_ICD(dispatch)
@@ -68,10 +85,18 @@ struct _cl_context {
pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
pthread_mutex_t event_lock; /* To allocate and deallocate events */
+ cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
+ /* All programs internal used, for example clEnqueuexxx api use */
+ cl_kernel internel_kernels[CL_INTERNAL_KERNEL_MAX];
+ /* All kernels for clenqueuexxx api, for example clEnqueuexxx api use */
uint32_t ver; /* Gen version */
struct _cl_context_prop props;
cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
cl_uint prop_len; /* count of the properties */
+ void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
+ /* User's callback when error occur in context */
+ void *user_data; /* A pointer to user supplied data */
+
};
/* Implement OpenCL function */
@@ -109,5 +134,8 @@ extern cl_int cl_context_ND_kernel(cl_context,
/* Used for allocation */
extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
+/* Get the internal used kernel */
+extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
+
#endif /* __CL_CONTEXT_H__ */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index a2c3ed2..16b343d 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -39,7 +39,7 @@ static struct _cl_device_id intel_ivb_gt2_device = {
.max_compute_unit = 128,
.max_thread_per_unit = 8,
.max_work_item_sizes = {512, 512, 512},
- .max_work_group_size = 512,
+ .max_work_group_size = 1024,
.max_clock_frequency = 1000,
.wg_sz = 1024,
.compile_wg_sz = {0},
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 1a0ec38..100b38d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -22,7 +22,7 @@
#include <stdint.h>
#include <stdlib.h>
-
+#include "cl_driver_type.h"
/* Various limitations we should remove actually */
#define GEN_MAX_SURFACES 128
#define GEN_MAX_SAMPLERS 16
@@ -33,28 +33,6 @@
* will allow us to make the use of a software performance simulator easier and
* to minimize the code specific for the HW and for the simulator
**************************************************************************/
-
-/* Encapsulates command buffer / data buffer / kernels */
-typedef struct _cl_buffer *cl_buffer;
-
-/* Encapsulates buffer manager */
-typedef struct _cl_buffer_mgr *cl_buffer_mgr;
-
-/* Encapsulates the driver backend functionalities */
-typedef struct _cl_driver *cl_driver;
-
-/* Encapsulates the gpgpu stream of commands */
-typedef struct _cl_gpgpu *cl_gpgpu;
-
-/* Encapsulates the event of a command stream */
-typedef struct _cl_gpgpu_event *cl_gpgpu_event;
-
-typedef struct _cl_context_prop *cl_context_prop;
-typedef struct _cl_sampler *cl_sampler;
-
-/**************************************************************************
- * Driver
- **************************************************************************/
/* Create a new driver */
typedef cl_driver (cl_driver_new_cb)(cl_context_prop);
extern cl_driver_new_cb *cl_driver_new;
@@ -100,7 +78,7 @@ typedef enum gpu_command_status {
typedef struct cl_gpgpu_kernel {
const char *name; /* kernel name and bo name */
uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) */
- uint32_t cst_sz; /* total size of all constants */
+ uint32_t curbe_sz; /* total size of all curbes */
cl_buffer bo; /* kernel code in the proper addr space */
int32_t barrierID; /* barrierID for _this_ kernel */
uint32_t use_slm:1; /* For gen7 (automatic barrier management) */
@@ -136,6 +114,7 @@ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
uint32_t type,
int32_t w,
int32_t h,
+ int32_t depth,
int pitch,
cl_gpgpu_tiling tiling);
@@ -157,9 +136,12 @@ extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
-/* Fills current constant buffer with data */
-typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t size);
-extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
+/* Fills current curbe buffer with data */
+typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
+
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
/* Setup all indirect states */
typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
@@ -231,14 +213,18 @@ typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride
extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
#include "cl_context.h"
+#include "cl_mem.h"
typedef struct _cl_context *cl_context;
-typedef cl_buffer (cl_buffer_alloc_from_eglimage_cb)(cl_context, void*, unsigned int *,
- int *, int *, int *, int *);
-extern cl_buffer_alloc_from_eglimage_cb *cl_buffer_alloc_from_eglimage;
+typedef cl_buffer (cl_buffer_alloc_from_texture_cb)(cl_context, unsigned int, int, unsigned int,
+ struct _cl_mem_image *gl_image);
+extern cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture;
+
+typedef void (cl_buffer_release_from_texture_cb)(cl_context, unsigned int, int, unsigned int);
+extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
/* Unref a buffer and destroy it if no more ref */
-typedef void (cl_buffer_unreference_cb)(cl_buffer);
+typedef int (cl_buffer_unreference_cb)(cl_buffer);
extern cl_buffer_unreference_cb *cl_buffer_unreference;
/* Add one more ref on a buffer */
@@ -257,6 +243,10 @@ extern cl_buffer_unmap_cb *cl_buffer_unmap;
typedef int (cl_buffer_map_gtt_cb)(cl_buffer);
extern cl_buffer_map_gtt_cb *cl_buffer_map_gtt;
+/* Map a buffer in the GTT domain, non waiting the GPU read or write*/
+typedef int (cl_buffer_map_gtt_unsync_cb)(cl_buffer);
+extern cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync;
+
/* Unmap a buffer in the GTT domain */
typedef int (cl_buffer_unmap_gtt_cb)(cl_buffer);
extern cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt;
@@ -289,5 +279,35 @@ extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
typedef int (cl_driver_get_device_id_cb)(void);
extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
+/**************************************************************************
+ * cl_khr_gl_sharing.
+ **************************************************************************/
+typedef int (cl_gl_acquire_texture_cb)(void *driver, void *ctx, int target,
+ int level, int texture, void*user_data);
+extern cl_gl_acquire_texture_cb *cl_gl_acquire_texture;
+
+typedef int (cl_gl_release_texture_cb)(void *driver, void *ctx, int target,
+ int level, int texture);
+extern cl_gl_release_texture_cb *cl_gl_release_texture;
+
+typedef int (cl_gl_acquire_buffer_object_cb)(void *driver, void *ctx,
+ int bufobj, void* user_data);
+extern cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object;
+
+typedef int (cl_gl_release_buffer_object_cb)(void *driver, void *ctx, int bufobj);
+extern cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object;
+
+typedef int (cl_gl_acquire_render_buffer_cb)(void *driver, void *ctx,
+ int rb, void* user_data);
+extern cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer;
+
+typedef int (cl_gl_release_render_buffer_cb)(void *driver, void *ctx, int rb);
+extern cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer;
+
+#ifndef DEFAULT_DRIVER_DIR
+/* this is normally defined in Mesa/configs/default with DRI_DRIVER_SEARCH_PATH */
+#define DEFAULT_DRIVER_DIR "/usr/local/lib/dri"
+#endif
+
#endif /* __CL_DRIVER_H__ */
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index e7412de..ac4ff7a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -30,12 +30,14 @@ LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
/* Buffer */
LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
-LOCAL cl_buffer_alloc_from_eglimage_cb *cl_buffer_alloc_from_eglimage = NULL;
+LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
+LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
LOCAL cl_buffer_map_cb *cl_buffer_map = NULL;
LOCAL cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
LOCAL cl_buffer_map_gtt_cb *cl_buffer_map_gtt = NULL;
+LOCAL cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync = NULL;
LOCAL cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt = NULL;
LOCAL cl_buffer_get_virtual_cb *cl_buffer_get_virtual = NULL;
LOCAL cl_buffer_get_size_cb *cl_buffer_get_size = NULL;
@@ -44,6 +46,13 @@ LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
+/* cl_khr_gl_sharing */
+LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
+LOCAL cl_gl_release_texture_cb *cl_gl_release_texture = NULL;
+LOCAL cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object = NULL;
+LOCAL cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object = NULL;
+LOCAL cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer = NULL;
+LOCAL cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer = NULL;
/* GPGPU */
LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
@@ -53,8 +62,9 @@ LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
-LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
+LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
diff --git a/src/cl_driver_type.h b/src/cl_driver_type.h
new file mode 100644
index 0000000..891a33c
--- /dev/null
+++ b/src/cl_driver_type.h
@@ -0,0 +1,24 @@
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
+ * will allow us to make the use of a software performance simulator easier and
+ * to minimize the code specific for the HW and for the simulator
+ **************************************************************************/
+
+/* Encapsulates command buffer / data buffer / kernels */
+typedef struct _cl_buffer *cl_buffer;
+
+/* Encapsulates buffer manager */
+typedef struct _cl_buffer_mgr *cl_buffer_mgr;
+
+/* Encapsulates the driver backend functionalities */
+typedef struct _cl_driver *cl_driver;
+
+/* Encapsulates the gpgpu stream of commands */
+typedef struct _cl_gpgpu *cl_gpgpu;
+
+/* Encapsulates the event of a command stream */
+typedef struct _cl_gpgpu_event *cl_gpgpu_event;
+
+typedef struct _cl_context_prop *cl_context_prop;
+typedef struct _cl_sampler *cl_sampler;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index a112cc4..0330691 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -45,6 +45,53 @@ error:
return err;
}
+cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
+{
+ cl_int err = CL_SUCCESS;
+ void* src_ptr;
+ void* dst_ptr;
+
+ const size_t* origin = data->origin;
+ const size_t* host_origin = data->host_origin;
+ const size_t* region = data->region;
+
+ if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+ src_ptr = (char*)src_ptr + offset;
+
+ offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+ dst_ptr = (char *)data->ptr + offset;
+
+ if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+ (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
+ {
+ memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ }
+ else {
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src = src_ptr;
+ char* dst = dst_ptr;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst, src, region[0]);
+ src += data->row_pitch;
+ dst += data->host_row_pitch;
+ }
+ src_ptr = (char*)src_ptr + data->slice_pitch;
+ dst_ptr = (char*)dst_ptr + data->host_slice_pitch;
+ }
+ }
+
+ err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+ return err;
+}
+
cl_int cl_enqueue_write_buffer(enqueue_data *data)
{
cl_int err = CL_SUCCESS;
@@ -63,186 +110,166 @@ error:
return err;
}
-cl_int cl_enqueue_read_image(enqueue_data *data)
+cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
{
cl_int err = CL_SUCCESS;
void* src_ptr;
+ void* dst_ptr;
- cl_mem image = data->mem_obj;
const size_t* origin = data->origin;
+ const size_t* host_origin = data->host_origin;
const size_t* region = data->region;
- if (!(src_ptr = cl_mem_map_auto(image))) {
+ if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
err = CL_MAP_FAILURE;
goto error;
}
- size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
- src_ptr = (char*)src_ptr + offset;
+ size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+ dst_ptr = (char *)dst_ptr + offset;
- if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
- (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+ offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+ src_ptr = (char*)data->const_ptr + offset;
+
+ if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+ (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
{
- memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
}
else {
cl_uint y, z;
for (z = 0; z < region[2]; z++) {
const char* src = src_ptr;
- char* dst = data->ptr;
+ char* dst = dst_ptr;
for (y = 0; y < region[1]; y++) {
- memcpy(dst, src, image->bpp*region[0]);
- src += image->row_pitch;
+ memcpy(dst, src, region[0]);
+ src += data->host_row_pitch;
dst += data->row_pitch;
}
- src_ptr = (char*)src_ptr + image->slice_pitch;
- data->ptr = (char*)data->ptr + data->slice_pitch;
+ src_ptr = (char*)src_ptr + data->host_slice_pitch;
+ dst_ptr = (char*)dst_ptr + data->slice_pitch;
}
}
- err = cl_mem_unmap_auto(image);
+ err = cl_mem_unmap_auto(data->mem_obj);
error:
return err;
-
}
-cl_int cl_enqueue_write_image(enqueue_data *data)
+
+cl_int cl_enqueue_read_image(enqueue_data *data)
{
cl_int err = CL_SUCCESS;
- void* dst_ptr;
+ void* src_ptr;
- cl_mem image = data->mem_obj;
- const size_t *origin = data->origin;
- const size_t *region = data->region;
+ cl_mem mem = data->mem_obj;
+ CHECK_IMAGE(mem, image);
+ const size_t* origin = data->origin;
+ const size_t* region = data->region;
- if (!(dst_ptr = cl_mem_map_auto(image))) {
+ if (!(src_ptr = cl_mem_map_auto(mem))) {
err = CL_MAP_FAILURE;
goto error;
}
size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
- dst_ptr = (char*)dst_ptr + offset;
+ src_ptr = (char*)src_ptr + offset;
if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
(region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
{
- memcpy(dst_ptr, data->ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+ memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
}
else {
cl_uint y, z;
for (z = 0; z < region[2]; z++) {
- const char* src = data->const_ptr;
- char* dst = dst_ptr;
+ const char* src = src_ptr;
+ char* dst = data->ptr;
for (y = 0; y < region[1]; y++) {
memcpy(dst, src, image->bpp*region[0]);
- src += data->row_pitch;
- dst += image->row_pitch;
+ src += image->row_pitch;
+ dst += data->row_pitch;
}
+ src_ptr = (char*)src_ptr + image->slice_pitch;
data->ptr = (char*)data->ptr + data->slice_pitch;
- dst_ptr = (char*)dst_ptr + image->slice_pitch;
}
}
- err = cl_mem_unmap_auto(image);
+ err = cl_mem_unmap_auto(mem);
error:
return err;
}
-cl_int cl_enqueue_map_buffer(enqueue_data *data)
+cl_int cl_enqueue_write_image(enqueue_data *data)
{
+ cl_int err = CL_SUCCESS;
+ void* dst_ptr;
+
+ cl_mem mem = data->mem_obj;
+ CHECK_IMAGE(mem, image);
+
+ if (!(dst_ptr = cl_mem_map_auto(mem))) {
+ err = CL_MAP_FAILURE;
+ goto error;
+ }
+
+ cl_mem_copy_image_region(data->origin, data->region, dst_ptr,
+ image->row_pitch, image->slice_pitch,
+ data->const_ptr, data->row_pitch,
+ data->slice_pitch, image);
+ err = cl_mem_unmap_auto(mem);
+
+error:
+ return err;
+}
+
+cl_int cl_enqueue_map_buffer(enqueue_data *data)
+{
void *ptr = NULL;
cl_int err = CL_SUCCESS;
- void *mem_ptr = NULL;
- cl_int slot = -1;
cl_mem buffer = data->mem_obj;
-
- if (!(ptr = cl_mem_map_auto(buffer))) {
+ //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+ if (!(ptr = cl_mem_map_gtt(buffer))) {
err = CL_MAP_FAILURE;
+ goto error;
}
ptr = (char*)ptr + data->offset;
+ assert(data->ptr == ptr);
if(buffer->flags & CL_MEM_USE_HOST_PTR) {
assert(buffer->host_ptr);
memcpy(buffer->host_ptr + data->offset, ptr, data->size);
- mem_ptr = buffer->host_ptr + data->offset;
- } else {
- mem_ptr = ptr;
- }
-
- /* Record the mapped address. */
- if (!buffer->mapped_ptr_sz) {
- buffer->mapped_ptr_sz = 16;
- buffer->mapped_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz);
- if (!buffer->mapped_ptr) {
- cl_mem_unmap_auto (buffer);
- err = CL_OUT_OF_HOST_MEMORY;
- ptr = NULL;
- goto error;
- }
-
- memset(buffer->mapped_ptr, 0, buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- slot = 0;
- } else {
- int i = 0;
- for (; i < buffer->mapped_ptr_sz; i++) {
- if (buffer->mapped_ptr[i].ptr == NULL) {
- slot = i;
- break;
- }
- }
-
- if (i == buffer->mapped_ptr_sz) {
- cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
- sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz * 2);
- if (!new_ptr) {
- cl_mem_unmap_auto (buffer);
- err = CL_OUT_OF_HOST_MEMORY;
- ptr = NULL;
- goto error;
- }
- memset(new_ptr, 0, 2 * buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- memcpy(new_ptr, buffer->mapped_ptr,
- buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
- slot = buffer->mapped_ptr_sz;
- buffer->mapped_ptr_sz *= 2;
- free(buffer->mapped_ptr);
- buffer->mapped_ptr = new_ptr;
- }
}
- assert(slot != -1);
- buffer->mapped_ptr[slot].ptr = mem_ptr;
- buffer->mapped_ptr[slot].v_ptr = ptr;
- buffer->mapped_ptr[slot].size = data->size;
- buffer->map_ref++;
-
- data->ptr = mem_ptr;
-
error:
return err;
}
cl_int cl_enqueue_map_image(enqueue_data *data)
{
- void *ptr = NULL;
cl_int err = CL_SUCCESS;
+ cl_mem mem = data->mem_obj;
+ void *ptr = NULL;
+ CHECK_IMAGE(mem, image);
- cl_mem image = data->mem_obj;
- const size_t *origin = data->origin;
-
- if (!(ptr = cl_mem_map_auto(image))) {
+ if (!(ptr = cl_mem_map_gtt(mem))) {
err = CL_MAP_FAILURE;
goto error;
}
- size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
- data->ptr = (char*)ptr + offset;
+ assert(data->ptr == (char*)ptr + data->offset);
+
+ if(mem->flags & CL_MEM_USE_HOST_PTR) {
+ assert(mem->host_ptr);
+ cl_mem_copy_image_region(data->origin, data->region,
+ mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+ data->ptr, data->row_pitch, data->slice_pitch, image);
+ }
error:
return err;
@@ -282,7 +309,7 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
assert(v_ptr == mapped_ptr);
}
- cl_mem_unmap_auto(memobj);
+ cl_mem_unmap_gtt(memobj);
/* shrink the mapped slot. */
if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
@@ -311,13 +338,43 @@ error:
return err;
}
+cl_int cl_enqueue_native_kernel(enqueue_data *data)
+{
+ cl_int err = CL_SUCCESS;
+ cl_uint num_mem_objects = (cl_uint)data->offset;
+ const cl_mem *mem_list = data->mem_list;
+ const void **args_mem_loc = (const void **)data->const_ptr;
+ cl_uint i;
+
+ for (i=0; i<num_mem_objects; ++i)
+ {
+ const cl_mem buffer = mem_list[i];
+ CHECK_MEM(buffer);
+
+ *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer);
+ }
+ data->user_func(data->ptr);
+
+ for (i=0; i<num_mem_objects; ++i)
+ {
+ cl_mem_unmap_auto(mem_list[i]);
+ }
+
+ free(data->ptr);
+error:
+ return err;
+}
cl_int cl_enqueue_handle(enqueue_data* data)
{
switch(data->type) {
case EnqueueReadBuffer:
return cl_enqueue_read_buffer(data);
+ case EnqueueReadBufferRect:
+ return cl_enqueue_read_buffer_rect(data);
case EnqueueWriteBuffer:
return cl_enqueue_write_buffer(data);
+ case EnqueueWriteBufferRect:
+ return cl_enqueue_write_buffer_rect(data);
case EnqueueReadImage:
return cl_enqueue_read_image(data);
case EnqueueWriteImage:
@@ -328,8 +385,15 @@ cl_int cl_enqueue_handle(enqueue_data* data)
return cl_enqueue_map_image(data);
case EnqueueUnmapMemObject:
return cl_enqueue_unmap_mem_object(data);
+ case EnqueueCopyBufferRect:
+ case EnqueueCopyImage:
+ case EnqueueCopyBufferToImage:
+ case EnqueueCopyImageToBuffer:
case EnqueueNDRangeKernel:
- cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr); //goto default
+ cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
+ return CL_SUCCESS;
+ case EnqueueNativeKernel:
+ return cl_enqueue_native_kernel(data);
default:
return CL_SUCCESS;
}
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 7dc8ceb..b412d58 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -19,9 +19,8 @@
#ifndef __CL_ENQUEUE_H__
#define __CL_ENQUEUE_H__
-#include "cl_mem.h"
-#include "cl_command_queue.h"
#include "cl_internals.h"
+#include "cl_driver.h"
#include "CL/cl.h"
typedef enum {
@@ -40,22 +39,28 @@ typedef enum {
EnqueueMapImage,
EnqueueUnmapMemObject,
EnqueueNDRangeKernel,
+ EnqueueNativeKernel,
+ EnqueueMarker,
EnqueueInvalid
} enqueue_type;
typedef struct _enqueue_data {
- enqueue_type type; /* Command type */
- cl_mem mem_obj; /* Enqueue's cl_mem */
- cl_command_queue queue; /* Command queue */
- size_t offset; /* Mem object's offset */
- size_t size; /* Size */
- size_t origin[3]; /* Origin */
- size_t region[3]; /* Region */
- size_t row_pitch; /* Row pitch */
- size_t slice_pitch; /* Slice pitch */
- cl_map_flags map_flags; /* Map flags */
- const void * const_ptr; /* Const ptr for memory read */
- void * ptr; /* ptr for write and return value */
+ enqueue_type type; /* Command type */
+ cl_mem mem_obj; /* Enqueue's cl_mem */
+ cl_command_queue queue; /* Command queue */
+ size_t offset; /* Mem object's offset */
+ size_t size; /* Size */
+ size_t origin[3]; /* Origin */
+ size_t host_origin[3]; /* Origin */
+ size_t region[3]; /* Region */
+ size_t row_pitch; /* Row pitch */
+ size_t slice_pitch; /* Slice pitch */
+ size_t host_row_pitch; /* Host row pitch, used in read/write buffer rect */
+ size_t host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */
+ const void * const_ptr; /* Const ptr for memory read */
+ void * ptr; /* Ptr for write and return value */
+ const cl_mem* mem_list; /* mem_list of clEnqueueNativeKernel */
+ void (*user_func)(void *); /* pointer to a host-callable user function */
} enqueue_data;
/* Do real enqueue commands */
diff --git a/src/cl_event.c b/src/cl_event.c
index e882c7c..918e245 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -23,10 +23,28 @@
#include "cl_alloc.h"
#include "cl_khr_icd.h"
#include "cl_kernel.h"
+#include "cl_command_queue.h"
#include <assert.h>
#include <stdio.h>
+inline cl_bool
+cl_event_is_gpu_command_type(cl_command_type type)
+{
+ switch(type) {
+ case CL_COMMAND_COPY_BUFFER:
+ case CL_COMMAND_COPY_IMAGE:
+ case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
+ case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+ case CL_COMMAND_COPY_BUFFER_RECT:
+ case CL_COMMAND_TASK:
+ case CL_COMMAND_NDRANGE_KERNEL:
+ return CL_TRUE;
+ default:
+ return CL_FALSE;
+ }
+}
+
cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
{
cl_event event = NULL;
@@ -56,13 +74,16 @@ cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type ty
}
else {
event->status = CL_QUEUED;
- event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
+ if(cl_event_is_gpu_command_type(event->type))
+ event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
}
cl_event_add_ref(event); //dec when complete
event->user_cb = NULL;
event->enqueue_cb = NULL;
event->waits_head = NULL;
event->emplict = emplict;
+ if(queue && event->gpgpu_event)
+ queue->last_event = event;
exit:
return event;
@@ -77,9 +98,14 @@ void cl_event_delete(cl_event event)
if (UNLIKELY(event == NULL))
return;
+ cl_event_update_status(event);
+
if (atomic_dec(&event->ref_n) > 1)
return;
+ if(event->queue && event->queue->last_event == event)
+ event->queue->last_event = NULL;
+
/* Call all user's callback if haven't execute */
user_callback *cb = event->user_cb;
while(event->user_cb) {
@@ -153,7 +179,7 @@ cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
/* check the event_wait_list and num_events_in_wait_list */
if((event_wait_list == NULL) &&
(num_events_in_wait_list > 0))
- goto exit;
+ goto error;
if ((event_wait_list != NULL) &&
(num_events_in_wait_list == 0)){
@@ -180,10 +206,11 @@ error:
goto exit;
}
-cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list)
+cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+ cl_command_queue queue)
{
cl_int i, j;
+
/* Check whether wait user events */
for(i=0; i<num_events_in_wait_list; i++) {
if(event_wait_list[i]->status <= CL_COMPLETE)
@@ -199,6 +226,10 @@ cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
}
}
+ if(queue && queue->barrier_index > 0) {
+ return CL_ENQUEUE_EXECUTE_DEFER;
+ }
+
/* Non user events or all user event finished, wait all enqueue events finish */
for(i=0; i<num_events_in_wait_list; i++) {
if(event_wait_list[i]->status <= CL_COMPLETE)
@@ -207,7 +238,8 @@ cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
//enqueue callback haven't finish, in another thread, wait
if(event_wait_list[i]->enqueue_cb != NULL)
return CL_ENQUEUE_EXECUTE_DEFER;
- cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
+ if(event_wait_list[i]->gpgpu_event)
+ cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
cl_event_set_status(event_wait_list[i], CL_COMPLETE); //Execute user's callback
}
return CL_ENQUEUE_EXECUTE_IMM;
@@ -220,16 +252,40 @@ void cl_event_new_enqueue_callback(cl_event event,
{
enqueue_callback *cb, *node;
user_event *user_events, *u_ev;
+ cl_command_queue queue = event->queue;
cl_int i;
/* Allocate and inialize the structure itself */
TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
cb->num_events = num_events_in_wait_list;
- cb->wait_list = event_wait_list;
+ TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list));
+ for(i=0; i<num_events_in_wait_list; i++)
+ cb->wait_list[i] = event_wait_list[i];
cb->event = event;
cb->next = NULL;
cb->wait_user_events = NULL;
+ if(queue && queue->barrier_index > 0) {
+ for(i=0; i<queue->barrier_index; i++) {
+ /* Insert the enqueue_callback to user event list */
+ node = queue->wait_events[i]->waits_head;
+ if(node == NULL)
+ queue->wait_events[i]->waits_head = cb;
+ else
+ while((node != cb) && node->next)
+ node = node->next;
+ if(node == cb) //wait on dup user event
+ continue;
+ node->next = cb;
+
+ /* Insert the user event to enqueue_callback's wait_user_events */
+ TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+ u_ev->event = queue->wait_events[i];
+ u_ev->next = cb->wait_user_events;
+ cb->wait_user_events = u_ev;
+ }
+ }
+
/* Find out all user events that events in event_wait_list wait */
for(i=0; i<num_events_in_wait_list; i++) {
if(event_wait_list[i]->status <= CL_COMPLETE)
@@ -252,6 +308,7 @@ void cl_event_new_enqueue_callback(cl_event event,
u_ev->event = event_wait_list[i];
u_ev->next = cb->wait_user_events;
cb->wait_user_events = u_ev;
+ cl_command_queue_insert_event(event->queue, event_wait_list[i]);
} else if(event_wait_list[i]->enqueue_cb != NULL) {
user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
while(user_events != NULL) {
@@ -271,11 +328,11 @@ void cl_event_new_enqueue_callback(cl_event event,
u_ev->next = cb->wait_user_events;
cb->wait_user_events = u_ev;
user_events = user_events->next;
+ cl_command_queue_insert_event(event->queue, event_wait_list[i]);
}
}
}
- if(data->queue != NULL) {
- assert(event->gpgpu_event);
+ if(data->queue != NULL && event->gpgpu_event != NULL) {
cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
data->ptr = (void *)event->gpgpu_event;
}
@@ -291,6 +348,8 @@ error:
cb->wait_user_events = cb->wait_user_events->next;
cl_free(u_ev);
}
+ if(cb->wait_list)
+ cl_free(cb->wait_list);
cl_free(cb);
}
goto exit;
@@ -317,6 +376,8 @@ void cl_event_set_status(cl_event event, cl_int status)
if(status <= CL_COMPLETE) {
if(event->enqueue_cb) {
cl_enqueue_handle(&event->enqueue_cb->data);
+ if(event->gpgpu_event)
+ cl_gpgpu_event_update_status(event->gpgpu_event, 1); //now set complet, need refine
event->status = status; //Change the event status after enqueue and befor unlock
pthread_mutex_unlock(&event->ctx->event_lock);
@@ -324,6 +385,8 @@ void cl_event_set_status(cl_event event, cl_int status)
cl_event_delete(event->enqueue_cb->wait_list[i]);
pthread_mutex_lock(&event->ctx->event_lock);
+ if(event->enqueue_cb->wait_list)
+ cl_free(event->enqueue_cb->wait_list);
cl_free(event->enqueue_cb);
event->enqueue_cb = NULL;
}
@@ -375,8 +438,12 @@ void cl_event_set_status(cl_event event, cl_int status)
continue;
}
+ //remove user event frome enqueue_cb's ctx
+ cl_command_queue_remove_event(enqueue_cb->event->queue, event);
+
/* All user events complete, now wait enqueue events */
- ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list);
+ ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
+ enqueue_cb->event->queue);
assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
cb = enqueue_cb;
@@ -385,7 +452,7 @@ void cl_event_set_status(cl_event event, cl_int status)
/* Call the pending operation */
evt = cb->event;
cl_event_set_status(cb->event, CL_COMPLETE);
- if(cb->event->emplict == CL_FALSE) {
+ if(evt->emplict == CL_FALSE) {
cl_event_delete(evt);
}
}
@@ -400,3 +467,26 @@ void cl_event_update_status(cl_event event)
(cl_gpgpu_event_update_status(event->gpgpu_event, 0) == command_complete))
cl_event_set_status(event, CL_COMPLETE);
}
+
+cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
+{
+ enqueue_data data;
+
+ *event = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
+ if(event == NULL)
+ return CL_OUT_OF_HOST_MEMORY;
+
+ //if wait_events_num>0, the marker event need wait queue->wait_events
+ if(queue->wait_events_num > 0) {
+ data.type = EnqueueMarker;
+ cl_event_new_enqueue_callback(*event, &data, queue->wait_events_num, queue->wait_events);
+ return CL_SUCCESS;
+ }
+
+ if(queue->last_event && queue->last_event->gpgpu_event) {
+ cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
+ }
+
+ cl_event_set_status(*event, CL_COMPLETE);
+ return CL_SUCCESS;
+}
diff --git a/src/cl_event.h b/src/cl_event.h
index c921cb2..7dde24b 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -22,9 +22,9 @@
#include <semaphore.h>
-#include "cl_enqueue.h"
#include "cl_internals.h"
#include "cl_driver.h"
+#include "cl_enqueue.h"
#include "CL/cl.h"
#define CL_ENQUEUE_EXECUTE_IMM 0
@@ -39,7 +39,7 @@ typedef struct _enqueue_callback {
cl_event event; /* The event relative this enqueue callback */
enqueue_data data; /* Hold all enqueue callback's infomation */
cl_uint num_events; /* num events in wait list */
- const cl_event* wait_list; /* All event wait list this callback wait on */
+ cl_event* wait_list; /* All event wait list this callback wait on */
user_event* wait_user_events; /* The head of user event list the callback wait on */
struct _enqueue_callback* next; /* The next enqueue callback in wait list */
} enqueue_callback;
@@ -81,12 +81,14 @@ cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
/* Check events wait list for enqueue commonds */
cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
/* Wait the all events in wait list complete */
-cl_int cl_event_wait_events(cl_uint, const cl_event *);
+cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue);
/* New a enqueue suspend task */
void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
/* Set the event status and call all callbacks */
void cl_event_set_status(cl_event, cl_int);
/* Check and update event status */
void cl_event_update_status(cl_event);
+/* Create the marker event */
+cl_int cl_event_marker(cl_command_queue, cl_event*);
#endif /* __CL_EVENT_H__ */
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 1ff81c1..d07a525 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -26,6 +26,7 @@ void check_basic_extension(cl_extensions_t *extensions)
{
int id;
for(id = BASE_EXT_START_ID; id <= BASE_EXT_END_ID; id++)
+ if (id != EXT_ID(khr_fp64))
extensions->extensions[id].base.ext_enabled = 1;
}
@@ -39,26 +40,12 @@ void check_opt1_extension(cl_extensions_t *extensions)
void
check_gl_extension(cl_extensions_t *extensions) {
-#ifdef HAS_EGL
-static struct cl_gl_ext_deps egl_funcs;
+#if defined(HAS_EGL)
int id;
-#if defined(EGL_KHR_image) && defined(EGL_KHR_gl_texture_2D_image) && defined(HAS_GBM)
- egl_funcs.eglCreateImageKHR_func = (PFNEGLCREATEIMAGEKHRPROC) eglGetProcAddress("eglCreateImageKHR");
- egl_funcs.eglDestroyImageKHR_func = (PFNEGLDESTROYIMAGEKHRPROC) eglGetProcAddress("eglDestroyImageKHR");
-#else
- egl_funcs.eglCreateImageKHR_func = NULL;
- egl_funcs.eglDestroyImageKHR_func = NULL;
-#endif
-
- if (egl_funcs.eglCreateImageKHR_func != NULL
- && egl_funcs.eglDestroyImageKHR_func != NULL) {
/* For now, we only support cl_khr_gl_sharing. */
- for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
- if (id == EXT_ID(khr_gl_sharing)) {
- extensions->extensions[id].base.ext_enabled = 1;
- extensions->extensions[id].EXT_STRUCT_NAME(khr_gl_sharing).gl_ext_deps = &egl_funcs;
- }
- }
+ for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
+ if (id == EXT_ID(khr_gl_sharing))
+ extensions->extensions[id].base.ext_enabled = 1;
#endif
}
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 51eb8e0..52ee0a4 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -76,29 +76,6 @@ struct EXT_STRUCT_NAME(name) { \
DECL_BASE_EXTENSIONS
DECL_OPT1_EXTENSIONS
DECL_D3D_EXTENSIONS
-#undef DECL_EXT
-
-#define DECL_EXT(name) \
-struct EXT_STRUCT_NAME(name) { \
- struct cl_extension_base base; \
- struct cl_gl_ext_deps *gl_ext_deps; \
-};
-
-struct cl_gl_ext_deps {
-#ifdef HAS_EGL
-#ifndef EGL_KHR_image
-#define PFNEGLCREATEIMAGEKHRPROC void*
-#define PFNEGLDESTROYIMAGEKHRPROC void*
-#endif
- PFNEGLCREATEIMAGEKHRPROC eglCreateImageKHR_func;
- PFNEGLDESTROYIMAGEKHRPROC eglDestroyImageKHR_func;
-#ifndef EGL_KHR_image
-#undef PFNEGLCREATEIMAGEKHRPROC
-#undef PFNEGLDESTROYIMAGEKHRPROC
-#endif
-#endif
-};
-
DECL_GL_EXTENSIONS
#undef DECL_EXT
@@ -117,8 +94,6 @@ typedef struct cl_extensions {
struct _cl_platform_id;
typedef struct _cl_platform_id * cl_platform_id;
-#define CL_EXTENSION_GET_FUNCS(ctx, name, funcs) \
- ctx->device->platform->internal_extensions->extensions[EXT_ID(name)].EXT_STRUCT_NAME(name).funcs
extern void
cl_intel_platform_extension_init(cl_platform_id intel_platform);
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f58e1fd..6bfc453 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -45,13 +45,13 @@
.image3d_max_width = 8192,
.image3d_max_height = 8192,
.image3d_max_depth = 8192,
-.max_samplers = 0,
+.max_samplers = 8,
.mem_base_addr_align = sizeof(cl_uint) * 8,
.min_data_type_align_size = sizeof(cl_uint),
.single_fp_config = 0, /* XXX */
.global_mem_cache_type = CL_READ_WRITE_CACHE,
-.global_mem_size = 4,
-.max_constant_buffer_size = 64 << 10,
+.global_mem_size = 128 * 1024 * 1024,
+.max_constant_buffer_size = 512 << 10,
.max_constant_args = 8,
.error_correction_support = CL_FALSE,
.host_unified_memory = CL_FALSE,
@@ -59,9 +59,11 @@
.endian_little = CL_TRUE,
.available = CL_TRUE,
.compiler_available = CL_FALSE, /* XXX */
-.execution_capabilities = CL_EXEC_KERNEL,
+.execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
.queue_properties = CL_QUEUE_PROFILING_ENABLE,
.platform = NULL, /* == intel_platform (set when requested) */
+/* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
+.single_fp_config = CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
#define DECL_INFO_STRING(FIELD, STRING) \
.FIELD = STRING, \
diff --git a/src/cl_image.c b/src/cl_image.c
index 6ea104b..f89bcae 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -61,11 +61,11 @@ cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp)
};
switch (order) {
+ case CL_Rx: break;
case CL_R: break;
case CL_A: break;
case CL_RA: *bpp *= 2; break;
case CL_RG: *bpp *= 2; break;
- case CL_Rx: *bpp *= 2; break;
case CL_INTENSITY:
case CL_LUMINANCE:
if (type != CL_UNORM_INT8 && type != CL_UNORM_INT16 &&
@@ -101,14 +101,26 @@ cl_image_get_intel_format(const cl_image_format *fmt)
const uint32_t order = fmt->image_channel_order;
switch (order) {
case CL_R:
+#if 0
+ case CL_Rx:
case CL_A:
case CL_INTENSITY:
case CL_LUMINANCE:
+ if ((order == CL_INTENSITY || order == CL_LUMINANCE)
+ && (type != CL_UNORM_INT8 && type != CL_UNORM_INT16
+ && type != CL_SNORM_INT8 && type != CL_SNORM_INT16
+ && type != CL_HALF_FLOAT && type != CL_FLOAT))
+ return INTEL_UNSUPPORTED_FORMAT;
+#endif
+
+/* XXX it seems we have some acuracy compatible issue with snomr_int8/16,
+ * have to disable those formats currently. */
+
switch (type) {
case CL_HALF_FLOAT: return I965_SURFACEFORMAT_R16_FLOAT;
case CL_FLOAT: return I965_SURFACEFORMAT_R32_FLOAT;
- case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16_SNORM;
- case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8_SNORM;
+// case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16_SNORM;
+// case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8_SNORM;
case CL_UNORM_INT8: return I965_SURFACEFORMAT_R8_UNORM;
case CL_UNORM_INT16: return I965_SURFACEFORMAT_R16_UNORM;
case CL_SIGNED_INT8: return I965_SURFACEFORMAT_R8_SINT;
@@ -119,9 +131,9 @@ cl_image_get_intel_format(const cl_image_format *fmt)
case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32_UINT;
default: return INTEL_UNSUPPORTED_FORMAT;
};
+#if 0
case CL_RG:
case CL_RA:
- case CL_Rx:
switch (type) {
case CL_HALF_FLOAT: return I965_SURFACEFORMAT_R16G16_FLOAT;
case CL_FLOAT: return I965_SURFACEFORMAT_R32G32_FLOAT;
@@ -145,12 +157,13 @@ cl_image_get_intel_format(const cl_image_format *fmt)
case CL_UNORM_SHORT_555:
default: return INTEL_UNSUPPORTED_FORMAT;
};
+#endif
case CL_RGBA:
switch (type) {
case CL_HALF_FLOAT: return I965_SURFACEFORMAT_R16G16B16A16_FLOAT;
case CL_FLOAT: return I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
- case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16G16B16A16_SNORM;
- case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8G8B8A8_SNORM;
+// case CL_SNORM_INT16: return I965_SURFACEFORMAT_R16G16B16A16_SNORM;
+// case CL_SNORM_INT8: return I965_SURFACEFORMAT_R8G8B8A8_SNORM;
case CL_UNORM_INT8: return I965_SURFACEFORMAT_R8G8B8A8_UNORM;
case CL_UNORM_INT16: return I965_SURFACEFORMAT_R16G16B16A16_UNORM;
case CL_SIGNED_INT8: return I965_SURFACEFORMAT_R8G8B8A8_SINT;
@@ -195,7 +208,6 @@ cl_image_get_supported_fmt(cl_context ctx,
cl_uint *num_image_formats)
{
size_t i, j, n = 0;
- assert(image_formats);
for (i = 0; i < cl_image_order_n; ++i)
for (j = 0; j < cl_image_type_n; ++j) {
const cl_image_format fmt = {
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 41e6a8a..4ba1c11 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -133,8 +133,8 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
return CL_INVALID_MEM_OBJECT;
- if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !mem->is_image)
- || (arg_type != GBE_ARG_IMAGE && mem->is_image)))
+ if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
+ || (arg_type != GBE_ARG_IMAGE && IS_IMAGE(mem))))
return CL_INVALID_ARG_VALUE;
}
}
@@ -186,16 +186,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
mem = *(cl_mem*) value;
- if(arg_type == GBE_ARG_CONSTANT_PTR) {
- int32_t cbOffset;
- cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
- //constant ptr's curbe offset changed, update it
- if(cbOffset >= 0) {
- offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
- *((uint32_t *)(k->curbe + offset)) = cbOffset; //cb offset in curbe
- }
- }
-
cl_mem_add_ref(mem);
if (k->args[index].mem)
cl_mem_delete(k->args[index].mem);
diff --git a/src/cl_khr_icd.h b/src/cl_khr_icd.h
index 6c8b9f4..1e206b4 100644
--- a/src/cl_khr_icd.h
+++ b/src/cl_khr_icd.h
@@ -14,6 +14,8 @@
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <http://www.gnu.org/licenses/>.
*/
+#ifndef __CL_KHR_ICD_H__
+#define __CL_KHR_ICD_H__
#ifdef HAS_OCLIcd
@@ -28,3 +30,5 @@ extern struct _cl_icd_dispatch const cl_khr_icd_dispatch;
#define INIT_ICD(member)
#define DEFINE_ICD(member)
#endif
+
+#endif
diff --git a/src/cl_mem.c b/src/cl_mem.c
index f794ce7..68753f1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
@@ -25,6 +25,8 @@
#include "cl_device_id.h"
#include "cl_driver.h"
#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
#include "CL/cl.h"
#include "CL/cl_intel.h"
@@ -42,6 +44,31 @@
return CL_INVALID_VALUE; \
break;
+#define CL_MEM_OBJECT_BUFFER 0x10F0
+#define CL_MEM_OBJECT_IMAGE2D 0x10F1
+#define CL_MEM_OBJECT_IMAGE3D 0x10F2
+
+static cl_mem_object_type
+cl_get_mem_object_type(cl_mem mem)
+{
+ switch (mem->type) {
+ case CL_MEM_BUFFER_TYPE: return CL_MEM_OBJECT_BUFFER;
+ case CL_MEM_IMAGE_TYPE:
+ case CL_MEM_GL_IMAGE_TYPE:
+ {
+ struct _cl_mem_image *image = cl_mem_image(mem);
+ if (image->depth == 1)
+ return CL_MEM_OBJECT_IMAGE1D;
+ else if (image->depth == 2)
+ return CL_MEM_OBJECT_IMAGE2D;
+ else if (image->depth == 3)
+ return CL_MEM_OBJECT_IMAGE3D;
+ }
+ default:
+ return CL_MEM_OBJECT_BUFFER;
+ }
+}
+
LOCAL cl_int
cl_get_mem_object_info(cl_mem mem,
cl_mem_info param_name,
@@ -67,7 +94,7 @@ cl_get_mem_object_info(cl_mem mem,
switch(param_name)
{
case CL_MEM_TYPE:
- *((cl_mem_object_type *)param_value) = mem->type;
+ *((cl_mem_object_type *)param_value) = cl_get_mem_object_type(mem);
break;
case CL_MEM_FLAGS:
*((cl_mem_flags *)param_value) = mem->flags;
@@ -106,8 +133,8 @@ cl_get_image_info(cl_mem mem,
void *param_value,
size_t *param_value_size_ret)
{
- if(!mem || !mem->is_image)
- return CL_INVALID_MEM_OBJECT;
+ int err;
+ CHECK_IMAGE(mem, image);
switch(param_name)
{
@@ -125,35 +152,39 @@ cl_get_image_info(cl_mem mem,
switch(param_name)
{
case CL_IMAGE_FORMAT:
- *(cl_image_format *)param_value = mem->fmt;
+ *(cl_image_format *)param_value = image->fmt;
break;
case CL_IMAGE_ELEMENT_SIZE:
- *(size_t *)param_value = mem->bpp;
+ *(size_t *)param_value = image->bpp;
break;
case CL_IMAGE_ROW_PITCH:
- *(size_t *)param_value = mem->row_pitch;
+ *(size_t *)param_value = image->row_pitch;
break;
case CL_IMAGE_SLICE_PITCH:
- *(size_t *)param_value = mem->slice_pitch;
+ *(size_t *)param_value = image->slice_pitch;
break;
case CL_IMAGE_WIDTH:
- *(size_t *)param_value = mem->w;
+ *(size_t *)param_value = image->w;
break;
case CL_IMAGE_HEIGHT:
- *(size_t *)param_value = mem->h;
+ *(size_t *)param_value = image->h;
break;
case CL_IMAGE_DEPTH:
- *(size_t *)param_value = mem->depth;
+ *(size_t *)param_value = image->depth;
break;
}
return CL_SUCCESS;
+
+error:
+ return err;
}
#undef FIELD_SIZE
-static cl_mem
-cl_mem_allocate(cl_context ctx,
+LOCAL cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+ cl_context ctx,
cl_mem_flags flags,
size_t sz,
cl_int is_tiled,
@@ -174,41 +205,56 @@ cl_mem_allocate(cl_context ctx,
NULL)) != CL_SUCCESS) {
goto error;
}
- if (UNLIKELY(sz == 0 || sz > max_mem_size)) {
+ if (UNLIKELY(sz > max_mem_size)) {
err = CL_INVALID_BUFFER_SIZE;
goto error;
}
/* Allocate and inialize the structure itself */
- TRY_ALLOC (mem, CALLOC(struct _cl_mem));
+ if (type == CL_MEM_IMAGE_TYPE) {
+ struct _cl_mem_image *image = NULL;
+ TRY_ALLOC (image, CALLOC(struct _cl_mem_image));
+ mem = &image->base;
+ } else if (type == CL_MEM_GL_IMAGE_TYPE ) {
+ struct _cl_mem_gl_image *gl_image = NULL;
+ TRY_ALLOC (gl_image, CALLOC(struct _cl_mem_gl_image));
+ mem = &gl_image->base.base;
+ } else {
+ struct _cl_mem_buffer *buffer = NULL;
+ TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer));
+ mem = &buffer->base;
+ }
+ mem->type = type;
SET_ICD(mem->dispatch)
mem->ref_n = 1;
mem->magic = CL_MAGIC_MEM_HEADER;
mem->flags = flags;
- /* Pinning will require stricter alignment rules */
- if ((flags & CL_MEM_PINNABLE) || is_tiled)
- alignment = 4096;
-
- /* Allocate space in memory */
- bufmgr = cl_context_get_bufmgr(ctx);
- assert(bufmgr);
- mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
- if (UNLIKELY(mem->bo == NULL)) {
- err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
- goto error;
+ if (sz != 0) {
+ /* Pinning will require stricter alignment rules */
+ if ((flags & CL_MEM_PINNABLE) || is_tiled)
+ alignment = 4096;
+
+ /* Allocate space in memory */
+ bufmgr = cl_context_get_bufmgr(ctx);
+ assert(bufmgr);
+ mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+ if (UNLIKELY(mem->bo == NULL)) {
+ err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+ goto error;
+ }
+ mem->size = sz;
}
- mem->size = sz;
- /* Append the buffer in the context buffer list */
+ cl_context_add_ref(ctx);
+ mem->ctx = ctx;
+ /* Append the buffer in the context buffer list */
pthread_mutex_lock(&ctx->buffer_lock);
- mem->next = ctx->buffers;
- if (ctx->buffers != NULL)
- ctx->buffers->prev = mem;
- ctx->buffers = mem;
+ mem->next = ctx->buffers;
+ if (ctx->buffers != NULL)
+ ctx->buffers->prev = mem;
+ ctx->buffers = mem;
pthread_mutex_unlock(&ctx->buffer_lock);
- mem->ctx = ctx;
- cl_context_add_ref(ctx);
exit:
if (errcode)
@@ -222,11 +268,11 @@ error:
}
LOCAL cl_mem
-cl_mem_new(cl_context ctx,
- cl_mem_flags flags,
- size_t sz,
- void *data,
- cl_int *errcode_ret)
+cl_mem_new_buffer(cl_context ctx,
+ cl_mem_flags flags,
+ size_t sz,
+ void *data,
+ cl_int *errcode_ret)
{
/* Possible mem type combination:
CL_MEM_ALLOC_HOST_PTR
@@ -262,12 +308,10 @@ cl_mem_new(cl_context ctx,
}
/* Create the buffer in video memory */
- mem = cl_mem_allocate(ctx, flags, sz, CL_FALSE, &err);
+ mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
if (mem == NULL || err != CL_SUCCESS)
goto error;
- mem->type = CL_MEM_OBJECT_BUFFER;
-
/* Copy the data if required */
if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR)
cl_buffer_subdata(mem->bo, 0, sz, data);
@@ -285,35 +329,48 @@ error:
goto exit;
}
-static void
-cl_mem_copy_image(cl_mem image,
- size_t row_pitch,
- size_t slice_pitch,
- void* host_ptr)
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+ void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+ const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+ const struct _cl_mem_image *image)
{
- char* dst_ptr = cl_mem_map_auto(image);
-
- if (row_pitch == image->row_pitch &&
- (image->depth == 1 || slice_pitch == image->slice_pitch))
+ size_t offset = image->bpp * origin[0] + dst_row_pitch * origin[1] + dst_slice_pitch * origin[2];
+ dst = (char*)dst + offset;
+ if (!origin[0] && region[0] == image->w && dst_row_pitch == src_row_pitch &&
+ (region[2] == 1 || (!origin[1] && region[1] == image->h && dst_slice_pitch == src_slice_pitch)))
{
- memcpy(dst_ptr, host_ptr, image->depth == 1 ? row_pitch*image->h : slice_pitch*image->depth);
+ memcpy(dst, src, region[2] == 1 ? src_row_pitch*region[1] : src_slice_pitch*region[2]);
}
else {
- size_t y, z;
- for (z = 0; z < image->depth; z++) {
- const char* src = host_ptr;
- char* dst = dst_ptr;
- for (y = 0; y < image->h; y++) {
- memcpy(dst, src, image->bpp*image->w);
- src += row_pitch;
- dst += image->row_pitch;
+ cl_uint y, z;
+ for (z = 0; z < region[2]; z++) {
+ const char* src_ptr = src;
+ char* dst_ptr = dst;
+ for (y = 0; y < region[1]; y++) {
+ memcpy(dst_ptr, src_ptr, image->bpp*region[0]);
+ src_ptr += src_row_pitch;
+ dst_ptr += dst_row_pitch;
}
- host_ptr = (char*)host_ptr + slice_pitch;
- dst_ptr = (char*)dst_ptr + image->slice_pitch;
+ src = (char*)src + src_slice_pitch;
+ dst = (char*)dst + dst_slice_pitch;
}
}
+}
- cl_mem_unmap_auto(image);
+static void
+cl_mem_copy_image(struct _cl_mem_image *image,
+ size_t row_pitch,
+ size_t slice_pitch,
+ void* host_ptr)
+{
+ char* dst_ptr = cl_mem_map_auto((cl_mem)image);
+ size_t origin[3] = {0, 0, 0};
+ size_t region[3] = {image->w, image->h, image->depth};
+
+ cl_mem_copy_image_region(origin, region, dst_ptr, image->row_pitch, image->slice_pitch,
+ host_ptr, row_pitch, slice_pitch, image);
+ cl_mem_unmap_auto((cl_mem)image);
}
static const uint32_t tile_sz = 4096; /* 4KB per tile */
@@ -338,11 +395,11 @@ _cl_mem_new_image(cl_context ctx,
cl_int err = CL_SUCCESS;
cl_mem mem = NULL;
uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
- size_t sz = 0, aligned_pitch = 0, aligned_h;
+ size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h;
cl_image_tiling_t tiling = CL_NO_TILE;
/* Check flags consistency */
- if (UNLIKELY((flags & CL_MEM_COPY_HOST_PTR) && data == NULL)) {
+ if (UNLIKELY((flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) && data == NULL)) {
err = CL_INVALID_HOST_PTR;
goto error;
}
@@ -416,27 +473,27 @@ _cl_mem_new_image(cl_context ctx,
}
sz = aligned_pitch * aligned_h * depth;
- mem = cl_mem_allocate(ctx, flags, sz, tiling != CL_NO_TILE, &err);
+ mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
if (mem == NULL || err != CL_SUCCESS)
goto error;
- mem->w = w;
- mem->h = h;
- mem->depth = depth;
- mem->fmt = *fmt;
- mem->intel_fmt = intel_fmt;
- mem->bpp = bpp;
- mem->is_image = 1;
- mem->row_pitch = aligned_pitch;
- mem->slice_pitch = image_type == CL_MEM_OBJECT_IMAGE1D || image_type == CL_MEM_OBJECT_IMAGE2D ? 0 : aligned_pitch*aligned_h;
- mem->tiling = tiling;
- mem->type = image_type;
-
cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+ aligned_slice_pitch = (image_type == CL_MEM_OBJECT_IMAGE1D
+ || image_type == CL_MEM_OBJECT_IMAGE2D) ? 0 : aligned_pitch * ALIGN(h, 2);
+
+ cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+ intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+ 0, 0, 0);
/* Copy the data if required */
- if (flags & CL_MEM_COPY_HOST_PTR)
- cl_mem_copy_image(mem, pitch, slice_pitch, data);
+ if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+ cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
+ if (flags & CL_MEM_USE_HOST_PTR) {
+ mem->host_ptr = data;
+ cl_mem_image(mem)->host_row_pitch = pitch;
+ cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+ }
+ }
exit:
if (errcode_ret)
@@ -479,17 +536,18 @@ cl_mem_new_image(cl_context context,
LOCAL void
cl_mem_delete(cl_mem mem)
{
+ cl_int i;
if (UNLIKELY(mem == NULL))
return;
if (atomic_dec(&mem->ref_n) > 1)
return;
- if (LIKELY(mem->bo != NULL))
- cl_buffer_unreference(mem->bo);
#ifdef HAS_EGL
- if (UNLIKELY(mem->egl_image != NULL)) {
- cl_mem_gl_delete(mem);
+ if (UNLIKELY(IS_GL_IMAGE(mem))) {
+ cl_mem_gl_delete(cl_mem_gl_image(mem));
}
#endif
+ if (LIKELY(mem->bo != NULL))
+ cl_buffer_unreference(mem->bo);
/* Remove it from the list */
assert(mem->ctx);
@@ -503,8 +561,17 @@ cl_mem_delete(cl_mem mem)
pthread_mutex_unlock(&mem->ctx->buffer_lock);
cl_context_delete(mem->ctx);
- /* Someone still mapped? */
- assert(!mem->map_ref);
+ /* Someone still mapped, unmap */
+ if(mem->map_ref > 0) {
+ assert(mem->mapped_ptr);
+ for(i=0; i<mem->mapped_ptr_sz; i++) {
+ if(mem->mapped_ptr[i].ptr != NULL) {
+ mem->map_ref--;
+ cl_mem_unmap_gtt(mem);
+ }
+ }
+ assert(mem->map_ref == 0);
+ }
if (mem->mapped_ptr)
free(mem->mapped_ptr);
@@ -529,6 +596,390 @@ cl_mem_add_ref(cl_mem mem)
atomic_inc(&mem->ref_n);
}
+#define LOCAL_SZ_0 16
+#define LOCAL_SZ_1 4
+#define LOCAL_SZ_2 4
+
+LOCAL cl_int
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+ const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+ size_t src_row_pitch, size_t src_slice_pitch,
+ size_t dst_row_pitch, size_t dst_slice_pitch) {
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+ cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
+ cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+ cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+
+ static const char *str_kernel =
+ "kernel void __cl_cpy_buffer_rect ( \n"
+ " global char* src, global char* dst, \n"
+ " unsigned int region0, unsigned int region1, unsigned int region2, \n"
+ " unsigned int src_offset, unsigned int dst_offset, \n"
+ " unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+ " unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+ " int i = get_global_id(0); \n"
+ " int j = get_global_id(1); \n"
+ " int k = get_global_id(2); \n"
+ " if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+ " return; \n"
+ " src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+ " dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+ " dst[dst_offset] = src[src_offset]; \n"
+ "}";
+
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_buf->ctx == dst_buf->ctx);
+
+ /* setup the kernel and run. */
+ ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL);
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
+ cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
+ cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
+ const size_t *src_origin, const size_t *dst_origin, const size_t *region) {
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+ cl_int index = CL_ENQUEUE_COPY_IMAGE_0;
+ char option[40] = "";
+ uint32_t fixupDataType;
+ uint32_t savedIntelFmt;
+
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ strcat(option, "-D SRC_IMAGE_3D");
+ index += 1;
+ }
+ if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ strcat(option, " -D DST_IMAGE_3D");
+ index += 2;
+ }
+
+ switch (src_image->fmt.image_channel_data_type) {
+ case CL_SNORM_INT8:
+ case CL_UNORM_INT8: fixupDataType = CL_UNSIGNED_INT8; break;
+ case CL_HALF_FLOAT:
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT16: fixupDataType = CL_UNSIGNED_INT16; break;
+ case CL_FLOAT: fixupDataType = CL_UNSIGNED_INT32; break;
+ default:
+ fixupDataType = 0;
+ }
+
+ if (fixupDataType) {
+ cl_image_format fmt;
+ if (src_image->fmt.image_channel_order != CL_BGRA)
+ fmt.image_channel_order = src_image->fmt.image_channel_order;
+ else
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = fixupDataType;
+ savedIntelFmt = src_image->intel_fmt;
+ src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+ dst_image->intel_fmt = src_image->intel_fmt;
+ }
+ static const char *str_kernel =
+ "#ifdef SRC_IMAGE_3D \n"
+ " #define SRC_IMAGE_TYPE image3d_t \n"
+ " #define SRC_COORD_TYPE int4 \n"
+ "#else \n"
+ " #define SRC_IMAGE_TYPE image2d_t \n"
+ " #define SRC_COORD_TYPE int2 \n"
+ "#endif \n"
+ "#ifdef DST_IMAGE_3D \n"
+ " #define DST_IMAGE_TYPE image3d_t \n"
+ " #define DST_COORD_TYPE int4 \n"
+ "#else \n"
+ " #define DST_IMAGE_TYPE image2d_t \n"
+ " #define DST_COORD_TYPE int2 \n"
+ "#endif \n"
+ "kernel void __cl_copy_image ( \n"
+ " __read_only SRC_IMAGE_TYPE src_image, __write_only DST_IMAGE_TYPE dst_image, \n"
+ " unsigned int region0, unsigned int region1, unsigned int region2, \n"
+ " unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
+ " unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2) { \n"
+ " int i = get_global_id(0); \n"
+ " int j = get_global_id(1); \n"
+ " int k = get_global_id(2); \n"
+ " int4 color; \n"
+ " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
+ " SRC_COORD_TYPE src_coord; \n"
+ " DST_COORD_TYPE dst_coord; \n"
+ " if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+ " return; \n"
+ " src_coord.x = src_origin0 + i; \n"
+ " src_coord.y = src_origin1 + j; \n"
+ "#ifdef SRC_IMAGE_3D \n"
+ " src_coord.z = src_origin2 + k; \n"
+ "#endif \n"
+ " dst_coord.x = dst_origin0 + i; \n"
+ " dst_coord.y = dst_origin1 + j; \n"
+ "#ifdef DST_IMAGE_3D \n"
+ " dst_coord.z = dst_origin2 + k; \n"
+ "#endif \n"
+ " color = read_imagei(src_image, sampler, src_coord); \n"
+ " write_imagei(dst_image, dst_coord, color); \n"
+ "}";
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_image->base.ctx == dst_image->base.ctx);
+
+ /* setup the kernel and run. */
+ ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+ if (!ker) {
+ ret = CL_OUT_OF_RESOURCES;
+ goto fail;
+ }
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]);
+ cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
+ cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+ if (fixupDataType) {
+ src_image->intel_fmt = savedIntelFmt;
+ dst_image->intel_fmt = savedIntelFmt;
+ }
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer,
+ const size_t *src_origin, const size_t dst_offset, const size_t *region) {
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+ cl_int index = CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0;
+ char option[40] = "";
+ uint32_t intel_fmt, bpp;
+ cl_image_format fmt;
+ size_t origin0, region0;
+
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ strcat(option, "-D IMAGE_3D");
+ index += 1;
+ }
+
+ static const char *str_kernel =
+ "#ifdef IMAGE_3D \n"
+ " #define IMAGE_TYPE image3d_t \n"
+ " #define COORD_TYPE int4 \n"
+ "#else \n"
+ " #define IMAGE_TYPE image2d_t \n"
+ " #define COORD_TYPE int2 \n"
+ "#endif \n"
+ "kernel void __cl_copy_image_to_buffer ( \n"
+ " __read_only IMAGE_TYPE image, global uchar* buffer, \n"
+ " unsigned int region0, unsigned int region1, unsigned int region2, \n"
+ " unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
+ " unsigned int dst_offset) { \n"
+ " int i = get_global_id(0); \n"
+ " int j = get_global_id(1); \n"
+ " int k = get_global_id(2); \n"
+ " uint4 color; \n"
+ " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
+ " COORD_TYPE src_coord; \n"
+ " if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+ " return; \n"
+ " src_coord.x = src_origin0 + i; \n"
+ " src_coord.y = src_origin1 + j; \n"
+ "#ifdef IMAGE_3D \n"
+ " src_coord.z = src_origin2 + k; \n"
+ "#endif \n"
+ " color = read_imageui(image, sampler, src_coord); \n"
+ " dst_offset += (k * region1 + j) * region0 + i; \n"
+ " buffer[dst_offset] = color.x; \n"
+ "}";
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(image->base.ctx == buffer->ctx);
+
+ fmt.image_channel_order = CL_R;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+ intel_fmt = image->intel_fmt;
+ bpp = image->bpp;
+ image->intel_fmt = cl_image_get_intel_format(&fmt);
+ image->w = image->w * image->bpp;
+ image->bpp = 1;
+ region0 = region[0] * bpp;
+ origin0 = src_origin[0] * bpp;
+ global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+ /* setup the kernel and run. */
+ ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+ if (!ker) {
+ ret = CL_OUT_OF_RESOURCES;
+ goto fail;
+ }
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_offset);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->w = image->w / bpp;
+
+ return ret;
+}
+
+
+LOCAL cl_int
+cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image,
+ const size_t src_offset, const size_t *dst_origin, const size_t *region) {
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+ cl_int index = CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0;
+ char option[40] = "";
+ uint32_t intel_fmt, bpp;
+ cl_image_format fmt;
+ size_t origin0, region0;
+
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ strcat(option, "-D IMAGE_3D");
+ index += 1;
+ }
+
+ static const char *str_kernel =
+ "#ifdef IMAGE_3D \n"
+ " #define IMAGE_TYPE image3d_t \n"
+ " #define COORD_TYPE int4 \n"
+ "#else \n"
+ " #define IMAGE_TYPE image2d_t \n"
+ " #define COORD_TYPE int2 \n"
+ "#endif \n"
+ "kernel void __cl_copy_image_to_buffer ( \n"
+ " __read_only IMAGE_TYPE image, global uchar* buffer, \n"
+ " unsigned int region0, unsigned int region1, unsigned int region2, \n"
+ " unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2, \n"
+ " unsigned int src_offset) { \n"
+ " int i = get_global_id(0); \n"
+ " int j = get_global_id(1); \n"
+ " int k = get_global_id(2); \n"
+ " uint4 color = (uint4)(0); \n"
+ " COORD_TYPE dst_coord; \n"
+ " if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+ " return; \n"
+ " dst_coord.x = dst_origin0 + i; \n"
+ " dst_coord.y = dst_origin1 + j; \n"
+ "#ifdef IMAGE_3D \n"
+ " dst_coord.z = dst_origin2 + k; \n"
+ "#endif \n"
+ " src_offset += (k * region1 + j) * region0 + i; \n"
+ " color.x = buffer[src_offset]; \n"
+ " write_imageui(image, dst_coord, color); \n"
+ "}";
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(image->base.ctx == buffer->ctx);
+
+ fmt.image_channel_order = CL_R;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+ intel_fmt = image->intel_fmt;
+ bpp = image->bpp;
+ image->intel_fmt = cl_image_get_intel_format(&fmt);
+ image->w = image->w * image->bpp;
+ image->bpp = 1;
+ region0 = region[0] * bpp;
+ origin0 = dst_origin[0] * bpp;
+ global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+ /* setup the kernel and run. */
+ ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->w = image->w / bpp;
+
+ return ret;
+}
+
+
LOCAL void*
cl_mem_map(cl_mem mem)
{
@@ -552,6 +1003,14 @@ cl_mem_map_gtt(cl_mem mem)
return cl_buffer_get_virtual(mem->bo);
}
+LOCAL void *
+cl_mem_map_gtt_unsync(cl_mem mem)
+{
+ cl_buffer_map_gtt_unsync(mem->bo);
+ assert(cl_buffer_get_virtual(mem->bo));
+ return cl_buffer_get_virtual(mem->bo);
+}
+
LOCAL cl_int
cl_mem_unmap_gtt(cl_mem mem)
{
@@ -562,7 +1021,7 @@ cl_mem_unmap_gtt(cl_mem mem)
LOCAL void*
cl_mem_map_auto(cl_mem mem)
{
- if (mem->is_image && mem->tiling != CL_NO_TILE)
+ if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
return cl_mem_map_gtt(mem);
else
return cl_mem_map(mem);
@@ -571,7 +1030,7 @@ cl_mem_map_auto(cl_mem mem)
LOCAL cl_int
cl_mem_unmap_auto(cl_mem mem)
{
- if (mem->is_image && mem->tiling != CL_NO_TILE)
+ if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
cl_buffer_unmap_gtt(mem->bo);
else
cl_buffer_unmap(mem->bo);
@@ -597,4 +1056,3 @@ cl_mem_unpin(cl_mem mem)
cl_buffer_unpin(mem->bo);
return CL_SUCCESS;
}
-
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 1b1709a..ca601f9 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -21,8 +21,10 @@
#define __CL_MEM_H__
#include "cl_internals.h"
-#include "cl_driver.h"
+#include "cl_driver_type.h"
#include "CL/cl.h"
+#include "cl_khr_icd.h"
+#include <assert.h>
#ifndef CL_VERSION_1_2
#define CL_MEM_OBJECT_IMAGE1D 0x10F4
@@ -62,31 +64,103 @@ typedef struct _cl_mem_dstr_cb {
}cl_mem_dstr_cb;
/* Used for buffers and images */
-struct _cl_mem {
- DEFINE_ICD(dispatch)
+enum cl_mem_type {
+ CL_MEM_BUFFER_TYPE,
+ CL_MEM_IMAGE_TYPE,
+ CL_MEM_GL_IMAGE_TYPE,
+};
+#define IS_IMAGE(mem) (mem->type >= CL_MEM_IMAGE_TYPE)
+#define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
+
+typedef struct _cl_mem {
uint64_t magic; /* To identify it as a memory object */
+ DEFINE_ICD(dispatch)
+ cl_mem prev, next; /* We chain the memory buffers together */
+ enum cl_mem_type type;
volatile int ref_n; /* This object is reference counted */
cl_buffer bo; /* Data in GPU memory */
- void *egl_image; /* created from external egl image*/
size_t size; /* original request size, not alignment size, used in constant buffer */
- cl_mem prev, next; /* We chain the memory buffers together */
cl_context ctx; /* Context it belongs to */
cl_mem_flags flags; /* Flags specified at the creation time */
- uint32_t is_image; /* Indicate if this is an image or not */
- cl_image_format fmt; /* only for images */
- cl_mem_object_type type; /* only for images 1D/2D...*/
- size_t w,h,depth; /* only for images (depth is only for 3D images) */
- size_t row_pitch,slice_pitch;
- uint32_t intel_fmt; /* format to provide in the surface state */
- uint32_t bpp; /* number of bytes per pixel */
- cl_image_tiling_t tiling; /* only IVB+ supports TILE_[X,Y] (image only) */
void * host_ptr; /* Pointer of the host mem specified by CL_MEM_ALLOC_HOST_PTR */
cl_mapped_ptr* mapped_ptr;/* Store the mapped addresses and size by caller. */
int mapped_ptr_sz; /* The array size of mapped_ptr. */
int map_ref; /* The mapped count. */
cl_mem_dstr_cb *dstr_cb; /* The destroy callback. */
+} _cl_mem;
+
+struct _cl_mem_image {
+ _cl_mem base;
+ cl_image_format fmt; /* only for images */
+ uint32_t intel_fmt; /* format to provide in the surface state */
+ uint32_t bpp; /* number of bytes per pixel */
+ cl_mem_object_type image_type; /* only for images 1D/2D...*/
+ size_t w, h, depth; /* only for images (depth is only for 3D images) */
+ size_t row_pitch, slice_pitch;
+ size_t host_row_pitch, host_slice_pitch;
+ cl_image_tiling_t tiling; /* only IVB+ supports TILE_[X,Y] (image only) */
+ size_t tile_x, tile_y; /* tile offset, used for mipmap images. */
+ size_t offset;
+};
+
+struct _cl_mem_gl_image {
+ struct _cl_mem_image base;
+ uint32_t target;
+ int miplevel;
+ uint32_t texture;
+};
+
+inline static void
+cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
+ cl_mem_object_type image_type,
+ size_t depth, cl_image_format fmt,
+ uint32_t intel_fmt, uint32_t bpp,
+ size_t row_pitch, size_t slice_pitch,
+ cl_image_tiling_t tiling,
+ size_t tile_x, size_t tile_y,
+ size_t offset)
+{
+ image->w = w;
+ image->h = h;
+ image->image_type = image_type;
+ image->depth = depth;
+ image->fmt = fmt;
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->row_pitch = row_pitch;
+ image->slice_pitch = slice_pitch;
+ image->tiling = tiling;
+ image->tile_x = tile_x;
+ image->tile_y = tile_y;
+ image->offset = offset;
+}
+
+struct _cl_mem_buffer {
+ _cl_mem base;
+ size_t offset;
};
+inline static struct _cl_mem_image *
+cl_mem_image(cl_mem mem)
+{
+ assert(IS_IMAGE(mem));
+ return (struct _cl_mem_image *)mem;
+}
+
+inline static struct _cl_mem_gl_image *
+cl_mem_gl_image(cl_mem mem)
+{
+ assert(IS_GL_IMAGE(mem));
+ return (struct _cl_mem_gl_image*)mem;
+}
+
+inline static struct _cl_mem_buffer *
+cl_mem_buffer(cl_mem mem)
+{
+ assert(!IS_IMAGE(mem));
+ return (struct _cl_mem_buffer *)mem;
+}
+
/* Query information about a memory object */
extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
@@ -94,7 +168,7 @@ extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t
extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
/* Create a new memory object and initialize it with possible user data */
-extern cl_mem cl_mem_new(cl_context, cl_mem_flags, size_t, void*, cl_int*);
+extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
/* Idem but this is an image */
extern cl_mem
@@ -109,11 +183,28 @@ cl_mem_new_image(cl_context context,
extern void cl_mem_delete(cl_mem);
/* Destroy egl image. */
-extern void cl_mem_gl_delete(cl_mem);
+extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
/* Add one more reference to this object */
extern void cl_mem_add_ref(cl_mem);
+/* api clEnqueueCopyBufferRect help function */
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+ const size_t *, const size_t *, const size_t *,
+ size_t, size_t, size_t, size_t);
+
+/* api clEnqueueCopyImage help function */
+extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct _cl_mem_image*, struct _cl_mem_image*,
+ const size_t *, const size_t *, const size_t *);
+
+/* api clEnqueueCopyImageToBuffer help function */
+extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct _cl_mem_image*, cl_mem,
+ const size_t *, const size_t, const size_t *);
+
+/* api clEnqueueCopyBufferToImage help function */
+extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_mem_image*,
+ const size_t, const size_t *, const size_t *);
+
/* Directly map a memory object */
extern void *cl_mem_map(cl_mem);
@@ -123,6 +214,9 @@ extern cl_int cl_mem_unmap(cl_mem);
/* Directly map a memory object in GTT mode */
extern void *cl_mem_map_gtt(cl_mem);
+/* Directly map a memory object in GTT mode, with out waiting gpu idle */
+extern void *cl_mem_map_gtt_unsync(cl_mem);
+
/* Unmap a memory object in GTT mode */
extern cl_int cl_mem_unmap_gtt(cl_mem);
@@ -136,5 +230,19 @@ extern cl_int cl_mem_unmap_auto(cl_mem);
extern cl_int cl_mem_pin(cl_mem);
extern cl_int cl_mem_unpin(cl_mem);
+extern cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+ cl_context ctx,
+ cl_mem_flags flags,
+ size_t sz,
+ cl_int is_tiled,
+ cl_int *errcode);
+
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+ void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+ const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+ const struct _cl_mem_image *image);
+
#endif /* __CL_MEM_H__ */
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index f247171..28d2ac6 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -37,191 +37,46 @@
#include "CL/cl_intel.h"
#include "CL/cl_gl.h"
-#ifndef CL_VERSION_1_2
-#define CL_INVALID_IMAGE_DESCRIPTOR -65
-#endif
-static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
-{
- cl_int ret = CL_SUCCESS;
-
- switch (tex_format) {
- case GL_RGBA8:
- case GL_RGBA:
- case GL_RGBA16:
- case GL_RGBA8I:
- case GL_RGBA16I:
- case GL_RGBA32I:
- case GL_RGBA8UI:
- case GL_RGBA16UI:
- case GL_RGBA32UI:
- case GL_RGBA16F:
- case GL_RGBA32F:
- cl_format->image_channel_order = CL_RGBA;
- break;
- case GL_BGRA:
- cl_format->image_channel_order = CL_BGRA;
- break;
- default:
- ret = CL_INVALID_IMAGE_DESCRIPTOR;
- goto error;
- }
-
- switch (tex_format) {
- case GL_RGBA8:
- case GL_RGBA:
- case GL_BGRA:
- cl_format->image_channel_data_type = CL_UNORM_INT8;
- break;
- case GL_RGBA16:
- cl_format->image_channel_data_type = CL_UNORM_INT16;
- break;
- case GL_RGBA8I:
- cl_format->image_channel_data_type = CL_SIGNED_INT8;
- break;
- case GL_RGBA16I:
- cl_format->image_channel_data_type = CL_SIGNED_INT16;
- break;
- case GL_RGBA32I:
- cl_format->image_channel_data_type = CL_SIGNED_INT32;
- break;
- case GL_RGBA8UI:
- cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
- break;
- case GL_RGBA16UI:
- cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
- break;
- case GL_RGBA32UI:
- cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
- break;
- case GL_RGBA16F:
- cl_format->image_channel_data_type = CL_HALF_FLOAT;
- break;
- case GL_RGBA32F:
- cl_format->image_channel_order = CL_FLOAT;
- break;
- default:
- ret = CL_INVALID_IMAGE_DESCRIPTOR;
- goto error;
- }
-
-error:
- return ret;
-}
-
-static cl_mem_object_type
-get_mem_type_from_target(GLenum texture_target)
-{
- switch(texture_target) {
- case GL_TEXTURE_1D: return CL_MEM_OBJECT_IMAGE1D;
- case GL_TEXTURE_2D: return CL_MEM_OBJECT_IMAGE2D;
- case GL_TEXTURE_3D: return CL_MEM_OBJECT_IMAGE3D;
- case GL_TEXTURE_1D_ARRAY: return CL_MEM_OBJECT_IMAGE1D_ARRAY;
- case GL_TEXTURE_2D_ARRAY: return CL_MEM_OBJECT_IMAGE2D_ARRAY;
- default:
- assert(0);
- }
- return 0;
-}
-
-LOCAL cl_mem cl_mem_new_gl_buffer(cl_context ctx,
- cl_mem_flags flags,
- GLuint buf_obj,
- cl_int *errcode_ret)
+LOCAL cl_mem
+cl_mem_new_gl_buffer(cl_context ctx,
+ cl_mem_flags flags,
+ GLuint buf_obj,
+ cl_int *errcode_ret)
{
NOT_IMPLEMENTED;
}
-EGLImageKHR cl_create_textured_egl_image(cl_context ctx,
- GLenum texture_target,
- GLint miplevel,
- GLuint texture)
-{
- struct cl_gl_ext_deps *egl_funcs;
- EGLDisplay egl_display;
- EGLContext egl_context;
- EGLint egl_attribs[] = { EGL_GL_TEXTURE_LEVEL_KHR, miplevel, EGL_NONE};
-
- assert(ctx->props.gl_type == CL_GL_EGL_DISPLAY);
- egl_funcs = CL_EXTENSION_GET_FUNCS(ctx, khr_gl_sharing, gl_ext_deps);
- assert(egl_funcs != NULL);
- egl_display = (EGLDisplay)ctx->props.egl_display;
- egl_context = (EGLDisplay)ctx->props.gl_context;
- return egl_funcs->eglCreateImageKHR_func(egl_display, egl_context,
- EGL_GL_TEXTURE_2D_KHR,
- (EGLClientBuffer)(uintptr_t)texture,
- &egl_attribs[0]);
-}
-
-LOCAL cl_mem cl_mem_new_gl_texture(cl_context ctx,
- cl_mem_flags flags,
- GLenum texture_target,
- GLint miplevel,
- GLuint texture,
- cl_int *errcode_ret)
+LOCAL cl_mem
+cl_mem_new_gl_texture(cl_context ctx,
+ cl_mem_flags flags,
+ GLenum texture_target,
+ GLint miplevel,
+ GLuint texture,
+ cl_int *errcode_ret)
{
cl_int err = CL_SUCCESS;
cl_mem mem = NULL;
- EGLImageKHR egl_image;
- int w, h, pitch, tiling;
- unsigned int bpp, intel_fmt;
- cl_image_format cl_format;
- unsigned int gl_format;
/* Check flags consistency */
if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR)) {
err = CL_INVALID_ARG_VALUE;
goto error;
}
- TRY_ALLOC (mem, CALLOC(struct _cl_mem));
- mem->ctx = ctx;
- cl_context_add_ref(ctx);
-
- egl_image = cl_create_textured_egl_image(ctx, texture_target, miplevel, texture);
-
- if (egl_image == NULL) {
- err = CL_INVALID_GL_OBJECT;
+ mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, &err);
+ if (mem == NULL || err != CL_SUCCESS)
goto error;
- }
- mem->egl_image = egl_image;
- mem->bo = cl_buffer_alloc_from_eglimage(ctx, (void*)egl_image, &gl_format, &w, &h, &pitch, &tiling);
+
+ mem->bo = cl_buffer_alloc_from_texture(ctx, texture_target, miplevel,
+ texture, cl_mem_image(mem));
if (UNLIKELY(mem->bo == NULL)) {
err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
goto error;
}
- cl_get_clformat_from_texture(gl_format, &cl_format);
-
- /* XXX Maybe we'd better to check the hw format in driver? */
- intel_fmt = cl_image_get_intel_format(&cl_format);
-
- if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) {
- err = CL_INVALID_IMAGE_DESCRIPTOR;
- goto error;
- }
- cl_image_byte_per_pixel(&cl_format, &bpp);
-
- mem->type = get_mem_type_from_target(texture_target);
- mem->w = w;
- mem->h = h;
- mem->depth = 1;
- mem->fmt = cl_format;
- mem->intel_fmt = intel_fmt;
- mem->bpp = bpp;
- mem->is_image = 1;
- mem->row_pitch = pitch;
- mem->slice_pitch = 0;
- mem->tiling = tiling;
- mem->ref_n = 1;
- mem->magic = CL_MAGIC_MEM_HEADER;
- mem->flags = flags;
- /* Append the buffer in the context buffer list */
- pthread_mutex_lock(&ctx->buffer_lock);
- mem->next = ctx->buffers;
- if (ctx->buffers != NULL)
- ctx->buffers->prev = mem;
- ctx->buffers = mem;
- pthread_mutex_unlock(&ctx->buffer_lock);
+ cl_mem_gl_image(mem)->target = texture_target;
+ cl_mem_gl_image(mem)->miplevel = miplevel;
+ cl_mem_gl_image(mem)->texture = texture;
exit:
if (errcode_ret)
@@ -234,10 +89,9 @@ error:
}
-LOCAL void cl_mem_gl_delete(cl_mem mem)
+LOCAL void cl_mem_gl_delete(struct _cl_mem_gl_image *gl_image)
{
- struct cl_gl_ext_deps *egl_funcs;
- EGLDisplay egl_display = (EGLDisplay)mem->ctx->props.egl_display;
- egl_funcs = CL_EXTENSION_GET_FUNCS(mem->ctx, khr_gl_sharing, gl_ext_deps);
- egl_funcs->eglDestroyImageKHR_func(egl_display, mem->egl_image);
+ if (gl_image->base.base.bo != NULL)
+ cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image->target,
+ gl_image->miplevel, gl_image->texture);
}
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index 33915ce..fdf0d78 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -28,7 +28,7 @@
#define DECL_INFO_STRING(FIELD, STRING) \
.FIELD = STRING, \
- .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
+ .JOIN(FIELD,_sz) = sizeof(STRING),
static struct _cl_platform_id intel_platform_data = {
INIT_ICD(dispatch)
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index b8f7d61..6b70aee 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -62,9 +62,10 @@ extern cl_int cl_get_platform_info(cl_platform_id platform,
#define _STR(x) #x
#define _JOINT(x, y) _STR(x) "." _STR(y)
-#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR)
-#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR)
+
#define LIBCL_DRIVER_VERSION_STRING _JOINT(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR)
+#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
+#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
#endif /* __CL_PLATFORM_ID_H__ */
diff --git a/src/cl_program.c b/src/cl_program.c
index 7870514..a0e0104 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -42,6 +42,15 @@ cl_program_release_sources(cl_program p)
}
}
+static void
+cl_program_release_binary(cl_program p)
+{
+ if (p->binary) {
+ cl_free(p->binary);
+ p->binary = NULL;
+ }
+}
+
LOCAL void
cl_program_delete(cl_program p)
{
@@ -53,8 +62,9 @@ cl_program_delete(cl_program p)
/* We are not done with it yet */
if ((ref = atomic_dec(&p->ref_n)) > 1) return;
- /* Destroy the sources if still allocated */
+ /* Destroy the sources and binary if still allocated */
cl_program_release_sources(p);
+ cl_program_release_binary(p);
/* Release the build options. */
if (p->build_opts) {
@@ -149,7 +159,6 @@ cl_program_create_from_binary(cl_context ctx,
cl_int * binary_status,
cl_int * errcode_ret)
{
-#if 0
cl_program program = NULL;
cl_int err = CL_SUCCESS;
@@ -174,7 +183,16 @@ cl_program_create_from_binary(cl_context ctx,
goto error;
}
- // TRY_ALLOC (program, cl_program_new(ctx, (const char *) binaries[0], lengths[0]));
+ program = cl_program_new(ctx);
+
+ // TODO: Need to check the binary format here to return CL_INVALID_BINARY.
+ TRY_ALLOC(program->binary, cl_calloc(lengths[0], sizeof(char)));
+ memcpy(program->binary, binaries[0], lengths[0]);
+ program->binary_sz = lengths[0];
+ program->source_type = FROM_BINARY;
+
+ if (binary_status)
+ binary_status[0] = CL_SUCCESS;
exit:
if (errcode_ret)
@@ -184,8 +202,7 @@ error:
cl_program_delete(program);
program = NULL;
goto exit;
-#endif
- NOT_IMPLEMENTED;
+
return CL_SUCCESS;
}
@@ -303,6 +320,16 @@ cl_program_build(cl_program p, const char *options)
/* Create all the kernels */
TRY (cl_program_load_gen_program, p);
p->source_type = FROM_LLVM;
+ } else if (p->source_type == FROM_BINARY) {
+ p->opaque = gbe_program_new_from_binary(p->binary, p->binary_sz);
+ if (UNLIKELY(p->opaque == NULL)) {
+ err = CL_INVALID_PROGRAM;
+ goto error;
+ }
+
+ /* Create all the kernels */
+ TRY (cl_program_load_gen_program, p);
+ p->source_type = FROM_LLVM;
}
for (i = 0; i < p->ker_n; i ++) {
diff --git a/src/cl_program.h b/src/cl_program.h
index 996a496..de82fd5 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -48,6 +48,8 @@ struct _cl_program {
char *bin; /* The program copied verbatim */
size_t bin_sz; /* Its size in memory */
char *source; /* Program sources */
+ char *binary; /* Program binary. */
+ size_t binary_sz; /* The binary size. */
uint32_t ker_n; /* Number of declared kernels */
uint32_t source_type:2; /* Built from binary, source or LLVM */
uint32_t is_built:1; /* Did we call clBuildProgram on it? */
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index 7e0b7b0..b3f7045 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -49,7 +49,7 @@ uint32_t cl_to_clk(cl_bool normalized_coords,
}
return (clk_address << __CLK_ADDRESS_BASE)
| (normalized_coords << __CLK_NORMALIZED_BASE)
- | (clk_filter << __CLK_FILTER_BASE);
+ | (clk_filter);
}
#define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
diff --git a/src/cl_utils.h b/src/cl_utils.h
index bfe418d..fa900a7 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -138,19 +138,21 @@ do { \
} \
} while (0)
-#define CHECK_IMAGE(IMAGE) \
-CHECK_MEM(image); \
+#define CHECK_IMAGE(MEM, IMAGE) \
+CHECK_MEM(MEM); \
do { \
- if (UNLIKELY(!IMAGE->is_image)) { \
+ if (UNLIKELY(!IS_IMAGE(MEM))) { \
err = CL_INVALID_MEM_OBJECT; \
goto error; \
} \
-} while (0)
+} while (0); \
+struct _cl_mem_image *IMAGE; \
+IMAGE = cl_mem_image(MEM); \
#define CHECK_EVENT(EVENT) \
do { \
if (UNLIKELY(EVENT == NULL)) { \
- err = CL_INVALID_EVENT; \
+ err = CL_INVALID_EVENT; \
goto error; \
} \
if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) { \
diff --git a/src/intel/intel_dri_resource_sharing.c b/src/intel/intel_dri_resource_sharing.c
new file mode 100644
index 0000000..b31844e
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#define HAVE_PTHREAD 1
+#include <errno.h>
+#include <time.h>
+#include "main/context.h"
+#include "main/renderbuffer.h"
+#include "main/texobj.h"
+#include <stdbool.h>
+#include <string.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+#include <GL/internal/dri_interface.h>
+#include "intel_mipmap_tree.h"
+#include "intel_regions.h"
+#include "intel_context.h"
+
+#include "intel_dri_resource_sharing.h"
+#include "intel_dri_resource_sharing_int.h"
+
+#include <dlfcn.h>
+/**
+ * Sets up a DRIImage structure to point to our shared image in a region
+ */
+static bool
+intel_setup_cl_region_from_mipmap_tree(void *driver,
+ struct intel_context *intel,
+ struct intel_mipmap_tree *mt,
+ GLuint level, GLuint zoffset,
+ struct _intel_dri_share_image_region *region)
+{
+ unsigned int draw_x, draw_y;
+ uint32_t mask_x, mask_y;
+ struct intel_region *null_region = (struct intel_region *)NULL;
+
+ intel_miptree_check_level_layer(mt, level, zoffset);
+
+ _intel_region_get_tile_masks(mt->region, &mask_x, &mask_y, false);
+ _intel_miptree_get_image_offset(mt, level, zoffset, &draw_x, &draw_y);
+
+ region->w = mt->level[level].width;
+ region->h = mt->level[level].height;
+ region->tile_x = draw_x & mask_x;
+ region->tile_y = draw_y & mask_y;
+ region->tiling = mt->region->tiling;
+ /* XXX hard code to 1 right now. */
+ region->depth = 1;
+ region->row_pitch = mt->region->pitch;
+
+ region->offset = _intel_region_get_aligned_offset(mt->region,
+ draw_x & ~mask_x,
+ draw_y & ~mask_y,
+ false);
+ if (!_intel_region_flink(mt->region, ®ion->name))
+ return false;
+ _intel_region_reference(&null_region, mt->region);
+ return true;
+}
+
+typedef void
+_mesa_test_texobj_completeness_t( const struct gl_context *ctx,
+ struct gl_texture_object *t );
+_mesa_test_texobj_completeness_t *__mesa_test_texobj_completeness;
+
+typedef struct gl_texture_object *
+_mesa_lookup_texture_t( const struct gl_context *ctx, GLuint id);
+_mesa_lookup_texture_t *__mesa_lookup_texture;
+
+static struct gl_texture_object *
+intel_get_gl_obj_from_texture(void *driver,
+ struct intel_context *intel,
+ GLenum target, GLint level,
+ GLuint texture, GLuint face)
+{
+ struct gl_texture_object *obj;
+ __mesa_lookup_texture = dlsym(driver, "_mesa_lookup_texture");
+ obj = __mesa_lookup_texture(&intel->ctx, texture);
+ if (!obj || obj->Target != target) {
+ return NULL;
+ }
+
+ __mesa_test_texobj_completeness = dlsym(driver, "_mesa_test_texobj_completeness");
+ __mesa_test_texobj_completeness(&intel->ctx, obj);
+ if (!obj->_BaseComplete || (level > 0 && !obj->_MipmapComplete)) {
+ return NULL;
+ }
+
+ if (level < obj->BaseLevel || level > obj->_MaxLevel) {
+ return NULL;
+ }
+
+ return obj;
+}
+
+static GLenum
+get_cl_gl_format(gl_format format)
+{
+ switch (format) {
+ case MESA_FORMAT_RGBA8888:
+ return GL_RGBA;
+ case MESA_FORMAT_ARGB8888:
+ return GL_BGRA;
+ default:
+ return GL_BGRA;
+ }
+}
+
+static bool
+intelAcquireTexture(void *driver, __DRIcontext *context, GLenum target,
+ GLint level, GLuint texture, void *user_data)
+{
+ struct _intel_dri_share_image_region *region = intel_dri_share_image_region(user_data);
+ struct intel_context *intel = context->driverPrivate;
+ struct gl_texture_object *obj;
+ struct intel_texture_object *iobj;
+ /* XXX Always be face 0? */
+ GLuint face = 0;
+
+ obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+ if (obj == NULL)
+ return false;
+ iobj = intel_texture_object(obj);
+ region->gl_format = get_cl_gl_format(obj->Image[face][level]->TexFormat);
+ return intel_setup_cl_region_from_mipmap_tree(driver, intel, iobj->mt, level, 0, region);
+}
+
+static bool
+intelReleaseTexture(void *driver, __DRIcontext *context, GLenum target,
+ GLint level, GLuint texture)
+{
+ struct intel_context *intel = context->driverPrivate;
+ struct gl_texture_object *obj;
+ struct intel_texture_object *iobj;
+ /* XXX Always be face 0? */
+ GLuint face = 0;
+
+ obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+ if (obj == NULL)
+ return false;
+
+ iobj = intel_texture_object(obj);
+ _intel_region_release(&iobj->mt->region);
+ return true;
+}
+
+static bool
+intelAcquireBufferObj(void *driver, __DRIcontext *driContextPriv,
+ GLuint bufobj, void *user_data)
+{
+ return false;
+}
+
+static bool
+intelReleaseBufferObj(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+ return false;
+}
+
+static bool
+intelAcquireRenderBuffer(void *driver, __DRIcontext *driContextPriv,
+ GLuint bufobj, void *user_data)
+{
+ return false;
+}
+
+static bool
+intelReleaseRenderBuffer(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+ return false;
+}
+
+#include "cl_driver.h"
+void
+intel_set_cl_gl_callbacks(void)
+{
+ cl_gl_acquire_texture = (cl_gl_acquire_texture_cb*)intelAcquireTexture;
+ cl_gl_release_texture = (cl_gl_release_texture_cb*)intelReleaseTexture;
+ cl_gl_acquire_buffer_object = (cl_gl_acquire_buffer_object_cb*)intelAcquireBufferObj;
+ cl_gl_release_buffer_object = (cl_gl_release_buffer_object_cb*)intelReleaseBufferObj;
+ cl_gl_acquire_render_buffer = (cl_gl_acquire_render_buffer_cb*)intelAcquireRenderBuffer;
+ cl_gl_release_render_buffer = (cl_gl_release_render_buffer_cb*)intelReleaseRenderBuffer;
+}
diff --git a/src/intel/intel_dri_resource_sharing.h b/src/intel/intel_dri_resource_sharing.h
new file mode 100644
index 0000000..6d2ce4d
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.h
@@ -0,0 +1,39 @@
+#ifndef __INTEL_DRI_RESOURCE_SHARING_H__
+#define __INTEL_DRI_RESOURCE_SHARING_H__
+
+struct _intel_dri_share_image_region {
+ unsigned int name;
+ size_t w;
+ size_t h;
+ size_t depth;
+ size_t pitch;
+ int tiling;
+ size_t offset;
+ size_t tile_x;
+ size_t tile_y;
+ unsigned int gl_format;
+ size_t row_pitch, slice_pitch;
+};
+
+struct _intel_dri_share_buffer_object {
+ unsigned int name;
+ size_t sz;
+ size_t offset;
+};
+
+inline static struct _intel_dri_share_image_region *
+intel_dri_share_image_region(void *user_data)
+{
+ return (struct _intel_dri_share_image_region *)user_data;
+}
+
+inline static struct _intel_dri_share_buffer_object *
+intel_dri_share_buffer_object(void *user_data)
+{
+ return (struct _intel_dri_share_buffer_object *)user_data;
+}
+
+extern void intel_set_cl_gl_callbacks(void);
+
+
+#endif
diff --git a/src/intel/intel_dri_resource_sharing_int.h b/src/intel/intel_dri_resource_sharing_int.h
new file mode 100644
index 0000000..c7b283a
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing_int.h
@@ -0,0 +1,143 @@
+/*****************************************************************
+ * The following functions are copied from i965 driver, commit
+ * id 292368570a13501dfa95b1b0dd70966caf6ffc6b. Need to keep consistant
+ * with the dri driver installed on current system.
+ *****************************************************************/
+static bool
+_intel_region_flink(struct intel_region *region, uint32_t *name)
+{
+ if (region->name == 0) {
+ if (drm_intel_bo_flink(region->bo, ®ion->name))
+ return false;
+ }
+
+ *name = region->name;
+
+ return true;
+}
+
+#define _DBG(...)
+static void
+_intel_region_release(struct intel_region **region_handle)
+{
+ struct intel_region *region = *region_handle;
+
+ if (region == NULL) {
+ _DBG("%s NULL\n", __FUNCTION__);
+ return;
+ }
+
+ _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
+
+ ASSERT(region->refcount > 0);
+ region->refcount--;
+
+ if (region->refcount == 0) {
+ drm_intel_bo_unreference(region->bo);
+
+ free(region);
+ }
+ *region_handle = NULL;
+}
+
+static void
+_intel_region_reference(struct intel_region **dst, struct intel_region *src)
+{
+ _DBG("%s: %p(%d) -> %p(%d)\n", __FUNCTION__,
+ *dst, *dst ? (*dst)->refcount : 0, src, src ? src->refcount : 0);
+
+ if (src != *dst) {
+ if (*dst)
+ _intel_region_release(dst);
+
+ if (src)
+ src->refcount++;
+ *dst = src;
+ }
+}
+
+/**
+ * This function computes masks that may be used to select the bits of the X
+ * and Y coordinates that indicate the offset within a tile. If the region is
+ * untiled, the masks are set to 0.
+ */
+static void
+_intel_region_get_tile_masks(struct intel_region *region,
+ uint32_t *mask_x, uint32_t *mask_y,
+ bool map_stencil_as_y_tiled)
+{
+ int cpp = region->cpp;
+ uint32_t tiling = region->tiling;
+
+ if (map_stencil_as_y_tiled)
+ tiling = I915_TILING_Y;
+
+ switch (tiling) {
+ default:
+ assert(false);
+ case I915_TILING_NONE:
+ *mask_x = *mask_y = 0;
+ break;
+ case I915_TILING_X:
+ *mask_x = 512 / cpp - 1;
+ *mask_y = 7;
+ break;
+ case I915_TILING_Y:
+ *mask_x = 128 / cpp - 1;
+ *mask_y = 31;
+ break;
+ }
+}
+
+/**
+ * Compute the offset (in bytes) from the start of the region to the given x
+ * and y coordinate. For tiled regions, caller must ensure that x and y are
+ * multiples of the tile size.
+ */
+static uint32_t
+_intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
+ uint32_t y, bool map_stencil_as_y_tiled)
+{
+ int cpp = region->cpp;
+ uint32_t pitch = region->pitch;
+ uint32_t tiling = region->tiling;
+
+ if (map_stencil_as_y_tiled) {
+ tiling = I915_TILING_Y;
+
+ /* When mapping a W-tiled stencil buffer as Y-tiled, each 64-high W-tile
+ * gets transformed into a 32-high Y-tile. Accordingly, the pitch of
+ * the resulting region is twice the pitch of the original region, since
+ * each row in the Y-tiled view corresponds to two rows in the actual
+ * W-tiled surface. So we need to correct the pitch before computing
+ * the offsets.
+ */
+ pitch *= 2;
+ }
+
+ switch (tiling) {
+ default:
+ assert(false);
+ case I915_TILING_NONE:
+ return y * pitch + x * cpp;
+ case I915_TILING_X:
+ assert((x % (512 / cpp)) == 0);
+ assert((y % 8) == 0);
+ return y * pitch + x / (512 / cpp) * 4096;
+ case I915_TILING_Y:
+ assert((x % (128 / cpp)) == 0);
+ assert((y % 32) == 0);
+ return y * pitch + x / (128 / cpp) * 4096;
+ }
+}
+
+static void
+_intel_miptree_get_image_offset(struct intel_mipmap_tree *mt,
+ GLuint level, GLuint slice,
+ GLuint *x, GLuint *y)
+{
+ assert(slice < mt->level[level].depth);
+
+ *x = mt->level[level].slice[slice].x_offset;
+ *y = mt->level[level].slice[slice].y_offset;
+}
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 6c6b9fb..cc33914 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -45,6 +45,13 @@
* Zou Nan hai <nanhai.zou at intel.com>
*
*/
+
+#if defined(HAS_EGL)
+#include "GL/gl.h"
+#include "EGL/egl.h"
+#include "x11/mesa_egl_extension.h"
+#endif
+
#include "intel_driver.h"
#include "intel_gpgpu.h"
#include "intel_batchbuffer.h"
@@ -65,6 +72,8 @@
#include "cl_alloc.h"
#include "cl_context.h"
#include "cl_driver.h"
+#include "cl_device_id.h"
+#include "cl_platform_id.h"
#define SET_BLOCKED_SIGSET(DRIVER) do { \
sigset_t bl_mask; \
@@ -169,6 +178,7 @@ static void
intel_driver_open(intel_driver_t *intel, cl_context_prop props)
{
int cardi;
+ char *driver_name;
if (props != NULL
&& props->gl_type != CL_GL_NOSHARE
&& props->gl_type != CL_GL_GLX_DISPLAY
@@ -182,7 +192,7 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
if(intel->x11_display) {
if((intel->dri_ctx = getDRI2State(intel->x11_display,
DefaultScreen(intel->x11_display),
- NULL)))
+ &driver_name)))
intel_driver_init_shared(intel, intel->dri_ctx);
else
printf("X server found. dri2 connection failed! \n");
@@ -206,15 +216,9 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
exit(-1);
}
-#if defined(HAS_GBM) && defined(HAS_EGL)
+#ifdef HAS_EGL
if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
assert(props->egl_display);
- intel->gbm = gbm_create_device(intel->fd);
- if (intel->gbm == NULL) {
- printf("GBM device create failed.\n");
- exit(-1);
- }
- cl_gbm_set_image_extension(intel->gbm, (void*)props->egl_display);
}
#endif
}
@@ -222,9 +226,6 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
static void
intel_driver_close(intel_driver_t *intel)
{
-#ifdef HAS_GBM
- if(intel->gbm) gbm_device_destroy(intel->gbm);
-#endif
if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
if(intel->x11_display) XCloseDisplay(intel->x11_display);
if(intel->fd) close(intel->fd);
@@ -325,11 +326,11 @@ intel_driver_unlock_hardware(intel_driver_t *driver)
}
LOCAL dri_bo*
-intel_driver_share_buffer(intel_driver_t *driver, uint32_t name)
+intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t name)
{
assert(!driver->master);
dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
- "rendering buffer",
+ sname,
name);
return bo;
}
@@ -380,7 +381,7 @@ cl_intel_driver_new(cl_context_prop props)
/* We use the first 2 slots(0,1) for all the bufs.
* Notify the gbe this base index, thus gbe can avoid conflicts
* when it allocates slots for images*/
- gbe_set_image_base_index(2);
+ gbe_set_image_base_index(3);
exit:
return driver;
error:
@@ -404,11 +405,9 @@ intel_driver_get_ver(struct intel_driver *drv)
static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
-#if defined(HAS_EGL) && defined(HAS_GBM)
-#include "gbm.h"
-#include "GL/gl.h"
-#include "EGL/egl.h"
-#include "EGL/eglext.h"
+#if defined(HAS_EGL)
+#include "intel_dri_resource_sharing.h"
+#include "cl_image.h"
static int get_cl_tiling(uint32_t drm_tiling)
{
switch(drm_tiling) {
@@ -421,50 +420,166 @@ static int get_cl_tiling(uint32_t drm_tiling)
return CL_NO_TILE;
}
-static unsigned int get_gl_format(uint32_t gbm_format)
+static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
{
- switch(gbm_format) {
- case GBM_FORMAT_ARGB8888: return GL_BGRA;
- case GBM_FORMAT_ABGR8888: return GL_RGBA;
+ cl_int ret = CL_SUCCESS;
+
+ switch (tex_format) {
+ case GL_RGBA8:
+ case GL_RGBA:
+ case GL_RGBA16:
+ case GL_RGBA8I:
+ case GL_RGBA16I:
+ case GL_RGBA32I:
+ case GL_RGBA8UI:
+ case GL_RGBA16UI:
+ case GL_RGBA32UI:
+ case GL_RGBA16F:
+ case GL_RGBA32F:
+ cl_format->image_channel_order = CL_RGBA;
+ break;
+ case GL_BGRA:
+ cl_format->image_channel_order = CL_BGRA;
+ break;
default:
- NOT_IMPLEMENTED;
+ ret = -1;
+ goto error;
}
- return 0;
+
+ switch (tex_format) {
+ case GL_RGBA8:
+ case GL_RGBA:
+ case GL_BGRA:
+ cl_format->image_channel_data_type = CL_UNORM_INT8;
+ break;
+ case GL_RGBA16:
+ cl_format->image_channel_data_type = CL_UNORM_INT16;
+ break;
+ case GL_RGBA8I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT8;
+ break;
+ case GL_RGBA16I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT16;
+ break;
+ case GL_RGBA32I:
+ cl_format->image_channel_data_type = CL_SIGNED_INT32;
+ break;
+ case GL_RGBA8UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
+ break;
+ case GL_RGBA16UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
+ break;
+ case GL_RGBA32UI:
+ cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
+ break;
+ case GL_RGBA16F:
+ cl_format->image_channel_data_type = CL_HALF_FLOAT;
+ break;
+ case GL_RGBA32F:
+ cl_format->image_channel_order = CL_FLOAT;
+ break;
+ default:
+ ret = -1;
+ goto error;
+ }
+
+error:
+ return ret;
}
-cl_buffer intel_alloc_buffer_from_eglimage(cl_context ctx,
- void* image,
- unsigned int *gl_format,
- int *w, int *h, int *pitch,
- int *tiling)
+static int
+get_mem_type_from_target(GLenum texture_target, cl_mem_object_type *type)
{
- struct gbm_bo *bo;
- uint32_t gbm_format;
- drm_intel_bo *intel_bo;
- int32_t name;
- uint32_t drm_tiling, swizzle;
- EGLImageKHR egl_image = (EGLImageKHR)image;
- intel_driver_t *intel = (intel_driver_t*)ctx->drv;
-
- bo = gbm_bo_import(intel->gbm, GBM_BO_IMPORT_EGL_IMAGE, (void*)egl_image, 0);
-
- *w = gbm_bo_get_width(bo);
- *h = gbm_bo_get_height(bo);
- *pitch = gbm_bo_get_stride(bo);
- gbm_format = gbm_bo_get_format(bo);
- *gl_format = get_gl_format(gbm_format);
- name = cl_gbm_bo_get_name(bo);
-
- intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, name);
-
- if (drm_intel_bo_get_tiling(intel_bo, &drm_tiling, &swizzle)!= 0)
- assert(0);
- *tiling = get_cl_tiling(drm_tiling);
+ switch(texture_target) {
+ case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
+ case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
+ case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
+ case GL_TEXTURE_1D_ARRAY: *type = CL_MEM_OBJECT_IMAGE1D_ARRAY; break;
+ case GL_TEXTURE_2D_ARRAY: *type = CL_MEM_OBJECT_IMAGE2D_ARRAY; break;
+ default:
+ return -1;
+ }
+ return CL_SUCCESS;
+}
- gbm_bo_destroy(bo);
+static cl_buffer
+intel_alloc_buffer_from_texture_egl(cl_context ctx, unsigned int target,
+ int miplevel, unsigned int texture,
+ struct _cl_mem_image *image)
+{
+ cl_buffer bo = (cl_buffer) NULL;
+ struct _intel_dri_share_image_region region;
+ unsigned int bpp, intel_fmt;
+ cl_image_format cl_format;
+ EGLBoolean ret;
+ EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+ EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+ EGL_GL_TEXTURE_TARGET_MESA, target,
+ EGL_NONE};
+ ret = eglAcquireResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx),
+ EGL_GL_TEXTURE_MESA,
+ &attrib_list[0], ®ion);
+ if (!ret)
+ goto out;
+
+ bo = (cl_buffer)intel_driver_share_buffer((intel_driver_t *)ctx->drv, "rendering buffer", region.name);
+
+ if (bo == NULL) {
+ eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+ goto out;
+ }
+ region.tiling = get_cl_tiling(region.tiling);
+ if (cl_get_clformat_from_texture(region.gl_format, &cl_format) != 0)
+ goto error;
+ intel_fmt = cl_image_get_intel_format(&cl_format);
+ if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
+ goto error;
+ cl_image_byte_per_pixel(&cl_format, &bpp);
+ cl_mem_object_type image_type;
+ if (get_mem_type_from_target(target, &image_type) != 0)
+ goto error;
+
+ cl_mem_image_init(image, region.w, region.h,
+ image_type, region.depth, cl_format,
+ intel_fmt, bpp, region.row_pitch,
+ region.slice_pitch, region.tiling,
+ region.tile_x, region.tile_y, region.offset);
+out:
+ return bo;
- return (cl_buffer)intel_bo;
+error:
+ cl_buffer_unreference(bo);
+ eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+ return NULL;
+}
+static cl_buffer
+intel_alloc_buffer_from_texture(cl_context ctx, unsigned int target,
+ int miplevel, unsigned int texture,
+ struct _cl_mem_image *image)
+{
+
+ if (IS_EGL_CONTEXT(ctx))
+ return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
+
+ return NULL;
+}
+
+static int
+intel_release_buffer_from_texture(cl_context ctx, unsigned int target,
+ int miplevel, unsigned int texture)
+{
+ if (IS_EGL_CONTEXT(ctx)) {
+ EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+ EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+ EGL_GL_TEXTURE_TARGET_MESA, target,
+ EGL_NONE};
+
+ eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+ return CL_SUCCESS;
+ }
+ return -1;
}
#endif
@@ -510,8 +625,10 @@ intel_setup_callbacks(void)
cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
-#ifdef HAS_EGL
- cl_buffer_alloc_from_eglimage = (cl_buffer_alloc_from_eglimage_cb *) intel_alloc_buffer_from_eglimage;
+#if defined(HAS_EGL)
+ cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
+ cl_buffer_release_from_texture = (cl_buffer_release_from_texture_cb *) intel_release_buffer_from_texture;
+ intel_set_cl_gl_callbacks();
#endif
cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference;
cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference;
@@ -519,6 +636,7 @@ intel_setup_callbacks(void)
cl_buffer_unmap = (cl_buffer_unmap_cb *) drm_intel_bo_unmap;
cl_buffer_map_gtt = (cl_buffer_map_gtt_cb *) drm_intel_gem_bo_map_gtt;
cl_buffer_unmap_gtt = (cl_buffer_unmap_gtt_cb *) drm_intel_gem_bo_unmap_gtt;
+ cl_buffer_map_gtt_unsync = (cl_buffer_map_gtt_unsync_cb *) drm_intel_gem_bo_map_unsynchronized;
cl_buffer_get_virtual = (cl_buffer_get_virtual_cb *) drm_intel_bo_get_virtual;
cl_buffer_get_size = (cl_buffer_get_size_cb *) drm_intel_bo_get_size;
cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
@@ -527,4 +645,3 @@ intel_setup_callbacks(void)
cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
intel_set_gpgpu_callbacks();
}
-
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index f70f96a..a01d881 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -54,9 +54,6 @@
#include <drm.h>
#include <i915_drm.h>
#include <intel_bufmgr.h>
-#ifdef HAS_GBM
-#include <gbm.h>
-#endif
#define CMD_MI (0x0 << 29)
#define CMD_2D (0x2 << 29)
@@ -90,9 +87,6 @@ typedef struct intel_driver
int master;
Display *x11_display;
struct dri_state *dri_ctx;
-#ifdef HAS_GBM
- struct gbm_device *gbm;
-#endif
} intel_driver_t;
/* device control */
@@ -100,7 +94,7 @@ extern void intel_driver_lock_hardware(intel_driver_t*);
extern void intel_driver_unlock_hardware(intel_driver_t*);
/* methods working in shared mode */
-extern dri_bo* intel_driver_share_buffer(intel_driver_t*, uint32_t name);
+extern dri_bo* intel_driver_share_buffer(intel_driver_t*, const char *sname, uint32_t name);
extern uint32_t intel_driver_shared_name(intel_driver_t*, dri_bo*);
/* init driver shared with X using dri state, acquired from X Display */
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 1301b66..5d93a67 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -79,7 +79,7 @@ struct intel_gpgpu
intel_batchbuffer_t *batch;
cl_gpgpu_kernel *ker;
drm_intel_bo *binded_buf[max_buf_n]; /* all buffers binded for the call */
- uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */
+ uint32_t binded_offset[max_buf_n]; /* their offsets in the curbe buffer */
uint32_t binded_n; /* number of buffers binded */
unsigned long img_bitmap; /* image usage bitmap. */
@@ -94,8 +94,10 @@ struct intel_gpgpu
struct { drm_intel_bo *bo; } vfe_state_b;
struct { drm_intel_bo *bo; } curbe_b;
struct { drm_intel_bo *bo; } sampler_state_b;
+ struct { drm_intel_bo *bo; } sampler_border_color_state_b;
struct { drm_intel_bo *bo; } perf_b;
struct { drm_intel_bo *bo; } scratch_b;
+ struct { drm_intel_bo *bo; } constant_b;
uint32_t per_thread_scratch;
struct {
@@ -131,6 +133,8 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
drm_intel_bo_unreference(gpgpu->curbe_b.bo);
if (gpgpu->sampler_state_b.bo)
drm_intel_bo_unreference(gpgpu->sampler_state_b.bo);
+ if (gpgpu->sampler_border_color_state_b.bo)
+ drm_intel_bo_unreference(gpgpu->sampler_border_color_state_b.bo);
if (gpgpu->perf_b.bo)
drm_intel_bo_unreference(gpgpu->perf_b.bo);
if (gpgpu->stack_b.bo)
@@ -138,6 +142,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
if (gpgpu->scratch_b.bo)
drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+ if(gpgpu->constant_b.bo)
+ drm_intel_bo_unreference(gpgpu->constant_b.bo);
+
intel_batchbuffer_delete(gpgpu->batch);
cl_free(gpgpu);
}
@@ -197,7 +204,10 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
#else
OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
- OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+ * to a valid bound value, otherwise, the border color pointer may be rejected and you
+ * may get incorrect border color. This is a known hardware bug. */
+ OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
#endif /* USE_FULSIM */
@@ -231,7 +241,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
}
static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
{
BEGIN_BATCH(gpgpu->batch, 4);
OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
@@ -284,6 +294,7 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
pc->dw1.render_target_cache_flush_enable = 1;
+ pc->dw1.texture_cache_invalidation_enable = 1;
pc->dw1.cs_stall = 1;
pc->dw1.dc_flush_enable = 1;
ADVANCE_BATCH(gpgpu->batch);
@@ -319,7 +330,7 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
intel_gpgpu_select_pipeline(gpgpu);
intel_gpgpu_set_base_address(gpgpu);
intel_gpgpu_load_vfe_state(gpgpu);
- intel_gpgpu_load_constant_buffer(gpgpu);
+ intel_gpgpu_load_curbe_buffer(gpgpu);
intel_gpgpu_load_idrt(gpgpu);
if (gpgpu->perf_b.bo) {
@@ -372,6 +383,7 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
for (i = 0; i < gpgpu->binded_n; ++i)
assert(gpgpu->binded_buf[i]->offset != 0);
}
+
static void
intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
{
@@ -391,7 +403,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
/* Binded buffers */
gpgpu->binded_n = 0;
gpgpu->img_bitmap = 0;
- gpgpu->img_index_base = 2;
+ gpgpu->img_index_base = 3;
gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
/* URB */
@@ -399,12 +411,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
gpgpu->urb.size_cs_entry = size_cs_entry;
gpgpu->max_threads = max_threads;
- /* Constant buffer */
+ /* Constant URB buffer */
if(gpgpu->curbe_b.bo)
dri_bo_unreference(gpgpu->curbe_b.bo);
uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
size_cb = ALIGN(size_cb, 4096);
- bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64);
assert(bo);
gpgpu->curbe_b.bo = bo;
@@ -447,6 +459,18 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
memset(bo->virtual, 0, sizeof(gen6_sampler_state_t) * GEN_MAX_SAMPLERS);
gpgpu->sampler_state_b.bo = bo;
+ /* sampler border color state */
+ if (gpgpu->sampler_border_color_state_b.bo)
+ dri_bo_unreference(gpgpu->sampler_border_color_state_b.bo);
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr,
+ "SAMPLER_BORDER_COLOR_STATE",
+ sizeof(gen7_sampler_border_color_t),
+ 32);
+ assert(bo);
+ dri_bo_map(bo, 1);
+ memset(bo->virtual, 0, sizeof(gen7_sampler_border_color_t));
+ gpgpu->sampler_border_color_state_b.bo = bo;
+
/* stack */
if (gpgpu->stack_b.bo)
dri_bo_unreference(gpgpu->stack_b.bo);
@@ -468,6 +492,39 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
obj_bo);
}
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
+{
+ uint32_t s = size - 1;
+ assert(size != 0);
+
+ surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
+ gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+ memset(ss2, 0, sizeof(gen7_surface_state_t));
+ ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+ ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */
+ ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+ ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+ ss2->ss5.cache_control = cc_llc_l3;
+ heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+
+ if(gpgpu->constant_b.bo)
+ dri_bo_unreference(gpgpu->constant_b.bo);
+ gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+ assert(gpgpu->constant_b.bo);
+ ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+ dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ 0,
+ heap->binding_table[2] +
+ offsetof(gen7_surface_state_t, ss1),
+ gpgpu->constant_b.bo);
+ return gpgpu->constant_b.bo;
+}
+
+
/* Map address space with two 2GB surfaces. One surface for untyped message and
* one surface for byte scatters / gathers. Actually the HW does not require two
* surfaces but Fulsim complains
@@ -517,6 +574,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
cl_mem_object_type type,
int32_t w,
int32_t h,
+ int32_t depth,
int32_t pitch,
int32_t tiling)
{
@@ -530,6 +588,9 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
ss->ss1.base_addr = obj_bo->offset;
ss->ss2.width = w - 1;
ss->ss2.height = h - 1;
+ ss->ss3.depth = depth - 1;
+ ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+ ss->ss4.not_str_buf.min_array_element = 0;
ss->ss3.pitch = pitch - 1;
ss->ss5.cache_control = cc_llc_l3;
if (tiling == GPGPU_TILE_X) {
@@ -586,10 +647,11 @@ intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
cl_mem_object_type type,
int32_t w,
int32_t h,
+ int32_t depth,
int32_t pitch,
cl_gpgpu_tiling tiling)
{
- intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, type, w, h, pitch, tiling);
+ intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, type, w, h, depth, pitch, tiling);
assert(index < GEN_MAX_SURFACES);
}
@@ -608,10 +670,12 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
ker_bo = (drm_intel_bo *) kernel->bo;
desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
desc->desc1.single_program_flow = 1;
+ desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
+ desc->desc5.rounding_mode = 0; /* round to nearest even */
desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
desc->desc3.binding_table_entry_count = 0; /* no prefetch */
desc->desc3.binding_table_pointer = 0;
- desc->desc4.curbe_read_len = kernel->cst_sz / 32;
+ desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
desc->desc4.curbe_read_offset = 0;
/* Barriers / SLM are automatically handled on Gen7+ */
@@ -642,7 +706,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
ker_bo);
dri_bo_emit_reloc(bo,
- I915_GEM_DOMAIN_INSTRUCTION, 0,
+ I915_GEM_DOMAIN_SAMPLER, 0,
0,
offsetof(gen6_interface_descriptor_t, desc2),
gpgpu->sampler_state_b.bo);
@@ -650,7 +714,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
}
static void
-intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
{
unsigned char *curbe = NULL;
cl_gpgpu_kernel *k = gpgpu->ker;
@@ -665,9 +729,9 @@ intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t si
/* Now put all the relocations for our flat address space */
for (i = 0; i < k->thread_n; ++i)
for (j = 0; j < gpgpu->binded_n; ++j) {
- *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->cst_sz) = gpgpu->binded_buf[j]->offset;
+ *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset;
drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
- gpgpu->binded_offset[j]+i*k->cst_sz,
+ gpgpu->binded_offset[j]+i*k->curbe_sz,
gpgpu->binded_buf[j],
0,
I915_GEM_DOMAIN_RENDER,
@@ -692,19 +756,7 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
case CLK_ADDRESS_REPEAT:
return GEN_TEXCOORDMODE_WRAP;
case CLK_ADDRESS_CLAMP:
- /* GL_CLAMP is the weird mode where coordinates are clamped to
- * [0.0, 1.0], so linear filtering of coordinates outside of
- * [0.0, 1.0] give you half edge texel value and half border
- * color. The fragment shader will clamp the coordinates, and
- * we set clamp_border here, which gets the result desired. We
- * just use clamp(_to_edge) for nearest, because for nearest
- * clamping to 1.0 gives border color instead of the desired
- * edge texels.
- */
- if (using_nearest)
- return GEN_TEXCOORDMODE_CLAMP;
- else
- return GEN_TEXCOORDMODE_CLAMP_BORDER;
+ return GEN_TEXCOORDMODE_CLAMP_BORDER;
case CLK_ADDRESS_CLAMP_TO_EDGE:
return GEN_TEXCOORDMODE_CLAMP;
case CLK_ADDRESS_MIRRORED_REPEAT:
@@ -721,7 +773,9 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
uint32_t wrap_mode;
gen7_sampler_state_t *sampler;
- sampler = (gen7_sampler_state_t *)gpgpu->sampler_state_b.bo->virtual + index;
+ sampler = (gen7_sampler_state_t *)(gpgpu->sampler_state_b.bo->virtual) + index;
+ memset(sampler, 0, sizeof(*sampler));
+ sampler->ss2.default_color_pointer = (gpgpu->sampler_border_color_state_b.bo->offset) >> 5;
if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
sampler->ss3.non_normalized_coord = 1;
else
@@ -742,9 +796,11 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
}
wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
- sampler->ss3.r_wrap_mode = wrap_mode;
sampler->ss3.s_wrap_mode = wrap_mode;
+ /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
+ * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
sampler->ss3.t_wrap_mode = wrap_mode;
+ sampler->ss3.r_wrap_mode = wrap_mode;
sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
@@ -762,15 +818,36 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+
+ dri_bo_emit_reloc(gpgpu->sampler_state_b.bo,
+ I915_GEM_DOMAIN_SAMPLER, 0,
+ 0,
+ index * sizeof(gen7_sampler_state_t) +
+ offsetof(gen7_sampler_state_t, ss2),
+ gpgpu->sampler_border_color_state_b.bo);
+
}
static void
intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
{
int index;
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ assert(sampler_sz <= GEN_MAX_SAMPLERS/2);
+#else
assert(sampler_sz <= GEN_MAX_SAMPLERS);
- for(index = 0; index < sampler_sz; index++)
- intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK);
+#endif
+ for(index = 0; index < sampler_sz; index++) {
+ intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+ /* Duplicate the sampler to 8 + index and fixup the address mode
+ * to repeat.*/
+ if ((samplers[index] & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) {
+ intel_gpgpu_insert_sampler(gpgpu, index + 8,
+ (samplers[index] & ~__CLK_ADDRESS_MASK) | CLK_ADDRESS_CLAMP_TO_EDGE);
+ }
+#endif
+ }
}
static void
@@ -781,6 +858,7 @@ intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
intel_gpgpu_map_address_space(gpgpu);
dri_bo_unmap(gpgpu->surface_heap_b.bo);
dri_bo_unmap(gpgpu->sampler_state_b.bo);
+ dri_bo_unmap(gpgpu->sampler_border_color_state_b.bo);
}
static void
@@ -821,11 +899,11 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
else
OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */
- OUT_BATCH(gpgpu->batch, global_wk_off[0]);
+ OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
- OUT_BATCH(gpgpu->batch, global_wk_off[1]);
+ OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
- OUT_BATCH(gpgpu->batch, global_wk_off[2]);
+ OUT_BATCH(gpgpu->batch, 0);
OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
OUT_BATCH(gpgpu->batch, right_mask);
OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/
@@ -925,7 +1003,8 @@ intel_set_gpgpu_callbacks(void)
cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
- cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
+ cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+ cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index ff339c5..36b5971 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -209,7 +209,17 @@ typedef struct gen7_surface_state
uint32_t depth:11;
} ss3;
- uint32_t ss4;
+ union {
+ struct {
+ uint32_t mulsample_pal_idx:3;
+ uint32_t numer_mulsample:3;
+ uint32_t mss_fmt:1;
+ uint32_t rt_view_extent:11;
+ uint32_t min_array_element:11;
+ uint32_t rt_rotate:2;
+ uint32_t pad0:1;
+ } not_str_buf;
+ } ss4;
struct {
uint32_t mip_count:4;
@@ -380,6 +390,10 @@ typedef struct gen6_sampler_state
} ss3;
} gen6_sampler_state_t;
+typedef struct gen7_sampler_border_color {
+ float r,g,b,a;
+} gen7_sampler_border_color_t;
+
typedef struct gen7_sampler_state
{
struct {
diff --git a/src/x11/dricommon.h b/src/x11/dricommon.h
index 08e66a5..5a950b4 100644
--- a/src/x11/dricommon.h
+++ b/src/x11/dricommon.h
@@ -94,11 +94,6 @@ void dri_state_release(dri_state_t*);
// Create a dri2 state from dpy and screen
dri_state_t *getDRI2State(Display* dpy, int screen, char **driver_name);
-#ifdef HAS_GBM
-#include<gbm.h>
-void cl_gbm_set_image_extension(struct gbm_device *gbm, void *display);
-int cl_gbm_bo_get_name(struct gbm_bo *bo);
-#endif
#endif /* _VA_DRICOMMON_H_ */
diff --git a/src/x11/gbm_deps/backend.h b/src/x11/gbm_deps/backend.h
deleted file mode 100644
index 4a64375..0000000
--- a/src/x11/gbm_deps/backend.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- * Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef MODULE_H_
-#define MODULE_H_
-
-#include "gbmint.h"
-
-struct gbm_device *
-_gbm_create_device(int fd);
-
-#endif
diff --git a/src/x11/gbm_deps/common.h b/src/x11/gbm_deps/common.h
deleted file mode 100644
index 1fcdfca..0000000
--- a/src/x11/gbm_deps/common.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- * Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-#include <libudev.h>
-
-struct udev_device *
-_gbm_udev_device_new_from_fd(struct udev *udev, int fd);
-
-char *
-_gbm_fd_get_device_name(int fd);
-
-void
-_gbm_log(const char *fmt_str, ...);
-
-#endif
diff --git a/src/x11/gbm_deps/common_drm.h b/src/x11/gbm_deps/common_drm.h
deleted file mode 100644
index d28c3f0..0000000
--- a/src/x11/gbm_deps/common_drm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- * Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _COMMON_DRM_H_
-#define _COMMON_DRM_H_
-
-#include "gbmint.h"
-
-enum gbm_drm_driver_type {
- GBM_DRM_DRIVER_TYPE_DRI,
- GBM_DRM_DRIVER_TYPE_GALLIUM,
-};
-
-struct gbm_drm_device {
- struct gbm_device base;
- enum gbm_drm_driver_type type;
- char *driver_name;
-};
-
-struct gbm_drm_bo {
- struct gbm_bo base;
-};
-
-#endif
diff --git a/src/x11/gbm_deps/gbm.h b/src/x11/gbm_deps/gbm.h
deleted file mode 100644
index e516df2..0000000
--- a/src/x11/gbm_deps/gbm.h
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- * Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _GBM_H_
-#define _GBM_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#define __GBM__ 1
-
-#include <stdint.h>
-
-/**
- * \file gbm.h
- * \brief Generic Buffer Manager
- */
-
-struct gbm_device;
-struct gbm_bo;
-struct gbm_surface;
-
-/**
- * \mainpage The Generic Buffer Manager
- *
- * This module provides an abstraction that the caller can use to request a
- * buffer from the underlying memory management system for the platform.
- *
- * This allows the creation of portable code whilst still allowing access to
- * the underlying memory manager.
- */
-
-/**
- * Abstraction representing the handle to a buffer allocated by the
- * manager
- */
-union gbm_bo_handle {
- void *ptr;
- int32_t s32;
- uint32_t u32;
- int64_t s64;
- uint64_t u64;
-};
-
-/** Format of the allocated buffer */
-enum gbm_bo_format {
- /** RGB with 8 bits per channel in a 32 bit value */
- GBM_BO_FORMAT_XRGB8888,
- /** ARGB with 8 bits per channel in a 32 bit value */
- GBM_BO_FORMAT_ARGB8888
-};
-
-#define __gbm_fourcc_code(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | \
- ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))
-
-#define GBM_FORMAT_BIG_ENDIAN (1<<31) /* format is big endian instead of little endian */
-
-/* color index */
-#define GBM_FORMAT_C8 __gbm_fourcc_code('C', '8', ' ', ' ') /* [7:0] C */
-
-/* 8 bpp RGB */
-#define GBM_FORMAT_RGB332 __gbm_fourcc_code('R', 'G', 'B', '8') /* [7:0] R:G:B 3:3:2 */
-#define GBM_FORMAT_BGR233 __gbm_fourcc_code('B', 'G', 'R', '8') /* [7:0] B:G:R 2:3:3 */
-
-/* 16 bpp RGB */
-#define GBM_FORMAT_XRGB4444 __gbm_fourcc_code('X', 'R', '1', '2') /* [15:0] x:R:G:B 4:4:4:4 little endian */
-#define GBM_FORMAT_XBGR4444 __gbm_fourcc_code('X', 'B', '1', '2') /* [15:0] x:B:G:R 4:4:4:4 little endian */
-#define GBM_FORMAT_RGBX4444 __gbm_fourcc_code('R', 'X', '1', '2') /* [15:0] R:G:B:x 4:4:4:4 little endian */
-#define GBM_FORMAT_BGRX4444 __gbm_fourcc_code('B', 'X', '1', '2') /* [15:0] B:G:R:x 4:4:4:4 little endian */
-
-#define GBM_FORMAT_ARGB4444 __gbm_fourcc_code('A', 'R', '1', '2') /* [15:0] A:R:G:B 4:4:4:4 little endian */
-#define GBM_FORMAT_ABGR4444 __gbm_fourcc_code('A', 'B', '1', '2') /* [15:0] A:B:G:R 4:4:4:4 little endian */
-#define GBM_FORMAT_RGBA4444 __gbm_fourcc_code('R', 'A', '1', '2') /* [15:0] R:G:B:A 4:4:4:4 little endian */
-#define GBM_FORMAT_BGRA4444 __gbm_fourcc_code('B', 'A', '1', '2') /* [15:0] B:G:R:A 4:4:4:4 little endian */
-
-#define GBM_FORMAT_XRGB1555 __gbm_fourcc_code('X', 'R', '1', '5') /* [15:0] x:R:G:B 1:5:5:5 little endian */
-#define GBM_FORMAT_XBGR1555 __gbm_fourcc_code('X', 'B', '1', '5') /* [15:0] x:B:G:R 1:5:5:5 little endian */
-#define GBM_FORMAT_RGBX5551 __gbm_fourcc_code('R', 'X', '1', '5') /* [15:0] R:G:B:x 5:5:5:1 little endian */
-#define GBM_FORMAT_BGRX5551 __gbm_fourcc_code('B', 'X', '1', '5') /* [15:0] B:G:R:x 5:5:5:1 little endian */
-
-#define GBM_FORMAT_ARGB1555 __gbm_fourcc_code('A', 'R', '1', '5') /* [15:0] A:R:G:B 1:5:5:5 little endian */
-#define GBM_FORMAT_ABGR1555 __gbm_fourcc_code('A', 'B', '1', '5') /* [15:0] A:B:G:R 1:5:5:5 little endian */
-#define GBM_FORMAT_RGBA5551 __gbm_fourcc_code('R', 'A', '1', '5') /* [15:0] R:G:B:A 5:5:5:1 little endian */
-#define GBM_FORMAT_BGRA5551 __gbm_fourcc_code('B', 'A', '1', '5') /* [15:0] B:G:R:A 5:5:5:1 little endian */
-
-#define GBM_FORMAT_RGB565 __gbm_fourcc_code('R', 'G', '1', '6') /* [15:0] R:G:B 5:6:5 little endian */
-#define GBM_FORMAT_BGR565 __gbm_fourcc_code('B', 'G', '1', '6') /* [15:0] B:G:R 5:6:5 little endian */
-
-/* 24 bpp RGB */
-#define GBM_FORMAT_RGB888 __gbm_fourcc_code('R', 'G', '2', '4') /* [23:0] R:G:B little endian */
-#define GBM_FORMAT_BGR888 __gbm_fourcc_code('B', 'G', '2', '4') /* [23:0] B:G:R little endian */
-
-/* 32 bpp RGB */
-#define GBM_FORMAT_XRGB8888 __gbm_fourcc_code('X', 'R', '2', '4') /* [31:0] x:R:G:B 8:8:8:8 little endian */
-#define GBM_FORMAT_XBGR8888 __gbm_fourcc_code('X', 'B', '2', '4') /* [31:0] x:B:G:R 8:8:8:8 little endian */
-#define GBM_FORMAT_RGBX8888 __gbm_fourcc_code('R', 'X', '2', '4') /* [31:0] R:G:B:x 8:8:8:8 little endian */
-#define GBM_FORMAT_BGRX8888 __gbm_fourcc_code('B', 'X', '2', '4') /* [31:0] B:G:R:x 8:8:8:8 little endian */
-
-#define GBM_FORMAT_ARGB8888 __gbm_fourcc_code('A', 'R', '2', '4') /* [31:0] A:R:G:B 8:8:8:8 little endian */
-#define GBM_FORMAT_ABGR8888 __gbm_fourcc_code('A', 'B', '2', '4') /* [31:0] A:B:G:R 8:8:8:8 little endian */
-#define GBM_FORMAT_RGBA8888 __gbm_fourcc_code('R', 'A', '2', '4') /* [31:0] R:G:B:A 8:8:8:8 little endian */
-#define GBM_FORMAT_BGRA8888 __gbm_fourcc_code('B', 'A', '2', '4') /* [31:0] B:G:R:A 8:8:8:8 little endian */
-
-#define GBM_FORMAT_XRGB2101010 __gbm_fourcc_code('X', 'R', '3', '0') /* [31:0] x:R:G:B 2:10:10:10 little endian */
-#define GBM_FORMAT_XBGR2101010 __gbm_fourcc_code('X', 'B', '3', '0') /* [31:0] x:B:G:R 2:10:10:10 little endian */
-#define GBM_FORMAT_RGBX1010102 __gbm_fourcc_code('R', 'X', '3', '0') /* [31:0] R:G:B:x 10:10:10:2 little endian */
-#define GBM_FORMAT_BGRX1010102 __gbm_fourcc_code('B', 'X', '3', '0') /* [31:0] B:G:R:x 10:10:10:2 little endian */
-
-#define GBM_FORMAT_ARGB2101010 __gbm_fourcc_code('A', 'R', '3', '0') /* [31:0] A:R:G:B 2:10:10:10 little endian */
-#define GBM_FORMAT_ABGR2101010 __gbm_fourcc_code('A', 'B', '3', '0') /* [31:0] A:B:G:R 2:10:10:10 little endian */
-#define GBM_FORMAT_RGBA1010102 __gbm_fourcc_code('R', 'A', '3', '0') /* [31:0] R:G:B:A 10:10:10:2 little endian */
-#define GBM_FORMAT_BGRA1010102 __gbm_fourcc_code('B', 'A', '3', '0') /* [31:0] B:G:R:A 10:10:10:2 little endian */
-
-/* packed YCbCr */
-#define GBM_FORMAT_YUYV __gbm_fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */
-#define GBM_FORMAT_YVYU __gbm_fourcc_code('Y', 'V', 'Y', 'U') /* [31:0] Cb0:Y1:Cr0:Y0 8:8:8:8 little endian */
-#define GBM_FORMAT_UYVY __gbm_fourcc_code('U', 'Y', 'V', 'Y') /* [31:0] Y1:Cr0:Y0:Cb0 8:8:8:8 little endian */
-#define GBM_FORMAT_VYUY __gbm_fourcc_code('V', 'Y', 'U', 'Y') /* [31:0] Y1:Cb0:Y0:Cr0 8:8:8:8 little endian */
-
-#define GBM_FORMAT_AYUV __gbm_fourcc_code('A', 'Y', 'U', 'V') /* [31:0] A:Y:Cb:Cr 8:8:8:8 little endian */
-
-/*
- * 2 plane YCbCr
- * index 0 = Y plane, [7:0] Y
- * index 1 = Cr:Cb plane, [15:0] Cr:Cb little endian
- * or
- * index 1 = Cb:Cr plane, [15:0] Cb:Cr little endian
- */
-#define GBM_FORMAT_NV12 __gbm_fourcc_code('N', 'V', '1', '2') /* 2x2 subsampled Cr:Cb plane */
-#define GBM_FORMAT_NV21 __gbm_fourcc_code('N', 'V', '2', '1') /* 2x2 subsampled Cb:Cr plane */
-#define GBM_FORMAT_NV16 __gbm_fourcc_code('N', 'V', '1', '6') /* 2x1 subsampled Cr:Cb plane */
-#define GBM_FORMAT_NV61 __gbm_fourcc_code('N', 'V', '6', '1') /* 2x1 subsampled Cb:Cr plane */
-
-/*
- * 3 plane YCbCr
- * index 0: Y plane, [7:0] Y
- * index 1: Cb plane, [7:0] Cb
- * index 2: Cr plane, [7:0] Cr
- * or
- * index 1: Cr plane, [7:0] Cr
- * index 2: Cb plane, [7:0] Cb
- */
-#define GBM_FORMAT_YUV410 __gbm_fourcc_code('Y', 'U', 'V', '9') /* 4x4 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU410 __gbm_fourcc_code('Y', 'V', 'U', '9') /* 4x4 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV411 __gbm_fourcc_code('Y', 'U', '1', '1') /* 4x1 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU411 __gbm_fourcc_code('Y', 'V', '1', '1') /* 4x1 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV420 __gbm_fourcc_code('Y', 'U', '1', '2') /* 2x2 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU420 __gbm_fourcc_code('Y', 'V', '1', '2') /* 2x2 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV422 __gbm_fourcc_code('Y', 'U', '1', '6') /* 2x1 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU422 __gbm_fourcc_code('Y', 'V', '1', '6') /* 2x1 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV444 __gbm_fourcc_code('Y', 'U', '2', '4') /* non-subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU444 __gbm_fourcc_code('Y', 'V', '2', '4') /* non-subsampled Cr (1) and Cb (2) planes */
-
-
-/**
- * Flags to indicate the intended use for the buffer - these are passed into
- * gbm_bo_create(). The caller must set the union of all the flags that are
- * appropriate
- *
- * \sa Use gbm_device_is_format_supported() to check if the combination of format
- * and use flags are supported
- */
-enum gbm_bo_flags {
- /**
- * Buffer is going to be presented to the screen using an API such as KMS
- */
- GBM_BO_USE_SCANOUT = (1 << 0),
- /**
- * Buffer is going to be used as cursor - the dimensions for the buffer
- * must be 64x64 if this flag is passed.
- */
- GBM_BO_USE_CURSOR_64X64 = (1 << 1),
- /**
- * Buffer is to be used for rendering - for example it is going to be used
- * as the storage for a color buffer
- */
- GBM_BO_USE_RENDERING = (1 << 2),
- /**
- * Buffer can be used for gbm_bo_write. This is guaranteed to work
- * with GBM_BO_USE_CURSOR_64X64. but may not work for other
- * combinations.
- */
- GBM_BO_USE_WRITE = (1 << 3),
-};
-
-int
-gbm_device_get_fd(struct gbm_device *gbm);
-
-const char *
-gbm_device_get_backend_name(struct gbm_device *gbm);
-
-int
-gbm_device_is_format_supported(struct gbm_device *gbm,
- uint32_t format, uint32_t usage);
-
-void
-gbm_device_destroy(struct gbm_device *gbm);
-
-struct gbm_device *
-gbm_create_device(int fd);
-
-struct gbm_bo *
-gbm_bo_create(struct gbm_device *gbm,
- uint32_t width, uint32_t height,
- uint32_t format, uint32_t flags);
-
-#define GBM_BO_IMPORT_WL_BUFFER 0x5501
-#define GBM_BO_IMPORT_EGL_IMAGE 0x5502
-
-struct gbm_bo *
-gbm_bo_import(struct gbm_device *gbm, uint32_t type,
- void *buffer, uint32_t usage);
-
-uint32_t
-gbm_bo_get_width(struct gbm_bo *bo);
-
-uint32_t
-gbm_bo_get_height(struct gbm_bo *bo);
-
-uint32_t
-gbm_bo_get_stride(struct gbm_bo *bo);
-
-uint32_t
-gbm_bo_get_format(struct gbm_bo *bo);
-
-struct gbm_device *
-gbm_bo_get_device(struct gbm_bo *bo);
-
-union gbm_bo_handle
-gbm_bo_get_handle(struct gbm_bo *bo);
-
-int
-gbm_bo_write(struct gbm_bo *bo, const void *buf, size_t count);
-
-void
-gbm_bo_set_user_data(struct gbm_bo *bo, void *data,
- void (*destroy_user_data)(struct gbm_bo *, void *));
-
-void *
-gbm_bo_get_user_data(struct gbm_bo *bo);
-
-void
-gbm_bo_destroy(struct gbm_bo *bo);
-
-struct gbm_surface *
-gbm_surface_create(struct gbm_device *gbm,
- uint32_t width, uint32_t height,
- uint32_t format, uint32_t flags);
-
-struct gbm_bo *
-gbm_surface_lock_front_buffer(struct gbm_surface *surface);
-
-void
-gbm_surface_release_buffer(struct gbm_surface *surface, struct gbm_bo *bo);
-
-int
-gbm_surface_has_free_buffers(struct gbm_surface *surface);
-
-void
-gbm_surface_destroy(struct gbm_surface *surface);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/x11/gbm_deps/gbm_driint.h b/src/x11/gbm_deps/gbm_driint.h
deleted file mode 100644
index 18fc3c0..0000000
--- a/src/x11/gbm_deps/gbm_driint.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- * Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _GBM_DRI_INTERNAL_H_
-#define _GBM_DRI_INTERNAL_H_
-
-#include "gbmint.h"
-
-#include "common.h"
-#include "common_drm.h"
-
-#include <GL/gl.h> /* dri_interface needs GL types */
-#include "GL/internal/dri_interface.h"
-
-struct gbm_dri_surface;
-
-struct gbm_dri_device {
- struct gbm_drm_device base;
-
- void *driver;
-
- __DRIscreen *screen;
-
- __DRIcoreExtension *core;
- __DRIdri2Extension *dri2;
- __DRIimageExtension *image;
- __DRI2flushExtension *flush;
- __DRIdri2LoaderExtension *loader;
-
- const __DRIconfig **driver_configs;
- const __DRIextension *extensions[4];
-
- __DRIimage *(*lookup_image)(__DRIscreen *screen, void *image, void *data);
- void *lookup_user_data;
-
- __DRIbuffer *(*get_buffers)(__DRIdrawable * driDrawable,
- int *width, int *height,
- unsigned int *attachments, int count,
- int *out_count, void *data);
- void (*flush_front_buffer)(__DRIdrawable * driDrawable, void *data);
- __DRIbuffer *(*get_buffers_with_format)(__DRIdrawable * driDrawable,
- int *width, int *height,
- unsigned int *attachments, int count,
- int *out_count, void *data);
-};
-
-struct gbm_dri_bo {
- struct gbm_drm_bo base;
-
- __DRIimage *image;
-
- /* Only used for cursors */
- uint32_t handle, size;
- void *map;
-};
-
-struct gbm_dri_surface {
- struct gbm_surface base;
-
- void *dri_private;
-};
-
-static inline struct gbm_dri_device *
-gbm_dri_device(struct gbm_device *gbm)
-{
- return (struct gbm_dri_device *) gbm;
-}
-
-static inline struct gbm_dri_bo *
-gbm_dri_bo(struct gbm_bo *bo)
-{
- return (struct gbm_dri_bo *) bo;
-}
-
-static inline struct gbm_dri_surface *
-gbm_dri_surface(struct gbm_surface *surface)
-{
- return (struct gbm_dri_surface *) surface;
-}
-
-char *
-dri_fd_get_driver_name(int fd);
-
-#endif
diff --git a/src/x11/gbm_deps/gbmint.h b/src/x11/gbm_deps/gbmint.h
deleted file mode 100644
index a467bea..0000000
--- a/src/x11/gbm_deps/gbmint.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- * Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef INTERNAL_H_
-#define INTERNAL_H_
-
-#include "gbm.h"
-#include <sys/stat.h>
-
-/* GCC visibility */
-#if defined(__GNUC__) && __GNUC__ >= 4
-#define GBM_EXPORT __attribute__ ((visibility("default")))
-#else
-#define GBM_EXPORT
-#endif
-
-/**
- * \file gbmint.h
- * \brief Internal implementation details of gbm
- */
-
-/**
- * The device used for the memory allocation.
- *
- * The members of this structure should be not accessed directly
- */
-struct gbm_device {
- /* Hack to make a gbm_device detectable by its first element. */
- struct gbm_device *(*dummy)(int);
-
- int fd;
- const char *name;
- unsigned int refcount;
- struct stat stat;
-
- void (*destroy)(struct gbm_device *gbm);
- int (*is_format_supported)(struct gbm_device *gbm,
- uint32_t format,
- uint32_t usage);
-
- struct gbm_bo *(*bo_create)(struct gbm_device *gbm,
- uint32_t width, uint32_t height,
- uint32_t format,
- uint32_t usage);
- struct gbm_bo *(*bo_import)(struct gbm_device *gbm, uint32_t type,
- void *buffer, uint32_t usage);
- int (*bo_write)(struct gbm_bo *bo, const void *buf, size_t data);
- void (*bo_destroy)(struct gbm_bo *bo);
-
- struct gbm_surface *(*surface_create)(struct gbm_device *gbm,
- uint32_t width, uint32_t height,
- uint32_t format, uint32_t flags);
- struct gbm_bo *(*surface_lock_front_buffer)(struct gbm_surface *surface);
- void (*surface_release_buffer)(struct gbm_surface *surface,
- struct gbm_bo *bo);
- int (*surface_has_free_buffers)(struct gbm_surface *surface);
- void (*surface_destroy)(struct gbm_surface *surface);
-};
-
-/**
- * The allocated buffer object.
- *
- * The members in this structure should not be accessed directly.
- */
-struct gbm_bo {
- struct gbm_device *gbm;
- uint32_t width;
- uint32_t height;
- uint32_t stride;
- uint32_t format;
- union gbm_bo_handle handle;
- void *user_data;
- void (*destroy_user_data)(struct gbm_bo *, void *);
-};
-
-struct gbm_surface {
- struct gbm_device *gbm;
- uint32_t width;
- uint32_t height;
- uint32_t format;
- uint32_t flags;
-};
-
-struct gbm_backend {
- const char *backend_name;
- struct gbm_device *(*create_device)(int fd);
-};
-
-GBM_EXPORT struct gbm_device *
-_gbm_mesa_get_device(int fd);
-
-#endif
diff --git a/src/x11/gbm_dri2_x11_platform.c b/src/x11/gbm_dri2_x11_platform.c
deleted file mode 100644
index 481f407..0000000
--- a/src/x11/gbm_dri2_x11_platform.c
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <string.h>
-#include "GL/gl.h" /* dri_interface need gl types definitions. */
-#include "GL/internal/dri_interface.h"
-#include "gbm_deps/gbm_driint.h"
-#include "gbm_deps/gbmint.h"
-#include "dricommon.h"
-
-typedef struct EGLDisplay _EGLDisplay;
-typedef struct EGLDriver _EGLDriver;
-/* XXX should check whether we support pthread.*/
-typedef pthread_mutex_t _EGLMutex;
-
-enum _egl_platform_type {
- _EGL_PLATFORM_WINDOWS,
- _EGL_PLATFORM_X11,
- _EGL_PLATFORM_WAYLAND,
- _EGL_PLATFORM_DRM,
- _EGL_PLATFORM_FBDEV,
- _EGL_PLATFORM_NULL,
- _EGL_PLATFORM_ANDROID,
-
- _EGL_NUM_PLATFORMS,
- _EGL_INVALID_PLATFORM = -1
-};
-typedef enum _egl_platform_type _EGLPlatformType;
-typedef unsigned int EGLBoolean;
-typedef int32_t EGLint;
-
-struct _hack_egl_display
-{
- /* used to link displays */
- _EGLDisplay *Next;
-
- _EGLMutex Mutex;
-
- _EGLPlatformType Platform; /**< The type of the platform display */
- void *PlatformDisplay; /**< A pointer to the platform display */
-
- _EGLDriver *Driver; /**< Matched driver of the display */
-
- EGLBoolean Initialized; /**< True if the display is initialized */
-
- /* options that affect how the driver initializes the display */
- struct {
- EGLBoolean TestOnly; /**< Driver should not set fields when true */
- EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
- } Options;
-
- /* these fields are set by the driver during init */
- void *DriverData; /**< Driver private data */
- EGLint VersionMajor; /**< EGL major version */
- EGLint VersionMinor; /**< EGL minor version */
- EGLint ClientAPIs; /**< Bitmask of APIs supported (EGL_xxx_BIT) */
-};
-
-struct _hack_dri2_egl_display
-{
- int dri2_major;
- int dri2_minor;
- __DRIscreen *dri_screen;
- int own_dri_screen;
- const __DRIconfig **driver_configs;
- void *driver;
- __DRIcoreExtension *core;
- __DRIdri2Extension *dri2;
- __DRIswrastExtension *swrast;
- __DRI2flushExtension *flush;
- __DRItexBufferExtension *tex_buffer;
- __DRIimageExtension *image;
- __DRIrobustnessExtension *robustness;
- __DRI2configQueryExtension *config;
- int fd;
-
- int own_device;
- int swap_available;
- int invalidate_available;
- int min_swap_interval;
- int max_swap_interval;
- int default_swap_interval;
- struct gbm_dri_device *gbm_dri;
-
- char *device_name;
- char *driver_name;
-
- __DRIdri2LoaderExtension dri2_loader_extension;
- __DRIswrastLoaderExtension swrast_loader_extension;
- const __DRIextension *extensions[4];
-};
-
-static __DRIimageLookupExtension *image_lookup_extension;
-
-/* We are use DRI2 x11 platform, and by default, gbm doesn't register
- * a valid image extension, and actually, it doesn't know how to register
- * it based on current interface. We have to hack it here. */
-void cl_gbm_set_image_extension(struct gbm_device *gbm, void *display)
-{
- struct gbm_dri_device *gbm_dri = gbm_dri_device(gbm);
- struct _hack_egl_display *egl_dpy = (struct _hack_egl_display*)display;
- struct _hack_dri2_egl_display *dri2_dpy = (struct _hack_dri2_egl_display*)egl_dpy->DriverData;
- int i;
-
- if (gbm_dri->lookup_image == NULL
- && egl_dpy->Platform == _EGL_PLATFORM_X11) {
- for(i = 0; i < 4; i++)
- if (dri2_dpy->extensions[i]
- && ((strncmp(dri2_dpy->extensions[i]->name,
- __DRI_IMAGE_LOOKUP,
- sizeof(__DRI_IMAGE_LOOKUP))) == 0))
- break;
- if (i >= 4) return;
- image_lookup_extension = (__DRIimageLookupExtension*)dri2_dpy->extensions[i];
- gbm_dri->lookup_image = image_lookup_extension->lookupEGLImage;
- gbm_dri->lookup_user_data = display;
- }
-}
-
-int cl_gbm_bo_get_name(struct gbm_bo *bo)
-{
- int name;
- struct gbm_dri_device *gbm_dri = gbm_dri_device(bo->gbm);
- struct gbm_dri_bo *bo_dri = gbm_dri_bo(bo);
-
- gbm_dri->image->queryImage(bo_dri->image, __DRI_IMAGE_ATTRIB_NAME,
- &name);
- return name;
-}
diff --git a/src/x11/mesa_egl_extension.c b/src/x11/mesa_egl_extension.c
new file mode 100644
index 0000000..a7fc8cb
--- /dev/null
+++ b/src/x11/mesa_egl_extension.c
@@ -0,0 +1,307 @@
+#include <stdio.h>
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+#include "src/cl_driver.h"
+
+struct _egl_display;
+struct _egl_resource;
+struct _egl_thread_info;
+struct _egl_config;
+struct _egl_surface;
+struct _egl_driver;
+
+typedef struct _egl_display _EGLDisplay;
+typedef struct _egl_resource _EGLResource;
+typedef struct _egl_thread_info _EGLThreadInfo;
+typedef struct _egl_config _EGLConfig;
+typedef struct _egl_surface _EGLSurface;
+typedef struct _egl_driver _EGLDriver;
+
+/**
+ * A resource of a display.
+ */
+struct _egl_resource
+{
+ /* which display the resource belongs to */
+ _EGLDisplay *Display;
+ EGLBoolean IsLinked;
+ EGLint RefCount;
+
+ /* used to link resources of the same type */
+ _EGLResource *Next;
+};
+
+/**
+ * "Base" class for device driver contexts.
+ */
+struct _egl_context
+{
+ /* A context is a display resource */
+ _EGLResource Resource;
+
+ /* The bound status of the context */
+ _EGLThreadInfo *Binding;
+ _EGLSurface *DrawSurface;
+ _EGLSurface *ReadSurface;
+
+ _EGLConfig *Config;
+
+ EGLint ClientAPI; /**< EGL_OPENGL_ES_API, EGL_OPENGL_API, EGL_OPENVG_API */
+ EGLint ClientMajorVersion;
+ EGLint ClientMinorVersion;
+ EGLint Flags;
+ EGLint Profile;
+ EGLint ResetNotificationStrategy;
+
+ /* The real render buffer when a window surface is bound */
+ EGLint WindowRenderBuffer;
+};
+
+typedef struct _egl_context _EGLContext;
+
+struct dri2_egl_display
+{
+ int dri2_major;
+ int dri2_minor;
+ __DRIscreen *dri_screen;
+ int own_dri_screen;
+ const __DRIconfig **driver_configs;
+ void *driver;
+};
+
+enum _egl_platform_type {
+ _EGL_PLATFORM_WINDOWS,
+ _EGL_PLATFORM_X11,
+ _EGL_PLATFORM_WAYLAND,
+ _EGL_PLATFORM_DRM,
+ _EGL_PLATFORM_FBDEV,
+ _EGL_PLATFORM_NULL,
+ _EGL_PLATFORM_ANDROID,
+
+ _EGL_NUM_PLATFORMS,
+ _EGL_INVALID_PLATFORM = -1
+};
+typedef enum _egl_platform_type _EGLPlatformType;
+
+typedef pthread_mutex_t _EGLMutex;
+
+struct _egl_display
+{
+ /* used to link displays */
+ _EGLDisplay *Next;
+
+ _EGLMutex Mutex;
+
+ _EGLPlatformType Platform; /**< The type of the platform display */
+ void *PlatformDisplay; /**< A pointer to the platform display */
+
+ _EGLDriver *Driver; /**< Matched driver of the display */
+ EGLBoolean Initialized; /**< True if the display is initialized */
+
+ /* options that affect how the driver initializes the display */
+ struct {
+ EGLBoolean TestOnly; /**< Driver should not set fields when true */
+ EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
+ } Options;
+
+ /* these fields are set by the driver during init */
+ void *DriverData; /**< Driver private data */
+};
+
+static struct dri2_egl_display *
+dri2_egl_display(_EGLDisplay *dpy)
+{
+ return (struct dri2_egl_display *)dpy->DriverData;
+}
+
+static _EGLDisplay *
+_eglLockDisplay(EGLDisplay dpy)
+{
+ return (_EGLDisplay *)dpy;
+}
+
+static _EGLContext *
+_eglLookupContext(EGLContext ctx, EGLDisplay disp)
+{
+ disp = disp;
+ return (_EGLContext *) ctx;
+}
+
+struct dri2_egl_context
+{
+ _EGLContext base;
+ __DRIcontext *dri_context;
+};
+
+static struct dri2_egl_context *
+dri2_egl_context(_EGLContext *ctx)
+{
+ return (struct dri2_egl_context *)ctx;
+}
+
+static EGLBoolean
+dri2_acquire_texture(_EGLDisplay *disp,
+ _EGLContext *ctx,
+ const EGLint *attr_list,
+ void *user_data)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint texture = 0;
+ GLenum gl_target = 0;
+ GLint level = 0;
+ GLboolean ret;
+
+ if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_acquire_texture(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ gl_target, level, texture,
+ user_data);
+ return ret;
+}
+
+static EGLBoolean
+dri2_release_texture(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint texture = 0;
+ GLenum gl_target = 0;
+ GLint level = 0;
+ GLboolean ret;
+
+ if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_release_texture(dri2_dpy->driver, dri2_ctx->dri_context,
+ gl_target, level, texture);
+ return ret;
+}
+
+static EGLBoolean
+dri2_acquire_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list,
+ void *user_data)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint bufobj = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_acquire_buffer_object(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ bufobj, user_data);
+ return ret;
+}
+
+static EGLBoolean
+dri2_release_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint bufobj = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_release_buffer_object(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ bufobj);
+ return ret;
+}
+
+static EGLBoolean
+dri2_acquire_render_buffer(_EGLDisplay *disp,
+ _EGLContext *ctx,
+ const EGLint *attr_list,
+ void *user_data)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint rb = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_acquire_render_buffer(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ rb, user_data);
+ return ret;
+}
+
+static EGLBoolean
+dri2_release_render_buffer(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+ struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+ struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+ GLuint rb = 0;
+ GLboolean ret;
+
+ if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+ return EGL_FALSE;
+
+ ret = cl_gl_release_render_buffer(dri2_dpy->driver,
+ dri2_ctx->dri_context,
+ rb);
+ return ret;
+}
+
+static EGLBoolean
+dri2_acquire_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+ const EGLint *attrib_list, void *user_data)
+{
+ switch (target) {
+ case EGL_GL_TEXTURE_MESA:
+ return dri2_acquire_texture(disp, ctx, attrib_list, user_data);
+ case EGL_GL_BUFFER_OBJECT_MESA:
+ return dri2_acquire_buffer_object(disp, ctx, attrib_list, user_data);
+ case EGL_GL_RENDER_BUFFER_MESA:
+ return dri2_acquire_render_buffer(disp, ctx, attrib_list, user_data);
+ default:
+ fprintf(stderr, "bad resource target value 0x%04x",
+ target);
+ }
+ return EGL_FALSE;
+}
+
+static EGLBoolean
+dri2_release_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+ const EGLint *attrib_list)
+{
+ switch (target) {
+ case EGL_GL_TEXTURE_MESA:
+ return dri2_release_texture(disp, ctx, attrib_list);
+ case EGL_GL_BUFFER_OBJECT_MESA:
+ return dri2_release_buffer_object(disp, ctx, attrib_list);
+ case EGL_GL_RENDER_BUFFER_MESA:
+ return dri2_release_render_buffer(disp, ctx, attrib_list);
+ default:
+ fprintf(stderr, "bad resource target value 0x%04x",
+ target);
+ }
+ return EGL_FALSE;
+}
+
+EGLBoolean
+eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void *user)
+{
+ _EGLDisplay *disp = _eglLockDisplay(dpy);
+ _EGLContext *context = _eglLookupContext(ctx, disp);
+
+ return dri2_acquire_resource_mesa(disp, context, target, attrib_list, user);
+}
+
+EGLBoolean
+eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list)
+{
+ _EGLDisplay *disp = _eglLockDisplay(dpy);
+ _EGLContext *context = _eglLookupContext(ctx, disp);
+
+ return dri2_release_resource_mesa(disp, context, target, attrib_list);
+}
diff --git a/src/x11/mesa_egl_extension.h b/src/x11/mesa_egl_extension.h
new file mode 100644
index 0000000..39ea134
--- /dev/null
+++ b/src/x11/mesa_egl_extension.h
@@ -0,0 +1,20 @@
+#ifndef __MESA_EGL_EXTENSION_H__
+#define __MESA_EGL_EXTENSION_H__
+
+#include <EGL/egl.h>
+#include <GL/gl.h>
+#include <GL/internal/dri_interface.h>
+
+#define EGL_GL_TEXTURE_MESA 0x3300 /* eglAcuireResource target */
+#define EGL_GL_BUFFER_OBJECT_MESA 0x3301 /* eglAcuireResource target */
+#define EGL_GL_RENDER_BUFFER_MESA 0x3302 /* eglAcuireResource target */
+#define EGL_GL_TEXTURE_ID_MESA 0x3303 /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_LEVEL_MESA 0x3304 /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_TARGET_MESA 0x3305 /* eglAcuireResource attribute */
+#define EGL_GL_BUFFER_OBJECT_ID_MESA 0x3306 /* eglAcuireResource attribute */
+#define EGL_GL_RENDER_BUFFER_ID_MESA 0x3307 /* eglAcuireResource attribute */
+
+EGLBoolean eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void * user_data);
+EGLBoolean eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list);
+
+#endif
diff --git a/src/x11/mesa_egl_res_share.c b/src/x11/mesa_egl_res_share.c
new file mode 100644
index 0000000..93e9454
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.c
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <assert.h>
+#include <string.h>
+
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target, EGLint *level,
+ const EGLint *attrib_list)
+{
+ EGLint i, err = EGL_SUCCESS;
+
+ *texture = 0;
+ *gl_target = 0;
+ *level = 0;
+
+ if (!attrib_list)
+ return EGL_BAD_ATTRIBUTE;
+
+ for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+ EGLint attr = attrib_list[i++];
+ EGLint val = attrib_list[i];
+
+ switch (attr) {
+ case EGL_GL_TEXTURE_LEVEL_MESA:
+ *level = val;
+ break;
+ case EGL_GL_TEXTURE_ID_MESA:
+ *texture = val;
+ break;
+ case EGL_GL_TEXTURE_TARGET_MESA:
+ *gl_target = val;
+ break;
+ default:
+ /* unknown attrs are ignored */
+ break;
+ }
+ }
+
+ return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj, const EGLint *attrib_list)
+{
+ EGLint i, err = EGL_SUCCESS;
+ *bufobj = 0;
+
+ if (!attrib_list)
+ return EGL_BAD_ATTRIBUTE;
+
+ for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+ EGLint attr = attrib_list[i++];
+ EGLint val = attrib_list[i];
+
+ switch (attr) {
+ case EGL_GL_BUFFER_OBJECT_ID_MESA:
+ *bufobj = val;
+ break;
+ default:
+ /* unknown attrs are ignored */
+ break;
+ }
+ }
+ if (*bufobj == 0)
+ err = EGL_BAD_ATTRIBUTE;
+
+ return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list)
+{
+ EGLint i, err = EGL_SUCCESS;
+ *rb = 0;
+
+ if (!attrib_list)
+ return EGL_BAD_ATTRIBUTE;
+
+ for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+ EGLint attr = attrib_list[i++];
+ EGLint val = attrib_list[i];
+
+ switch (attr) {
+ case EGL_GL_RENDER_BUFFER_ID_MESA:
+ *rb = val;
+ break;
+ default:
+ /* unknown attrs are ignored */
+ break;
+ }
+ }
+ if (*rb == 0)
+ err = EGL_BAD_ATTRIBUTE;
+
+ return err;
+}
diff --git a/src/x11/mesa_egl_res_share.h b/src/x11/mesa_egl_res_share.h
new file mode 100644
index 0000000..43e746e
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef EGLRESSHARE_INCLUDED
+#define EGLRESSHARE_INCLUDED
+
+#include <EGL/egl.h>
+
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target,
+ EGLint *level, const EGLint *attrib_list);
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj,
+ const EGLint *attrib_list);
+
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list);
+#endif
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 97b7519..f18bd46 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -90,7 +90,7 @@ set (utests_sources
compiler_insn_selection_min.cpp
compiler_insn_selection_max.cpp
compiler_insn_selection_masked_min_max.cpp
- compiler_load_bool_imm.cpp
+ compiler_load_bool_imm.cpp
compiler_global_memory_barrier.cpp
compiler_local_memory_two_ptr.cpp
compiler_local_memory_barrier.cpp
@@ -102,8 +102,10 @@ set (utests_sources
compiler_get_image_info.cpp
compiler_vect_compare.cpp
compiler_vector_load_store.cpp
+ compiler_vector_inc.cpp
compiler_cl_finish.cpp
get_cl_info.cpp
+ builtin_atan2.cpp
builtin_bitselect.cpp
builtin_frexp.cpp
builtin_mad_sat.cpp
@@ -113,6 +115,10 @@ set (utests_sources
builtin_shuffle.cpp
builtin_shuffle2.cpp
builtin_sign.cpp
+ builtin_sinpi.cpp
+ builtin_lgamma.cpp
+ builtin_lgamma_r.cpp
+ builtin_tgamma.cpp
buildin_work_dim.cpp
builtin_global_size.cpp
builtin_local_size.cpp
@@ -122,7 +128,7 @@ set (utests_sources
builtin_acos_asin.cpp
runtime_createcontext.cpp
runtime_null_kernel_arg.cpp
- runtime_event.cpp
+ runtime_event.cpp
compiler_double.cpp
compiler_double_2.cpp
compiler_double_3.cpp
@@ -135,16 +141,28 @@ set (utests_sources
compiler_long_asr.cpp
compiler_long_mult.cpp
compiler_long_cmp.cpp
+ compiler_bool_cross_basic_block.cpp
+ load_program_from_bin.cpp
utest_assert.cpp
utest.cpp
utest_file_map.cpp
utest_helper.cpp)
-if (EGL_FOUND)
+SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
+ADD_CUSTOM_COMMAND(
+ OUTPUT ${kernel_bin}.bin
+ COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl -o${kernel_bin}.bin
+ DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl
+ )
+
+ADD_CUSTOM_TARGET(kernel_bin.bin
+ DEPENDS ${kernel_bin}.bin)
+
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
-endif (EGL_FOUND)
+endif (EGL_FOUND AND MESA_SOURCE_FOUND)
ADD_LIBRARY(utests SHARED ${utests_sources})
@@ -152,7 +170,7 @@ TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
ADD_EXECUTABLE(utest_run utest_run.cpp)
TARGET_LINK_LIBRARIES(utest_run utests)
+ADD_DEPENDENCIES (utest_run kernel_bin.bin)
ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
TARGET_LINK_LIBRARIES(flat_address_space utests)
-
diff --git a/utests/builtin_atan2.cpp b/utests/builtin_atan2.cpp
new file mode 100644
index 0000000..29dd7b4
--- /dev/null
+++ b/utests/builtin_atan2.cpp
@@ -0,0 +1,43 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_atan2(void) {
+ const int n = 1024;
+ float y[n], x[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_atan2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (int i = 0; i < n; ++i) {
+ y[i] = ((float*) buf_data[0])[i] = (rand()&255) * 0.01f;
+ x[i] = ((float*) buf_data[1])[i] = (rand()&255) * 0.01f;
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(2);
+ float *dst = (float*) buf_data[2];
+ for (int i = 0; i < n; ++i) {
+ float cpu = atan2f(y[i], x[i]);
+ float gpu = dst[i];
+ if (fabsf(cpu - gpu) >= 1e-2) {
+ printf("%f %f %f %f\n", y[i], x[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_atan2);
diff --git a/utests/builtin_lgamma.cpp b/utests/builtin_lgamma.cpp
new file mode 100644
index 0000000..876699a
--- /dev/null
+++ b/utests/builtin_lgamma.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma(void) {
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_lgamma");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*) buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = lgamma(src[i]);
+ float gpu = dst[i];
+ if (fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f\n", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma);
diff --git a/utests/builtin_lgamma_r.cpp b/utests/builtin_lgamma_r.cpp
new file mode 100644
index 0000000..b6e5d0e
--- /dev/null
+++ b/utests/builtin_lgamma_r.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma_r(void) {
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_lgamma_r");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ float *dst = (float*) buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ int cpu_signp;
+ float cpu = lgamma_r(src[i], &cpu_signp);
+ int gpu_signp = ((int*)buf_data[2])[i];
+ float gpu = dst[i];
+ if (cpu_signp != gpu_signp || fabsf(cpu - gpu) >= 1e-3) {
+ printf("%f %f %f\n", src[i], cpu, gpu);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma_r);
diff --git a/utests/builtin_sinpi.cpp b/utests/builtin_sinpi.cpp
new file mode 100644
index 0000000..0e11a0d
--- /dev/null
+++ b/utests/builtin_sinpi.cpp
@@ -0,0 +1,104 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static int as_int(float x) {
+ union {float f; int i;} u;
+ u.f = x;
+ return u.i;
+}
+
+static float sinpi(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+ float y, z;
+ int n = 0, ix;
+ const float pi = 3.1415927410e+00f;
+
+ ix = as_int(x) & 0x7fffffff;
+
+ if (ix < 0x3e800000)
+ return sinf(pi * x);
+ y = -x;
+ z = floorf(y);
+ if (z != y) {
+ y *= 0.5f;
+ y = 2.f * (y - floorf(y));
+ n = y * 4.f;
+ } else {
+ if (ix >= 0x4b800000) {
+ y = 0;
+ n = 0;
+ } else {
+ if (ix < 0x4b000000)
+ z = y + 8.3886080000e+06f;
+ int n = as_int(z);
+ n &= 1;
+ y = n;
+ n <<= 2;
+ }
+ }
+ switch (n) {
+ case 0:
+ y = sinf(pi * y);
+ break;
+ case 1:
+ case 2:
+ y = cosf(pi * ((float) 0.5 - y));
+ break;
+ case 3:
+ case 4:
+ y = sinf(pi * (1.f - y));
+ break;
+ case 5:
+ case 6:
+ y = -cosf(pi * (y - (float) 1.5));
+ break;
+ default:
+ y = sinf(pi * (y - (float) 2.0));
+ break;
+ }
+ return -y;
+}
+
+void builtin_sinpi(void)
+{
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_sinpi");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1000; j ++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*)buf_data[0])[i] = (j*n + i) * 0.01f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = sinpi(src[i]);
+ OCL_ASSERT (fabsf(cpu - dst[i]) < 1e-4);
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sinpi);
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
new file mode 100644
index 0000000..4c824d0
--- /dev/null
+++ b/utests/builtin_tgamma.cpp
@@ -0,0 +1,42 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_tgamma(void)
+{
+ const int n = 1024;
+ float src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("builtin_tgamma");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ for (int j = 0; j < 1024; j ++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+ src[i] = ((float*)buf_data[0])[i] = (j*n+i+1) * 0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ for (int i = 0; i < n; ++i) {
+ float cpu = gammaf(src[i]);
+ if (isinf(cpu)) {
+ OCL_ASSERT(isinf(dst[i]));
+ } else if (fabsf(cpu - dst[i]) >= 1e-3) {
+ printf("%f %f %f\n", src[i], cpu, dst[i]);
+ OCL_ASSERT(0);
+ }
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_tgamma);
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
index 384a654..71881b1 100644
--- a/utests/compiler_abs_diff.cpp
+++ b/utests/compiler_abs_diff.cpp
@@ -182,25 +182,30 @@ template <typename T, typename U> static void compiler_abs_diff_with_type(void)
}
}
-#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) \
- static void compiler_abs_diff_##TYPE (void) \
+
+#define ABS_TEST_DIFF_TYPE_2(TYPE, CLTYPE, UTYPE) \
+ static void compiler_abs_diff_##CLTYPE (void) \
{ \
- OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#TYPE, SOURCE, NULL); \
+ OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#CLTYPE, SOURCE, NULL); \
compiler_abs_diff_with_type<TYPE, UTYPE>(); \
} \
- MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##TYPE);
+ MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##CLTYPE);
+
+#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE)
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
+typedef uint64_t ulong64;
ABS_TEST_DIFF_TYPE(int, uint)
+ABS_TEST_DIFF_TYPE_2(int64_t, long, ulong64)
ABS_TEST_DIFF_TYPE(short, ushort)
ABS_TEST_DIFF_TYPE(char, uchar)
ABS_TEST_DIFF_TYPE(uint, uint)
+ABS_TEST_DIFF_TYPE_2(ulong64, ulong, ulong64)
ABS_TEST_DIFF_TYPE(ushort, ushort)
ABS_TEST_DIFF_TYPE(uchar, uchar)
-
typedef cl_vec<int, 2> int2;
typedef cl_vec<int, 3> int3;
typedef cl_vec<int, 4> int4;
@@ -222,6 +227,26 @@ ABS_TEST_DIFF_TYPE(uint4, uint4)
ABS_TEST_DIFF_TYPE(uint8, uint8)
ABS_TEST_DIFF_TYPE(uint16, uint16)
+typedef cl_vec<int64_t, 2> long2;
+typedef cl_vec<int64_t, 3> long3;
+typedef cl_vec<int64_t, 4> long4;
+typedef cl_vec<int64_t, 8> long8;
+typedef cl_vec<int64_t, 16> long16;
+typedef cl_vec<uint64_t, 2> ulong2;
+typedef cl_vec<uint64_t, 3> ulong3;
+typedef cl_vec<uint64_t, 4> ulong4;
+typedef cl_vec<uint64_t, 8> ulong8;
+typedef cl_vec<uint64_t, 16> ulong16;
+ABS_TEST_DIFF_TYPE(long2, ulong2)
+ABS_TEST_DIFF_TYPE(long3, ulong3)
+ABS_TEST_DIFF_TYPE(long4, ulong4)
+ABS_TEST_DIFF_TYPE(long8, ulong8)
+ABS_TEST_DIFF_TYPE(long16, ulong16)
+ABS_TEST_DIFF_TYPE(ulong2, ulong2)
+ABS_TEST_DIFF_TYPE(ulong3, ulong3)
+ABS_TEST_DIFF_TYPE(ulong4, ulong4)
+ABS_TEST_DIFF_TYPE(ulong8, ulong8)
+ABS_TEST_DIFF_TYPE(ulong16, ulong16)
typedef cl_vec<char, 2> char2;
typedef cl_vec<char, 3> char3;
diff --git a/utests/compiler_bool_cross_basic_block.cpp b/utests/compiler_bool_cross_basic_block.cpp
new file mode 100644
index 0000000..4dd5bc7
--- /dev/null
+++ b/utests/compiler_bool_cross_basic_block.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int scale) {
+ bool isRedRow = false;
+ bool isRed;
+ int val = src[global_id];
+ for (int i=0; i<scale; i++, isRedRow = !isRedRow) {
+ if (isRedRow) {
+ isRed= false;
+ for (int j=0; j < scale; j++, isRed=!isRed) {
+ if (isRed) {
+ val++;
+ }
+ }
+ }
+ }
+ dst[global_id] = val;
+}
+
+void compiler_bool_cross_basic_block(void){
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+ int scale = 4;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_bool_cross_basic_block");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(int), &scale);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int*)buf_data[0])[i] = i;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu(i, cpu_src, cpu_dst, scale);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_bool_cross_basic_block)
diff --git a/utests/compiler_copy_image_3d.cpp b/utests/compiler_copy_image_3d.cpp
index 5290090..ff493e7 100644
--- a/utests/compiler_copy_image_3d.cpp
+++ b/utests/compiler_copy_image_3d.cpp
@@ -1,10 +1,11 @@
#include "utest_helper.hpp"
+#include "string.h"
static void compiler_copy_image_3d(void)
{
const size_t w = 512;
const size_t h = 512;
- const size_t depth = 1;
+ const size_t depth = 4;
cl_image_format format;
cl_sampler sampler;
@@ -14,12 +15,14 @@ static void compiler_copy_image_3d(void)
for (uint32_t k = 0; k < depth; k++)
for (uint32_t j = 0; j < h; j++)
for (uint32_t i = 0; i < w; i++)
- ((uint32_t*)buf_data[0])[k*w*h + j*w + i] = k*w*h + j*w + i;
+ ((float*)buf_data[0])[k*w*h + j*w + i] = (k << 10) + (j << 10) + i;
format.image_channel_order = CL_RGBA;
- format.image_channel_data_type = CL_UNSIGNED_INT8;
- OCL_CREATE_IMAGE3D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, depth, 0, 0, buf_data[0]);
+ format.image_channel_data_type = CL_UNORM_INT8;
+ OCL_CREATE_IMAGE3D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, depth, w*4, w*h*4, buf_data[0]);
OCL_CREATE_IMAGE3D(buf[1], 0, &format, w, h, depth, 0, 0, NULL);
+ for(uint32_t i = 0; i < depth; i++)
+ OCL_CREATE_IMAGE2D(buf[2 + i], 0, &format, w, h, 0, NULL);
OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
free(buf_data[0]);
buf_data[0] = NULL;
@@ -28,21 +31,28 @@ static void compiler_copy_image_3d(void)
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
OCL_SET_ARG(2, sizeof(sampler), &sampler);
+ for(uint32_t i = 0; i < depth; i++)
+ OCL_SET_ARG(3 + i, sizeof(cl_mem), &buf[2 + i]);
globals[0] = w;
globals[1] = h;
- locals[0] = 16;
- locals[1] = 16;
- OCL_NDRANGE(2);
+ globals[2] = depth;
+ locals[0] = 64;
+ locals[1] = 1;
+ locals[2] = 1;
+ OCL_NDRANGE(3);
// Check result
- OCL_MAP_BUFFER(0);
- OCL_MAP_BUFFER(1);
+ for(uint32_t i = 0; i < depth + 2; i++)
+ OCL_MAP_BUFFER_GTT(i);
for (uint32_t k = 0; k < depth; k++)
for (uint32_t j = 0; j < h; ++j)
- for (uint32_t i = 0; i < w; i++)
- OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == ((uint32_t*)buf_data[1])[k*w*h + j*w + i]);
- OCL_UNMAP_BUFFER(0);
- OCL_UNMAP_BUFFER(1);
+ for (uint32_t i = 0; i < w; i++) {
+ OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[1])[k*w*((h+1)&-2LL) + j*w + i]);
+ OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[k + 2])[j * w + i]);
+ }
+
+ for(uint32_t i = 0; i < depth + 2; i++)
+ OCL_UNMAP_BUFFER_GTT(i);
}
MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_3d);
diff --git a/utests/compiler_fill_image_3d.cpp b/utests/compiler_fill_image_3d.cpp
index 4b3d4e3..6a679fb 100644
--- a/utests/compiler_fill_image_3d.cpp
+++ b/utests/compiler_fill_image_3d.cpp
@@ -4,7 +4,7 @@ static void compiler_fill_image_3d(void)
{
const size_t w = 512;
const size_t h = 512;
- const size_t depth = 1;
+ const size_t depth = 5;
uint32_t color = 0x12345678;
cl_image_format format;
@@ -21,9 +21,11 @@ static void compiler_fill_image_3d(void)
OCL_SET_ARG(1, sizeof(color), &color);
globals[0] = w;
globals[1] = h;
+ globals[2] = depth;
locals[0] = 16;
locals[1] = 16;
- OCL_NDRANGE(2);
+ locals[2] = 1;
+ OCL_NDRANGE(3);
// Check result
OCL_MAP_BUFFER(0);
diff --git a/utests/compiler_fill_image_3d_2.cpp b/utests/compiler_fill_image_3d_2.cpp
index 8ecc3e3..f5ff792 100644
--- a/utests/compiler_fill_image_3d_2.cpp
+++ b/utests/compiler_fill_image_3d_2.cpp
@@ -4,7 +4,7 @@ static void compiler_fill_image_3d_2(void)
{
const size_t w = 512;
const size_t h = 512;
- const size_t depth = 1;
+ const size_t depth = 5;
cl_image_format format;
format.image_channel_order = CL_RGBA;
@@ -19,17 +19,19 @@ static void compiler_fill_image_3d_2(void)
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
globals[0] = w;
globals[1] = h;
+ globals[2] = depth;
locals[0] = 16;
locals[1] = 16;
- OCL_NDRANGE(2);
+ locals[2] = 1;
+ OCL_NDRANGE(3);
// Check result
- OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER_GTT(0);
for (uint32_t k = 0; k < depth; k++)
for (uint32_t j = 0; j < h; ++j)
for (uint32_t i = 0; i < w; i++)
OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == 0x78563412);
- OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER_GTT(0);
}
MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_3d_2);
diff --git a/utests/compiler_function_constant0.cpp b/utests/compiler_function_constant0.cpp
index c0a8a9d..6fbbd30 100644
--- a/utests/compiler_function_constant0.cpp
+++ b/utests/compiler_function_constant0.cpp
@@ -7,7 +7,7 @@ void compiler_function_constant0(void)
// Setup kernel and buffers
OCL_CREATE_KERNEL("compiler_function_constant0");
- OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(int32_t), NULL);
OCL_CREATE_BUFFER(buf[1], 0, 1 * sizeof(char), NULL);
OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -17,7 +17,7 @@ void compiler_function_constant0(void)
OCL_MAP_BUFFER(0);
for(uint32_t i = 0; i < 69; ++i)
- ((short *)buf_data[0])[i] = i;
+ ((int32_t *)buf_data[0])[i] = i;
OCL_UNMAP_BUFFER(0);
OCL_MAP_BUFFER(1);
diff --git a/utests/compiler_global_constant.cpp b/utests/compiler_global_constant.cpp
index 1547020..a2d0172 100644
--- a/utests/compiler_global_constant.cpp
+++ b/utests/compiler_global_constant.cpp
@@ -22,8 +22,83 @@ void compiler_global_constant(void)
// Check results
OCL_MAP_BUFFER(0);
for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], m[i%3] + e + r);
OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + e + r);
OCL_UNMAP_BUFFER(0);
}
+void compiler_global_constant1(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant1");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ uint32_t data1[] = {1, 4, 7};
+ uint32_t data2[]= {3, 7, 11};
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + data2[i%3]);
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + data2[i%3]);
+ OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant2(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], 6);
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == 6);
+ OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant3(void)
+{
+ const size_t n = 32;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant3");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ uint32_t data1[] = {3, 6, 9};
+ char data2[]= {'c', 'f', 'j'};
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + (int)data2[i%3]);
+ OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + (uint32_t)data2[i%3]);
+ OCL_UNMAP_BUFFER(0);
+}
+
MAKE_UTEST_FROM_FUNCTION(compiler_global_constant);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant1);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant2);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant3);
diff --git a/utests/compiler_global_constant_2.cpp b/utests/compiler_global_constant_2.cpp
index 56fccb5..cbe63ae 100644
--- a/utests/compiler_global_constant_2.cpp
+++ b/utests/compiler_global_constant_2.cpp
@@ -23,8 +23,37 @@ void compiler_global_constant_2(void)
// Check results
OCL_MAP_BUFFER(0);
for (uint32_t i = 0; i < n; ++i)
+// std::cout << ((uint32_t *)buf_data[0])[i] << std::endl;
OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + t[i%5] + e + r);
OCL_UNMAP_BUFFER(0);
}
+void compiler_global_constant_2_long(void)
+{
+ const size_t n = 2048;
+ const uint32_t e = 34, r = 77;
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant_2", "compiler_global_constant_2_long");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(uint32_t), &e);
+ OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+ // Run the kernel
+ globals[0] = n;
+ locals[0] = 16;
+ OCL_NDRANGE(1);
+
+ uint64_t m[3] = {0x15b,0x25b,0xFFFFFFFFF};
+
+ // Check results
+ OCL_MAP_BUFFER(0);
+ for (uint32_t i = 0; i < n; ++i)
+// std::cout << ((uint64_t *)buf_data[0])[i] << std::endl;
+ OCL_ASSERT(((uint64_t *)buf_data[0])[i] == m[i%3] + e + r);
+ OCL_UNMAP_BUFFER(0);
+}
+
MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2_long);
diff --git a/utests/compiler_group_size.cpp b/utests/compiler_group_size.cpp
index 6d59aed..0c8881c 100644
--- a/utests/compiler_group_size.cpp
+++ b/utests/compiler_group_size.cpp
@@ -1,4 +1,11 @@
#include "utest_helper.hpp"
+#include <string.h>
+
+struct xyz{
+ unsigned short b;
+ unsigned short e;
+ unsigned int o;
+};
void compiler_group_size1(void)
{
@@ -80,7 +87,55 @@ void compiler_group_size3(void)
OCL_UNMAP_BUFFER(0);
}
}
+
+void compiler_group_size4(void)
+{
+ const size_t n = 16;
+ uint32_t color = 2;
+ uint32_t num = 1;
+ int group_size[] = {1};
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_group_size", "compiler_group_size4");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(struct xyz), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+
+ for(uint32_t i = 0; i < num; i++) {
+ // Run the kernel
+ OCL_MAP_BUFFER(0);
+ ((struct xyz*)buf_data[0])[0].b = 0;
+ ((struct xyz*)buf_data[0])[0].e = 2;
+ ((struct xyz*)buf_data[0])[0].o = 0;
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_MAP_BUFFER(1);
+ memset(((uint32_t*)buf_data[1]), 0x0, sizeof(uint32_t)*n);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_int), &group_size[i]);
+ OCL_SET_ARG(3, sizeof(cl_int), &color);
+
+ globals[0] = group_size[i];
+ locals[0] = group_size[i];
+ OCL_NDRANGE(1);
+ OCL_MAP_BUFFER(1);
+
+ // Check results
+ for (uint32_t j = 0; j < n; ++j) {
+// std::cout <<((uint32_t*)buf_data[1])[j] << " ";
+ if(j >= i && j <= i+2) {
+ OCL_ASSERT(((uint32_t*)buf_data[1])[j] == color);
+ } else {
+ OCL_ASSERT(((uint32_t*)buf_data[1])[j] == 0);
+ }
+
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
+}
MAKE_UTEST_FROM_FUNCTION(compiler_group_size1);
MAKE_UTEST_FROM_FUNCTION(compiler_group_size2);
MAKE_UTEST_FROM_FUNCTION(compiler_group_size3);
+MAKE_UTEST_FROM_FUNCTION(compiler_group_size4);
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
index fad2744..d7e1517 100644
--- a/utests/compiler_long.cpp
+++ b/utests/compiler_long.cpp
@@ -30,8 +30,8 @@ void compiler_long(void)
src1[7] = -2L, src2[7] = -1L;
src1[8] = 0, src2[8] = 0x8000000000000000UL;
for (int32_t i = 9; i < (int32_t) n; ++i) {
- src1[i] = ((long)rand() << 32) + rand();
- src2[i] = ((long)rand() << 32) + rand();
+ src1[i] = ((int64_t)rand() << 32) + rand();
+ src2[i] = ((int64_t)rand() << 32) + rand();
}
OCL_MAP_BUFFER(0);
OCL_MAP_BUFFER(1);
diff --git a/utests/compiler_long_2.cpp b/utests/compiler_long_2.cpp
index e3c6640..6c5da4b 100644
--- a/utests/compiler_long_2.cpp
+++ b/utests/compiler_long_2.cpp
@@ -21,8 +21,8 @@ void compiler_long_2(void)
// Run random tests
for (int32_t i = 0; i < (int32_t) n; ++i) {
- src1[i] = ((long)rand() << 32) + rand();
- src2[i] = ((long)rand() << 32) + rand();
+ src1[i] = ((int64_t)rand() << 32) + rand();
+ src2[i] = ((int64_t)rand() << 32) + rand();
}
src1[4] = 1;
OCL_MAP_BUFFER(0);
diff --git a/utests/compiler_long_convert.cpp b/utests/compiler_long_convert.cpp
index 18e13ee..827a45b 100644
--- a/utests/compiler_long_convert.cpp
+++ b/utests/compiler_long_convert.cpp
@@ -3,6 +3,7 @@
#include <iostream>
#include "utest_helper.hpp"
+// convert shorter integer to 64-bit integer
void compiler_long_convert(void)
{
const size_t n = 16;
@@ -65,3 +66,93 @@ void compiler_long_convert(void)
}
MAKE_UTEST_FROM_FUNCTION(compiler_long_convert);
+
+// convert 64-bit integer to shorter integer
+void compiler_long_convert_2(void)
+{
+ const size_t n = 16;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_2");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src[i] = -i;
+ }
+ OCL_MAP_BUFFER(3);
+ memcpy(buf_data[3], src, sizeof(src));
+ OCL_UNMAP_BUFFER(3);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ char *dst1 = ((char *)buf_data[0]);
+ short *dst2 = ((short *)buf_data[1]);
+ int *dst3 = ((int *)buf_data[2]);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%x %x %x\n", dst1[i], dst2[i], dst3[i]);
+ OCL_ASSERT(dst1[i] == -i);
+ OCL_ASSERT(dst2[i] == -i);
+ OCL_ASSERT(dst3[i] == -i);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_2);
+
+// convert 64-bit integer to 32-bit float
+void compiler_long_convert_to_float(void)
+{
+ const size_t n = 16;
+ int64_t src[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_to_float");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src[i] = -(int64_t)i;
+ }
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[1], src, sizeof(src));
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ float *dst = ((float *)buf_data[0]);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%f\n", dst[i]);
+ OCL_ASSERT(dst[i] == src[i]);
+ }
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_to_float);
diff --git a/utests/compiler_vector_inc.cpp b/utests/compiler_vector_inc.cpp
new file mode 100644
index 0000000..abc5408
--- /dev/null
+++ b/utests/compiler_vector_inc.cpp
@@ -0,0 +1,46 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_vector_inc(void)
+{
+ const int n = 64;
+ char dst[n];
+ char src[n];
+
+ OCL_CREATE_KERNEL("compiler_vector_inc");
+ OCL_CREATE_BUFFER(buf[0], 0, n, NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n, NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = n / 2;
+ locals[0] = 16;
+
+ for (int i = 0; i < n; ++i) {
+ dst[i] = i;
+ src[i] = (i / 2) % 4;
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], dst, n);
+ memcpy(buf_data[1], src, n);
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_NDRANGE(1);
+
+ OCL_MAP_BUFFER(0);
+ char *dest = ((char *)buf_data[0]);
+ for (int i=0; i<n; ++i) {
+ char wish;
+ if (src[i/2] < 2)
+ wish = dst[i] + 1;
+ else
+ wish = dst[i] - 1;
+ OCL_ASSERT(dest[i] == wish);
+ }
+ OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_vector_inc);
diff --git a/utests/load_program_from_bin.cpp b/utests/load_program_from_bin.cpp
new file mode 100644
index 0000000..d45c2bd
--- /dev/null
+++ b/utests/load_program_from_bin.cpp
@@ -0,0 +1,77 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+ dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_bin(void)
+{
+ const size_t n = 16;
+ float cpu_dst[16], cpu_src[16];
+ cl_int status;
+ cl_int binary_status;
+ char *ker_path = NULL;
+
+ cl_file_map_t *fm = cl_file_map_new();
+ ker_path = cl_do_kiss_path("compiler_ceil.bin", device);
+ OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+ const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
+ const size_t sz = cl_file_map_size(fm);
+
+ program = clCreateProgramWithBinary(ctx, 1,
+ &device, &sz, &src, &binary_status, &status);
+
+ OCL_ASSERT(program && status == CL_SUCCESS);
+
+ /* OCL requires to build the program even if it is created from a binary */
+ OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+ kernel = clCreateKernel(program, "compiler_ceil", &status);
+ OCL_ASSERT(status == CL_SUCCESS);
+
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+
+#if 0
+ printf("#### GPU:\n");
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ printf(" %f", ((float *)buf_data[1])[i]);
+ printf("\n#### CPU:\n");
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ printf(" %f", cpu_dst[i]);
+ printf("\n");
+#endif
+
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
index 1ec8692..b974f6a 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_event.cpp
@@ -33,6 +33,8 @@ void runtime_event(void)
OCL_ASSERT(status >= CL_SUBMITTED);
}
+ buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
@@ -45,13 +47,10 @@ void runtime_event(void)
OCL_ASSERT(status <= CL_COMPLETE);
}
- // Check results
- OCL_MAP_BUFFER(0);
-
for (uint32_t i = 0; i < n; ++i) {
OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
}
- OCL_UNMAP_BUFFER(0);
+ clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
clReleaseEvent(ev[i]);
diff --git a/utests/utest.cpp b/utests/utest.cpp
index fc3467e..18d10e8 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -32,7 +32,7 @@ using namespace std;
vector<UTest> *UTest::utestList = NULL;
void releaseUTestList(void) { delete UTest::utestList; }
-UTest::UTest(Function fn, const char *name) : fn(fn), name(name) {
+UTest::UTest(Function fn, const char *name, bool haveIssue) : fn(fn), name(name), haveIssue(haveIssue) {
if (utestList == NULL) {
utestList = new vector<UTest>;
atexit(releaseUTestList);
@@ -40,7 +40,7 @@ UTest::UTest(Function fn, const char *name) : fn(fn), name(name) {
utestList->push_back(*this);
}
-UTest::UTest(void) : fn(NULL), name(NULL) {}
+UTest::UTest(void) : fn(NULL), name(NULL), haveIssue(false) {}
static bool strequal(const char *s1, const char *s2) {
if (strcmp(s1, s2) == 0) return true;
@@ -52,7 +52,7 @@ void UTest::run(const char *name) {
if (utestList == NULL) return;
for (size_t i = 0; i < utestList->size(); ++i) {
const UTest &utest = (*utestList)[i];
- if (utest.name == NULL || utest.fn == NULL) continue;
+ if (utest.name == NULL || utest.fn == NULL ) continue;
if (strequal(utest.name, name)) {
std::cout << utest.name << ":" << std::endl;
(utest.fn)();
@@ -76,3 +76,25 @@ void UTest::runAll(void) {
}
}
+void UTest::runAllNoIssue(void) {
+ if (utestList == NULL) return;
+ for (size_t i = 0; i < utestList->size(); ++i) {
+ const UTest &utest = (*utestList)[i];
+ if (utest.fn == NULL || utest.haveIssue) continue;
+ std::cout << utest.name << ":" << std::endl;
+ (utest.fn)();
+ std::cout << std::endl;
+ cl_kernel_destroy();
+ cl_buffer_destroy();
+ }
+}
+
+void UTest::listAllCases()
+{
+ if (utestList == NULL) return;
+ for (size_t i = 0; i < utestList->size(); ++i) {
+ const UTest &utest = (*utestList)[i];
+ if (utest.fn == NULL) continue;
+ std::cout << utest.name << std::endl;
+ }
+}
diff --git a/utests/utest.hpp b/utests/utest.hpp
index 338a4dc..d3a6a6f 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -39,17 +39,23 @@ struct UTest
/*! Empty test */
UTest(void);
/*! Build a new unit test and append it to the unit test list */
- UTest(Function fn, const char *name);
+ UTest(Function fn, const char *name, bool haveIssue = false);
/*! Function to execute */
Function fn;
/*! Name of the test */
const char *name;
+ /*! Indicate whether current test cases has issue to be fixes */
+ bool haveIssue;
/*! The tests that are registered */
static std::vector<UTest> *utestList;
/*! Run the test with the given name */
static void run(const char *name);
+ /*! Run all the tests without known issue*/
+ static void runAllNoIssue(void);
/*! Run all the tests */
static void runAll(void);
+ /*! List all test cases */
+ static void listAllCases(void);
};
/*! Register a new unit test */
@@ -60,6 +66,12 @@ struct UTest
static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
static const UTest __##FN##__(__ANON__##FN##__, #FN);
+/*! Register a test case which has issue to be fixed */
+#define MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(FN) \
+ static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+ static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
+
+
/*! No assert is expected */
#define UTEST_EXPECT_SUCCESS(EXPR) \
do { \
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 9069db2..8089799 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -205,8 +205,8 @@ clpanic(const char *msg, int rval)
exit(-1);
}
-static char*
-do_kiss_path(const char *file, cl_device_id device)
+char*
+cl_do_kiss_path(const char *file, cl_device_id device)
{
cl_int ver;
const char *sub_path = NULL;
@@ -239,7 +239,7 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format, const
cl_int status = CL_SUCCESS;
/* Load the program and build it */
- ker_path = do_kiss_path(file_name, device);
+ ker_path = cl_do_kiss_path(file_name, device);
if (format == LLVM)
program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
else if (format == SOURCE) {
@@ -294,10 +294,10 @@ error:
#include <cstring>
#define GET_DEVICE_STR_INFO(LOWER_NAME, NAME) \
std::string LOWER_NAME ##Str; \
- OCL_CALL (clGetDeviceInfo, device, NAME, 0, 0, ¶m_value_size); \
+ OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, 0, 0, ¶m_value_size); \
{ \
std::vector<char> param_value(param_value_size); \
- OCL_CALL (clGetDeviceInfo, device, NAME, \
+ OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, \
param_value_size, param_value.empty() ? NULL : ¶m_value.front(), \
¶m_value_size); \
if (!param_value.empty()) \
@@ -311,7 +311,9 @@ cl_ocl_init(void)
cl_int status = CL_SUCCESS;
cl_uint platform_n;
size_t i;
+#ifdef HAS_EGL
bool hasGLExt = false;
+#endif
cl_context_properties *props = NULL;
/* Get the platform number */
@@ -331,16 +333,17 @@ cl_ocl_init(void)
OCL_CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
{
size_t param_value_size;
- GET_DEVICE_STR_INFO(profile, CL_DEVICE_PROFILE);
- GET_DEVICE_STR_INFO(name, CL_DEVICE_NAME);
- GET_DEVICE_STR_INFO(vendor, CL_DEVICE_VENDOR);
- GET_DEVICE_STR_INFO(version, CL_DEVICE_VERSION);
- GET_DEVICE_STR_INFO(opencl_c_version, CL_DEVICE_OPENCL_C_VERSION);
- GET_DEVICE_STR_INFO(driver_version, CL_DRIVER_VERSION);
- GET_DEVICE_STR_INFO(extensions, CL_DEVICE_EXTENSIONS);
+ GET_DEVICE_STR_INFO(profile, PROFILE);
+ GET_DEVICE_STR_INFO(name, NAME);
+ GET_DEVICE_STR_INFO(vendor, VENDOR);
+ GET_DEVICE_STR_INFO(version, VERSION);
+ GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
+ GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
+#ifdef HAS_EGL
if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
hasGLExt = true;
}
+#endif
}
#ifdef HAS_EGL
@@ -510,7 +513,7 @@ struct bmphdr {
int *cl_read_bmp(const char *filename, int *width, int *height)
{
struct bmphdr hdr;
- char *bmppath = do_kiss_path(filename, device);
+ char *bmppath = cl_do_kiss_path(filename, device);
FILE *fp = fopen(bmppath, "rb");
assert(fp);
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index e7f43fc..29a21d5 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -50,7 +50,7 @@ extern EGLSurface eglSurface;
#define OCL_THROW_ERROR(FN, STATUS) \
do { \
char msg[2048]; \
- sprintf(msg, "error calling %s with error%s \n", #FN, err_msg[-STATUS]); \
+ sprintf(msg, "error calling %s with error %s \n", #FN, err_msg[-STATUS]); \
OCL_ASSERTM(false, msg); \
} while (0)
@@ -186,6 +186,9 @@ extern int cl_ocl_init(void);
extern int cl_kernel_init(const char *file_name,
const char *kernel_name, int format, const char * build_opt);
+/* Get the file path */
+extern char* cl_do_kiss_path(const char *file, cl_device_id device);
+
/* init the bunch of global varaibles here */
extern int cl_test_init(const char *file_name, const char *kernel_name, int format);
diff --git a/utests/utest_run.cpp b/utests/utest_run.cpp
index e577b7b..94fbbee 100644
--- a/utests/utest_run.cpp
+++ b/utests/utest_run.cpp
@@ -26,19 +26,93 @@
#include "utest_helper.hpp"
#include "utest_exception.hpp"
#include <iostream>
+#include <getopt.h>
+
+static const char *shortopts = "c:lanh";
+struct option longopts[] = {
+{"casename", required_argument, NULL, 'c'},
+{"list", no_argument, NULL, 'l'},
+{"all", no_argument, NULL, 'a'},
+{"allnoissue", no_argument, NULL, 'n'},
+{"help", no_argument, NULL, 'h'},
+{0, 0, 0, 0},
+};
+
+void usage()
+{
+ std::cout << "\
+Usage:\n\
+ ./utest_run <option>\n\
+\n\
+ option:\n\
+ -c <casename>: run sub-case named 'casename'\n\
+ -l : list all the available case name\n\
+ -a : run all test cases\n\
+ -n : run all test cases without known issue (default option)\n\
+ -h : display this usage\n\
+\
+ "<< std::endl;
+}
int main(int argc, char *argv[])
{
- try {
- cl_ocl_init();
- if (argc >= 2)
- for (int i = 1; i < argc; ++i)
- UTest::run(argv[i]);
- else
- UTest::runAll();
- cl_ocl_destroy();
- } catch (Exception e) {
- std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+
+ int c = 0;
+ cl_ocl_init();
+
+ c = getopt_long (argc, argv, shortopts, longopts, NULL);
+
+ if (argc == 1)
+ c = 'n';
+ if (argc == 2 && c < 1 ){
+ c = 'c';
+ optarg = argv[1];
}
+
+ {
+ switch (c)
+ {
+ case 'c':
+ try {
+ UTest::run(optarg);
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'l':
+ UTest::listAllCases();
+ break;
+
+ case 'a':
+ try {
+ UTest::runAll();
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'n':
+ try {
+ UTest::runAllNoIssue();
+ }
+ catch (Exception e){
+ std::cout << " " << e.what() << " [SUCCESS]" << std::endl;
+ }
+
+ break;
+
+ case 'h':
+ default:
+ usage();
+ exit(1);
+ }
+ } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
+
+ cl_ocl_destroy();
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git
More information about the Pkg-opencl-devel
mailing list