[Pkg-opencl-devel] [oclgrind] 01/01: Imported Upstream version 15.5
James Price
jprice-guest at moszumanska.debian.org
Tue Aug 11 12:52:21 UTC 2015
This is an automated email from the git hooks/post-receive script.
jprice-guest pushed a commit to branch upstream
in repository oclgrind.
commit 0ae7b0acd988dc66f11a1e6e8528575369ad5090
Author: James Price <j.price at bristol.ac.uk>
Date: Tue Aug 11 13:31:35 2015 +0100
Imported Upstream version 15.5
---
.gitignore | 54 +
CMakeLists.txt | 299 ++
LICENSE | 29 +
Makefile.am | 147 +
NEWS | 58 +
README | 138 +
cmake_config.h.in | 5 +
configure.ac | 134 +
m4/m4_ax_check_compile_flag.m4 | 74 +
src/CL/cl.h | 1214 +++++
src/CL/cl_d3d10.h | 126 +
src/CL/cl_d3d11.h | 126 +
src/CL/cl_dx9_media_sharing.h | 127 +
src/CL/cl_egl.h | 131 +
src/CL/cl_ext.h | 310 ++
src/CL/cl_gl.h | 162 +
src/CL/cl_gl_ext.h | 69 +
src/CL/cl_platform.h | 1278 +++++
src/CL/opencl.h | 54 +
src/core/Context.cpp | 547 ++
src/core/Context.h | 115 +
src/core/Kernel.cpp | 534 ++
src/core/Kernel.h | 72 +
src/core/KernelInvocation.cpp | 355 ++
src/core/KernelInvocation.h | 64 +
src/core/Memory.cpp | 464 ++
src/core/Memory.h | 68 +
src/core/Plugin.cpp | 25 +
src/core/Plugin.h | 69 +
src/core/Program.cpp | 728 +++
src/core/Program.h | 79 +
src/core/Queue.cpp | 260 +
src/core/Queue.h | 183 +
src/core/WorkGroup.cpp | 428 ++
src/core/WorkGroup.h | 100 +
src/core/WorkItem.cpp | 1660 ++++++
src/core/WorkItem.h | 213 +
src/core/WorkItemBuiltins.cpp | 3561 +++++++++++++
src/core/clc.h | 1035 ++++
src/core/common.cpp | 712 +++
src/core/common.h | 203 +
src/core/gen_clc_h.cmake | 11 +
src/core/gen_clc_h.sh | 18 +
src/core/half.h | 160 +
src/install/INSTALL.darwin | 17 +
src/install/INSTALL.linux | 20 +
src/install/INSTALL.windows | 8 +
src/install/install.bat | 23 +
src/install/oclgrind-icd.reg | Bin 0 -> 1042 bytes
src/install/uninstall.bat | 1 +
src/kernel/Simulation.cpp | 764 +++
src/kernel/Simulation.h | 82 +
src/kernel/oclgrind-kernel.cpp | 233 +
src/plugins/InstructionCounter.cpp | 184 +
src/plugins/InstructionCounter.h | 38 +
src/plugins/InteractiveDebugger.cpp | 1024 ++++
src/plugins/InteractiveDebugger.h | 72 +
src/plugins/Logger.cpp | 81 +
src/plugins/Logger.h | 27 +
src/plugins/MemCheck.cpp | 107 +
src/plugins/MemCheck.h | 43 +
src/plugins/RaceDetector.cpp | 336 ++
src/plugins/RaceDetector.h | 94 +
src/runtime/async_queue.cpp | 136 +
src/runtime/async_queue.h | 21 +
src/runtime/icd.def | 5 +
src/runtime/icd.h | 235 +
src/runtime/oclgrind | 145 +
src/runtime/runtime.cpp | 5594 ++++++++++++++++++++
src/runtime/runtime.def | 119 +
tests/apps/CMakeLists.txt | 33 +
tests/apps/vecadd/vecadd.c | 190 +
tests/kernels/TESTS | 56 +
tests/kernels/alignment/packed.cl | 10 +
tests/kernels/alignment/packed.ref | 4 +
tests/kernels/alignment/packed.sim | 10 +
tests/kernels/alignment/unaligned.cl | 6 +
tests/kernels/alignment/unaligned.ref | 5 +
tests/kernels/alignment/unaligned.sim | 7 +
tests/kernels/async_copy/async_copy.cl | 8 +
tests/kernels/async_copy/async_copy.ref | 7 +
tests/kernels/async_copy/async_copy.sim | 7 +
tests/kernels/async_copy/async_copy_divergent.cl | 14 +
tests/kernels/async_copy/async_copy_divergent.ref | 8 +
tests/kernels/async_copy/async_copy_divergent.sim | 7 +
tests/kernels/async_copy/async_copy_global_race.cl | 11 +
.../kernels/async_copy/async_copy_global_race.ref | 8 +
.../kernels/async_copy/async_copy_global_race.sim | 7 +
tests/kernels/async_copy/async_copy_local_race.cl | 10 +
tests/kernels/async_copy/async_copy_local_race.ref | 8 +
tests/kernels/async_copy/async_copy_local_race.sim | 7 +
tests/kernels/async_copy/async_copy_loop.cl | 14 +
tests/kernels/async_copy/async_copy_loop.ref | 7 +
tests/kernels/async_copy/async_copy_loop.sim | 7 +
.../async_copy/async_copy_loop_divergent.cl | 19 +
.../async_copy/async_copy_loop_divergent.ref | 8 +
.../async_copy/async_copy_loop_divergent.sim | 7 +
tests/kernels/async_copy/async_copy_single_wi.cl | 13 +
tests/kernels/async_copy/async_copy_single_wi.ref | 8 +
tests/kernels/async_copy/async_copy_single_wi.sim | 7 +
tests/kernels/async_copy/async_copy_unwaited.cl | 7 +
tests/kernels/async_copy/async_copy_unwaited.ref | 8 +
tests/kernels/async_copy/async_copy_unwaited.sim | 7 +
tests/kernels/atomics/atomic_cmpxchg_false_race.cl | 36 +
.../kernels/atomics/atomic_cmpxchg_false_race.ref | 8 +
.../kernels/atomics/atomic_cmpxchg_false_race.sim | 7 +
tests/kernels/atomics/atomic_cmpxchg_read_race.cl | 12 +
tests/kernels/atomics/atomic_cmpxchg_read_race.ref | 5 +
tests/kernels/atomics/atomic_cmpxchg_read_race.sim | 7 +
tests/kernels/atomics/atomic_cmpxchg_write_race.cl | 9 +
.../kernels/atomics/atomic_cmpxchg_write_race.ref | 5 +
.../kernels/atomics/atomic_cmpxchg_write_race.sim | 7 +
tests/kernels/atomics/atomic_global_fence.cl | 17 +
tests/kernels/atomics/atomic_global_fence.ref | 5 +
tests/kernels/atomics/atomic_global_fence.sim | 7 +
tests/kernels/atomics/atomic_global_fence_race.cl | 12 +
tests/kernels/atomics/atomic_global_fence_race.ref | 6 +
tests/kernels/atomics/atomic_global_fence_race.sim | 7 +
tests/kernels/atomics/atomic_increment.cl | 4 +
tests/kernels/atomics/atomic_increment.ref | 4 +
tests/kernels/atomics/atomic_increment.sim | 6 +
tests/kernels/atomics/atomic_intergroup_race.cl | 10 +
tests/kernels/atomics/atomic_intergroup_race.ref | 5 +
tests/kernels/atomics/atomic_intergroup_race.sim | 6 +
tests/kernels/atomics/atomic_local_fence.cl | 17 +
tests/kernels/atomics/atomic_local_fence.ref | 5 +
tests/kernels/atomics/atomic_local_fence.sim | 7 +
tests/kernels/atomics/atomic_race_after.cl | 8 +
tests/kernels/atomics/atomic_race_after.ref | 5 +
tests/kernels/atomics/atomic_race_after.sim | 6 +
tests/kernels/atomics/atomic_race_before.cl | 8 +
tests/kernels/atomics/atomic_race_before.ref | 5 +
tests/kernels/atomics/atomic_race_before.sim | 6 +
tests/kernels/atomics/atomic_same_workitem.cl | 14 +
tests/kernels/atomics/atomic_same_workitem.ref | 7 +
tests/kernels/atomics/atomic_same_workitem.sim | 6 +
.../barrier/barrier_different_instructions.cl | 14 +
.../barrier/barrier_different_instructions.ref | 8 +
.../barrier/barrier_different_instructions.sim | 6 +
tests/kernels/barrier/barrier_divergence.cl | 9 +
tests/kernels/barrier/barrier_divergence.ref | 8 +
tests/kernels/barrier/barrier_divergence.sim | 6 +
tests/kernels/bugs/gvn_arbitrary_integers.cl | 8 +
tests/kernels/bugs/gvn_arbitrary_integers.ref | 6 +
tests/kernels/bugs/gvn_arbitrary_integers.sim | 7 +
tests/kernels/bugs/kernel_struct_argument.cl | 11 +
tests/kernels/bugs/kernel_struct_argument.ref | 4 +
tests/kernels/bugs/kernel_struct_argument.sim | 11 +
tests/kernels/bugs/many_alloca.cl | 21 +
tests/kernels/bugs/many_alloca.ref | 4 +
tests/kernels/bugs/many_alloca.sim | 9 +
tests/kernels/bugs/multidim_array_in_struct.cl | 40 +
tests/kernels/bugs/multidim_array_in_struct.ref | 4 +
tests/kernels/bugs/multidim_array_in_struct.sim | 13 +
tests/kernels/bugs/null_argument.cl | 9 +
tests/kernels/bugs/null_argument.ref | 4 +
tests/kernels/bugs/null_argument.sim | 6 +
tests/kernels/bugs/sroa_addrspace_cast.cl | 12 +
tests/kernels/bugs/sroa_addrspace_cast.ref | 4 +
tests/kernels/bugs/sroa_addrspace_cast.sim | 7 +
tests/kernels/data-race/broadcast.cl | 5 +
tests/kernels/data-race/broadcast.ref | 7 +
tests/kernels/data-race/broadcast.sim | 9 +
tests/kernels/data-race/global_fence.cl | 16 +
tests/kernels/data-race/global_fence.ref | 7 +
tests/kernels/data-race/global_fence.sim | 7 +
tests/kernels/data-race/global_only_fence.cl | 16 +
tests/kernels/data-race/global_only_fence.ref | 8 +
tests/kernels/data-race/global_only_fence.sim | 7 +
tests/kernels/data-race/global_read_write_race.cl | 8 +
tests/kernels/data-race/global_read_write_race.ref | 8 +
tests/kernels/data-race/global_read_write_race.sim | 6 +
tests/kernels/data-race/global_write_write_race.cl | 4 +
.../kernels/data-race/global_write_write_race.ref | 5 +
.../kernels/data-race/global_write_write_race.sim | 6 +
tests/kernels/data-race/increment.cl | 5 +
tests/kernels/data-race/increment.ref | 7 +
tests/kernels/data-race/increment.sim | 6 +
tests/kernels/data-race/intergroup_hidden_race.cl | 9 +
tests/kernels/data-race/intergroup_hidden_race.ref | 6 +
tests/kernels/data-race/intergroup_hidden_race.sim | 7 +
tests/kernels/data-race/intergroup_race.cl | 19 +
tests/kernels/data-race/intergroup_race.ref | 8 +
tests/kernels/data-race/intergroup_race.sim | 6 +
tests/kernels/data-race/intragroup_hidden_race.cl | 10 +
tests/kernels/data-race/intragroup_hidden_race.ref | 6 +
tests/kernels/data-race/intragroup_hidden_race.sim | 7 +
tests/kernels/data-race/local_only_fence.cl | 16 +
tests/kernels/data-race/local_only_fence.ref | 8 +
tests/kernels/data-race/local_only_fence.sim | 7 +
tests/kernels/data-race/local_read_write_race.cl | 14 +
tests/kernels/data-race/local_read_write_race.ref | 5 +
tests/kernels/data-race/local_read_write_race.sim | 7 +
tests/kernels/data-race/local_write_write_race.cl | 7 +
tests/kernels/data-race/local_write_write_race.ref | 8 +
tests/kernels/data-race/local_write_write_race.sim | 7 +
tests/kernels/data-race/uniform_write_race.cl | 4 +
tests/kernels/data-race/uniform_write_race.ref | 4 +
tests/kernels/data-race/uniform_write_race.sim | 6 +
tests/kernels/memcheck/async_copy_out_of_bounds.cl | 8 +
.../kernels/memcheck/async_copy_out_of_bounds.ref | 8 +
.../kernels/memcheck/async_copy_out_of_bounds.sim | 7 +
tests/kernels/memcheck/atomic_out_of_bounds.cl | 5 +
tests/kernels/memcheck/atomic_out_of_bounds.ref | 8 +
tests/kernels/memcheck/atomic_out_of_bounds.sim | 6 +
tests/kernels/memcheck/dereference_null.cl | 4 +
tests/kernels/memcheck/dereference_null.ref | 5 +
tests/kernels/memcheck/dereference_null.sim | 7 +
tests/kernels/memcheck/read_out_of_bounds.cl | 12 +
tests/kernels/memcheck/read_out_of_bounds.ref | 9 +
tests/kernels/memcheck/read_out_of_bounds.sim | 8 +
tests/kernels/memcheck/read_write_only_memory.cl | 5 +
tests/kernels/memcheck/read_write_only_memory.ref | 8 +
tests/kernels/memcheck/read_write_only_memory.sim | 7 +
tests/kernels/memcheck/write_out_of_bounds.cl | 5 +
tests/kernels/memcheck/write_out_of_bounds.ref | 8 +
tests/kernels/memcheck/write_out_of_bounds.sim | 8 +
tests/kernels/memcheck/write_read_only_memory.cl | 5 +
tests/kernels/memcheck/write_read_only_memory.ref | 8 +
tests/kernels/memcheck/write_read_only_memory.sim | 7 +
tests/kernels/misc/array.cl | 10 +
tests/kernels/misc/array.ref | 131 +
tests/kernels/misc/array.sim | 6 +
tests/kernels/misc/reduce.cl | 28 +
tests/kernels/misc/reduce.ref | 4 +
tests/kernels/misc/reduce.sim | 11 +
tests/kernels/misc/vecadd.cl | 5 +
tests/kernels/misc/vecadd.ref | 1027 ++++
tests/kernels/misc/vecadd.sim | 8 +
tests/kernels/run_kernel_test.py | 93 +
tests/kernels/wait_event/wait_event_chained.cl | 13 +
tests/kernels/wait_event/wait_event_chained.ref | 7 +
tests/kernels/wait_event/wait_event_chained.sim | 7 +
tests/kernels/wait_event/wait_event_divergent.cl | 11 +
tests/kernels/wait_event/wait_event_divergent.ref | 6 +
tests/kernels/wait_event/wait_event_divergent.sim | 7 +
tests/kernels/wait_event/wait_event_duplicates.cl | 13 +
tests/kernels/wait_event/wait_event_duplicates.ref | 7 +
tests/kernels/wait_event/wait_event_duplicates.sim | 7 +
tests/kernels/wait_event/wait_event_invalid.cl | 5 +
tests/kernels/wait_event/wait_event_invalid.ref | 8 +
tests/kernels/wait_event/wait_event_invalid.sim | 6 +
242 files changed, 28529 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..14830ae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,54 @@
+# Autotools generated files
+aclocal.m4
+autom4te.cache
+compile
+config.guess
+config.h
+config.h.in
+config.log
+config.status
+config.sub
+configure
+depcomp
+.deps
+*.dirstamp
+install-sh
+ltmain.sh
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+Makefile
+Makefile.in
+missing
+stamp-h1
+test-driver
+
+# Compiler output
+*.o
+*.lo
+.libs
+liboclgrind.la
+liboclgrind-rt.la
+liboclgrind-rt-icd.la
+libtool
+/oclgrind
+oclgrind.icd
+oclgrind-kernel
+src/core/clc_h.cpp
+
+# Test output
+test-suite.log
+*.trs
+*.log
+*.out
+*.diff
+tests/apps/vecadd/vecadd
+
+# Misc
+oclgrind-*.tar.gz
+oclgrind-*.zip
+.DS_Store
+*.kdev4
+*.sublime-*
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a35af1e
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,299 @@
+# CMakeLists.txt (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(Oclgrind)
+set(Oclgrind_VERSION_MAJOR 15)
+set(Oclgrind_VERSION_MINOR 5)
+
+include(CheckIncludeFiles)
+include(CheckLibraryExists)
+
+# Enable C99 for GCC (required for tests)
+if (CMAKE_COMPILER_IS_GNUCC)
+ set(CMAKE_C_FLAGS "-std=c99")
+endif()
+
+# Enable rpath on OS X
+set(CMAKE_MACOSX_RPATH 1)
+
+# Enable C++11 for Clang/GCC
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ set(CMAKE_CXX_FLAGS "-std=c++11")
+endif()
+
+# Disable min/max macros on Windows
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ add_definitions(-DNOMINMAX)
+endif()
+
+# Suppress warnings from OpenCL runtime API headers
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-attributes -Wno-gcc-compat -Wno-availability")
+endif()
+
+
+# Find LLVM
+find_package(LLVM REQUIRED CONFIG NO_CMAKE_BUILDS_PATH)
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+
+# Check LLVM version
+if (${LLVM_PACKAGE_VERSION} VERSION_LESS "3.6")
+ message(FATAL_ERROR "LLVM version must be >= 3.6")
+endif()
+set(LLVM_VERSION ${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR})
+
+# Add flags for LLVM
+add_definitions(${LLVM_DEFINITIONS})
+include_directories(${LLVM_INCLUDE_DIRS})
+link_directories(${LLVM_LIBRARY_DIRS})
+set(CLANG ${LLVM_TOOLS_BINARY_DIR}/clang)
+
+# Get LLVM libraries for linking
+llvm_map_components_to_libnames(LLVM_LIBS
+ bitreader bitwriter core instrumentation ipo irreader
+ linker mcparser objcarcopts option)
+
+
+# Check for GNU readline library
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ set(READLINE_DIR "" CACHE PATH "Location of GNU readline library")
+
+ set(CMAKE_REQUIRED_INCLUDES ${READLINE_DIR}/include)
+ include_directories(${READLINE_DIR}/include)
+ link_directories(${READLINE_DIR}/lib)
+
+ message(STATUS ${CMAKE_REQUIRED_LIBRARIES})
+
+ check_include_files("stdio.h;readline/readline.h" HAVE_READLINE_H)
+ check_include_files("stdio.h;readline/history.h" HAVE_HISTORY_H)
+ check_library_exists(readline readline "${READLINE_DIR}/lib" HAVE_READLINE_LIB)
+ check_library_exists(readline add_history "${READLINE_DIR}/lib" HAVE_HISTORY_LIB)
+ if (HAVE_READLINE_H AND HAVE_HISTORY_H AND
+ HAVE_READLINE_LIB AND HAVE_HISTORY_LIB)
+ set(HAVE_READLINE 1)
+ list(APPEND CORE_EXTRA_LIBS readline)
+ else()
+ set(HAVE_READLINE 0)
+ message(WARNING "GNU readline library not found (set READLINE_DIR)\n"
+ "The interactive debugger will not have a command history.")
+ endif()
+else()
+ set(HAVE_READLINE 0)
+endif()
+
+# Generate stringified clc.h
+add_custom_command(
+ OUTPUT src/core/clc_h.cpp
+ COMMAND ${CMAKE_COMMAND} -DSOURCE_FILE=${CMAKE_SOURCE_DIR}/src/core/clc.h
+ -P ${CMAKE_SOURCE_DIR}/src/core/gen_clc_h.cmake
+ DEPENDS src/core/clc.h src/core/gen_clc_h.cmake
+)
+
+include_directories("src/" "${PROJECT_BINARY_DIR}")
+
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ set(CORE_LIB_TYPE "SHARED")
+endif()
+
+set(CORE_HEADERS
+ src/core/common.h
+ src/core/Context.h
+ src/core/half.h
+ src/core/Kernel.h
+ src/core/KernelInvocation.h
+ src/core/Memory.h
+ src/core/Plugin.h
+ src/core/Program.h
+ src/core/Queue.h
+ src/core/WorkItem.h
+ src/core/WorkGroup.h)
+
+add_library(oclgrind ${CORE_LIB_TYPE}
+ ${CORE_HEADERS}
+ src/core/clc_h.cpp
+ src/core/common.cpp
+ src/core/Context.cpp
+ src/core/Kernel.cpp
+ src/core/KernelInvocation.cpp
+ src/core/Memory.cpp
+ src/core/Plugin.cpp
+ src/core/Program.cpp
+ src/core/Queue.cpp
+ src/core/WorkItem.cpp
+ src/core/WorkItemBuiltins.cpp
+ src/core/WorkGroup.cpp
+ src/plugins/InstructionCounter.h
+ src/plugins/InstructionCounter.cpp
+ src/plugins/InteractiveDebugger.h
+ src/plugins/InteractiveDebugger.cpp
+ src/plugins/Logger.h
+ src/plugins/Logger.cpp
+ src/plugins/MemCheck.h
+ src/plugins/MemCheck.cpp
+ src/plugins/RaceDetector.h
+ src/plugins/RaceDetector.cpp)
+target_link_libraries(oclgrind ${CORE_EXTRA_LIBS}
+ clangAnalysis clangAST clangBasic clangCodeGen clangDriver clangEdit
+ clangFrontend clangLex clangParse clangSema clangSerialization
+ ${LLVM_LIBS})
+
+# Sources for OpenCL runtime API frontend
+set(RUNTIME_SOURCES
+ src/runtime/async_queue.h
+ src/runtime/async_queue.cpp
+ src/runtime/icd.h
+ src/runtime/runtime.cpp)
+
+# Add ICD exports on Windows
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ list(APPEND RUNTIME_SOURCES src/runtime/icd.def)
+endif()
+
+add_library(oclgrind-rt-icd SHARED ${RUNTIME_SOURCES})
+set_target_properties(oclgrind-rt-icd PROPERTIES COMPILE_FLAGS -DOCLGRIND_ICD)
+target_link_libraries(oclgrind-rt-icd ${CMAKE_DL_LIBS} oclgrind)
+
+# Add runtime exports on Windows
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ list(APPEND RUNTIME_SOURCES src/runtime/runtime.def)
+endif()
+
+add_library(oclgrind-rt SHARED ${RUNTIME_SOURCES})
+target_link_libraries(oclgrind-rt ${CMAKE_DL_LIBS} oclgrind)
+
+add_executable(oclgrind-kernel
+ src/kernel/oclgrind-kernel.cpp
+ src/kernel/Simulation.h
+ src/kernel/Simulation.cpp)
+target_link_libraries(oclgrind-kernel oclgrind)
+
+set(CLC_HEADERS
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc32.pch
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc64.pch
+)
+
+add_custom_target(CLC_HEADERS ALL DEPENDS ${CLC_HEADERS})
+
+add_custom_command(
+ OUTPUT include/oclgrind/clc.h
+ POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E
+ copy ${CMAKE_SOURCE_DIR}/src/core/clc.h include/oclgrind/clc.h
+ DEPENDS src/core/clc.h)
+
+# Generate precompiled headers for clc.h
+add_custom_command(
+ OUTPUT include/oclgrind/clc32.pch
+ POST_BUILD
+ COMMAND
+ ${CLANG}
+ -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin
+ -emit-pch -triple spir-unknown-unknown
+ -relocatable-pch -isysroot ${CMAKE_BINARY_DIR}/include/oclgrind/
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+ -o include/oclgrind/clc32.pch
+ DEPENDS include/oclgrind/clc.h
+)
+add_custom_command(
+ OUTPUT include/oclgrind/clc64.pch
+ POST_BUILD
+ COMMAND
+ ${CLANG}
+ -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin
+ -emit-pch -triple spir64-unknown-unknown
+ -relocatable-pch -isysroot ${CMAKE_BINARY_DIR}/include/oclgrind/
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+ -o include/oclgrind/clc64.pch
+ DEPENDS include/oclgrind/clc.h
+)
+
+
+# Generate config.h
+configure_file("cmake_config.h.in" "config.h")
+
+
+# Install oclgrind script if not on Windows
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ file(READ src/runtime/oclgrind OCLGRIND_SCRIPT)
+ string(REGEX REPLACE
+ "__VERSION__" "${Oclgrind_VERSION_MAJOR}.${Oclgrind_VERSION_MINOR}"
+ OCLGRIND_SCRIPT "${OCLGRIND_SCRIPT}")
+ file(WRITE ${CMAKE_BINARY_DIR}/oclgrind "${OCLGRIND_SCRIPT}")
+
+ # Generate ICD loader
+ get_property(OCLGRIND_RT_FILENAME TARGET oclgrind-rt-icd PROPERTY LOCATION)
+ file(WRITE ${CMAKE_BINARY_DIR}/oclgrind.icd "${OCLGRIND_RT_FILENAME}\n")
+
+ install(PROGRAMS
+ ${CMAKE_BINARY_DIR}/oclgrind
+ DESTINATION bin)
+endif()
+
+install(TARGETS
+ oclgrind-kernel
+ DESTINATION bin)
+install(TARGETS
+ oclgrind oclgrind-rt oclgrind-rt-icd
+ DESTINATION lib)
+install(FILES
+ ${CORE_HEADERS} ${CMAKE_BINARY_DIR}/config.h ${CLC_HEADERS} LICENSE
+ DESTINATION include/oclgrind)
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ install(FILES
+ src/CL/cl.h
+ src/CL/cl_d3d10.h
+ src/CL/cl_d3d11.h
+ src/CL/cl_dx9_media_sharing.h
+ src/CL/cl_egl.h
+ src/CL/cl_ext.h
+ src/CL/cl_gl.h
+ src/CL/cl_gl_ext.h
+ src/CL/cl_platform.h
+ src/CL/opencl.h
+ DESTINATION include/CL)
+endif()
+
+
+# Tests
+enable_testing()
+
+# Check for Python
+find_package(PythonInterp)
+if (PYTHONINTERP_FOUND)
+
+ # Add kernel tests
+ file(READ tests/kernels/TESTS KERNEL_TESTS)
+ string(REPLACE "\n" ";" KERNEL_TESTS ${KERNEL_TESTS})
+ foreach(test ${KERNEL_TESTS})
+ add_test(
+ NAME ${test}
+ COMMAND
+ ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/kernels/run_kernel_test.py
+ $<TARGET_FILE:oclgrind-kernel>
+ ${CMAKE_SOURCE_DIR}/tests/kernels/${test}.sim)
+ endforeach(${test})
+
+ # Set PCH directory
+ set_tests_properties(${KERNEL_TESTS} PROPERTIES
+ ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
+
+ # Expected failures
+ set_tests_properties(
+ atomics/atomic_intergroup_race
+ data-race/intragroup_hidden_race
+ PROPERTIES WILL_FAIL TRUE)
+
+else()
+ message(WARNING "Kernel tests will not be run (Python required)")
+endif()
+
+# Add app tests
+add_subdirectory(tests/apps)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f91a2f2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+University of Bristol. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..8fcd00f
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,147 @@
+# Makefile.am (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+AUTOMAKE_OPTIONS = subdir-objects
+ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4
+
+AM_CFLAGS = -std=c99
+AM_CPPFLAGS = -I$(top_srcdir)/src/ -Wall
+
+# Suppress warnings from OpenCL runtime API headers
+if USING_CLANG
+AM_CPPFLAGS += -Wno-ignored-attributes -Wno-gcc-compat -Wno-availability
+endif USING_CLANG
+
+lib_LTLIBRARIES = liboclgrind.la liboclgrind-rt.la liboclgrind-rt-icd.la
+
+LLVM_LIBS = `$(llvm_config) --system-libs --libs bitreader bitwriter \
+ core instrumentation ipo irreader linker mcparser objcarcopts option`
+
+liboclgrind_la_SOURCES = src/core/common.h src/core/common.cpp \
+ src/core/Context.h src/core/Context.cpp src/core/half.h \
+ src/core/Kernel.h src/core/Kernel.cpp src/core/KernelInvocation.h \
+ src/core/KernelInvocation.cpp src/core/Memory.h src/core/Memory.cpp \
+ src/core/Plugin.h src/core/Plugin.cpp src/core/Program.h \
+ src/core/Program.cpp src/core/Queue.h src/core/Queue.cpp \
+ src/core/WorkItem.h src/core/WorkItem.cpp \
+ src/core/WorkItemBuiltins.cpp src/core/WorkGroup.h \
+ src/core/WorkGroup.cpp src/plugins/InstructionCounter.h \
+ src/plugins/InstructionCounter.cpp src/plugins/InteractiveDebugger.h \
+ src/plugins/InteractiveDebugger.cpp src/plugins/Logger.h \
+ src/plugins/Logger.cpp src/plugins/MemCheck.h \
+ src/plugins/MemCheck.cpp src/plugins/RaceDetector.h \
+ src/plugins/RaceDetector.cpp
+nodist_liboclgrind_la_SOURCES = src/core/clc_h.cpp config.h
+liboclgrind_la_LDFLAGS = -lclangFrontend -lclangDriver \
+-lclangSerialization -lclangCodeGen -lclangParse -lclangSema \
+-lclangAnalysis -lclangEdit -lclangAST -lclangLex -lclangBasic \
+${LLVM_LIBS} $(oclgrind_extra_libs) -shared
+oclgrind_includedir = $(includedir)/oclgrind
+oclgrind_include_HEADERS = src/core/common.h src/core/Context.h \
+ src/core/half.h src/core/Kernel.h src/core/KernelInvocation.h \
+ src/core/Memory.h src/core/Plugin.h src/core/Program.h \
+ src/core/Queue.h src/core/WorkItem.h src/core/WorkGroup.h config.h LICENSE
+src/core/clc_h.cpp: src/core/gen_clc_h.sh src/core/clc.h
+ $(top_srcdir)/src/core/gen_clc_h.sh $(top_srcdir)/src/core/clc.h $@
+
+install-data-hook:
+ cp -p src/include/oclgrind/clc.h $(DESTDIR)$(includedir)/oclgrind/
+ cp -p src/include/oclgrind/clc32.pch $(DESTDIR)$(includedir)/oclgrind/
+ cp -p src/include/oclgrind/clc64.pch $(DESTDIR)$(includedir)/oclgrind/
+
+uninstall-hook:
+ rm -rf $(DESTDIR)$(includedir)/oclgrind/clc.h
+ rm -rf $(DESTDIR)$(includedir)/oclgrind/clc32.pch
+ rm -rf $(DESTDIR)$(includedir)/oclgrind/clc64.pch
+
+RUNTIME_SOURCES = src/runtime/async_queue.h \
+ src/runtime/async_queue.cpp src/runtime/icd.h src/runtime/runtime.cpp
+
+liboclgrind_rt_la_SOURCES = $(RUNTIME_SOURCES)
+liboclgrind_rt_la_LIBADD = liboclgrind.la
+liboclgrind_rt_la_LDFLAGS = -shared
+
+liboclgrind_rt_icd_la_CPPFLAGS = -DOCLGRIND_ICD $(AM_CPPFLAGS)
+liboclgrind_rt_icd_la_SOURCES = $(RUNTIME_SOURCES)
+liboclgrind_rt_icd_la_LIBADD = liboclgrind.la
+liboclgrind_rt_icd_la_LDFLAGS = -shared
+
+bin_PROGRAMS = oclgrind-kernel
+oclgrind_kernel_SOURCES = src/kernel/oclgrind-kernel.cpp \
+ src/kernel/Simulation.h src/kernel/Simulation.cpp
+oclgrind_kernel_LDADD = liboclgrind.la
+
+bin_SCRIPTS = oclgrind
+oclgrind: $(top_srcdir)/src/runtime/oclgrind
+ cat $(top_srcdir)/src/runtime/oclgrind \
+ | $(SED) 's|__VERSION__|'$(VERSION)'|g' \
+ >$@
+noinst_SCRIPTS = oclgrind.icd \
+ src/include/oclgrind/clc.h \
+ src/include/oclgrind/clc32.pch \
+ src/include/oclgrind/clc64.pch
+oclgrind.icd: liboclgrind-rt-icd.la
+ printf $(libdir)/ >$@
+ $(GREP) dlname $< | $(AWK) -F "'" '{print $$2}' >>$@
+
+src/include/oclgrind/clc.h: $(top_srcdir)/src/core/clc.h
+ mkdir -p src/include/oclgrind
+ cp $< $@
+
+src/include/oclgrind/clc32.pch: src/include/oclgrind/clc.h
+ $(clang) \
+ -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin \
+ -emit-pch -triple spir-unknown-unknown \
+ -relocatable-pch \
+ -isysroot $(abs_builddir)/src/include/oclgrind \
+ $< -o $@
+src/include/oclgrind/clc64.pch: src/include/oclgrind/clc.h
+ $(clang) \
+ -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin \
+ -emit-pch -triple spir64-unknown-unknown \
+ -relocatable-pch \
+ -isysroot $(abs_builddir)/src/include/oclgrind \
+ $< -o $@
+
+check_PROGRAMS = tests/apps/vecadd/vecadd
+tests_apps_vecadd_vecadd_LDADD = liboclgrind-rt.la
+TESTS = $(check_PROGRAMS)
+
+TEST_EXTENSIONS = .sim
+SIM_LOG_COMPILER = $(PYTHON) \
+ $(top_srcdir)/tests/kernels/run_kernel_test.py \
+ ${abs_top_builddir}/oclgrind-kernel
+AM_TESTS_ENVIRONMENT = \
+ export AM_TESTS=1; \
+ export OCLGRIND_PCH_DIR=$(abs_builddir)/src/include/oclgrind;
+
+if HAVE_PYTHON
+TESTS += $(KERNEL_TESTS)
+XFAIL_TESTS = \
+ tests/kernels/atomics/atomic_intergroup_race.sim \
+ tests/kernels/data-race/intragroup_hidden_race.sim
+else
+check-local:
+ @echo
+ @echo "WARNING: Kernel tests skipped (Python required)."
+ @echo
+endif
+
+EXTRA_DIST = NEWS src/core/gen_clc_h.sh src/core/clc.h \
+ src/runtime/oclgrind src/CL/cl.h src/CL/cl_gl.h src/CL/cl_platform.h \
+ src/CL/cl_ext.h src/CL/cl_gl_ext.h src/CL/cl_egl.h src/CL/cl_d3d10.h \
+ src/CL/cl_d3d11.h src/CL/cl_dx9_media_sharing.h src/CL/opencl.h \
+ CMakeLists.txt tests/apps/CMakeLists.txt cmake_config.h.in \
+ src/core/gen_clc_h.cmake src/runtime/icd.def src/runtime/runtime.def \
+ src/install/INSTALL.darwin src/install/INSTALL.linux \
+ src/install/INSTALL.windows src/install/install.bat \
+ src/install/uninstall.bat src/install/oclgrind-icd.reg \
+ tests/kernels/run_kernel_test.py tests/kernels/TESTS \
+ $(KERNEL_TEST_INPUTS)
+CLEANFILES = src/core/clc_h.cpp $(bin_SCRIPTS) $(noinst_SCRIPTS) \
+ $(KERNEL_TEST_OUTPUTS)
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..16766ab
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,58 @@
+For more information, please visit the Oclgrind Wiki:
+https://github.com/jrprice/Oclgrind/wiki
+
+Oclgrind 15.5
+=============
+This release updates to LLVM 3.6, which improves the OpenCL C compiler
+and provides some additional performance enhancements. See README for
+revised instructions on how to build Oclgrind from source.
+
+- Fixed race conditions in atomic operations
+- Interactive debugger breaks on Ctrl+C
+- Various other minor bug fixes
+
+
+Oclgrind 15.2
+=============
+This release significantly improves simulation performance, and fixes
+several bugs impacting on usage and stability.
+
+- Added detection for violations of read-only/write-only attributes
+- Added --build-options argument to append additional compiler flags
+- Added hostMemoryLoad and hostMemoryStore callbacks
+- Added workGroupBegin and workItemBegin callbacks
+- Split atomic callbacks into separate load and store
+- Multi-threaded simulation to improve performance
+- Various other performance improvements
+- Several general bug fixes and stability improvements
+
+
+Oclgrind 14.12
+==============
+This release incorporates a new plugin system, to allow third-party
+developers to build tools that utilise Oclgrind. More information can
+be found on the Wiki:
+https://github.com/jrprice/Oclgrind/wiki/Creating-Plugins
+
+In addition, this release contains the following changes:
+- Interactive debugger now has a command history
+- Detection for unaligned memory accesses
+- Limit the number of error messages printed to avoid flooding output
+- Various other bug fixes and improvements
+
+
+Oclgrind 14.5
+=============
+Initial release (beta).
+
+Implements a SPIR 1.2 interpreter which can be targeted either via an
+OpenCL 1.2 runtime API implementation or using a standalone kernel
+interface.
+
+Provides the following utilities:
+- Memory access error detection
+- Work-group divergence detection (barriers, async-copies)
+- Data-race detection (--data-races)
+- Simple interactive debugger (--interactive)
+- Instruction histograms (--inst-counts)
+- OpenCL runtime API error reporting (--check-api)
diff --git a/README b/README
new file mode 100644
index 0000000..6a65f57
--- /dev/null
+++ b/README
@@ -0,0 +1,138 @@
+========
+Oclgrind
+========
+
+About
+-----
+This project implements a virtual OpenCL device simulator, including
+an OpenCL runtime with ICD support. The goal is to provide a platform
+for creating tools to aid OpenCL development. In particular, this
+project currently implements utilities for debugging memory access
+errors, detecting data-races and barrier divergence, collecting
+instruction histograms, and for interactive OpenCL kernel debugging.
+The simulator is built on an interpreter for LLVM IR. This project is
+being developed by James Price and Simon McIntosh-Smith at the
+University of Bristol.
+
+Binary releases can be found on the GitHub releases page:
+
+ https://github.com/jrprice/Oclgrind/releases
+
+
+Building
+--------
+To build this project, you will require the LLVM and Clang 3.6
+development libraries and headers. With some modifications, it may
+also be possible to use other (recent) versions of LLVM. If building
+LLVM from source, it is recommended to enable optimizations to improve
+the performance of Oclgrind (configure with --enable-optimized, or set
+CMAKE_BUILD_TYPE to RelWithDebInfo).
+
+You will also need to use a compiler that supports C++11.
+
+
+Building on Linux and OS X
+--------------------------
+If you are building directly from the GitHub repository, you need to
+run 'autoreconf -i' to generate the necessary build files. This is not
+required if you are using a released source package.
+
+Run ./configure to generate the Makefile, optionally using
+--prefix=PATH to specify the target installation directory. If you
+don't have the LLVM/Clang includes and libraries on your search path,
+you can specify the location of your LLVM installation using the
+--with-llvm=PATH option. For example:
+
+./configure --prefix=$PWD/build/ --with-llvm=PATH/TO/LLVM/INSTALL
+
+This path should be the directory in which LLVM is installed (e.g. the
+path specified to --prefix or CMAKE_INSTALL_PATH when LLVM was built).
+
+Next, build and install with make:
+
+make
+make check
+make install
+
+If installing to a non-default location, you should add the bin/
+directory to the PATH environment variable in order to make use of the
+oclgrind command. If you wish to use Oclgrind via the OpenCL ICD
+(optional), then you should create an ICD loading point by copying the
+oclgrind.icd file from the build directory to /etc/OpenCL/vendors/.
+
+Building on Windows
+-------------------
+A CMake build system is provided for building Oclgrind on Windows. At
+present, this only works with Visual Studio 2013 (or newer), and
+Windows 7.
+
+When configuring the CMake build, you may be prompted to supply a
+value for the LLVM_DIR parameter. This should be set to the directory
+containing your LLVM installations's LLVMConfig.cmake file, (for
+example C:\Program Files\LLVM\share\llvm\cmake\).
+
+If you wish to use Oclgrind via the OpenCL ICD (optional), then you
+should also create an ICD loading point. To do this, you should add a
+REG_DWORD value to the Windows Registry under one or both of the
+registry keys below, with the name set to the absolute path of the
+oclgrind-rt-icd.dll library and the value set to 0.
+
+Key for 32-bit machines or 64-bit apps on a 64-bit machine:
+HKEY_LOCAL_MACHINE\SOFTWARE\Khronos\OpenCL\Vendors
+
+Key for 32-bit apps on a 64-bit machine:
+HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Khronos\OpenCL\Vendors
+
+
+Usage
+-----
+The recommended method of running an application with Oclgrind is to
+use the oclgrind command, for example:
+
+oclgrind ./application
+
+This command will make it such the only OpenCL platform and device
+available to your application is Oclgrind. If you need more control
+over platform selection then installing an ICD loading point for
+Oclgrind will cause it to appear when an application calls
+clGetPlatformIDs(), alongside any other OpenCL platforms installed on
+your system.
+
+If it encounters any invalid memory accesses, Oclgrind will
+report the details to stderr, for example:
+
+> Invalid write of size 4 at global memory address 0x1000000000040
+> Kernel: vecadd
+> Entity: Global(16,0,0) Local(0,0,0) Group(16,0,0)
+> store i32 %tmp9, i32 addrspace(1)* %tmp15, align 4
+> At line 4 of input.cl
+> c[i] = a[i] + b[i]
+
+Since it is interpreting an abstract intermediate representation and
+bounds-checking each memory access, Oclgrind will run quite slowly
+(typically a couple of orders of magnitude slower than a regular CPU
+implementation). Therefore, it is recommended to run your application
+with a small problem if possible.
+
+To enable an interactive, GDB-style debugging session, supply the -i
+flag to the oclgrind command, or export the environment variable
+OCLGRIND_INTERACTIVE=1. This will cause Oclgrind to automatically
+break at the beginning of each kernel invocation, and upon
+encountering an invalid memory access. Type 'help' for details of
+available commands.
+
+For more detailed information about using Oclgrind please visit the
+GitHub Wiki:
+
+ https://github.com/jrprice/Oclgrind/wiki/
+
+
+Contact
+-------
+If you encounter any issues or have any questions, please use the
+GitHub issues page:
+
+ https://github.com/jrprice/Oclgrind/issues
+
+You can also contact the primary developer via email:
+James Price <j.price at bristol.ac.uk>
diff --git a/cmake_config.h.in b/cmake_config.h.in
new file mode 100644
index 0000000..3794dc8
--- /dev/null
+++ b/cmake_config.h.in
@@ -0,0 +1,5 @@
+#define PACKAGE_VERSION "@Oclgrind_VERSION_MAJOR at .@Oclgrind_VERSION_MINOR@"
+
+#define HAVE_READLINE @HAVE_READLINE@
+
+#define LLVM_VERSION @LLVM_VERSION@
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..4b4c793
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,134 @@
+# configure.ac (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+AC_INIT([Oclgrind], [15.5], , [oclgrind], [https://github.com/jrprice/Oclgrind])
+AC_PREREQ([2.63])
+AC_CONFIG_SRCDIR([src/])
+AM_INIT_AUTOMAKE([foreign 1.12])
+AC_LANG(C++)
+AC_PROG_CXX
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_FILES([Makefile])
+
+LT_INIT
+
+# Check if we're compiling with Clang
+AS_CASE([`$CC --version`], [*clang*], [using_clang=yes])
+AM_CONDITIONAL([USING_CLANG], [test "$using_clang" == "yes"])
+
+oclgrind_extra_libs=
+
+# Check for C++11
+AX_CHECK_COMPILE_FLAG([-std=c++11], [],
+ [AC_MSG_ERROR([C++11 support is required])])
+CXXFLAGS="$CXXFLAGS -std=c++11"
+CPPFLAGS="$CPPFLAGS -std=c++11"
+
+# --with-llvm option to specify root of LLVM/Clang installation
+AC_ARG_WITH(
+ llvm,
+ [AS_HELP_STRING([--with-llvm],
+ [directory containing LLVM/Clang installation])],
+ [AC_SUBST(clang, $withval/bin/clang)
+ AC_SUBST(llvm_config, $withval/bin/llvm-config)])
+
+# Find LLVM/Clang binaries (assume on PATH if --with-llvm not used)
+AC_CHECK_PROG(clang, [clang], `which clang`)
+AC_CHECK_PROG(llvm_config, [llvm-config], `which llvm-config`)
+if test -z $llvm_config; then
+ AC_MSG_ERROR([llvm-config not found (use --with-llvm=)])
+fi
+
+# Check version of LLVM
+AC_MSG_CHECKING([llvm version])
+llvm_full_version=`$llvm_config --version`
+llvm_version=`echo $llvm_full_version | cut -b 1,3`
+AC_MSG_RESULT($llvm_full_version)
+if test $llvm_version -lt 36; then
+ AC_MSG_ERROR([LLVM version must be >= 3.6])
+fi
+AC_DEFINE_UNQUOTED([LLVM_VERSION],
+ [$llvm_version],
+ [Version of LLVM we are building against])
+
+# Add flags for LLVM
+CPPFLAGS="$CPPFLAGS `$llvm_config --cppflags`"
+LDFLAGS="$LDFLAGS `$llvm_config --ldflags`"
+
+# Check for LLVM/Clang headers/libraries
+AC_CHECK_HEADERS(
+ [llvm/IR/Instruction.h clang/CodeGen/CodeGenAction.h],
+ [:],
+ [AC_MSG_ERROR([LLVM/Clang includes not found (use --with-llvm=)])])
+
+AC_CHECK_LIB(
+ [clangFrontend],
+ [main],
+ [:],
+ [AC_MSG_ERROR([Clang library not found (use --with-llvm)])])
+
+# GNU readline library (for interactive debugger)
+AC_ARG_WITH(
+ [readline],
+ AS_HELP_STRING([--with-readline],
+ [location of GNU readline library]),
+ [CPPFLAGS="$CPPFLAGS -I$withval/include";
+ LDFLAGS="$LDFLAGS -L$withval/lib"])
+
+have_readline=true
+AC_CHECK_HEADER(
+ [readline/readline.h],
+ [:],
+ [have_readline=false])
+AC_CHECK_HEADER(
+ [readline/history.h],
+ [:],
+ [have_readline=false])
+AC_CHECK_LIB(
+ [readline],
+ [readline],
+ [:],
+ [have_readline=false])
+AC_CHECK_LIB(
+ [readline],
+ [add_history],
+ [:],
+ [have_readline=false])
+if test $have_readline = true; then
+ AC_DEFINE([HAVE_READLINE], [1], [Define to 1 if GNU readline found])
+ oclgrind_extra_libs="$oclgrind_extra_libs -lreadline"
+else
+ AC_MSG_WARN([GNU readline library not found (use --with-readline)])
+fi
+
+
+AC_SUBST([oclgrind_extra_libs], [$oclgrind_extra_libs])
+
+
+# Check if Python is available (required to run tests)
+AM_PATH_PYTHON(,,[:])
+AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :])
+
+# Kernel tests
+KERNEL_TESTS=""
+KERNEL_TEST_INPUTS=""
+KERNEL_TEST_OUTPUTS=""
+m4_foreach([name], m4_split(m4_include(tests/kernels/TESTS), m4_newline),
+[
+ KERNEL_TESTS="$KERNEL_TESTS tests/kernels/"name".sim"
+ KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".sim"
+ KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".cl"
+ KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".ref"
+ KERNEL_TEST_OUTPUTS="$KERNEL_TEST_OUTPUTS tests/kernels/"name".out"
+])
+AC_SUBST(KERNEL_TESTS, $KERNEL_TESTS)
+AC_SUBST(KERNEL_TEST_INPUTS, $KERNEL_TEST_INPUTS)
+AC_SUBST(KERNEL_TEST_OUTPUTS, $KERNEL_TEST_OUTPUTS)
+
+AC_OUTPUT
diff --git a/m4/m4_ax_check_compile_flag.m4 b/m4/m4_ax_check_compile_flag.m4
new file mode 100644
index 0000000..ca36397
--- /dev/null
+++ b/m4/m4_ax_check_compile_flag.m4
@@ -0,0 +1,74 @@
+# ===========================================================================
+# http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
+#
+# DESCRIPTION
+#
+# Check whether the given FLAG works with the current language's compiler
+# or gives an error. (Warnings, however, are ignored)
+#
+# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+# success/failure.
+#
+# If EXTRA-FLAGS is defined, it is added to the current language's default
+# flags (e.g. CFLAGS) when the check is done. The check is thus made with
+# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to
+# force the compiler to issue an error when a bad flag is given.
+#
+# INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
+# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+# Copyright (c) 2008 Guido U. Draheim <guidod at gmx.de>
+# Copyright (c) 2011 Maarten Bosmans <mkbosmans at gmail.com>
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception, the respective Autoconf Macro's copyright owner
+# gives unlimited permission to copy, distribute and modify the configure
+# scripts that are the output of Autoconf when processing the Macro. You
+# need not follow the terms of the GNU General Public License when using
+# or distributing such scripts, even though portions of the text of the
+# Macro appear in them. The GNU General Public License (GPL) does govern
+# all other use of the material that constitutes the Autoconf Macro.
+#
+# This special exception to the GPL applies to versions of the Autoconf
+# Macro released by the Autoconf Archive. When you make and distribute a
+# modified version of the Autoconf Macro, you may extend this special
+# exception to the GPL to apply to your modified version as well.
+
+#serial 4
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+ ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+ _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+ AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
+ [AS_VAR_SET(CACHEVAR,[yes])],
+ [AS_VAR_SET(CACHEVAR,[no])])
+ _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_VAR_IF(CACHEVAR,yes,
+ [m4_default([$2], :)],
+ [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
diff --git a/src/CL/cl.h b/src/CL/cl.h
new file mode 100644
index 0000000..203c659
--- /dev/null
+++ b/src/CL/cl.h
@@ -0,0 +1,1214 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id * cl_platform_id;
+typedef struct _cl_device_id * cl_device_id;
+typedef struct _cl_context * cl_context;
+typedef struct _cl_command_queue * cl_command_queue;
+typedef struct _cl_mem * cl_mem;
+typedef struct _cl_program * cl_program;
+typedef struct _cl_kernel * cl_kernel;
+typedef struct _cl_event * cl_event;
+typedef struct _cl_sampler * cl_sampler;
+
+typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong cl_bitfield;
+typedef cl_bitfield cl_device_type;
+typedef cl_uint cl_platform_info;
+typedef cl_uint cl_device_info;
+typedef cl_bitfield cl_device_fp_config;
+typedef cl_uint cl_device_mem_cache_type;
+typedef cl_uint cl_device_local_mem_type;
+typedef cl_bitfield cl_device_exec_capabilities;
+typedef cl_bitfield cl_command_queue_properties;
+typedef intptr_t cl_device_partition_property;
+typedef cl_bitfield cl_device_affinity_domain;
+
+typedef intptr_t cl_context_properties;
+typedef cl_uint cl_context_info;
+typedef cl_uint cl_command_queue_info;
+typedef cl_uint cl_channel_order;
+typedef cl_uint cl_channel_type;
+typedef cl_bitfield cl_mem_flags;
+typedef cl_uint cl_mem_object_type;
+typedef cl_uint cl_mem_info;
+typedef cl_bitfield cl_mem_migration_flags;
+typedef cl_uint cl_image_info;
+typedef cl_uint cl_buffer_create_type;
+typedef cl_uint cl_addressing_mode;
+typedef cl_uint cl_filter_mode;
+typedef cl_uint cl_sampler_info;
+typedef cl_bitfield cl_map_flags;
+typedef cl_uint cl_program_info;
+typedef cl_uint cl_program_build_info;
+typedef cl_uint cl_program_binary_type;
+typedef cl_int cl_build_status;
+typedef cl_uint cl_kernel_info;
+typedef cl_uint cl_kernel_arg_info;
+typedef cl_uint cl_kernel_arg_address_qualifier;
+typedef cl_uint cl_kernel_arg_access_qualifier;
+typedef cl_bitfield cl_kernel_arg_type_qualifier;
+typedef cl_uint cl_kernel_work_group_info;
+typedef cl_uint cl_event_info;
+typedef cl_uint cl_command_type;
+typedef cl_uint cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+ cl_channel_order image_channel_order;
+ cl_channel_type image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+ cl_mem_object_type image_type;
+ size_t image_width;
+ size_t image_height;
+ size_t image_depth;
+ size_t image_array_size;
+ size_t image_row_pitch;
+ size_t image_slice_pitch;
+ cl_uint num_mip_levels;
+ cl_uint num_samples;
+ cl_mem buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+ size_t origin;
+ size_t size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS 0
+#define CL_DEVICE_NOT_FOUND -1
+#define CL_DEVICE_NOT_AVAILABLE -2
+#define CL_COMPILER_NOT_AVAILABLE -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
+#define CL_OUT_OF_RESOURCES -5
+#define CL_OUT_OF_HOST_MEMORY -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE -7
+#define CL_MEM_COPY_OVERLAP -8
+#define CL_IMAGE_FORMAT_MISMATCH -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
+#define CL_BUILD_PROGRAM_FAILURE -11
+#define CL_MAP_FAILURE -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE -15
+#define CL_LINKER_NOT_AVAILABLE -16
+#define CL_LINK_PROGRAM_FAILURE -17
+#define CL_DEVICE_PARTITION_FAILED -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19
+
+#define CL_INVALID_VALUE -30
+#define CL_INVALID_DEVICE_TYPE -31
+#define CL_INVALID_PLATFORM -32
+#define CL_INVALID_DEVICE -33
+#define CL_INVALID_CONTEXT -34
+#define CL_INVALID_QUEUE_PROPERTIES -35
+#define CL_INVALID_COMMAND_QUEUE -36
+#define CL_INVALID_HOST_PTR -37
+#define CL_INVALID_MEM_OBJECT -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
+#define CL_INVALID_IMAGE_SIZE -40
+#define CL_INVALID_SAMPLER -41
+#define CL_INVALID_BINARY -42
+#define CL_INVALID_BUILD_OPTIONS -43
+#define CL_INVALID_PROGRAM -44
+#define CL_INVALID_PROGRAM_EXECUTABLE -45
+#define CL_INVALID_KERNEL_NAME -46
+#define CL_INVALID_KERNEL_DEFINITION -47
+#define CL_INVALID_KERNEL -48
+#define CL_INVALID_ARG_INDEX -49
+#define CL_INVALID_ARG_VALUE -50
+#define CL_INVALID_ARG_SIZE -51
+#define CL_INVALID_KERNEL_ARGS -52
+#define CL_INVALID_WORK_DIMENSION -53
+#define CL_INVALID_WORK_GROUP_SIZE -54
+#define CL_INVALID_WORK_ITEM_SIZE -55
+#define CL_INVALID_GLOBAL_OFFSET -56
+#define CL_INVALID_EVENT_WAIT_LIST -57
+#define CL_INVALID_EVENT -58
+#define CL_INVALID_OPERATION -59
+#define CL_INVALID_GL_OBJECT -60
+#define CL_INVALID_BUFFER_SIZE -61
+#define CL_INVALID_MIP_LEVEL -62
+#define CL_INVALID_GLOBAL_WORK_SIZE -63
+#define CL_INVALID_PROPERTY -64
+#define CL_INVALID_IMAGE_DESCRIPTOR -65
+#define CL_INVALID_COMPILER_OPTIONS -66
+#define CL_INVALID_LINKER_OPTIONS -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0 1
+#define CL_VERSION_1_1 1
+#define CL_VERSION_1_2 1
+
+/* cl_bool */
+#define CL_FALSE 0
+#define CL_TRUE 1
+#define CL_BLOCKING CL_TRUE
+#define CL_NON_BLOCKING CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE 0x0900
+#define CL_PLATFORM_VERSION 0x0901
+#define CL_PLATFORM_NAME 0x0902
+#define CL_PLATFORM_VENDOR 0x0903
+#define CL_PLATFORM_EXTENSIONS 0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
+#define CL_DEVICE_TYPE_CPU (1 << 1)
+#define CL_DEVICE_TYPE_GPU (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
+#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE 0x1000
+#define CL_DEVICE_VENDOR_ID 0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
+#define CL_DEVICE_ADDRESS_BITS 0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT 0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
+#define CL_DEVICE_MAX_SAMPLERS 0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
+#define CL_DEVICE_ENDIAN_LITTLE 0x1026
+#define CL_DEVICE_AVAILABLE 0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES 0x102A
+#define CL_DEVICE_NAME 0x102B
+#define CL_DEVICE_VENDOR 0x102C
+#define CL_DRIVER_VERSION 0x102D
+#define CL_DEVICE_PROFILE 0x102E
+#define CL_DEVICE_VERSION 0x102F
+#define CL_DEVICE_EXTENSIONS 0x1030
+#define CL_DEVICE_PLATFORM 0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
+#define CL_DEVICE_OPENCL_C_VERSION 0x103D
+#define CL_DEVICE_LINKER_AVAILABLE 0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
+#define CL_DEVICE_PARENT_DEVICE 0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES 0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
+#define CL_DEVICE_PARTITION_TYPE 0x1046
+#define CL_DEVICE_REFERENCE_COUNT 0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM (1 << 0)
+#define CL_FP_INF_NAN (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST (1 << 2)
+#define CL_FP_ROUND_TO_ZERO (1 << 3)
+#define CL_FP_ROUND_TO_INF (1 << 4)
+#define CL_FP_FMA (1 << 5)
+#define CL_FP_SOFT_FLOAT (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE 0x0
+#define CL_READ_ONLY_CACHE 0x1
+#define CL_READ_WRITE_CACHE 0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL 0x1
+#define CL_GLOBAL 0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
+
+/* cl_context_info */
+#define CL_CONTEXT_REFERENCE_COUNT 0x1080
+#define CL_CONTEXT_DEVICES 0x1081
+#define CL_CONTEXT_PROPERTIES 0x1082
+#define CL_CONTEXT_NUM_DEVICES 0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM 0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT 0x1090
+#define CL_QUEUE_DEVICE 0x1091
+#define CL_QUEUE_REFERENCE_COUNT 0x1092
+#define CL_QUEUE_PROPERTIES 0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE (1 << 0)
+#define CL_MEM_WRITE_ONLY (1 << 1)
+#define CL_MEM_READ_ONLY (1 << 2)
+#define CL_MEM_USE_HOST_PTR (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
+#define CL_MEM_COPY_HOST_PTR (1 << 5)
+// reserved (1 << 6)
+#define CL_MEM_HOST_WRITE_ONLY (1 << 7)
+#define CL_MEM_HOST_READ_ONLY (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1)
+
+/* cl_channel_order */
+#define CL_R 0x10B0
+#define CL_A 0x10B1
+#define CL_RG 0x10B2
+#define CL_RA 0x10B3
+#define CL_RGB 0x10B4
+#define CL_RGBA 0x10B5
+#define CL_BGRA 0x10B6
+#define CL_ARGB 0x10B7
+#define CL_INTENSITY 0x10B8
+#define CL_LUMINANCE 0x10B9
+#define CL_Rx 0x10BA
+#define CL_RGx 0x10BB
+#define CL_RGBx 0x10BC
+#define CL_DEPTH 0x10BD
+#define CL_DEPTH_STENCIL 0x10BE
+
+/* cl_channel_type */
+#define CL_SNORM_INT8 0x10D0
+#define CL_SNORM_INT16 0x10D1
+#define CL_UNORM_INT8 0x10D2
+#define CL_UNORM_INT16 0x10D3
+#define CL_UNORM_SHORT_565 0x10D4
+#define CL_UNORM_SHORT_555 0x10D5
+#define CL_UNORM_INT_101010 0x10D6
+#define CL_SIGNED_INT8 0x10D7
+#define CL_SIGNED_INT16 0x10D8
+#define CL_SIGNED_INT32 0x10D9
+#define CL_UNSIGNED_INT8 0x10DA
+#define CL_UNSIGNED_INT16 0x10DB
+#define CL_UNSIGNED_INT32 0x10DC
+#define CL_HALF_FLOAT 0x10DD
+#define CL_FLOAT 0x10DE
+#define CL_UNORM_INT24 0x10DF
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER 0x10F0
+#define CL_MEM_OBJECT_IMAGE2D 0x10F1
+#define CL_MEM_OBJECT_IMAGE3D 0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D 0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
+
+/* cl_mem_info */
+#define CL_MEM_TYPE 0x1100
+#define CL_MEM_FLAGS 0x1101
+#define CL_MEM_SIZE 0x1102
+#define CL_MEM_HOST_PTR 0x1103
+#define CL_MEM_MAP_COUNT 0x1104
+#define CL_MEM_REFERENCE_COUNT 0x1105
+#define CL_MEM_CONTEXT 0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
+#define CL_MEM_OFFSET 0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT 0x1110
+#define CL_IMAGE_ELEMENT_SIZE 0x1111
+#define CL_IMAGE_ROW_PITCH 0x1112
+#define CL_IMAGE_SLICE_PITCH 0x1113
+#define CL_IMAGE_WIDTH 0x1114
+#define CL_IMAGE_HEIGHT 0x1115
+#define CL_IMAGE_DEPTH 0x1116
+#define CL_IMAGE_ARRAY_SIZE 0x1117
+#define CL_IMAGE_BUFFER 0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS 0x1119
+#define CL_IMAGE_NUM_SAMPLES 0x111A
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE 0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
+#define CL_ADDRESS_CLAMP 0x1132
+#define CL_ADDRESS_REPEAT 0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT 0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST 0x1140
+#define CL_FILTER_LINEAR 0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT 0x1150
+#define CL_SAMPLER_CONTEXT 0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
+#define CL_SAMPLER_ADDRESSING_MODE 0x1153
+#define CL_SAMPLER_FILTER_MODE 0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ (1 << 0)
+#define CL_MAP_WRITE (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT 0x1160
+#define CL_PROGRAM_CONTEXT 0x1161
+#define CL_PROGRAM_NUM_DEVICES 0x1162
+#define CL_PROGRAM_DEVICES 0x1163
+#define CL_PROGRAM_SOURCE 0x1164
+#define CL_PROGRAM_BINARY_SIZES 0x1165
+#define CL_PROGRAM_BINARIES 0x1166
+#define CL_PROGRAM_NUM_KERNELS 0x1167
+#define CL_PROGRAM_KERNEL_NAMES 0x1168
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS 0x1181
+#define CL_PROGRAM_BUILD_OPTIONS 0x1182
+#define CL_PROGRAM_BUILD_LOG 0x1183
+#define CL_PROGRAM_BINARY_TYPE 0x1184
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS 0
+#define CL_BUILD_NONE -1
+#define CL_BUILD_ERROR -2
+#define CL_BUILD_IN_PROGRESS -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME 0x1190
+#define CL_KERNEL_NUM_ARGS 0x1191
+#define CL_KERNEL_REFERENCE_COUNT 0x1192
+#define CL_KERNEL_CONTEXT 0x1193
+#define CL_KERNEL_PROGRAM 0x1194
+#define CL_KERNEL_ATTRIBUTES 0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197
+#define CL_KERNEL_ARG_TYPE_NAME 0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199
+#define CL_KERNEL_ARG_NAME 0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3
+
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE 0
+#define CL_KERNEL_ARG_TYPE_CONST (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_QUEUE 0x11D0
+#define CL_EVENT_COMMAND_TYPE 0x11D1
+#define CL_EVENT_REFERENCE_COUNT 0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
+#define CL_EVENT_CONTEXT 0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
+#define CL_COMMAND_TASK 0x11F1
+#define CL_COMMAND_NATIVE_KERNEL 0x11F2
+#define CL_COMMAND_READ_BUFFER 0x11F3
+#define CL_COMMAND_WRITE_BUFFER 0x11F4
+#define CL_COMMAND_COPY_BUFFER 0x11F5
+#define CL_COMMAND_READ_IMAGE 0x11F6
+#define CL_COMMAND_WRITE_IMAGE 0x11F7
+#define CL_COMMAND_COPY_IMAGE 0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
+#define CL_COMMAND_MAP_BUFFER 0x11FB
+#define CL_COMMAND_MAP_IMAGE 0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
+#define CL_COMMAND_MARKER 0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
+#define CL_COMMAND_READ_BUFFER_RECT 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
+#define CL_COMMAND_USER 0x1204
+#define CL_COMMAND_BARRIER 0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
+#define CL_COMMAND_FILL_BUFFER 0x1207
+#define CL_COMMAND_FILL_IMAGE 0x1208
+
+/* command execution status */
+#define CL_COMPLETE 0x0
+#define CL_RUNNING 0x1
+#define CL_SUBMITTED 0x2
+#define CL_QUEUED 0x3
+
+/* cl_buffer_create_type */
+#define CL_BUFFER_CREATE_TYPE_REGION 0x1220
+
+/* cl_profiling_info */
+#define CL_PROFILING_COMMAND_QUEUED 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT 0x1281
+#define CL_PROFILING_COMMAND_START 0x1282
+#define CL_PROFILING_COMMAND_END 0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id /* platform */,
+ cl_platform_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id /* platform */,
+ cl_device_type /* device_type */,
+ cl_uint /* num_entries */,
+ cl_device_id * /* devices */,
+ cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id /* device */,
+ cl_device_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id /* in_device */,
+ const cl_device_partition_property * /* properties */,
+ cl_uint /* num_devices */,
+ cl_device_id * /* out_devices */,
+ cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+
+/* Context APIs */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* devices */,
+ void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+ cl_device_type /* device_type */,
+ void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context /* context */,
+ cl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context /* context */,
+ cl_device_id /* device */,
+ cl_command_queue_properties /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue /* command_queue */,
+ cl_command_queue_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ size_t /* size */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem /* buffer */,
+ cl_mem_flags /* flags */,
+ cl_buffer_create_type /* buffer_create_type */,
+ const void * /* buffer_create_info */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ const cl_image_desc * /* image_desc */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_mem_object_type /* image_type */,
+ cl_uint /* num_entries */,
+ cl_image_format * /* image_formats */,
+ cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem /* memobj */,
+ cl_mem_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem /* image */,
+ cl_image_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback( cl_mem /* memobj */,
+ void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1;
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context /* context */,
+ cl_bool /* normalized_coords */,
+ cl_addressing_mode /* addressing_mode */,
+ cl_filter_mode /* filter_mode */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler /* sampler */,
+ cl_sampler_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context /* context */,
+ cl_uint /* count */,
+ const char ** /* strings */,
+ const size_t * /* lengths */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const size_t * /* lengths */,
+ const unsigned char ** /* binaries */,
+ cl_int * /* binary_status */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* kernel_names */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program /* program */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_headers */,
+ const cl_program * /* input_headers */,
+ const char ** /* header_include_names */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context /* context */,
+ cl_uint /* num_devices */,
+ const cl_device_id * /* device_list */,
+ const char * /* options */,
+ cl_uint /* num_input_programs */,
+ const cl_program * /* input_programs */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */,
+ cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program /* program */,
+ cl_program_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program /* program */,
+ cl_device_id /* device */,
+ cl_program_build_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program /* program */,
+ const char * /* kernel_name */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program /* program */,
+ cl_uint /* num_kernels */,
+ cl_kernel * /* kernels */,
+ cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel /* kernel */,
+ cl_uint /* arg_index */,
+ size_t /* arg_size */,
+ const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel /* kernel */,
+ cl_kernel_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel /* kernel */,
+ cl_uint /* arg_indx */,
+ cl_kernel_arg_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel /* kernel */,
+ cl_device_id /* device */,
+ cl_kernel_work_group_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event /* event */,
+ cl_event_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context /* context */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event /* event */,
+ cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event /* event */,
+ cl_int /* command_exec_callback_type */,
+ void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event /* event */,
+ cl_profiling_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ size_t /* offset */,
+ size_t /* size */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_read */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ size_t /* offset */,
+ size_t /* size */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_write */,
+ const size_t * /* buffer_offset */,
+ const size_t * /* host_offset */,
+ const size_t * /* region */,
+ size_t /* buffer_row_pitch */,
+ size_t /* buffer_slice_pitch */,
+ size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ const void * /* pattern */,
+ size_t /* pattern_size */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ size_t /* src_offset */,
+ size_t /* dst_offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin */,
+ const size_t * /* dst_origin */,
+ const size_t * /* region */,
+ size_t /* src_row_pitch */,
+ size_t /* src_slice_pitch */,
+ size_t /* dst_row_pitch */,
+ size_t /* dst_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_read */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* row_pitch */,
+ size_t /* slice_pitch */,
+ void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_write */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* input_row_pitch */,
+ size_t /* input_slice_pitch */,
+ const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ const void * /* fill_color */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_image */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* src_image */,
+ cl_mem /* dst_buffer */,
+ const size_t * /* src_origin[3] */,
+ const size_t * /* region[3] */,
+ size_t /* dst_offset */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+ cl_mem /* src_buffer */,
+ cl_mem /* dst_image */,
+ size_t /* src_offset */,
+ const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+ cl_mem /* buffer */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ size_t /* offset */,
+ size_t /* size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue /* command_queue */,
+ cl_mem /* image */,
+ cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */,
+ const size_t * /* origin[3] */,
+ const size_t * /* region[3] */,
+ size_t * /* image_row_pitch */,
+ size_t * /* image_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+ cl_mem /* memobj */,
+ void * /* mapped_ptr */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_mem_migration_flags /* flags */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* work_dim */,
+ const size_t * /* global_work_offset */,
+ const size_t * /* global_work_size */,
+ const size_t * /* local_work_size */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue /* command_queue */,
+ void (CL_CALLBACK * /*user_func*/)(void *),
+ void * /* args */,
+ size_t /* cb_args */,
+ cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_list */,
+ const void ** /* args_mem_loc */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found. The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+ const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+
+
+// Deprecated OpenCL 1.1 APIs
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_row_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_depth */,
+ size_t /* image_row_pitch */,
+ size_t /* image_slice_pitch */,
+ void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue /* command_queue */,
+ cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+ cl_uint /* num_events */,
+ const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_H */
+
diff --git a/src/CL/cl_d3d10.h b/src/CL/cl_d3d10.h
new file mode 100644
index 0000000..81b0d37
--- /dev/null
+++ b/src/CL/cl_d3d10.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D10_DEVICE_KHR -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
+
+// cl_d3d10_device_source_nv
+#define CL_D3D10_DEVICE_KHR 0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
+
+// cl_d3d10_device_set_nv
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
+
+// cl_context_info
+#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+// cl_mem_info
+#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
+
+// cl_image_info
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+ cl_platform_id platform,
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Buffer * resource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture2D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture3D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __OPENCL_CL_D3D10_H
+
diff --git a/src/CL/cl_d3d11.h b/src/CL/cl_d3d11.h
new file mode 100644
index 0000000..d3c8bdc
--- /dev/null
+++ b/src/CL/cl_d3d11.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D11_DEVICE_KHR -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009
+
+// cl_d3d11_device_source
+#define CL_D3D11_DEVICE_KHR 0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A
+
+// cl_d3d11_device_set
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C
+
+// cl_context_info
+#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+// cl_mem_info
+#define CL_MEM_D3D11_RESOURCE_KHR 0x401E
+
+// cl_image_info
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+ cl_platform_id platform,
+ cl_d3d11_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d11_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Buffer * resource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Texture2D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Texture3D * resource,
+ UINT subresource,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __OPENCL_CL_D3D11_H
+
diff --git a/src/CL/cl_dx9_media_sharing.h b/src/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000..1ef543a
--- /dev/null
+++ b/src/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,127 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+/* cl_khr_dx9_media_sharing */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint cl_dx9_media_adapter_type_khr;
+typedef cl_uint cl_dx9_media_adapter_set_khr;
+
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+ IDirect3DSurface9 *resource;
+ HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013
+
+// cl_media_adapter_type_khr
+#define CL_ADAPTER_D3D9_KHR 0x2020
+#define CL_ADAPTER_D3D9EX_KHR 0x2021
+#define CL_ADAPTER_DXVA_KHR 0x2022
+
+// cl_media_adapter_set_khr
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024
+
+// cl_context_info
+#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027
+
+// cl_mem_info
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029
+
+// cl_image_info
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+ cl_platform_id platform,
+ cl_uint num_media_adapters,
+ cl_dx9_media_adapter_type_khr * media_adapter_type,
+ void * media_adapters,
+ cl_dx9_media_adapter_set_khr media_adapter_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_dx9_media_adapter_type_khr adapter_type,
+ void * surface_info,
+ cl_uint plane,
+ cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __OPENCL_CL_DX9_MEDIA_SHARING_H
+
diff --git a/src/CL/cl_egl.h b/src/CL/cl_egl.h
new file mode 100644
index 0000000..c1bd4f3
--- /dev/null
+++ b/src/CL/cl_egl.h
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include <CL/cl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context /* context */,
+ CLeglDisplayKHR /* egldisplay */,
+ CLeglImageKHR /* eglimage */,
+ cl_mem_flags /* flags */,
+ const cl_egl_image_properties_khr * /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+ cl_context context,
+ CLeglDisplayKHR egldisplay,
+ CLeglImageKHR eglimage,
+ cl_mem_flags flags,
+ const cl_egl_image_properties_khr * properties,
+ cl_int * errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context /* context */,
+ EGLSyncKHR /* sync */,
+ EGLDisplay /* display */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+ cl_context context,
+ EGLSyncKHR sync,
+ EGLDisplay display,
+ cl_int * errcode_ret);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/src/CL/cl_ext.h b/src/CL/cl_ext.h
new file mode 100644
index 0000000..5ab2c13
--- /dev/null
+++ b/src/CL/cl_ext.h
@@ -0,0 +1,310 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies. */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+ #include <OpenCL/cl.h>
+ #include <AvailabilityMacros.h>
+#else
+ #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions */
+#define CL_DEVICE_HALF_FP_CONFIG 0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */,
+ void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
+ void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */,
+ const void * /* private_info */,
+ size_t /* cb */,
+ void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info */
+#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
+
+/* Additional Error Codes */
+#define CL_PLATFORM_NOT_FOUND_KHR -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+ cl_uint /* num_entries */,
+ cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x200E
+
+
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x200F
+#define CL_CONTEXT_TERMINATE_KHR 0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS 0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
+#define CL_DEVICE_WARP_SIZE_NV 0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
+
+#ifdef CL_VERSION_1_1
+ /***********************************
+ * cl_ext_device_fission extension *
+ ***********************************/
+ #define cl_ext_device_fission 1
+
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef cl_ulong cl_device_partition_property_ext;
+ extern CL_API_ENTRY cl_int CL_API_CALL
+ clCreateSubDevicesEXT( cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ typedef CL_API_ENTRY cl_int
+ ( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/,
+ const cl_device_partition_property_ext * /* properties */,
+ cl_uint /*num_entries*/,
+ cl_device_id * /*out_devices*/,
+ cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+ /* cl_device_partition_property_ext */
+ #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
+ #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
+ #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
+ #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
+
+ /* clDeviceGetInfo selectors */
+ #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
+ #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
+ #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
+ #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
+ #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
+
+ /* error codes */
+ #define CL_DEVICE_PARTITION_FAILED_EXT -1057
+ #define CL_INVALID_PARTITION_COUNT_EXT -1058
+ #define CL_INVALID_PARTITION_NAME_EXT -1059
+
+ /* CL_AFFINITY_DOMAINs */
+ #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
+ #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
+ #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
+ #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
+ #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
+ #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
+
+ /* cl_device_partition_property_ext list terminators */
+ #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
+ #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
+ #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7
+
+typedef cl_uint cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id device,
+ size_t image_width,
+ size_t image_height,
+ const cl_image_format *image_format,
+ cl_image_pitch_info_qcom param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+ // Type of external memory allocation.
+ // Legal values will be defined in layered extensions.
+ cl_uint allocation_type;
+
+ // Host cache policy for this external memory allocation.
+ cl_uint host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+ // Type of external memory allocation.
+ // Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations.
+ cl_mem_ext_host_ptr ext_host_ptr;
+
+ // ION file descriptor
+ int ion_filedesc;
+
+ // Host pointer to the ION allocated memory
+ void* ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/src/CL/cl_gl.h b/src/CL/cl_gl.h
new file mode 100644
index 0000000..af2036c
--- /dev/null
+++ b/src/CL/cl_gl.h
@@ -0,0 +1,162 @@
+/**********************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint cl_gl_object_type;
+typedef cl_uint cl_gl_texture_info;
+typedef cl_uint cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
+#define CL_GL_OBJECT_BUFFER 0x2000
+#define CL_GL_OBJECT_TEXTURE2D 0x2001
+#define CL_GL_OBJECT_TEXTURE3D 0x2002
+#define CL_GL_OBJECT_RENDERBUFFER 0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
+#define CL_GL_OBJECT_TEXTURE1D 0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
+
+/* cl_gl_texture_info */
+#define CL_GL_TEXTURE_TARGET 0x2004
+#define CL_GL_MIPMAP_LEVEL 0x2005
+#define CL_GL_NUM_SAMPLES 0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLuint /* bufobj */,
+ int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLuint /* renderbuffer */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem /* memobj */,
+ cl_gl_object_type * /* gl_object_type */,
+ cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem /* memobj */,
+ cl_gl_texture_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+// Deprecated OpenCL 1.1 APIs
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context /* context */,
+ cl_mem_flags /* flags */,
+ cl_GLenum /* target */,
+ cl_GLint /* miplevel */,
+ cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* cl_khr_gl_sharing extension */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint cl_gl_context_info;
+
+/* Additional Error Codes */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
+
+/* cl_gl_context_info */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
+
+/* Additional cl_context_properties */
+#define CL_GL_CONTEXT_KHR 0x2008
+#define CL_EGL_DISPLAY_KHR 0x2009
+#define CL_GLX_DISPLAY_KHR 0x200A
+#define CL_WGL_HDC_KHR 0x200B
+#define CL_CGL_SHAREGROUP_KHR 0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+ cl_gl_context_info /* param_name */,
+ size_t /* param_value_size */,
+ void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+ const cl_context_properties * properties,
+ cl_gl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_GL_H */
diff --git a/src/CL/cl_gl_ext.h b/src/CL/cl_gl_ext.h
new file mode 100644
index 0000000..77d5353
--- /dev/null
+++ b/src/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */
+/* OpenGL dependencies. */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+ #include <OpenCL/cl_gl.h>
+#else
+ #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ * cl_VEN_extname extension */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ * This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/*
+ * cl_khr_gl_event extension
+ * See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context /* context */,
+ cl_GLsync /* cl_GLsync */,
+ cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_GL_EXT_H */
diff --git a/src/CL/cl_platform.h b/src/CL/cl_platform.h
new file mode 100644
index 0000000..7f6f5e8
--- /dev/null
+++ b/src/CL/cl_platform.h
@@ -0,0 +1,1278 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+ /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+ #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+ #define CL_API_ENTRY
+ #define CL_API_CALL __stdcall
+ #define CL_CALLBACK __stdcall
+#else
+ #define CL_API_ENTRY
+ #define CL_API_CALL
+ #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+ #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import))
+ #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+
+ #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+ #else
+ #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
+ #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+ #endif
+#else
+ #define CL_EXTENSION_WEAK_LINK
+ #define CL_API_SUFFIX__VERSION_1_0
+ #define CL_EXT_SUFFIX__VERSION_1_0
+ #define CL_API_SUFFIX__VERSION_1_1
+ #define CL_EXT_SUFFIX__VERSION_1_1
+ #define CL_API_SUFFIX__VERSION_1_2
+ #define CL_EXT_SUFFIX__VERSION_1_2
+
+ #ifdef __GNUC__
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #endif
+ #elif _WIN32
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
+ #endif
+
+ #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
+ #endif
+ #else
+ #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+
+ #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+ #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+ #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types */
+typedef signed __int8 cl_char;
+typedef unsigned __int8 cl_uchar;
+typedef signed __int16 cl_short;
+typedef unsigned __int16 cl_ushort;
+typedef signed __int32 cl_int;
+typedef unsigned __int32 cl_uint;
+typedef signed __int64 cl_long;
+typedef unsigned __int64 cl_ulong;
+
+typedef unsigned __int16 cl_half;
+typedef float cl_float;
+typedef double cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN 1.175494350822287507969e-38f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN 2.225073858507201383090e-308
+#define CL_DBL_EPSILON 2.220446049250313080847e-16
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#define CL_NAN (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF ((cl_float) 1e50)
+#define CL_HUGE_VAL ((cl_double) 1e500)
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types */
+typedef int8_t cl_char;
+typedef uint8_t cl_uchar;
+typedef int16_t cl_short __attribute__((aligned(2)));
+typedef uint16_t cl_ushort __attribute__((aligned(2)));
+typedef int32_t cl_int __attribute__((aligned(4)));
+typedef uint32_t cl_uint __attribute__((aligned(4)));
+typedef int64_t cl_long __attribute__((aligned(8)));
+typedef uint64_t cl_ulong __attribute__((aligned(8)));
+
+typedef uint16_t cl_half __attribute__((aligned(2)));
+typedef float cl_float __attribute__((aligned(4)));
+typedef double cl_double __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT 8
+#define CL_SCHAR_MAX 127
+#define CL_SCHAR_MIN (-127-1)
+#define CL_CHAR_MAX CL_SCHAR_MAX
+#define CL_CHAR_MIN CL_SCHAR_MIN
+#define CL_UCHAR_MAX 255
+#define CL_SHRT_MAX 32767
+#define CL_SHRT_MIN (-32767-1)
+#define CL_USHRT_MAX 65535
+#define CL_INT_MAX 2147483647
+#define CL_INT_MIN (-2147483647-1)
+#define CL_UINT_MAX 0xffffffffU
+#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG 6
+#define CL_FLT_MANT_DIG 24
+#define CL_FLT_MAX_10_EXP +38
+#define CL_FLT_MAX_EXP +128
+#define CL_FLT_MIN_10_EXP -37
+#define CL_FLT_MIN_EXP -125
+#define CL_FLT_RADIX 2
+#define CL_FLT_MAX 0x1.fffffep127f
+#define CL_FLT_MIN 0x1.0p-126f
+#define CL_FLT_EPSILON 0x1.0p-23f
+
+#define CL_DBL_DIG 15
+#define CL_DBL_MANT_DIG 53
+#define CL_DBL_MAX_10_EXP +308
+#define CL_DBL_MAX_EXP +1024
+#define CL_DBL_MIN_10_EXP -307
+#define CL_DBL_MIN_EXP -1021
+#define CL_DBL_RADIX 2
+#define CL_DBL_MAX 0x1.fffffffffffffp1023
+#define CL_DBL_MIN 0x1.0p-1022
+#define CL_DBL_EPSILON 0x1.0p-52
+
+#define CL_M_E 2.718281828459045090796
+#define CL_M_LOG2E 1.442695040888963387005
+#define CL_M_LOG10E 0.434294481903251816668
+#define CL_M_LN2 0.693147180559945286227
+#define CL_M_LN10 2.302585092994045901094
+#define CL_M_PI 3.141592653589793115998
+#define CL_M_PI_2 1.570796326794896557999
+#define CL_M_PI_4 0.785398163397448278999
+#define CL_M_1_PI 0.318309886183790691216
+#define CL_M_2_PI 0.636619772367581382433
+#define CL_M_2_SQRTPI 1.128379167095512558561
+#define CL_M_SQRT2 1.414213562373095145475
+#define CL_M_SQRT1_2 0.707106781186547572737
+
+#define CL_M_E_F 2.71828174591064f
+#define CL_M_LOG2E_F 1.44269502162933f
+#define CL_M_LOG10E_F 0.43429449200630f
+#define CL_M_LN2_F 0.69314718246460f
+#define CL_M_LN10_F 2.30258512496948f
+#define CL_M_PI_F 3.14159274101257f
+#define CL_M_PI_2_F 1.57079637050629f
+#define CL_M_PI_4_F 0.78539818525314f
+#define CL_M_1_PI_F 0.31830987334251f
+#define CL_M_2_PI_F 0.63661974668503f
+#define CL_M_2_SQRTPI_F 1.12837922573090f
+#define CL_M_SQRT2_F 1.41421353816986f
+#define CL_M_SQRT1_2_F 0.70710676908493f
+
+#if defined( __GNUC__ )
+ #define CL_HUGE_VALF __builtin_huge_valf()
+ #define CL_HUGE_VAL __builtin_huge_val()
+ #define CL_NAN __builtin_nanf( "" )
+#else
+ #define CL_HUGE_VALF ((cl_float) 1e50)
+ #define CL_HUGE_VAL ((cl_double) 1e500)
+ float nanf( const char * );
+ #define CL_NAN nanf( "" )
+#endif
+#define CL_MAXFLOAT CL_FLT_MAX
+#define CL_INFINITY CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ * Note: OpenCL requires that all types be naturally aligned.
+ * This means that vector types must be naturally aligned.
+ * For example, a vector of four floats must be aligned to
+ * a 16 byte boundary (calculated as 4 * the natural 4-byte
+ * alignment of the float). The alignment qualifiers here
+ * will only function properly if your compiler supports them
+ * and if you don't actively work to defeat them. For example,
+ * in order for a cl_float4 to be 16 byte aligned in a struct,
+ * the start of the struct must itself be 16-byte aligned.
+ *
+ * Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+ #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+ typedef vector unsigned char __cl_uchar16;
+ typedef vector signed char __cl_char16;
+ typedef vector unsigned short __cl_ushort8;
+ typedef vector signed short __cl_short8;
+ typedef vector unsigned int __cl_uint4;
+ typedef vector signed int __cl_int4;
+ typedef vector float __cl_float4;
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_INT4__ 1
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <xmmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef float __cl_float4 __attribute__((vector_size(16)));
+ #else
+ typedef __m128 __cl_float4;
+ #endif
+ #define __CL_FLOAT4__ 1
+#endif
+
+#if defined( __SSE2__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <emmintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16)));
+ typedef cl_char __cl_char16 __attribute__((vector_size(16)));
+ typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16)));
+ typedef cl_short __cl_short8 __attribute__((vector_size(16)));
+ typedef cl_uint __cl_uint4 __attribute__((vector_size(16)));
+ typedef cl_int __cl_int4 __attribute__((vector_size(16)));
+ typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16)));
+ typedef cl_long __cl_long2 __attribute__((vector_size(16)));
+ typedef cl_double __cl_double2 __attribute__((vector_size(16)));
+ #else
+ typedef __m128i __cl_uchar16;
+ typedef __m128i __cl_char16;
+ typedef __m128i __cl_ushort8;
+ typedef __m128i __cl_short8;
+ typedef __m128i __cl_uint4;
+ typedef __m128i __cl_int4;
+ typedef __m128i __cl_ulong2;
+ typedef __m128i __cl_long2;
+ typedef __m128d __cl_double2;
+ #endif
+ #define __CL_UCHAR16__ 1
+ #define __CL_CHAR16__ 1
+ #define __CL_USHORT8__ 1
+ #define __CL_SHORT8__ 1
+ #define __CL_INT4__ 1
+ #define __CL_UINT4__ 1
+ #define __CL_ULONG2__ 1
+ #define __CL_LONG2__ 1
+ #define __CL_DOUBLE2__ 1
+#endif
+
+#if defined( __MMX__ )
+ #include <mmintrin.h>
+ #if defined( __GNUC__ )
+ typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8)));
+ typedef cl_char __cl_char8 __attribute__((vector_size(8)));
+ typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8)));
+ typedef cl_short __cl_short4 __attribute__((vector_size(8)));
+ typedef cl_uint __cl_uint2 __attribute__((vector_size(8)));
+ typedef cl_int __cl_int2 __attribute__((vector_size(8)));
+ typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8)));
+ typedef cl_long __cl_long1 __attribute__((vector_size(8)));
+ typedef cl_float __cl_float2 __attribute__((vector_size(8)));
+ #else
+ typedef __m64 __cl_uchar8;
+ typedef __m64 __cl_char8;
+ typedef __m64 __cl_ushort4;
+ typedef __m64 __cl_short4;
+ typedef __m64 __cl_uint2;
+ typedef __m64 __cl_int2;
+ typedef __m64 __cl_ulong1;
+ typedef __m64 __cl_long1;
+ typedef __m64 __cl_float2;
+ #endif
+ #define __CL_UCHAR8__ 1
+ #define __CL_CHAR8__ 1
+ #define __CL_USHORT4__ 1
+ #define __CL_SHORT4__ 1
+ #define __CL_INT2__ 1
+ #define __CL_UINT2__ 1
+ #define __CL_ULONG1__ 1
+ #define __CL_LONG1__ 1
+ #define __CL_FLOAT2__ 1
+#endif
+
+#if defined( __AVX__ )
+ #if defined( __MINGW64__ )
+ #include <intrin.h>
+ #else
+ #include <immintrin.h>
+ #endif
+ #if defined( __GNUC__ )
+ typedef cl_float __cl_float8 __attribute__((vector_size(32)));
+ typedef cl_double __cl_double4 __attribute__((vector_size(32)));
+ #else
+ typedef __m256 __cl_float8;
+ typedef __m256d __cl_double4;
+ #endif
+ #define __CL_FLOAT8__ 1
+ #define __CL_DOUBLE4__ 1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define __CL_HAS_ANON_STRUCT__ 1
+#define __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && (_MSC_VER >= 1500)
+ /* Microsoft Developer Studio 2008 supports anonymous structs, but
+ * complains by default. */
+#define __CL_HAS_ANON_STRUCT__ 1
+#define __CL_ANON_STRUCT__
+ /* Disable warning C4201: nonstandard extension used : nameless
+ * struct/union */
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#else
+#define __CL_HAS_ANON_STRUCT__ 0
+#define __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+ #define CL_ALIGNED(_x) __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+ /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */
+ /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */
+ /* #include <crtdefs.h> */
+ /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */
+ #define CL_ALIGNED(_x)
+#else
+ #warning Need to implement some method to align data here
+ #define CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+ /* .xyzw and .s0123...{f|F} are supported */
+ #define CL_HAS_NAMED_VECTOR_FIELDS 1
+ /* .hi and .lo are supported */
+ #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+ cl_char CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_char lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+ cl_char CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef cl_char4 cl_char3;
+
+typedef union
+{
+ cl_char CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+ cl_char CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+ __cl_char2 v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+ __cl_char4 v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+ __cl_char8 v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+ __cl_char16 v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+ cl_uchar CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+ __cl_uchar2 v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef cl_uchar4 cl_uchar3;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+ cl_uchar CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+ __cl_uchar2 v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+ __cl_uchar4 v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+ __cl_uchar8 v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+ __cl_uchar16 v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+ cl_short CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_short lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+ cl_short CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef cl_short4 cl_short3;
+
+typedef union
+{
+ cl_short CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+ cl_short CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+ __cl_short2 v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+ __cl_short4 v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+ __cl_short8 v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+ __cl_short16 v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+ cl_ushort CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef cl_ushort4 cl_ushort3;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+ cl_ushort CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+ __cl_ushort2 v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+ __cl_ushort4 v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+ __cl_ushort8 v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+ __cl_ushort16 v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+ cl_int CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_int lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+ cl_int CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[2];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef cl_int4 cl_int3;
+
+typedef union
+{
+ cl_int CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[4];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[2];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+ cl_int CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+ __cl_int2 v2[8];
+#endif
+#if defined( __CL_INT4__)
+ __cl_int4 v4[4];
+#endif
+#if defined( __CL_INT8__ )
+ __cl_int8 v8[2];
+#endif
+#if defined( __CL_INT16__ )
+ __cl_int16 v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+ cl_uint CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[2];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef cl_uint4 cl_uint3;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[4];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+ cl_uint CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+ __cl_uint2 v2[8];
+#endif
+#if defined( __CL_UINT4__)
+ __cl_uint4 v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+ __cl_uint8 v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+ __cl_uint16 v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+ cl_long CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_long lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+ cl_long CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[2];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef cl_long4 cl_long3;
+
+typedef union
+{
+ cl_long CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[4];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+ cl_long CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+ __cl_long2 v2[8];
+#endif
+#if defined( __CL_LONG4__)
+ __cl_long4 v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+ __cl_long8 v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+ __cl_long16 v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+ cl_ulong CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef cl_ulong4 cl_ulong3;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+ cl_ulong CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+ __cl_ulong2 v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+ __cl_ulong4 v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+ __cl_ulong8 v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+ __cl_ulong16 v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+ cl_float CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_float lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+ cl_float CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef cl_float4 cl_float3;
+
+typedef union
+{
+ cl_float CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+ cl_float CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+ __cl_float2 v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+ __cl_float4 v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+ __cl_float8 v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+ __cl_float16 v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+ cl_double CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+ __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+ cl_double CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; };
+ __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef cl_double4 cl_double3;
+
+typedef union
+{
+ cl_double CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; };
+ __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+ cl_double CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+ __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+ __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+ __cl_double2 v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+ __cl_double4 v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+ __cl_double8 v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+ __cl_double16 v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \"
+ * Each line thereafter of OpenCL C source must end with: \n\
+ * The last line ends in ";
+ *
+ * Example:
+ *
+ * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ * kernel void foo( int a, float * b ) \n\
+ * { \n\
+ * // my comment \n\
+ * *b[ get_global_id(0)] = a; \n\
+ * } \n\
+ * ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define __CL_STRINGIFY( _x ) # _x
+#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x )
+#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && (_MSC_VER >= 1500)
+#pragma warning( pop )
+#endif
+
+#endif /* __CL_PLATFORM_H */
diff --git a/src/CL/opencl.h b/src/CL/opencl.h
new file mode 100644
index 0000000..3f00524
--- /dev/null
+++ b/src/CL/opencl.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_H */
+
diff --git a/src/core/Context.cpp b/src/core/Context.cpp
new file mode 100644
index 0000000..6a8b4ff
--- /dev/null
+++ b/src/core/Context.cpp
@@ -0,0 +1,547 @@
+// Context.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <windows.h>
+#undef ERROR
+#else
+#include <dlfcn.h>
+#endif
+
+#include <mutex>
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Instruction.h"
+
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "Program.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+
+#include "plugins/InstructionCounter.h"
+#include "plugins/InteractiveDebugger.h"
+#include "plugins/Logger.h"
+#include "plugins/MemCheck.h"
+#include "plugins/RaceDetector.h"
+
+using namespace oclgrind;
+using namespace std;
+
+Context::Context()
+{
+ m_globalMemory = new Memory(AddrSpaceGlobal, this);
+ m_kernelInvocation = NULL;
+
+ loadPlugins();
+}
+
+Context::~Context()
+{
+ delete m_globalMemory;
+
+ unloadPlugins();
+}
+
+bool Context::isThreadSafe() const
+{
+ for (const PluginEntry &p : m_plugins)
+ {
+ if (!p.first->isThreadSafe())
+ return false;
+ }
+ return true;
+}
+
+Memory* Context::getGlobalMemory() const
+{
+ return m_globalMemory;
+}
+
+void Context::loadPlugins()
+{
+ // Create core plugins
+ m_plugins.push_back(make_pair(new Logger(this), true));
+ m_plugins.push_back(make_pair(new MemCheck(this), true));
+
+ if (checkEnv("OCLGRIND_INST_COUNTS"))
+ m_plugins.push_back(make_pair(new InstructionCounter(this), true));
+
+ if (checkEnv("OCLGRIND_DATA_RACES"))
+ m_plugins.push_back(make_pair(new RaceDetector(this), true));
+
+ if (checkEnv("OCLGRIND_INTERACTIVE"))
+ m_plugins.push_back(make_pair(new InteractiveDebugger(this), true));
+
+
+ // Load dynamic plugins
+ const char *dynamicPlugins = getenv("OCLGRIND_PLUGINS");
+ if (dynamicPlugins)
+ {
+ std::istringstream ss(dynamicPlugins);
+ std::string libpath;
+ while(std::getline(ss, libpath, ':'))
+ {
+#if defined(_WIN32) && !defined(__MINGW32__)
+ HMODULE library = LoadLibrary(libpath.c_str());
+ if (!library)
+ {
+ cerr << "Loading Oclgrind plugin failed (LoadLibrary): "
+ << GetLastError() << endl;
+ continue;
+ }
+
+ void *initialize = GetProcAddress(library, "initializePlugins");
+ if (!initialize)
+ {
+ cerr << "Loading Oclgrind plugin failed (GetProcAddress): "
+ << GetLastError() << endl;
+ continue;
+ }
+#else
+ void *library = dlopen(libpath.c_str(), RTLD_NOW);
+ if (!library)
+ {
+ cerr << "Loading Oclgrind plugin failed (dlopen): "
+ << dlerror() << endl;
+ continue;
+ }
+
+ void *initialize = dlsym(library, "initializePlugins");
+ if (!initialize)
+ {
+ cerr << "Loading Oclgrind plugin failed (dlsym): "
+ << dlerror() << endl;
+ continue;
+ }
+#endif
+
+ ((void(*)(Context*))initialize)(this);
+ m_pluginLibraries.push_back(library);
+ }
+ }
+}
+
+void Context::unloadPlugins()
+{
+ // Release dynamic plugin libraries
+ list<void*>::iterator plibItr;
+ for (plibItr = m_pluginLibraries.begin();
+ plibItr != m_pluginLibraries.end(); plibItr++)
+ {
+#if defined(_WIN32) && !defined(__MINGW32__)
+ void *release = GetProcAddress((HMODULE)*plibItr, "releasePlugins");
+ if (release)
+ {
+ ((void(*)(Context*))release)(this);
+ }
+ FreeLibrary((HMODULE)*plibItr);
+#else
+ void *release = dlsym(*plibItr, "releasePlugins");
+ if (release)
+ {
+ ((void(*)(Context*))release)(this);
+ }
+ dlclose(*plibItr);
+#endif
+ }
+
+ // Destroy internal plugins
+ PluginList::iterator pItr;
+ for (pItr = m_plugins.begin(); pItr != m_plugins.end(); pItr++)
+ {
+ if (pItr->second)
+ delete pItr->first;
+ }
+
+ m_plugins.clear();
+}
+
+void Context::registerPlugin(Plugin *plugin)
+{
+ m_plugins.push_back(make_pair(plugin, false));
+}
+
+void Context::unregisterPlugin(Plugin *plugin)
+{
+ m_plugins.remove(make_pair(plugin, false));
+}
+
+void Context::logError(const char* error) const
+{
+ Message msg(ERROR, this);
+ msg << error << endl
+ << msg.INDENT
+ << "Kernel: " << msg.CURRENT_KERNEL << endl
+ << "Entity: " << msg.CURRENT_ENTITY << endl
+ << msg.CURRENT_LOCATION << endl;
+ msg.send();
+}
+
+#define NOTIFY(function, ...) \
+{ \
+ PluginList::const_iterator pluginItr; \
+ for (pluginItr = m_plugins.begin(); \
+ pluginItr != m_plugins.end(); pluginItr++) \
+ { \
+ pluginItr->first->function(__VA_ARGS__); \
+ } \
+}
+
+void Context::notifyInstructionExecuted(const WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ const TypedValue& result) const
+{
+ NOTIFY(instructionExecuted, workItem, instruction, result);
+}
+
+void Context::notifyKernelBegin(const KernelInvocation *kernelInvocation) const
+{
+ assert(m_kernelInvocation == NULL);
+ m_kernelInvocation = kernelInvocation;
+
+ NOTIFY(kernelBegin, kernelInvocation);
+}
+
+void Context::notifyKernelEnd(const KernelInvocation *kernelInvocation) const
+{
+ NOTIFY(kernelEnd, kernelInvocation);
+
+ assert(m_kernelInvocation == kernelInvocation);
+ m_kernelInvocation = NULL;
+}
+
+void Context::notifyMemoryAllocated(const Memory *memory, size_t address,
+ size_t size, cl_mem_flags flags) const
+{
+ NOTIFY(memoryAllocated, memory, address, size, flags);
+}
+
+void Context::notifyMemoryAtomicLoad(const Memory *memory, AtomicOp op,
+ size_t address, size_t size) const
+{
+ if (m_kernelInvocation && m_kernelInvocation->getCurrentWorkItem())
+ {
+ NOTIFY(memoryAtomicLoad, memory, m_kernelInvocation->getCurrentWorkItem(),
+ op, address, size);
+ }
+}
+
+void Context::notifyMemoryAtomicStore(const Memory *memory, AtomicOp op,
+ size_t address, size_t size) const
+{
+ if (m_kernelInvocation && m_kernelInvocation->getCurrentWorkItem())
+ {
+ NOTIFY(memoryAtomicStore, memory, m_kernelInvocation->getCurrentWorkItem(),
+ op, address, size);
+ }
+}
+
+void Context::notifyMemoryDeallocated(const Memory *memory,
+ size_t address) const
+{
+ NOTIFY(memoryDeallocated, memory, address);
+}
+
+void Context::notifyMemoryLoad(const Memory *memory, size_t address,
+ size_t size) const
+{
+ if (m_kernelInvocation)
+ {
+ if (m_kernelInvocation->getCurrentWorkItem())
+ {
+ NOTIFY(memoryLoad, memory, m_kernelInvocation->getCurrentWorkItem(),
+ address, size);
+ }
+ else if (m_kernelInvocation->getCurrentWorkGroup())
+ {
+ NOTIFY(memoryLoad, memory, m_kernelInvocation->getCurrentWorkGroup(),
+ address, size);
+ }
+ }
+ else
+ {
+ NOTIFY(hostMemoryLoad, memory, address, size);
+ }
+}
+
+void Context::notifyMemoryStore(const Memory *memory, size_t address,
+ size_t size, const uint8_t *storeData) const
+{
+ if (m_kernelInvocation)
+ {
+ if (m_kernelInvocation->getCurrentWorkItem())
+ {
+ NOTIFY(memoryStore, memory, m_kernelInvocation->getCurrentWorkItem(),
+ address, size, storeData);
+ }
+ else if (m_kernelInvocation->getCurrentWorkGroup())
+ {
+ NOTIFY(memoryStore, memory, m_kernelInvocation->getCurrentWorkGroup(),
+ address, size, storeData);
+ }
+ }
+ else
+ {
+ NOTIFY(hostMemoryStore, memory, address, size, storeData);
+ }
+}
+
+void Context::notifyMessage(MessageType type, const char *message) const
+{
+ NOTIFY(log, type, message);
+}
+
+void Context::notifyWorkGroupBarrier(const WorkGroup *workGroup,
+ uint32_t flags) const
+{
+ NOTIFY(workGroupBarrier, workGroup, flags);
+}
+
+void Context::notifyWorkGroupBegin(const WorkGroup *workGroup) const
+{
+ NOTIFY(workGroupBegin, workGroup);
+}
+
+void Context::notifyWorkGroupComplete(const WorkGroup *workGroup) const
+{
+ NOTIFY(workGroupComplete, workGroup);
+}
+
+void Context::notifyWorkItemBegin(const WorkItem *workItem) const
+{
+ NOTIFY(workItemBegin, workItem);
+}
+
+void Context::notifyWorkItemComplete(const WorkItem *workItem) const
+{
+ NOTIFY(workItemComplete, workItem);
+}
+
+#undef NOTIFY
+
+
+Context::Message::Message(MessageType type, const Context *context)
+{
+ m_type = type;
+ m_context = context;
+ m_kernelInvocation = context->m_kernelInvocation;
+}
+
+Context::Message& Context::Message::operator<<(const special& id)
+{
+ switch (id)
+ {
+ case INDENT:
+ m_indentModifiers.push_back( m_stream.tellp());
+ break;
+ case UNINDENT:
+ m_indentModifiers.push_back(-m_stream.tellp());
+ break;
+ case CURRENT_KERNEL:
+ *this << m_kernelInvocation->getKernel()->getName();
+ break;
+ case CURRENT_WORK_ITEM_GLOBAL:
+ {
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (workItem)
+ {
+ *this << workItem->getGlobalID();
+ }
+ else
+ {
+ *this << "(none)";
+ }
+ break;
+ }
+ case CURRENT_WORK_ITEM_LOCAL:
+ {
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (workItem)
+ {
+ *this << workItem->getLocalID();
+ }
+ else
+ {
+ *this << "(none)";
+ }
+ break;
+ }
+ case CURRENT_WORK_GROUP:
+ {
+ const WorkGroup *workGroup = m_kernelInvocation->getCurrentWorkGroup();
+ if (workGroup)
+ {
+ *this << workGroup->getGroupID();
+ }
+ else
+ {
+ *this << "(none)";
+ }
+ break;
+ }
+ case CURRENT_ENTITY:
+ {
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ const WorkGroup *workGroup = m_kernelInvocation->getCurrentWorkGroup();
+ if (workItem)
+ {
+ *this << "Global" << workItem->getGlobalID()
+ << " Local" << workItem->getLocalID() << " ";
+ }
+ if (workGroup)
+ {
+ *this << "Group" << workGroup->getGroupID();
+ }
+ if (!workItem && ! workGroup)
+ {
+ *this << "(unknown)";
+ }
+ break;
+ }
+ case CURRENT_LOCATION:
+ {
+ const llvm::Instruction *instruction = NULL;
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ const WorkGroup *workGroup = m_kernelInvocation->getCurrentWorkGroup();
+ if (workItem)
+ {
+ instruction = workItem->getCurrentInstruction();
+ }
+ else if (workGroup)
+ {
+ instruction = workGroup->getCurrentBarrier();
+ }
+
+ *this << instruction;
+ break;
+ }
+ }
+ return *this;
+}
+
+Context::Message& Context::Message::operator<<(
+ const llvm::Instruction *instruction)
+{
+ // Use mutex as some part of LLVM used by dumpInstruction() is not thread-safe
+ static std::mutex mtx;
+ std::lock_guard<std::mutex> lock(mtx);
+
+ if (instruction)
+ {
+ // Output instruction
+ dumpInstruction(m_stream, instruction);
+ *this << endl;
+
+ // Output debug information
+ llvm::MDNode *md = instruction->getMetadata("dbg");
+ if (!md)
+ {
+ *this << "Debugging information not available." << endl;
+ }
+ else
+ {
+#if LLVM_VERSION > 36
+ llvm::DILocation *loc = (llvm::DILocation*)md;
+ unsigned lineNumber = loc->getLine();
+ llvm::StringRef filename = loc->getFilename();
+#else
+ llvm::DILocation loc((llvm::MDLocation*)md);
+ unsigned lineNumber = loc.getLineNumber();
+ llvm::StringRef filename = loc.getFilename();
+#endif
+
+ *this << "At line " << dec << lineNumber
+ << " of " << filename.str() << ":" << endl;
+
+ // Get source line
+ const Program *program = m_kernelInvocation->getKernel()->getProgram();
+ const char *line = program->getSourceLine(lineNumber);
+ if (line)
+ {
+ while (isspace(line[0]))
+ line++;
+ *this << " " << line;
+ }
+ else
+ *this << " (source not available)";
+
+ }
+ }
+ else
+ {
+ *this << "(location unknown)";
+ }
+
+ return *this;
+}
+
+Context::Message& Context::Message::operator<<(
+ std::ostream& (*t)(std::ostream&))
+{
+ m_stream << t;
+ return *this;
+}
+
+Context::Message& Context::Message::operator<<(
+ std::ios& (*t)(std::ios&))
+{
+ m_stream << t;
+ return *this;
+}
+
+Context::Message& Context::Message::operator<<(
+ std::ios_base& (*t)(std::ios_base&))
+{
+ m_stream << t;
+ return *this;
+}
+
+void Context::Message::send() const
+{
+ string msg;
+
+ string line;
+ int currentIndent = 0;
+ list<int>::const_iterator itr = m_indentModifiers.begin();
+
+ m_stream.clear();
+ m_stream.seekg(0);
+ while (m_stream.good())
+ {
+ getline(m_stream, line);
+
+ // TODO: Wrap long lines
+ msg += line;
+
+ // Check for indentation modifiers
+ long pos = m_stream.tellg();
+ if (itr != m_indentModifiers.end() && pos >= abs(*itr))
+ {
+ if (*itr >= 0)
+ currentIndent++;
+ else
+ currentIndent--;
+ itr++;
+ }
+
+ if (!m_stream.eof())
+ {
+ // Add newline and indentation
+ msg += '\n';
+ for (int i = 0; i < currentIndent; i++)
+ msg += '\t';
+ }
+ }
+
+ m_context->notifyMessage(m_type, msg.c_str());
+}
diff --git a/src/core/Context.h b/src/core/Context.h
new file mode 100644
index 0000000..41be6c7
--- /dev/null
+++ b/src/core/Context.h
@@ -0,0 +1,115 @@
+// Context.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+namespace oclgrind
+{
+ class KernelInvocation;
+ class Memory;
+ class Plugin;
+ class WorkGroup;
+ class WorkItem;
+
+ typedef std::pair<Plugin*, bool> PluginEntry;
+ typedef std::list<PluginEntry> PluginList;
+
+ class Context
+ {
+ public:
+ Context();
+ virtual ~Context();
+
+ Memory* getGlobalMemory() const;
+ bool isThreadSafe() const;
+ void logError(const char* error) const;
+
+ // Simulation callbacks
+ void notifyInstructionExecuted(const WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ const TypedValue& result) const;
+ void notifyKernelBegin(const KernelInvocation *kernelInvocation) const;
+ void notifyKernelEnd(const KernelInvocation *kernelInvocation) const;
+ void notifyMemoryAllocated(const Memory *memory, size_t address,
+ size_t size, cl_mem_flags flags) const;
+ void notifyMemoryAtomicLoad(const Memory *memory, AtomicOp op,
+ size_t address, size_t size) const;
+ void notifyMemoryAtomicStore(const Memory *memory, AtomicOp op,
+ size_t address, size_t size) const;
+ void notifyMemoryDeallocated(const Memory *memory, size_t address) const;
+ void notifyMemoryLoad(const Memory *memory, size_t address,
+ size_t size) const;
+ void notifyMemoryStore(const Memory *memory, size_t address, size_t size,
+ const uint8_t *storeData) const;
+ void notifyMessage(MessageType type, const char *message) const;
+ void notifyWorkGroupBarrier(const WorkGroup *workGroup,
+ uint32_t flags) const;
+ void notifyWorkGroupBegin(const WorkGroup *workGroup) const;
+ void notifyWorkGroupComplete(const WorkGroup *workGroup) const;
+ void notifyWorkItemBegin(const WorkItem *workItem) const;
+ void notifyWorkItemComplete(const WorkItem *workItem) const;
+
+
+ // Plugins
+ void registerPlugin(Plugin *plugin);
+ void unregisterPlugin(Plugin *plugin);
+
+ private:
+ mutable const KernelInvocation *m_kernelInvocation;
+ Memory *m_globalMemory;
+
+ PluginList m_plugins;
+ std::list<void*> m_pluginLibraries;
+ void loadPlugins();
+ void unloadPlugins();
+
+ public:
+ class Message
+ {
+ public:
+ enum special
+ {
+ INDENT,
+ UNINDENT,
+ CURRENT_KERNEL,
+ CURRENT_WORK_ITEM_GLOBAL,
+ CURRENT_WORK_ITEM_LOCAL,
+ CURRENT_WORK_GROUP,
+ CURRENT_ENTITY,
+ CURRENT_LOCATION,
+ };
+
+ Message(MessageType type, const Context *context);
+
+ Message& operator<<(const special& id);
+ Message& operator<<(const llvm::Instruction *instruction);
+
+ template<typename T>
+ Message& operator<<(const T& t);
+ Message& operator<<(std::ostream& (*t)(std::ostream&));
+ Message& operator<<(std::ios& (*t)(std::ios&));
+ Message& operator<<(std::ios_base& (*t)(std::ios_base&));
+
+ void send() const;
+
+ private:
+ MessageType m_type;
+ const Context *m_context;
+ const KernelInvocation *m_kernelInvocation;
+ mutable std::stringstream m_stream;
+ std::list<int> m_indentModifiers;
+ };
+ };
+
+ template<typename T>
+ Context::Message& Context::Message::operator<<(const T& t)
+ {
+ m_stream << t;
+ return *this;
+ }
+}
diff --git a/src/core/Kernel.cpp b/src/core/Kernel.cpp
new file mode 100644
index 0000000..ab2741e
--- /dev/null
+++ b/src/core/Kernel.cpp
@@ -0,0 +1,534 @@
+// Kernel.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+#include <sstream>
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_os_ostream.h"
+
+#include "Kernel.h"
+#include "Program.h"
+#include "Memory.h"
+
+using namespace oclgrind;
+using namespace std;
+
+Kernel::Kernel(const Program *program,
+ const llvm::Function *function, const llvm::Module *module)
+ : m_program(program), m_function(function), m_name(function->getName())
+{
+ m_localMemory = new Memory(AddrSpaceLocal, program->getContext());
+ m_privateMemory = new Memory(AddrSpacePrivate, program->getContext());
+
+ // Set-up global variables
+ llvm::Module::const_global_iterator itr;
+ for (itr = module->global_begin(); itr != module->global_end(); itr++)
+ {
+ llvm::PointerType *type = itr->getType();
+ switch (type->getPointerAddressSpace())
+ {
+ case AddrSpacePrivate:
+ {
+ const llvm::Constant *init = itr->getInitializer();
+
+ // Allocate private memory for variable
+ unsigned size = getTypeSize(init->getType());
+ size_t address = m_privateMemory->allocateBuffer(size);
+
+ // Initialize variable
+ void *ptr = m_privateMemory->getPointer(address);
+ getConstantData((unsigned char*)ptr, init);
+
+ TypedValue value =
+ {
+ sizeof(size_t),
+ 1,
+ new unsigned char[sizeof(size_t)]
+ };
+ value.setPointer(address);
+ m_arguments[itr] = value;
+
+ break;
+ }
+ case AddrSpaceConstant:
+ m_constants.push_back(itr);
+ break;
+ case AddrSpaceLocal:
+ {
+ // Allocate buffer
+ unsigned size = getTypeSize(itr->getInitializer()->getType());
+ TypedValue v = {
+ sizeof(size_t),
+ 1,
+ new unsigned char[sizeof(size_t)]
+ };
+ v.setPointer(m_localMemory->allocateBuffer(size));
+ m_arguments[itr] = v;
+
+ break;
+ }
+ default:
+ FATAL_ERROR("Unsupported GlobalVariable address space: %d",
+ type->getPointerAddressSpace());
+ }
+ }
+
+ // Get metadata node containing kernel arg info
+ m_metadata = NULL;
+ llvm::NamedMDNode *md = module->getNamedMetadata("opencl.kernels");
+ if (md)
+ {
+ for (unsigned i = 0; i < md->getNumOperands(); i++)
+ {
+ llvm::MDNode *node = md->getOperand(i);
+
+ llvm::ConstantAsMetadata *cam =
+ llvm::dyn_cast<llvm::ConstantAsMetadata>(node->getOperand(0).get());
+ if (!cam)
+ continue;
+
+ llvm::Function *function = ((llvm::Function*)cam->getValue());
+ if (function->getName() == m_name)
+ {
+ m_metadata = node;
+ break;
+ }
+ }
+ }
+}
+
+Kernel::Kernel(const Kernel& kernel)
+ : m_program(kernel.m_program)
+{
+ m_function = kernel.m_function;
+ m_constants = kernel.m_constants;
+ m_constantBuffers = kernel.m_constantBuffers;
+ m_localMemory = kernel.m_localMemory->clone();
+ m_privateMemory = kernel.m_privateMemory->clone();
+ m_name = kernel.m_name;
+ m_metadata = kernel.m_metadata;
+
+ TypedValueMap::const_iterator itr;
+ for (itr = kernel.m_arguments.begin();
+ itr != kernel.m_arguments.end(); itr++)
+ {
+ m_arguments[itr->first] = itr->second.clone();
+ }
+}
+
+Kernel::~Kernel()
+{
+ delete m_localMemory;
+ delete m_privateMemory;
+
+ TypedValueMap::iterator itr;
+ for (itr = m_arguments.begin(); itr != m_arguments.end(); itr++)
+ {
+ delete[] itr->second.data;
+ }
+}
+
+bool Kernel::allArgumentsSet() const
+{
+ llvm::Function::const_arg_iterator itr;
+ for (itr = m_function->arg_begin(); itr != m_function->arg_end(); itr++)
+ {
+ if (!m_arguments.count(itr))
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+void Kernel::allocateConstants(Memory *memory)
+{
+ list<const llvm::GlobalVariable*>::const_iterator itr;
+ for (itr = m_constants.begin(); itr != m_constants.end(); itr++)
+ {
+ const llvm::Constant *initializer = (*itr)->getInitializer();
+ const llvm::Type *type = initializer->getType();
+
+ // Allocate buffer
+ unsigned size = getTypeSize(type);
+ TypedValue v = {
+ sizeof(size_t),
+ 1,
+ new unsigned char[sizeof(size_t)]
+ };
+ size_t address = memory->allocateBuffer(size);
+ v.setPointer(address);
+ m_constantBuffers.push_back(address);
+ m_arguments[*itr] = v;
+
+ // Initialise buffer contents
+ unsigned char *data = new unsigned char[size];
+ getConstantData(data, (const llvm::Constant*)initializer);
+ memory->store(data, address, size);
+ delete[] data;
+ }
+}
+
+void Kernel::deallocateConstants(Memory *memory)
+{
+ list<size_t>::const_iterator itr;
+ for (itr = m_constantBuffers.begin(); itr != m_constantBuffers.end(); itr++)
+ {
+ memory->deallocateBuffer(*itr);
+ }
+ m_constantBuffers.clear();
+}
+
+const llvm::Argument* Kernel::getArgument(unsigned int index) const
+{
+ assert(index < getNumArguments());
+
+ llvm::Function::const_arg_iterator argItr = m_function->arg_begin();
+ for (unsigned i = 0; i < index; i++)
+ {
+ argItr++;
+ }
+ return argItr;
+}
+
+unsigned int Kernel::getArgumentAccessQualifier(unsigned int index) const
+{
+ assert(index < getNumArguments());
+
+ // Get metadata node
+ const llvm::MDNode *node = getArgumentMetadata("kernel_arg_access_qual");
+ if (!node)
+ {
+ return -1;
+ }
+
+ // Get qualifier string
+ llvm::MDString *str
+ = llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1));
+ string access = str->getString();
+ if (access == "read_only")
+ {
+ return CL_KERNEL_ARG_ACCESS_READ_ONLY;
+ }
+ else if (access == "write_only")
+ {
+ return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+ }
+ else if (access == "read_write")
+ {
+ return CL_KERNEL_ARG_ACCESS_READ_WRITE;
+ }
+ return CL_KERNEL_ARG_ACCESS_NONE;
+}
+
+unsigned int Kernel::getArgumentAddressQualifier(unsigned int index) const
+{
+ assert(index < getNumArguments());
+
+ // Get metadata node
+ const llvm::MDNode *node = getArgumentMetadata("kernel_arg_addr_space");
+ if (!node)
+ {
+ return -1;
+ }
+
+ // Get address space
+ switch(getMDOpAsConstInt(node->getOperand(index+1))->getZExtValue())
+ {
+ case AddrSpacePrivate:
+ return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+ case AddrSpaceGlobal:
+ return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ case AddrSpaceConstant:
+ return CL_KERNEL_ARG_ADDRESS_CONSTANT;
+ case AddrSpaceLocal:
+ return CL_KERNEL_ARG_ADDRESS_LOCAL;
+ default:
+ return -1;
+ }
+}
+
+const llvm::MDNode* Kernel::getArgumentMetadata(string name) const
+{
+ if (!m_metadata)
+ {
+ return NULL;
+ }
+
+ // Loop over all metadata nodes for this kernel
+ for (unsigned i = 0; i < m_metadata->getNumOperands(); i++)
+ {
+ const llvm::MDOperand& op = m_metadata->getOperand(i);
+ if (llvm::MDNode *node = llvm::dyn_cast<llvm::MDNode>(op.get()))
+ {
+ // Check if node matches target name
+ if (node->getNumOperands() > 0 &&
+ ((llvm::MDString*)(node->getOperand(0).get()))->getString() == name)
+ {
+ return node;
+ }
+ }
+ }
+ return NULL;
+}
+
+const llvm::StringRef Kernel::getArgumentName(unsigned int index) const
+{
+ return getArgument(index)->getName();
+}
+
+const llvm::StringRef Kernel::getArgumentTypeName(unsigned int index) const
+{
+ assert(index < getNumArguments());
+
+ // Get metadata node
+ const llvm::MDNode *node = getArgumentMetadata("kernel_arg_type");
+ if (!node)
+ {
+ return "";
+ }
+
+ return llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1))->getString();
+}
+
+unsigned int Kernel::getArgumentTypeQualifier(unsigned int index) const
+{
+ assert(index < getNumArguments());
+
+ // Get metadata node
+ const llvm::MDNode *node = getArgumentMetadata("kernel_arg_type_qual");
+ if (!node)
+ {
+ return -1;
+ }
+
+ // Get qualifiers
+ llvm::MDString *str =
+ llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1));
+ istringstream iss(str->getString().str());
+
+ unsigned int result = CL_KERNEL_ARG_TYPE_NONE;
+ while (!iss.eof())
+ {
+ string tok;
+ iss >> tok;
+ if (tok == "const")
+ {
+ result |= CL_KERNEL_ARG_TYPE_CONST;
+ }
+ else if (tok == "restrict")
+ {
+ result |= CL_KERNEL_ARG_TYPE_RESTRICT;
+ }
+ else if (tok == "volatile")
+ {
+ result |= CL_KERNEL_ARG_TYPE_VOLATILE;
+ }
+ }
+
+ return result;
+}
+
+size_t Kernel::getArgumentSize(unsigned int index) const
+{
+ const llvm::Argument *argument = getArgument(index);
+ const llvm::Type *type = argument->getType();
+
+ // Check if pointer argument
+ if (type->isPointerTy() && argument->hasByValAttr())
+ {
+ return getTypeSize(type->getPointerElementType());
+ }
+
+ return getTypeSize(type);
+}
+
+string Kernel::getAttributes() const
+{
+ ostringstream attributes("");
+ for (unsigned i = 0; i < m_metadata->getNumOperands(); i++)
+ {
+ llvm::MDNode *op = llvm::dyn_cast<llvm::MDNode>(m_metadata->getOperand(i));
+ if (op)
+ {
+ llvm::MDNode *val = ((llvm::MDNode*)op);
+ llvm::MDString *str =
+ llvm::dyn_cast<llvm::MDString>(val->getOperand(0).get());
+ string name = str->getString().str();
+
+ if (name == "reqd_work_group_size" ||
+ name == "work_group_size_hint")
+ {
+ attributes << name << "("
+ <<
+ getMDOpAsConstInt(val->getOperand(1))->getZExtValue()
+ << "," <<
+ getMDOpAsConstInt(val->getOperand(2))->getZExtValue()
+ << "," <<
+ getMDOpAsConstInt(val->getOperand(3))->getZExtValue()
+ << ") ";
+ }
+ else if (name == "vec_type_hint")
+ {
+ // Get type hint
+ size_t n = 1;
+ llvm::Metadata *md = val->getOperand(1).get();
+ llvm::ValueAsMetadata *vam = llvm::dyn_cast<llvm::ValueAsMetadata>(md);
+ const llvm::Type *type = vam->getType();
+ if (type->isVectorTy())
+ {
+ n = type->getVectorNumElements();
+ type = type->getVectorElementType();
+ }
+
+ // Generate attribute string
+ attributes << name << "(" << flush;
+ llvm::raw_os_ostream out(attributes);
+ type->print(out);
+ out.flush();
+ attributes << n << ") ";
+ }
+ }
+ }
+ return attributes.str();
+}
+
+const llvm::Function* Kernel::getFunction() const
+{
+ return m_function;
+}
+
+const Memory* Kernel::getLocalMemory() const
+{
+ return m_localMemory;
+}
+
+size_t Kernel::getLocalMemorySize() const
+{
+ return m_localMemory->getTotalAllocated();
+}
+
+const std::string& Kernel::getName() const
+{
+ return m_name;
+}
+
+unsigned int Kernel::getNumArguments() const
+{
+ return m_function->arg_size();
+}
+
+const Memory* Kernel::getPrivateMemory() const
+{
+ return m_privateMemory;
+}
+
+const Program* Kernel::getProgram() const
+{
+ return m_program;
+}
+
+void Kernel::getRequiredWorkGroupSize(size_t reqdWorkGroupSize[3]) const
+{
+ memset(reqdWorkGroupSize, 0, 3*sizeof(size_t));
+ for (unsigned i = 0; i < m_metadata->getNumOperands(); i++)
+ {
+ const llvm::MDOperand& op = m_metadata->getOperand(i);
+ if (llvm::MDNode *val = llvm::dyn_cast<llvm::MDNode>(op.get()))
+ {
+ llvm::MDString *str =
+ llvm::dyn_cast<llvm::MDString>(val->getOperand(0).get());
+ if (str->getString() == "reqd_work_group_size")
+ {
+ for (int j = 0; j < 3; j++)
+ {
+ reqdWorkGroupSize[j] =
+ getMDOpAsConstInt(val->getOperand(j+1))->getZExtValue();
+ }
+ }
+ }
+ }
+}
+
+void Kernel::setArgument(unsigned int index, TypedValue value)
+{
+ assert(index < m_function->arg_size());
+
+ const llvm::Value *argument = getArgument(index);
+ unsigned int type = getArgumentAddressQualifier(index);
+ if (type == CL_KERNEL_ARG_ADDRESS_LOCAL)
+ {
+ // Deallocate existing argument
+ if (m_arguments.count(argument))
+ {
+ m_localMemory->deallocateBuffer(m_arguments[argument].getPointer());
+ delete[] m_arguments[argument].data;
+ }
+
+ // Allocate local memory buffer
+ TypedValue v = {
+ sizeof(size_t),
+ 1,
+ new unsigned char[sizeof(size_t)]
+ };
+ v.setPointer(m_localMemory->allocateBuffer(value.size));
+ m_arguments[argument] = v;
+ }
+ else
+ {
+ if (((const llvm::Argument*)argument)->hasByValAttr())
+ {
+ // Deallocate existing argument
+ if (m_arguments.count(argument))
+ {
+ m_privateMemory->deallocateBuffer(m_arguments[argument].getPointer());
+ delete[] m_arguments[argument].data;
+ }
+
+ TypedValue address =
+ {
+ sizeof(size_t),
+ 1,
+ new unsigned char[sizeof(size_t)]
+ };
+ size_t size = value.size*value.num;
+ address.setPointer(m_privateMemory->allocateBuffer(size));
+ m_privateMemory->store(value.data, address.getPointer(), size);
+ m_arguments[argument] = address;
+ }
+ else
+ {
+ // Deallocate existing argument
+ if (m_arguments.count(argument))
+ {
+ delete[] m_arguments[argument].data;
+ }
+
+ const llvm::Type *type = argument->getType();
+ if (type->isVectorTy())
+ {
+ value.num = type->getVectorNumElements();
+ value.size = getTypeSize(type->getVectorElementType());
+ }
+ m_arguments[argument] = value.clone();
+ }
+ }
+}
+
+TypedValueMap::const_iterator Kernel::args_begin() const
+{
+ return m_arguments.begin();
+}
+
+TypedValueMap::const_iterator Kernel::args_end() const
+{
+ return m_arguments.end();
+}
diff --git a/src/core/Kernel.h b/src/core/Kernel.h
new file mode 100644
index 0000000..cf94e90
--- /dev/null
+++ b/src/core/Kernel.h
@@ -0,0 +1,72 @@
+// Kernel.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm
+{
+ class Argument;
+ class Constant;
+ class Function;
+ class GlobalVariable;
+ class MDNode;
+ class Module;
+}
+
+namespace oclgrind
+{
+ class Memory;
+ class Program;
+
+ class Kernel
+ {
+ public:
+ Kernel(const Program *program,
+ const llvm::Function *function, const llvm::Module *module);
+ Kernel(const Kernel& kernel);
+ virtual ~Kernel();
+
+ TypedValueMap::const_iterator args_begin() const;
+ TypedValueMap::const_iterator args_end() const;
+ bool allArgumentsSet() const;
+ void allocateConstants(Memory *memory);
+ void deallocateConstants(Memory *memory);
+ size_t getArgumentSize(unsigned int index) const;
+ unsigned int getArgumentAccessQualifier(unsigned int index) const;
+ unsigned int getArgumentAddressQualifier(unsigned int index) const;
+ const llvm::StringRef getArgumentName(unsigned int index) const;
+ const llvm::StringRef getArgumentTypeName(unsigned int index) const;
+ unsigned int getArgumentTypeQualifier(unsigned int index) const;
+ std::string getAttributes() const;
+ const llvm::Function* getFunction() const;
+ const Memory* getLocalMemory() const;
+ size_t getLocalMemorySize() const;
+ const std::string& getName() const;
+ unsigned int getNumArguments() const;
+ const Memory* getPrivateMemory() const;
+ const Program* getProgram() const;
+ void getRequiredWorkGroupSize(size_t reqdWorkGroupSize[3]) const;
+ void setArgument(unsigned int index, TypedValue value);
+
+ private:
+ const Program *m_program;
+ const llvm::Function *m_function;
+ TypedValueMap m_arguments;
+ std::list<const llvm::GlobalVariable*> m_constants;
+ std::list<size_t> m_constantBuffers;
+ Memory *m_localMemory;
+ const llvm::MDNode *m_metadata;
+ std::string m_name;
+ Memory *m_privateMemory;
+
+ const llvm::Argument* getArgument(unsigned int index) const;
+ const llvm::MDNode* getArgumentMetadata(std::string name) const;
+ };
+}
diff --git a/src/core/KernelInvocation.cpp b/src/core/KernelInvocation.cpp
new file mode 100644
index 0000000..3d50031
--- /dev/null
+++ b/src/core/KernelInvocation.cpp
@@ -0,0 +1,355 @@
+// KernelInvocation.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+#include <atomic>
+#include <sstream>
+#include <thread>
+
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+
+using namespace oclgrind;
+using namespace std;
+
+// TODO: Remove this when thread_local fixed on OS X
+#ifdef __APPLE__
+#define THREAD_LOCAL __thread
+#elif defined(_WIN32) && !defined(__MINGW32__)
+#define THREAD_LOCAL __declspec(thread)
+#else
+#define THREAD_LOCAL thread_local
+#endif
+struct
+{
+ WorkGroup *workGroup;
+ WorkItem *workItem;
+} static THREAD_LOCAL workerState;
+
+static atomic<unsigned> nextGroupIndex;
+
+KernelInvocation::KernelInvocation(const Context *context, const Kernel *kernel,
+ unsigned int workDim,
+ Size3 globalOffset,
+ Size3 globalSize,
+ Size3 localSize)
+ : m_context(context), m_kernel(kernel)
+{
+ m_workDim = workDim;
+ m_globalOffset = globalOffset;
+ m_globalSize = globalSize;
+ m_localSize = localSize;
+
+ m_numGroups.x = m_globalSize.x/m_localSize.x;
+ m_numGroups.y = m_globalSize.y/m_localSize.y;
+ m_numGroups.z = m_globalSize.z/m_localSize.z;
+
+ // Check for user overriding number of threads
+ m_numWorkers = 0;
+ const char *numThreads = getenv("OCLGRIND_NUM_THREADS");
+ if (numThreads)
+ {
+ char *next;
+ m_numWorkers = strtoul(numThreads, &next, 10);
+ if (strlen(next))
+ {
+ cerr << "Oclgrind: Invalid value for OCLGRIND_NUM_THREADS" << endl;
+ }
+ }
+ else
+ {
+ m_numWorkers = thread::hardware_concurrency();
+ }
+ if (!m_numWorkers || !m_context->isThreadSafe())
+ m_numWorkers = 1;
+
+ // Check for quick-mode environment variable
+ if (checkEnv("OCLGRIND_QUICK"))
+ {
+ // Only run first and last work-groups in quick-mode
+ Size3 firstGroup(0, 0, 0);
+ Size3 lastGroup(m_numGroups.x-1, m_numGroups.y-1, m_numGroups.z-1);
+ m_workGroups.push_back(firstGroup);
+ m_workGroups.push_back(lastGroup);
+ }
+ else
+ {
+ for (size_t k = 0; k < m_numGroups.z; k++)
+ {
+ for (size_t j = 0; j < m_numGroups.y; j++)
+ {
+ for (size_t i = 0; i < m_numGroups.x; i++)
+ {
+ m_workGroups.push_back(Size3(i, j, k));
+ }
+ }
+ }
+ }
+}
+
+KernelInvocation::~KernelInvocation()
+{
+ // Destroy any remaining work-groups
+ while (!m_runningGroups.empty())
+ {
+ delete m_runningGroups.front();
+ m_runningGroups.pop_front();
+ }
+}
+
+const Context* KernelInvocation::getContext() const
+{
+ return m_context;
+}
+
+const WorkGroup* KernelInvocation::getCurrentWorkGroup() const
+{
+ return workerState.workGroup;
+}
+
+const WorkItem* KernelInvocation::getCurrentWorkItem() const
+{
+ return workerState.workItem;
+}
+
+Size3 KernelInvocation::getGlobalOffset() const
+{
+ return m_globalOffset;
+}
+
+Size3 KernelInvocation::getGlobalSize() const
+{
+ return m_globalSize;
+}
+
+const Kernel* KernelInvocation::getKernel() const
+{
+ return m_kernel;
+}
+
+Size3 KernelInvocation::getLocalSize() const
+{
+ return m_localSize;
+}
+
+Size3 KernelInvocation::getNumGroups() const
+{
+ return m_numGroups;
+}
+
+size_t KernelInvocation::getWorkDim() const
+{
+ return m_workDim;
+}
+
+void KernelInvocation::run(const Context *context, Kernel *kernel,
+ unsigned int workDim,
+ Size3 globalOffset,
+ Size3 globalSize,
+ Size3 localSize)
+{
+ try
+ {
+ // Allocate and initialise constant memory
+ kernel->allocateConstants(context->getGlobalMemory());
+ }
+ catch (FatalError& err)
+ {
+ ostringstream info;
+ info << endl << "OCLGRIND FATAL ERROR "
+ << "(" << err.getFile() << ":" << err.getLine() << ")"
+ << endl << err.what()
+ << endl << "When allocating kernel constants for '"
+ << kernel->getName() << "'";
+ context->logError(info.str().c_str());
+ return;
+ }
+
+ // Create kernel invocation
+ KernelInvocation *ki = new KernelInvocation(context, kernel, workDim,
+ globalOffset,
+ globalSize,
+ localSize);
+
+ // Run kernel
+ context->notifyKernelBegin(ki);
+ ki->run();
+ context->notifyKernelEnd(ki);
+
+ delete ki;
+
+ // Deallocate constant memory
+ kernel->deallocateConstants(context->getGlobalMemory());
+}
+
+void KernelInvocation::run()
+{
+ nextGroupIndex = 0;
+
+ // Create worker threads
+ // TODO: Run in main thread if only 1 worker
+ vector<thread> threads;
+ for (unsigned i = 0; i < m_numWorkers; i++)
+ {
+ threads.push_back(thread(&KernelInvocation::runWorker, this));
+ }
+
+ // Wait for workers to complete
+ for (unsigned i = 0; i < m_numWorkers; i++)
+ {
+ threads[i].join();
+ }
+}
+
+void KernelInvocation::runWorker()
+{
+ workerState.workGroup = NULL;
+ workerState.workItem = NULL;
+ try
+ {
+ while (true)
+ {
+ // Move to next work-group
+ if (!m_runningGroups.empty())
+ {
+ // Take next work-group from running pool
+ workerState.workGroup = m_runningGroups.front();
+ m_runningGroups.pop_front();
+ }
+ else
+ {
+ // Take next work-group from pending pool
+ unsigned index = nextGroupIndex++;
+ if (index >= m_workGroups.size())
+ // No more work to do
+ break;
+
+ workerState.workGroup = new WorkGroup(this, m_workGroups[index]);
+ m_context->notifyWorkGroupBegin(workerState.workGroup);
+ }
+
+ // Execute work-group
+ workerState.workItem = workerState.workGroup->getNextWorkItem();
+ while (workerState.workItem)
+ {
+ // Run work-item until complete or at barrier
+ while (workerState.workItem->getState() == WorkItem::READY)
+ {
+ workerState.workItem->step();
+ }
+
+ // Move to next work-item
+ workerState.workItem = workerState.workGroup->getNextWorkItem();
+ if (workerState.workItem)
+ continue;
+
+ // No more work-items in READY state
+ // Check if there are work-items at a barrier
+ if (workerState.workGroup->hasBarrier())
+ {
+ // Resume execution
+ workerState.workGroup->clearBarrier();
+ workerState.workItem = workerState.workGroup->getNextWorkItem();
+ }
+ }
+
+ // Work-group has finished
+ m_context->notifyWorkGroupComplete(workerState.workGroup);
+ delete workerState.workGroup;
+ workerState.workGroup = NULL;
+ }
+ }
+ catch (FatalError& err)
+ {
+ ostringstream info;
+ info << endl << "OCLGRIND FATAL ERROR "
+ << "(" << err.getFile() << ":" << err.getLine() << ")"
+ << endl << err.what();
+ m_context->logError(info.str().c_str());
+
+ if (workerState.workGroup)
+ delete workerState.workGroup;
+ }
+}
+
+bool KernelInvocation::switchWorkItem(const Size3 gid)
+{
+ assert(m_numWorkers == 1);
+
+ // Compute work-group ID
+ Size3 group(gid.x/m_localSize.x, gid.y/m_localSize.y, gid.z/m_localSize.z);
+
+ bool found = false;
+ WorkGroup *previousWorkGroup = workerState.workGroup;
+
+ // Check if we're already running the work-group
+ if (group == previousWorkGroup->getGroupID())
+ {
+ found = true;
+ }
+
+ // Check if work-group is in running pool
+ if (!found)
+ {
+ std::list<WorkGroup*>::iterator rItr;
+ for (rItr = m_runningGroups.begin(); rItr != m_runningGroups.end(); rItr++)
+ {
+ if (group == (*rItr)->getGroupID())
+ {
+ workerState.workGroup = *rItr;
+ m_runningGroups.erase(rItr);
+ found = true;
+ break;
+ }
+ }
+ }
+
+ // Check if work-group is in pending pool
+ if (!found)
+ {
+ std::vector<Size3>::iterator pItr;
+ for (pItr = m_workGroups.begin()+nextGroupIndex;
+ pItr != m_workGroups.end(); pItr++)
+ {
+ if (group == *pItr)
+ {
+ workerState.workGroup = new WorkGroup(this, group);
+ found = true;
+
+ // Re-order list of groups accordingly
+ // Safe since this is not in a multi-threaded context
+ m_workGroups.erase(pItr);
+ m_workGroups.insert(m_workGroups.begin()+nextGroupIndex, group);
+ nextGroupIndex++;
+
+ break;
+ }
+ }
+ }
+
+ if (!found)
+ {
+ return false;
+ }
+
+ if (previousWorkGroup != workerState.workGroup)
+ {
+ m_runningGroups.push_back(previousWorkGroup);
+ }
+
+ // Get work-item
+ Size3 lid(gid.x%m_localSize.x, gid.y%m_localSize.y, gid.z%m_localSize.z);
+ workerState.workItem = workerState.workGroup->getWorkItem(lid);
+
+ return true;
+}
diff --git a/src/core/KernelInvocation.h b/src/core/KernelInvocation.h
new file mode 100644
index 0000000..4f02447
--- /dev/null
+++ b/src/core/KernelInvocation.h
@@ -0,0 +1,64 @@
+// KernelInvocation.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+namespace oclgrind
+{
+ class Context;
+ class Kernel;
+ class WorkGroup;
+ class WorkItem;
+
+ class KernelInvocation
+ {
+ public:
+ static void run(const Context *context, Kernel *kernel,
+ unsigned int workDim,
+ Size3 globalOffset,
+ Size3 globalSize,
+ Size3 localSize);
+
+ const Context* getContext() const;
+ const WorkGroup* getCurrentWorkGroup() const;
+ const WorkItem* getCurrentWorkItem() const;
+ Size3 getGlobalOffset() const;
+ Size3 getGlobalSize() const;
+ Size3 getLocalSize() const;
+ const Kernel* getKernel() const;
+ Size3 getNumGroups() const;
+ size_t getWorkDim() const;
+ bool switchWorkItem(const Size3 gid);
+
+ private:
+ KernelInvocation(const Context *context, const Kernel *kernel,
+ unsigned int workDim,
+ Size3 globalOffset,
+ Size3 globalSize,
+ Size3 localSize);
+ virtual ~KernelInvocation();
+ void run();
+
+ // Kernel launch parameters
+ const Context *m_context;
+ const Kernel *m_kernel;
+ size_t m_workDim;
+ Size3 m_globalOffset;
+ Size3 m_globalSize;
+ Size3 m_localSize;
+ Size3 m_numGroups;
+
+ // Current execution state
+ std::vector<Size3> m_workGroups;
+ std::list<WorkGroup*> m_runningGroups;
+
+ // Worker threads
+ void runWorker();
+ unsigned m_numWorkers;
+ };
+}
diff --git a/src/core/Memory.cpp b/src/core/Memory.cpp
new file mode 100644
index 0000000..cd33bc4
--- /dev/null
+++ b/src/core/Memory.cpp
@@ -0,0 +1,464 @@
+// Memory.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <mutex>
+
+#include "Context.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+
+using namespace oclgrind;
+using namespace std;
+
+// Multiple mutexes to mitigate risk of unnecessary synchronisation in atomics
+#define NUM_ATOMIC_MUTEXES 64 // Must be power of two
+mutex atomicMutex[NUM_ATOMIC_MUTEXES];
+#define ATOMIC_MUTEX(offset) \
+ atomicMutex[(((offset)>>2) & (NUM_ATOMIC_MUTEXES-1))]
+
+Memory::Memory(unsigned int addrSpace, const Context *context)
+{
+ m_context = context;
+ m_addressSpace = addrSpace;
+
+ clear();
+}
+
+Memory::~Memory()
+{
+ clear();
+}
+
+size_t Memory::allocateBuffer(size_t size, cl_mem_flags flags)
+{
+ // Check requested size doesn't exceed maximum
+ if (size > MAX_BUFFER_SIZE)
+ {
+ return 0;
+ }
+
+ // Find first unallocated buffer slot
+ unsigned b = getNextBuffer();
+ if (b >= MAX_NUM_BUFFERS)
+ {
+ return 0;
+ }
+
+ // Create buffer
+ Buffer *buffer = new Buffer;
+ buffer->size = size;
+ buffer->flags = flags;
+ buffer->data = new unsigned char[size];
+
+ // Initialize contents to 0
+ memset(buffer->data, 0, size);
+
+ if (b >= m_memory.size())
+ {
+ m_memory.push_back(buffer);
+ }
+ else
+ {
+ m_memory[b] = buffer;
+ }
+
+ m_totalAllocated += size;
+
+ size_t address = ((size_t)b) << NUM_ADDRESS_BITS;
+
+ m_context->notifyMemoryAllocated(this, address, size, flags);
+
+ return address;
+}
+
+uint32_t Memory::atomic(AtomicOp op, size_t address, uint32_t value)
+{
+ m_context->notifyMemoryAtomicLoad(this, op, address, 4);
+ m_context->notifyMemoryAtomicStore(this, op, address, 4);
+
+ // Bounds check
+ if (!isAddressValid(address, 4))
+ {
+ return 0;
+ }
+
+ // Get buffer
+ size_t offset = EXTRACT_OFFSET(address);
+ Buffer *buffer = m_memory[EXTRACT_BUFFER(address)];
+ uint32_t *ptr = (uint32_t*)(buffer->data + offset);
+
+ if (m_addressSpace == AddrSpaceGlobal)
+ ATOMIC_MUTEX(offset).lock();
+
+ uint32_t old = *ptr;
+ switch(op)
+ {
+ case AtomicAdd:
+ *ptr = old + value;
+ break;
+ case AtomicAnd:
+ *ptr = old & value;
+ break;
+ case AtomicCmpXchg:
+ FATAL_ERROR("AtomicCmpXchg in generic atomic handler");
+ break;
+ case AtomicDec:
+ *ptr = old - 1;
+ break;
+ case AtomicInc:
+ *ptr = old + 1;
+ break;
+ case AtomicMax:
+ *ptr = old > value ? old : value;
+ break;
+ case AtomicMin:
+ *ptr = old < value ? old : value;
+ break;
+ case AtomicOr:
+ *ptr = old | value;
+ break;
+ case AtomicSub:
+ *ptr = old - value;
+ break;
+ case AtomicXchg:
+ *ptr = value;
+ break;
+ case AtomicXor:
+ *ptr = old ^ value;
+ break;
+ }
+
+ if (m_addressSpace == AddrSpaceGlobal)
+ ATOMIC_MUTEX(offset).unlock();
+
+ return old;
+}
+
+uint32_t Memory::atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value)
+{
+ m_context->notifyMemoryAtomicLoad(this, AtomicCmpXchg, address, 4);
+
+ // Bounds check
+ if (!isAddressValid(address, 4))
+ {
+ return 0;
+ }
+
+ // Get buffer
+ size_t offset = EXTRACT_OFFSET(address);
+ Buffer *buffer = m_memory[EXTRACT_BUFFER(address)];
+ uint32_t *ptr = (uint32_t*)(buffer->data + offset);
+
+ if (m_addressSpace == AddrSpaceGlobal)
+ ATOMIC_MUTEX(offset).lock();
+
+ // Perform cmpxchg
+ uint32_t old = *ptr;
+ if (old == cmp)
+ {
+ *ptr = value;
+
+ m_context->notifyMemoryAtomicStore(this, AtomicCmpXchg, address, 4);
+ }
+
+ if (m_addressSpace == AddrSpaceGlobal)
+ ATOMIC_MUTEX(offset).unlock();
+
+ return old;
+}
+
+void Memory::clear()
+{
+ vector<Buffer*>::iterator itr;
+ for (itr = m_memory.begin(); itr != m_memory.end(); itr++)
+ {
+ if (*itr)
+ {
+ if (!((*itr)->flags & CL_MEM_USE_HOST_PTR))
+ {
+ delete[] (*itr)->data;
+ }
+ delete *itr;
+
+ size_t address = (itr-m_memory.begin())<<NUM_ADDRESS_BITS;
+ m_context->notifyMemoryDeallocated(this, address);
+ }
+ }
+ m_memory.resize(1);
+ m_memory[0] = NULL;
+ m_freeBuffers = queue<unsigned>();
+ m_totalAllocated = 0;
+}
+
+Memory* Memory::clone() const
+{
+ Memory *mem = new Memory(m_addressSpace, m_context);
+
+ // Clone buffers
+ mem->m_memory.resize(m_memory.size());
+ mem->m_memory[0] = NULL;
+ for (unsigned i = 1; i < m_memory.size(); i++)
+ {
+ Buffer *src = m_memory[i];
+ Buffer *dst = new Buffer;
+ dst->size = src->size;
+ dst->flags = src->flags,
+ dst->data =
+ (src->flags&CL_MEM_USE_HOST_PTR) ?
+ src->data : new unsigned char[src->size],
+ memcpy(dst->data, src->data, src->size);
+ mem->m_memory[i] = dst;
+ m_context->notifyMemoryAllocated(mem, ((size_t)i<<NUM_ADDRESS_BITS),
+ src->size, src->flags);
+ }
+
+ // Clone state
+ mem->m_freeBuffers = m_freeBuffers;
+ mem->m_totalAllocated = m_totalAllocated;
+
+ return mem;
+}
+
+size_t Memory::createHostBuffer(size_t size, void *ptr, cl_mem_flags flags)
+{
+ // Check requested size doesn't exceed maximum
+ if (size > MAX_BUFFER_SIZE)
+ {
+ return 0;
+ }
+
+ // Find first unallocated buffer slot
+ unsigned b = getNextBuffer();
+ if (b >= MAX_NUM_BUFFERS)
+ {
+ return 0;
+ }
+
+ // Create buffer
+ Buffer *buffer = new Buffer;
+ buffer->size = size;
+ buffer->flags = flags;
+ buffer->data = (unsigned char*)ptr;
+
+ if (b >= m_memory.size())
+ {
+ m_memory.push_back(buffer);
+ }
+ else
+ {
+ m_memory[b] = buffer;
+ }
+
+ m_totalAllocated += size;
+
+ size_t address = ((size_t)b) << NUM_ADDRESS_BITS;
+
+ m_context->notifyMemoryAllocated(this, address, size, flags);
+
+ return address;
+}
+
+bool Memory::copy(size_t dst, size_t src, size_t size)
+{
+ m_context->notifyMemoryLoad(this, src, size);
+
+ // Check source address
+ if (!isAddressValid(src, size))
+ {
+ return false;
+ }
+ size_t src_offset = EXTRACT_OFFSET(src);
+ Buffer *src_buffer = m_memory.at(EXTRACT_BUFFER(src));
+
+
+ m_context->notifyMemoryStore(this, dst, size, src_buffer->data + src_offset);
+
+ // Check destination address
+ if (!isAddressValid(dst, size))
+ {
+ return false;
+ }
+ size_t dst_offset = EXTRACT_OFFSET(dst);
+ Buffer *dst_buffer = m_memory.at(EXTRACT_BUFFER(dst));
+
+
+ // Copy data
+ memcpy(dst_buffer->data + dst_offset,
+ src_buffer->data + src_offset,
+ size);
+
+ return true;
+}
+
+void Memory::deallocateBuffer(size_t address)
+{
+ unsigned buffer = EXTRACT_BUFFER(address);
+ assert(buffer < m_memory.size() && m_memory[buffer]);
+
+ if (!(m_memory[buffer]->flags & CL_MEM_USE_HOST_PTR))
+ {
+ delete[] m_memory[buffer]->data;
+ }
+
+ m_totalAllocated -= m_memory[buffer]->size;
+ m_freeBuffers.push(buffer);
+
+ delete m_memory[buffer];
+ m_memory[buffer] = NULL;
+
+ m_context->notifyMemoryDeallocated(this, address);
+}
+
+void Memory::dump() const
+{
+ for (unsigned b = 1; b < m_memory.size(); b++)
+ {
+ if (!m_memory[b]->data)
+ {
+ continue;
+ }
+
+ for (unsigned i = 0; i < m_memory[b]->size; i++)
+ {
+ if (i%4 == 0)
+ {
+ cout << endl << hex << uppercase
+ << setw(16) << setfill(' ') << right
+ << ((((size_t)b)<<NUM_ADDRESS_BITS) | i) << ":";
+ }
+ cout << " " << hex << uppercase << setw(2) << setfill('0')
+ << (int)m_memory[b]->data[i];
+ }
+ }
+ cout << endl;
+}
+
+unsigned int Memory::getAddressSpace() const
+{
+ return m_addressSpace;
+}
+
+const Memory::Buffer* Memory::getBuffer(size_t address) const
+{
+ size_t buf = EXTRACT_BUFFER(address);
+ if (buf == 0 || buf >= m_memory.size() || !m_memory[buf]->data)
+ {
+ return NULL;
+ }
+
+ return m_memory[buf];
+}
+
+size_t Memory::getMaxAllocSize()
+{
+ return MAX_BUFFER_SIZE;
+}
+
+unsigned Memory::getNextBuffer()
+{
+ if (m_freeBuffers.empty())
+ {
+ return m_memory.size();
+ }
+ else
+ {
+ unsigned b = m_freeBuffers.front();
+ m_freeBuffers.pop();
+ return b;
+ }
+}
+
+void* Memory::getPointer(size_t address) const
+{
+ size_t buffer = EXTRACT_BUFFER(address);
+
+ // Bounds check
+ if (!isAddressValid(address))
+ {
+ return NULL;
+ }
+
+ return m_memory[buffer]->data + EXTRACT_OFFSET(address);
+}
+
+size_t Memory::getTotalAllocated() const
+{
+ return m_totalAllocated;
+}
+
+bool Memory::isAddressValid(size_t address, size_t size) const
+{
+ size_t buffer = EXTRACT_BUFFER(address);
+ size_t offset = EXTRACT_OFFSET(address);
+ if (buffer == 0 ||
+ buffer >= m_memory.size() ||
+ !m_memory[buffer] ||
+ offset+size > m_memory[buffer]->size)
+ {
+ return false;
+ }
+ return true;
+}
+
+bool Memory::load(unsigned char *dest, size_t address, size_t size) const
+{
+ m_context->notifyMemoryLoad(this, address, size);
+
+ // Bounds check
+ if (!isAddressValid(address, size))
+ {
+ return false;
+ }
+
+ // Get buffer
+ size_t offset = EXTRACT_OFFSET(address);
+ Buffer *src = m_memory[EXTRACT_BUFFER(address)];
+
+ // Load data
+ memcpy(dest, src->data + offset, size);
+
+ return true;
+}
+
+unsigned char* Memory::mapBuffer(size_t address, size_t offset, size_t size)
+{
+ size_t buffer = EXTRACT_BUFFER(address);
+
+ // Bounds check
+ if (!isAddressValid(address, size))
+ {
+ return NULL;
+ }
+
+ return m_memory[buffer]->data + offset + EXTRACT_OFFSET(address);
+}
+
+bool Memory::store(const unsigned char *source, size_t address, size_t size)
+{
+ m_context->notifyMemoryStore(this, address, size, source);
+
+ // Bounds check
+ if (!isAddressValid(address, size))
+ {
+ return false;
+ }
+
+ // Get buffer
+ size_t offset = EXTRACT_OFFSET(address);
+ Buffer *dst = m_memory[EXTRACT_BUFFER(address)];
+
+ // Store data
+ memcpy(dst->data + offset, source, size);
+
+ return true;
+}
diff --git a/src/core/Memory.h b/src/core/Memory.h
new file mode 100644
index 0000000..42eb63a
--- /dev/null
+++ b/src/core/Memory.h
@@ -0,0 +1,68 @@
+// Memory.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+#define NUM_BUFFER_BITS ( (sizeof(size_t)==4) ? 8 : 16)
+#define MAX_NUM_BUFFERS ((size_t)1 << NUM_BUFFER_BITS)
+#define NUM_ADDRESS_BITS ((sizeof(size_t)<<3) - NUM_BUFFER_BITS)
+#define MAX_BUFFER_SIZE ((size_t)1 << NUM_ADDRESS_BITS)
+
+#define EXTRACT_BUFFER(address) \
+ (address >> NUM_ADDRESS_BITS)
+#define EXTRACT_OFFSET(address) \
+ (address & (((size_t)-1) >> NUM_BUFFER_BITS))
+
+namespace oclgrind
+{
+ class Context;
+
+ class Memory
+ {
+ public:
+ typedef struct
+ {
+ size_t size;
+ cl_mem_flags flags;
+ unsigned char *data;
+ } Buffer;
+
+ public:
+ Memory(unsigned int addrSpace, const Context *context);
+ virtual ~Memory();
+
+ size_t allocateBuffer(size_t size, cl_mem_flags flags=0);
+ uint32_t atomic(AtomicOp op, size_t address, uint32_t value = 0);
+ uint32_t atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value);
+ void clear();
+ Memory *clone() const;
+ size_t createHostBuffer(size_t size, void *ptr, cl_mem_flags flags=0);
+ bool copy(size_t dest, size_t src, size_t size);
+ void deallocateBuffer(size_t address);
+ void dump() const;
+ unsigned int getAddressSpace() const;
+ const Buffer* getBuffer(size_t address) const;
+ void* getPointer(size_t address) const;
+ size_t getTotalAllocated() const;
+ bool isAddressValid(size_t address, size_t size=1) const;
+ bool load(unsigned char *dst, size_t address, size_t size=1) const;
+ unsigned char* mapBuffer(size_t address, size_t offset, size_t size);
+ bool store(const unsigned char *source, size_t address, size_t size=1);
+
+ static size_t getMaxAllocSize();
+
+ private:
+ const Context *m_context;
+ std::queue<unsigned> m_freeBuffers;
+ std::vector<Buffer*> m_memory;
+ unsigned int m_addressSpace;
+ size_t m_totalAllocated;
+
+ unsigned getNextBuffer();
+ };
+}
diff --git a/src/core/Plugin.cpp b/src/core/Plugin.cpp
new file mode 100644
index 0000000..8880f2d
--- /dev/null
+++ b/src/core/Plugin.cpp
@@ -0,0 +1,25 @@
+// Plugin.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "Plugin.h"
+
+using namespace oclgrind;
+
+Plugin::Plugin(const Context *context)
+ : m_context(context)
+{
+}
+
+Plugin::~Plugin()
+{
+}
+
+bool Plugin::isThreadSafe() const
+{
+ return true;
+}
diff --git a/src/core/Plugin.h b/src/core/Plugin.h
new file mode 100644
index 0000000..d4a8ea7
--- /dev/null
+++ b/src/core/Plugin.h
@@ -0,0 +1,69 @@
+// Plugin.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#pragma once
+
+#include "common.h"
+
+namespace oclgrind
+{
+ class Context;
+ class Kernel;
+ class KernelInvocation;
+ class Memory;
+ class WorkGroup;
+ class WorkItem;
+
+ class Plugin
+ {
+ public:
+ Plugin(const Context *context);
+ virtual ~Plugin();
+
+ virtual void hostMemoryLoad(const Memory *memory,
+ size_t address, size_t size){}
+ virtual void hostMemoryStore(const Memory *memory,
+ size_t address, size_t size,
+ const uint8_t *storeData){}
+ virtual void instructionExecuted(const WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ const TypedValue& result){}
+ virtual void kernelBegin(const KernelInvocation *kernelInvocation){}
+ virtual void kernelEnd(const KernelInvocation *kernelInvocation){}
+ virtual void log(MessageType type, const char *message){}
+ virtual void memoryAllocated(const Memory *memory, size_t address,
+ size_t size, cl_mem_flags flags){}
+ virtual void memoryAtomicLoad(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op, size_t address, size_t size){}
+ virtual void memoryAtomicStore(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op, size_t address, size_t size){}
+ virtual void memoryDeallocated(const Memory *memory, size_t address){}
+ virtual void memoryLoad(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size){}
+ virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size){}
+ virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size,
+ const uint8_t *storeData){}
+ virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size,
+ const uint8_t *storeData){}
+ virtual void workGroupBarrier(const WorkGroup *workGroup, uint32_t flags){}
+ virtual void workGroupBegin(const WorkGroup *workGroup){}
+ virtual void workGroupComplete(const WorkGroup *workGroup){}
+ virtual void workItemBegin(const WorkItem *workItem){}
+ virtual void workItemComplete(const WorkItem *workItem){}
+
+ virtual bool isThreadSafe() const;
+
+ protected:
+ const Context *m_context;
+ };
+}
diff --git a/src/core/Program.cpp b/src/core/Program.cpp
new file mode 100644
index 0000000..31fdc5b
--- /dev/null
+++ b/src/core/Program.cpp
@@ -0,0 +1,728 @@
+// Program.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+#include <fstream>
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "clang/CodeGen/CodeGenAction.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
+
+#include "Kernel.h"
+#include "Program.h"
+#include "WorkItem.h"
+
+#define ENV_DUMP_SPIR "OCLGRIND_DUMP_SPIR"
+#define CL_DUMP_NAME "/tmp/oclgrind_%lX.cl"
+#define IR_DUMP_NAME "/tmp/oclgrind_%lX.s"
+#define BC_DUMP_NAME "/tmp/oclgrind_%lX.bc"
+
+#if defined(_WIN32)
+#define REMAP_DIR "Z:/remapped/"
+#else
+#define REMAP_DIR "/remapped/"
+#endif
+
+#define REMAP_INPUT "input.cl"
+#define CLC_H_PATH REMAP_DIR"clc.h"
+extern const char CLC_H_DATA[];
+
+const char *EXTENSIONS[] =
+{
+ "cl_khr_fp64",
+ "cl_khr_3d_image_writes",
+ "cl_khr_global_int32_base_atomics",
+ "cl_khr_global_int32_extended_atomics",
+ "cl_khr_local_int32_base_atomics",
+ "cl_khr_local_int32_extended_atomics",
+ "cl_khr_byte_addressable_store",
+};
+
+using namespace oclgrind;
+using namespace std;
+
+Program::Program(const Context *context, llvm::Module *module)
+ : m_module(module), m_context(context)
+{
+ m_buildLog = "";
+ m_buildOptions = "";
+ m_buildStatus = CL_BUILD_SUCCESS;
+ m_uid = generateUID();
+}
+
+Program::Program(const Context *context, const string& source)
+ : m_context(context)
+{
+ m_source = source;
+ m_buildLog = "";
+ m_buildOptions = "";
+ m_buildStatus = CL_BUILD_NONE;
+ m_uid = 0;
+
+ // Split source into individual lines
+ m_sourceLines.clear();
+ if (!source.empty())
+ {
+ std::stringstream ss(source);
+ std::string line;
+ while(std::getline(ss, line, '\n'))
+ {
+ m_sourceLines.push_back(line);
+ }
+ }
+}
+
+Program::~Program()
+{
+ clearInterpreterCache();
+}
+
+bool Program::build(const char *options, list<Header> headers)
+{
+ m_buildStatus = CL_BUILD_IN_PROGRESS;
+ m_buildOptions = options ? options : "";
+
+ // Create build log
+ m_buildLog = "";
+ llvm::raw_string_ostream buildLog(m_buildLog);
+
+ // Do nothing if program was created with binary
+ if (m_source.empty() && m_module)
+ {
+ m_buildStatus = CL_BUILD_SUCCESS;
+ return true;
+ }
+
+ if (m_module)
+ {
+ clearInterpreterCache();
+ m_module.reset();
+ }
+
+ // Assign a new UID to this program
+ m_uid = generateUID();
+
+ // Set compiler arguments
+ vector<const char*> args;
+ args.push_back("-cl-std=CL1.2");
+ args.push_back("-cl-kernel-arg-info");
+ args.push_back("-fno-builtin");
+ args.push_back("-g");
+ args.push_back("-triple");
+ if (sizeof(size_t) == 4)
+ args.push_back("spir-unknown-unknown");
+ else
+ args.push_back("spir64-unknown-unknown");
+
+ // Define extensions
+ for (unsigned i = 0; i < sizeof(EXTENSIONS)/sizeof(const char*); i++)
+ {
+ args.push_back("-D");
+ args.push_back(EXTENSIONS[i]);
+ }
+
+ // Disable Clang's optimizations.
+ // We will manually run optimization passes and legalize the IR later.
+ args.push_back("-O0");
+
+ bool optimize = true;
+ bool cl12 = true;
+
+ // Add OpenCL build options
+ const char *mainOptions = options;
+ const char *extraOptions = getenv("OCLGRIND_BUILD_OPTIONS");
+ if (!mainOptions)
+ mainOptions = "";
+ if (!extraOptions)
+ extraOptions = "";
+ char *tmpOptions = new char[strlen(mainOptions) + strlen(extraOptions) + 2];
+ sprintf(tmpOptions, "%s %s", mainOptions, extraOptions);
+ for (char *opt = strtok(tmpOptions, " "); opt; opt = strtok(NULL, " "))
+ {
+ // Ignore options that break PCH
+ if (strcmp(opt, "-cl-fast-relaxed-math") != 0 &&
+ strcmp(opt, "-cl-finite-math-only") != 0 &&
+ strcmp(opt, "-cl-single-precision-constant") != 0)
+ {
+ // Check for optimization flags
+ if (strcmp(opt, "-O0") == 0 || strcmp(opt, "-cl-opt-disable") == 0)
+ {
+ optimize = false;
+ continue;
+ }
+ else if (strncmp(opt, "-O", 2) == 0)
+ {
+ optimize = true;
+ continue;
+ }
+
+ // Check for -cl-std flag
+ if (strncmp(opt, "-cl-std=", 8) == 0)
+ {
+ if (strcmp(opt+8, "CL1.2") != 0)
+ {
+ cl12 = false;
+ args.push_back(opt);
+ }
+ continue;
+ }
+
+ args.push_back(opt);
+ }
+ }
+
+ if (cl12)
+ {
+ args.push_back("-cl-std=CL1.2");
+ }
+
+ // Pre-compiled header
+ char *pchdir = NULL;
+ char *pch = NULL;
+ if (!checkEnv("OCLGRIND_DISABLE_PCH") && cl12)
+ {
+ const char *pchdirOverride = getenv("OCLGRIND_PCH_DIR");
+ if (pchdirOverride)
+ {
+ pchdir = strdup(pchdirOverride);
+ }
+ else
+ {
+ // Get directory containing library
+#if defined(_WIN32) && !defined(__MINGW32__)
+ char libpath[4096];
+ HMODULE dll;
+ if (GetModuleHandleEx(
+ GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+ GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+ (LPCSTR)&Program::createFromBitcode, &dll) &&
+ GetModuleFileName(dll, libpath, sizeof(libpath)))
+ {
+#else
+ Dl_info dlinfo;
+ if (dladdr((const void*)Program::createFromBitcode, &dlinfo))
+ {
+ const char *libpath = dlinfo.dli_fname;
+#endif
+
+ // Construct path to PCH directory
+ const char *dirend;
+#if defined(_WIN32) && !defined(__MINGW32__)
+ if ((dirend = strrchr(libpath, '\\')))
+#else
+ if ((dirend = strrchr(libpath, '/')))
+#endif
+ {
+ const char *includes_relative = "/../include/oclgrind/";
+ size_t length = dirend - libpath;
+ pchdir = new char[length + strlen(includes_relative) + 1];
+ strncpy(pchdir, libpath, length);
+ strcpy(pchdir + length, includes_relative);
+ }
+ }
+ }
+
+ if (pchdir)
+ {
+ // Select precompiled header
+ pch = new char[strlen(pchdir) + 20];
+ sprintf(pch, "%s/clc%d.pch", pchdir, (sizeof(size_t) == 4 ? 32 : 64));
+
+ // Check if precompiled header exists
+ ifstream pchfile(pch);
+ if (!pchfile.good())
+ {
+ buildLog << "WARNING: Unable to find precompiled header:\n"
+ << pch << "\n";
+ delete[] pch;
+ pch = NULL;
+ }
+ pchfile.close();
+ }
+ else
+ {
+ buildLog << "WARNING: Unable to determine precompiled header path\n";
+ }
+ }
+
+ if (pch)
+ {
+ args.push_back("-isysroot");
+ args.push_back(pchdir);
+
+ args.push_back("-include-pch");
+ args.push_back(pch);
+ }
+ else
+ {
+ // Fall back to embedded clc.h
+ args.push_back("-include");
+ args.push_back(CLC_H_PATH);
+ }
+
+ // Append input file to arguments (remapped later)
+ args.push_back(REMAP_INPUT);
+
+ // Create diagnostics engine
+ clang::DiagnosticOptions *diagOpts = new clang::DiagnosticOptions();
+ llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagID(
+ new clang::DiagnosticIDs());
+ clang::TextDiagnosticPrinter *diagConsumer =
+ new clang::TextDiagnosticPrinter(buildLog, diagOpts);
+ clang::DiagnosticsEngine diags(diagID, diagOpts, diagConsumer);
+
+ // Create compiler instance
+ clang::CompilerInstance compiler;
+ compiler.createDiagnostics(diagConsumer, false);
+
+ // Create compiler invocation
+ clang::CompilerInvocation *invocation = new clang::CompilerInvocation;
+ clang::CompilerInvocation::CreateFromArgs(*invocation,
+ &args[0], &args[0] + args.size(),
+ compiler.getDiagnostics());
+ compiler.setInvocation(invocation);
+
+ // Remap include files
+ std::unique_ptr<llvm::MemoryBuffer> buffer;
+ compiler.getHeaderSearchOpts().AddPath(REMAP_DIR, clang::frontend::Quoted,
+ false, true);
+ list<Header>::iterator itr;
+ for (itr = headers.begin(); itr != headers.end(); itr++)
+ {
+ buffer = llvm::MemoryBuffer::getMemBuffer(itr->second->m_source, "", false);
+ compiler.getPreprocessorOpts().addRemappedFile(REMAP_DIR + itr->first,
+ buffer.release());
+ }
+
+ // Remap clc.h
+ buffer = llvm::MemoryBuffer::getMemBuffer(CLC_H_DATA, "", false);
+ compiler.getPreprocessorOpts().addRemappedFile(CLC_H_PATH, buffer.release());
+
+ // Remap input file
+ buffer = llvm::MemoryBuffer::getMemBuffer(m_source, "", false);
+ compiler.getPreprocessorOpts().addRemappedFile(REMAP_INPUT, buffer.release());
+
+ // Compile
+ llvm::LLVMContext& context = llvm::getGlobalContext();
+ clang::EmitLLVMOnlyAction action(&context);
+ if (compiler.ExecuteAction(action))
+ {
+ // Retrieve module
+ m_module = action.takeModule();
+
+ // Strip debug intrinsics if not in interactive mode
+ if (!checkEnv("OCLGRIND_INTERACTIVE"))
+ {
+ stripDebugIntrinsics();
+ }
+
+ // Initialize pass managers
+ llvm::legacy::PassManager modulePasses;
+ llvm::legacy::FunctionPassManager functionPasses(m_module.get());
+#if LLVM_VERSION < 37
+ modulePasses.add(new llvm::DataLayoutPass());
+ functionPasses.add(new llvm::DataLayoutPass());
+#endif
+
+ // Run optimizations on module
+ if (optimize)
+ {
+ // Populate pass managers with -Oz
+ llvm::PassManagerBuilder builder;
+ builder.OptLevel = 2;
+ builder.SizeLevel = 2;
+ builder.populateModulePassManager(modulePasses);
+ builder.populateFunctionPassManager(functionPasses);
+ }
+
+ // Run passes
+ functionPasses.doInitialization();
+ llvm::Module::iterator fItr;
+ for (fItr = m_module->begin(); fItr != m_module->end(); fItr++)
+ functionPasses.run(*fItr);
+ functionPasses.doFinalization();
+ modulePasses.run(*m_module);
+
+ m_buildStatus = CL_BUILD_SUCCESS;
+ }
+ else
+ {
+ m_buildStatus = CL_BUILD_ERROR;
+ }
+
+ // Dump temps if required
+ if (checkEnv(ENV_DUMP_SPIR))
+ {
+ // Temporary directory
+#if defined(_WIN32)
+ const char *tmpdir = getenv("TEMP");
+#else
+ const char *tmpdir = "/tmp";
+#endif
+
+ // Construct unique output filenames
+ size_t sz = snprintf(NULL, 0, "%s/oclgrind_%lX.XX", tmpdir, m_uid) + 1;
+ char *tempCL = new char[sz];
+ char *tempIR = new char[sz];
+ char *tempBC = new char[sz];
+ sprintf(tempCL, "%s/oclgrind_%lX.cl", tmpdir, m_uid);
+ sprintf(tempIR, "%s/oclgrind_%lX.ll", tmpdir, m_uid);
+ sprintf(tempBC, "%s/oclgrind_%lX.bc", tmpdir, m_uid);
+
+ // Dump source
+ ofstream cl;
+ cl.open(tempCL);
+ cl << m_source;
+ cl.close();
+
+ if (m_buildStatus == CL_BUILD_SUCCESS)
+ {
+ // Dump IR
+ std::error_code err;
+ llvm::raw_fd_ostream ir(tempIR, err, llvm::sys::fs::F_None);
+ llvm::AssemblyAnnotationWriter asmWriter;
+ m_module->print(ir, &asmWriter);
+ ir.close();
+
+ // Dump bitcode
+ llvm::raw_fd_ostream bc(tempBC, err, llvm::sys::fs::F_None);
+ llvm::WriteBitcodeToFile(m_module.get(), bc);
+ bc.close();
+ }
+
+ delete[] tempCL;
+ delete[] tempIR;
+ delete[] tempBC;
+ }
+
+ delete[] tmpOptions;
+ delete[] pchdir;
+ delete[] pch;
+
+ return m_buildStatus == CL_BUILD_SUCCESS;
+}
+
+void Program::clearInterpreterCache()
+{
+ InterpreterCacheMap::iterator itr;
+ for (itr = m_interpreterCache.begin(); itr != m_interpreterCache.end(); itr++)
+ {
+ delete itr->second;
+ }
+ m_interpreterCache.clear();
+}
+
+Program* Program::createFromBitcode(const Context *context,
+ const unsigned char *bitcode,
+ size_t length)
+{
+ // Load bitcode from file
+ llvm::StringRef data((const char*)bitcode, length);
+ unique_ptr<llvm::MemoryBuffer> buffer =
+ llvm::MemoryBuffer::getMemBuffer(data, "", false);
+ if (!buffer)
+ {
+ return NULL;
+ }
+
+ // Parse bitcode into IR module
+ llvm::ErrorOr<llvm::Module*> module =
+ parseBitcodeFile(buffer->getMemBufferRef(), llvm::getGlobalContext());
+ if (!module)
+ {
+ return NULL;
+ }
+
+ return new Program(context, module.get());
+}
+
+Program* Program::createFromBitcodeFile(const Context *context,
+ const string filename)
+{
+ // Load bitcode from file
+ llvm::ErrorOr<unique_ptr<llvm::MemoryBuffer>> buffer =
+ llvm::MemoryBuffer::getFile(filename);
+ if (!buffer)
+ {
+ return NULL;
+ }
+
+ // Parse bitcode into IR module
+ llvm::ErrorOr<llvm::Module*> module =
+ parseBitcodeFile(buffer->get()->getMemBufferRef(),
+ llvm::getGlobalContext());
+ if (!module)
+ {
+ return NULL;
+ }
+
+ return new Program(context, module.get());
+}
+
+Program* Program::createFromPrograms(const Context *context,
+ list<const Program*> programs)
+{
+ llvm::Module *module = new llvm::Module("oclgrind_linked",
+ llvm::getGlobalContext());
+ llvm::Linker linker(module);
+
+ // Link modules
+ list<const Program*>::iterator itr;
+ for (itr = programs.begin(); itr != programs.end(); itr++)
+ {
+ if (linker.linkInModule(CloneModule((*itr)->m_module.get())))
+ {
+ return NULL;
+ }
+ }
+
+ return new Program(context, linker.getModule());
+}
+
+Kernel* Program::createKernel(const string name)
+{
+ if (!m_module)
+ return NULL;
+
+ // Iterate over functions in module to find kernel
+ llvm::Function *function = NULL;
+
+ // Query the SPIR kernel list
+ llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
+ // No kernels in module
+ if (!tuple)
+ return NULL;
+
+ for (unsigned i = 0; i < tuple->getNumOperands(); ++i)
+ {
+ llvm::MDNode* kernel = tuple->getOperand(i);
+
+ llvm::ConstantAsMetadata *cam =
+ llvm::dyn_cast<llvm::ConstantAsMetadata>(kernel->getOperand(0).get());
+ if (!cam)
+ continue;
+
+ llvm::Function *kernelFunction =
+ llvm::dyn_cast<llvm::Function>(cam->getValue());
+
+ // Shouldn't really happen - this would mean an invalid Module as input
+ if (!kernelFunction)
+ continue;
+
+ // Is this the kernel we want?
+ if (kernelFunction->getName() == name)
+ {
+ function = kernelFunction;
+ break;
+ }
+ }
+
+ if (function == NULL)
+ {
+ return NULL;
+ }
+
+ try
+ {
+ // Create cache if none already
+ InterpreterCacheMap::iterator itr = m_interpreterCache.find(function);
+ if (itr == m_interpreterCache.end())
+ {
+ m_interpreterCache[function] = new InterpreterCache(function);
+ }
+
+ return new Kernel(this, function, m_module.get());
+ }
+ catch (FatalError& err)
+ {
+ cerr << endl << "OCLGRIND FATAL ERROR "
+ << "(" << err.getFile() << ":" << err.getLine() << ")"
+ << endl << err.what()
+ << endl << "When creating kernel '" << name << "'"
+ << endl;
+ return NULL;
+ }
+}
+
+unsigned char* Program::getBinary() const
+{
+ if (!m_module)
+ {
+ return NULL;
+ }
+
+ std::string str;
+ llvm::raw_string_ostream stream(str);
+ llvm::WriteBitcodeToFile(m_module.get(), stream);
+ stream.str();
+ unsigned char *bitcode = new unsigned char[str.length()];
+ memcpy(bitcode, str.c_str(), str.length());
+ return bitcode;
+}
+
+size_t Program::getBinarySize() const
+{
+ if (!m_module)
+ {
+ return 0;
+ }
+
+ std::string str;
+ llvm::raw_string_ostream stream(str);
+ llvm::WriteBitcodeToFile(m_module.get(), stream);
+ stream.str();
+ return str.length();
+}
+
+const string& Program::getBuildLog() const
+{
+ return m_buildLog;
+}
+
+const string& Program::getBuildOptions() const
+{
+ return m_buildOptions;
+}
+
+unsigned int Program::getBuildStatus() const
+{
+ return m_buildStatus;
+}
+
+const Context* Program::getContext() const
+{
+ return m_context;
+}
+
+unsigned long Program::generateUID() const
+{
+ srand(now());
+ return rand();
+}
+
+const InterpreterCache* Program::getInterpreterCache(
+ const llvm::Function *kernel) const
+{
+ return m_interpreterCache[kernel];
+}
+
+list<string> Program::getKernelNames() const
+{
+ list<string> names;
+
+ // Query the SPIR kernel list
+ llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
+
+ if (tuple)
+ {
+ for (unsigned i = 0; i < tuple->getNumOperands(); ++i)
+ {
+ llvm::MDNode* kernel = tuple->getOperand(i);
+
+ llvm::ConstantAsMetadata *cam =
+ llvm::dyn_cast<llvm::ConstantAsMetadata>(kernel->getOperand(0).get());
+ if (!cam)
+ continue;
+
+ llvm::Function *kernelFunction =
+ llvm::dyn_cast<llvm::Function>(cam->getValue());
+
+ // Shouldn't really happen - this would mean an invalid Module as input
+ if (!kernelFunction)
+ continue;
+
+ names.push_back(kernelFunction->getName());
+ }
+ }
+
+ return names;
+}
+
+unsigned int Program::getNumKernels() const
+{
+ assert(m_module);
+
+ // Extract kernels from metadata
+ llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
+
+ // No kernels in module
+ if (!tuple)
+ return 0;
+
+ return tuple->getNumOperands();
+}
+
+const string& Program::getSource() const
+{
+ return m_source;
+}
+
+const char* Program::getSourceLine(size_t lineNumber) const
+{
+ if (!lineNumber || (lineNumber-1) >= m_sourceLines.size())
+ return NULL;
+
+ return m_sourceLines[lineNumber-1].c_str();
+}
+
+size_t Program::getNumSourceLines() const
+{
+ return m_sourceLines.size();
+}
+
+unsigned long Program::getUID() const
+{
+ return m_uid;
+}
+
+void Program::stripDebugIntrinsics()
+{
+ // Get list of llvm.dbg intrinsics
+ set<llvm::Instruction*> intrinsics;
+ for (llvm::Module::iterator F = m_module->begin(); F != m_module->end(); F++)
+ {
+ for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; I++)
+ {
+ if (I->getOpcode() == llvm::Instruction::Call)
+ {
+ llvm::CallInst *call = (llvm::CallInst*)&*I;
+ llvm::Function *function =
+ (llvm::Function*)call->getCalledValue()->stripPointerCasts();
+ if (function->getName().startswith("llvm.dbg"))
+ {
+ intrinsics.insert(&*I);
+ }
+ }
+ }
+ }
+
+ // Remove instructions
+ set<llvm::Instruction*>::iterator itr;
+ for (itr = intrinsics.begin(); itr != intrinsics.end(); itr++)
+ {
+ (*itr)->removeFromParent();
+ delete *itr;
+ }
+}
diff --git a/src/core/Program.h b/src/core/Program.h
new file mode 100644
index 0000000..f888746
--- /dev/null
+++ b/src/core/Program.h
@@ -0,0 +1,79 @@
+// Program.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+namespace llvm
+{
+ class Function;
+ class Module;
+}
+
+namespace oclgrind
+{
+ class Context;
+ class InterpreterCache;
+ class Kernel;
+
+ class Program
+ {
+ public:
+ typedef std::pair<std::string, const Program*> Header;
+
+ public:
+ Program(const Context *context, const std::string& source);
+ virtual ~Program();
+
+ static Program* createFromBitcode(const Context *context,
+ const unsigned char *bitcode,
+ size_t length);
+ static Program* createFromBitcodeFile(const Context *context,
+ const std::string filename);
+ static Program* createFromPrograms(const Context *context,
+ std::list<const Program*>);
+
+ bool build(const char *options,
+ std::list<Header> headers = std::list<Header>());
+ Kernel* createKernel(const std::string name);
+ const std::string& getBuildLog() const;
+ const std::string& getBuildOptions() const;
+ unsigned char* getBinary() const;
+ size_t getBinarySize() const;
+ unsigned int getBuildStatus() const;
+ const Context *getContext() const;
+ const InterpreterCache* getInterpreterCache(
+ const llvm::Function *kernel) const;
+ std::list<std::string> getKernelNames() const;
+ unsigned int getNumKernels() const;
+ const std::string& getSource() const;
+ const char* getSourceLine(size_t lineNumber) const;
+ size_t getNumSourceLines() const;
+ unsigned long getUID() const;
+
+ private:
+ Program(const Context *context, llvm::Module *module);
+
+ std::unique_ptr<llvm::Module> m_module;
+ std::string m_source;
+ std::string m_buildLog;
+ std::string m_buildOptions;
+ unsigned int m_buildStatus;
+ const Context *m_context;
+ std::vector<std::string> m_sourceLines;
+
+ unsigned long m_uid;
+ unsigned long generateUID() const;
+
+ void stripDebugIntrinsics();
+
+ typedef std::map<const llvm::Function*, InterpreterCache*>
+ InterpreterCacheMap;
+ mutable InterpreterCacheMap m_interpreterCache;
+ void clearInterpreterCache();
+ };
+}
diff --git a/src/core/Queue.cpp b/src/core/Queue.cpp
new file mode 100644
index 0000000..e9e082f
--- /dev/null
+++ b/src/core/Queue.cpp
@@ -0,0 +1,260 @@
+// Queue.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+#include <cassert>
+
+#include "Context.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "Queue.h"
+
+using namespace oclgrind;
+using namespace std;
+
+Queue::Queue(const Context *context)
+ : m_context(context)
+{
+}
+
+Queue::~Queue()
+{
+}
+
+Event::Event()
+{
+ state = CL_QUEUED;
+ queueTime = now();
+ startTime = endTime = 0;
+}
+
+Event* Queue::enqueue(Command *cmd)
+{
+ Event *event = new Event();
+ cmd->event = event;
+ m_queue.push(cmd);
+ return event;
+}
+
+void Queue::executeCopyBuffer(CopyCommand *cmd)
+{
+ m_context->getGlobalMemory()->copy(cmd->dst, cmd->src, cmd->size);
+}
+
+void Queue::executeCopyBufferRect(CopyRectCommand *cmd)
+{
+ // Perform copy
+ Memory *memory = m_context->getGlobalMemory();
+ for (unsigned z = 0; z < cmd->region[2]; z++)
+ {
+ for (unsigned y = 0; y < cmd->region[1]; y++)
+ {
+ // Compute addresses
+ size_t src =
+ cmd->src +
+ cmd->src_offset[0] +
+ y * cmd->src_offset[1] +
+ z * cmd->src_offset[2];
+ size_t dst =
+ cmd->dst +
+ cmd->dst_offset[0] +
+ y * cmd->dst_offset[1] +
+ z * cmd->dst_offset[2];
+
+ // Copy data
+ memory->copy(dst, src, cmd->region[0]);
+ }
+ }
+}
+
+void Queue::executeFillBuffer(FillBufferCommand *cmd)
+{
+ Memory *memory = m_context->getGlobalMemory();
+ for (unsigned i = 0; i < cmd->size/cmd->pattern_size; i++)
+ {
+ memory->store(cmd->pattern,
+ cmd->address + i*cmd->pattern_size,
+ cmd->pattern_size);
+ }
+}
+
+void Queue::executeFillImage(FillImageCommand *cmd)
+{
+ Memory *memory = m_context->getGlobalMemory();
+
+ for (unsigned z = 0; z < cmd->region[2]; z++)
+ {
+ for (unsigned y = 0; y < cmd->region[1]; y++)
+ {
+ for (unsigned x = 0; x < cmd->region[0]; x++)
+ {
+ size_t address = cmd->base
+ + (cmd->origin[0] + x) * cmd->pixelSize
+ + (cmd->origin[1] + y) * cmd->rowPitch
+ + (cmd->origin[2] + z) * cmd->slicePitch;
+ memory->store(cmd->color, address, cmd->pixelSize);
+ }
+ }
+ }
+}
+
+void Queue::executeKernel(KernelCommand *cmd)
+{
+ // Run kernel
+ KernelInvocation::run(m_context,
+ cmd->kernel,
+ cmd->work_dim,
+ cmd->globalOffset,
+ cmd->globalSize,
+ cmd->localSize);
+}
+
+void Queue::executeNativeKernel(NativeKernelCommand *cmd)
+{
+ // Run kernel
+ cmd->func(cmd->args);
+}
+
+void Queue::executeReadBuffer(BufferCommand *cmd)
+{
+ m_context->getGlobalMemory()->load(cmd->ptr, cmd->address, cmd->size);
+}
+
+void Queue::executeReadBufferRect(BufferRectCommand *cmd)
+{
+ Memory *memory = m_context->getGlobalMemory();
+ for (unsigned z = 0; z < cmd->region[2]; z++)
+ {
+ for (unsigned y = 0; y < cmd->region[1]; y++)
+ {
+ unsigned char *host =
+ cmd->ptr +
+ cmd->host_offset[0] +
+ y * cmd->host_offset[1] +
+ z * cmd->host_offset[2];
+ size_t buff =
+ cmd->address +
+ cmd->buffer_offset[0] +
+ y * cmd->buffer_offset[1] +
+ z * cmd->buffer_offset[2];
+ memory->load(host, buff, cmd->region[0]);
+ }
+ }
+}
+
+void Queue::executeWriteBuffer(BufferCommand *cmd)
+{
+ m_context->getGlobalMemory()->store(cmd->ptr, cmd->address, cmd->size);
+}
+
+void Queue::executeWriteBufferRect(BufferRectCommand *cmd)
+{
+ // Perform write
+ Memory *memory = m_context->getGlobalMemory();
+ for (unsigned z = 0; z < cmd->region[2]; z++)
+ {
+ for (unsigned y = 0; y < cmd->region[1]; y++)
+ {
+ const unsigned char *host =
+ cmd->ptr +
+ cmd->host_offset[0] +
+ y * cmd->host_offset[1] +
+ z * cmd->host_offset[2];
+ size_t buff =
+ cmd->address +
+ cmd->buffer_offset[0] +
+ y * cmd->buffer_offset[1] +
+ z * cmd->buffer_offset[2];
+ memory->store(host, buff, cmd->region[0]);
+ }
+ }
+}
+
+bool Queue::isEmpty() const
+{
+ return m_queue.empty();
+}
+
+Queue::Command* Queue::update()
+{
+ if (m_queue.empty())
+ {
+ return NULL;
+ }
+
+ // Get next command
+ Command *cmd = m_queue.front();
+
+ // Check if all events in wait list have completed
+ while (!cmd->waitList.empty())
+ {
+ if (cmd->waitList.front()->state == CL_COMPLETE)
+ {
+ cmd->waitList.pop_front();
+ }
+ else if (cmd->waitList.front()->state < 0)
+ {
+ cmd->event->state = cmd->waitList.front()->state;
+ m_queue.pop();
+ return cmd;
+ }
+ else
+ {
+ return NULL;
+ }
+ }
+
+ cmd->event->startTime = now();
+ cmd->event->state = CL_RUNNING;
+
+ // Dispatch command
+ switch (cmd->type)
+ {
+ case COPY:
+ executeCopyBuffer((CopyCommand*)cmd);
+ break;
+ case COPY_RECT:
+ executeCopyBufferRect((CopyRectCommand*)cmd);
+ break;
+ case EMPTY:
+ break;
+ case FILL_BUFFER:
+ executeFillBuffer((FillBufferCommand*)cmd);
+ break;
+ case FILL_IMAGE:
+ executeFillImage((FillImageCommand*)cmd);
+ break;
+ case READ:
+ executeReadBuffer((BufferCommand*)cmd);
+ break;
+ case READ_RECT:
+ executeReadBufferRect((BufferRectCommand*)cmd);
+ break;
+ case KERNEL:
+ executeKernel((KernelCommand*)cmd);
+ break;
+ case NATIVE_KERNEL:
+ executeNativeKernel((NativeKernelCommand*)cmd);
+ break;
+ case WRITE:
+ executeWriteBuffer((BufferCommand*)cmd);
+ break;
+ case WRITE_RECT:
+ executeWriteBufferRect((BufferRectCommand*)cmd);
+ break;
+ default:
+ assert(false && "Unhandled command type in queue.");
+ }
+
+ cmd->event->endTime = now();
+ cmd->event->state = CL_COMPLETE;
+
+ // Remove command from queue and delete
+ m_queue.pop();
+ return cmd;
+}
diff --git a/src/core/Queue.h b/src/core/Queue.h
new file mode 100644
index 0000000..7736d47
--- /dev/null
+++ b/src/core/Queue.h
@@ -0,0 +1,183 @@
+// Queue.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#pragma once
+#include "common.h"
+
+namespace oclgrind
+{
+ class Context;
+ class Kernel;
+
+ struct Event
+ {
+ int state;
+ double queueTime, startTime, endTime;
+ Event();
+ };
+
+ class Queue
+ {
+ public:
+ enum CommandType {EMPTY, COPY, COPY_RECT, FILL_BUFFER, FILL_IMAGE, KERNEL,
+ NATIVE_KERNEL, READ, READ_RECT, WRITE, WRITE_RECT};
+ struct Command
+ {
+ CommandType type;
+ std::list<Event*> waitList;
+ Command()
+ {
+ type = EMPTY;
+ }
+ private:
+ Event *event;
+ friend class Queue;
+ };
+ struct BufferCommand : Command
+ {
+ unsigned char *ptr;
+ size_t address, size;
+ BufferCommand(CommandType t)
+ {
+ type = t;
+ }
+ };
+ struct BufferRectCommand : Command
+ {
+ unsigned char *ptr;
+ size_t address;
+ size_t region[3];
+ size_t host_offset[3];
+ size_t buffer_offset[3];
+ BufferRectCommand(CommandType t)
+ {
+ type = t;
+ }
+ };
+ struct CopyCommand : Command
+ {
+ size_t src, dst, size;
+ CopyCommand()
+ {
+ type = COPY;
+ }
+ };
+ struct CopyRectCommand : Command
+ {
+ size_t src, dst;
+ size_t region[3];
+ size_t src_offset[3];
+ size_t dst_offset[3];
+ CopyRectCommand()
+ {
+ type = COPY_RECT;
+ }
+ };
+ struct FillBufferCommand : Command
+ {
+ size_t address, size;
+ size_t pattern_size;
+ unsigned char *pattern;
+ FillBufferCommand(const unsigned char *p, size_t sz)
+ {
+ type = FILL_BUFFER;
+ pattern = new unsigned char[sz];
+ pattern_size = sz;
+ memcpy(pattern, p, sz);
+ }
+ ~FillBufferCommand()
+ {
+ delete[] pattern;
+ }
+ };
+ struct FillImageCommand : Command
+ {
+ size_t base;
+ size_t origin[3], region[3];
+ size_t rowPitch, slicePitch;
+ size_t pixelSize;
+ unsigned char color[16];
+ FillImageCommand(size_t b, const size_t o[3], const size_t r[3],
+ size_t rp, size_t sp,
+ size_t ps, const unsigned char *col)
+ {
+ type = FILL_IMAGE;
+ base = b;
+ memcpy(origin, o, sizeof(size_t)*3);
+ memcpy(region, r, sizeof(size_t)*3);
+ rowPitch = rp;
+ slicePitch = sp;
+ pixelSize = ps;
+ memcpy(color, col, 16);
+ }
+ };
+ struct KernelCommand : Command
+ {
+ Kernel *kernel;
+ unsigned int work_dim;
+ Size3 globalOffset;
+ Size3 globalSize;
+ Size3 localSize;
+ KernelCommand()
+ {
+ type = KERNEL;
+ }
+ };
+ struct NativeKernelCommand : Command
+ {
+ void (CL_CALLBACK *func)(void *);
+ void *args;
+ NativeKernelCommand(void (CL_CALLBACK *f)(void *),
+ void *a, size_t sz)
+ {
+ type = NATIVE_KERNEL;
+ func = f;
+ if (a)
+ {
+ args = malloc(sz);
+ memcpy(args, a, sz);
+ }
+ else
+ {
+ args = NULL;
+ }
+ }
+ ~NativeKernelCommand()
+ {
+ if (args)
+ {
+ free(args);
+ }
+ }
+ };
+
+ public:
+ Queue(const Context *context);
+ virtual ~Queue();
+
+ Event* enqueue(Command *command);
+
+ void executeCopyBuffer(CopyCommand *cmd);
+ void executeCopyBufferRect(CopyRectCommand *cmd);
+ void executeFillBuffer(FillBufferCommand *cmd);
+ void executeFillImage(FillImageCommand *cmd);
+ void executeKernel(KernelCommand *cmd);
+ void executeNativeKernel(NativeKernelCommand *cmd);
+ void executeReadBuffer(BufferCommand *cmd);
+ void executeReadBufferRect(BufferRectCommand *cmd);
+ void executeWriteBuffer(BufferCommand *cmd);
+ void executeWriteBufferRect(BufferRectCommand *cmd);
+
+ bool isEmpty() const;
+ Command* update();
+
+ private:
+ const Context *m_context;
+ std::queue<Command*> m_queue;
+ };
+}
diff --git a/src/core/WorkGroup.cpp b/src/core/WorkGroup.cpp
new file mode 100644
index 0000000..23daf9d
--- /dev/null
+++ b/src/core/WorkGroup.cpp
@@ -0,0 +1,428 @@
+// WorkGroup.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+#include <sstream>
+
+#include "llvm/IR/Module.h"
+
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+
+using namespace oclgrind;
+using namespace std;
+
+WorkGroup::WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid)
+ : m_context(kernelInvocation->getContext())
+{
+ m_groupID = wgid;
+ m_groupSize = kernelInvocation->getLocalSize();
+
+ m_groupIndex = (m_groupID.x +
+ (m_groupID.y +
+ m_groupID.z*(kernelInvocation->getNumGroups().y) *
+ kernelInvocation->getNumGroups().x));
+
+ // Allocate local memory
+ m_localMemory = kernelInvocation->getKernel()->getLocalMemory()->clone();
+
+ // Initialise work-items
+ for (size_t k = 0; k < m_groupSize.z; k++)
+ {
+ for (size_t j = 0; j < m_groupSize.y; j++)
+ {
+ for (size_t i = 0; i < m_groupSize.x; i++)
+ {
+ WorkItem *workItem = new WorkItem(kernelInvocation, this,
+ Size3(i, j, k));
+ m_workItems.push_back(workItem);
+ m_running.insert(workItem);
+ m_context->notifyWorkItemBegin(workItem);
+ }
+ }
+ }
+
+ m_nextEvent = 1;
+ m_barrier = NULL;
+}
+
+WorkGroup::~WorkGroup()
+{
+ // Delete work-items
+ for (unsigned i = 0; i < m_workItems.size(); i++)
+ {
+ delete m_workItems[i];
+ }
+
+ delete m_localMemory;
+}
+
+size_t WorkGroup::async_copy(
+ const WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ AsyncCopyType type,
+ size_t dest,
+ size_t src,
+ size_t size,
+ size_t num,
+ size_t srcStride,
+ size_t destStride,
+ size_t event)
+{
+ AsyncCopy copy =
+ {
+ instruction,
+ type,
+ dest,
+ src,
+ size,
+ num,
+ srcStride,
+ destStride,
+
+ event
+ };
+
+ // Check if copy has already been registered by another work-item
+ list< pair<AsyncCopy,set<const WorkItem*> > >::iterator itr;
+ for (itr = m_asyncCopies.begin(); itr != m_asyncCopies.end(); itr++)
+ {
+ if (itr->second.count(workItem))
+ {
+ continue;
+ }
+
+ // Check for divergence
+ if ((itr->first.instruction->getDebugLoc()
+ != copy.instruction->getDebugLoc()) ||
+ (itr->first.type != copy.type) ||
+ (itr->first.dest != copy.dest) ||
+ (itr->first.src != copy.src) ||
+ (itr->first.size != copy.size) ||
+ (itr->first.num != copy.num) ||
+ (itr->first.srcStride != copy.srcStride) ||
+ (itr->first.destStride != copy.destStride))
+ {
+ Context::Message msg(ERROR, m_context);
+ msg << "Work-group divergence detected (async copy)" << endl
+ << msg.INDENT
+ << "Kernel: " << msg.CURRENT_KERNEL << endl
+ << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+ << endl
+ << "Work-item: " << msg.CURRENT_ENTITY << endl
+ << msg.CURRENT_LOCATION << endl
+ << "dest=0x" << hex << copy.dest << ", "
+ << "src=0x" << hex << copy.src << endl
+ << "elem_size=" << dec << copy.size << ", "
+ << "num_elems=" << dec << copy.num << ", "
+ << "src_stride=" << dec << copy.srcStride << ", "
+ << "dest_stride=" << dec << copy.destStride << endl
+ << endl
+ << "Previous work-items executed:" << endl
+ << itr->first.instruction << endl
+ << "dest=0x" << hex << itr->first.dest << ", "
+ << "src=0x" << hex << itr->first.src << endl
+ << "elem_size=" << dec << itr->first.size << ", "
+ << "num_elems=" << dec << itr->first.num << ", "
+ << "src_stride=" << dec << itr->first.srcStride << ", "
+ << "dest_stride=" << dec << itr->first.destStride << endl;
+ msg.send();
+ }
+
+ itr->second.insert(workItem);
+ return itr->first.event;
+ }
+
+ // Create new event if necessary
+ if (copy.event == 0)
+ {
+ copy.event = m_nextEvent++;
+ }
+
+ // Register new copy and event
+ m_asyncCopies.push_back(make_pair(copy, set<const WorkItem*>()));
+ m_asyncCopies.back().second.insert(workItem);
+ if (!m_events.count(event))
+ {
+ m_events[copy.event] = list<AsyncCopy>();
+ }
+ m_events[copy.event].push_back(copy);
+
+ return copy.event;
+}
+
+void WorkGroup::clearBarrier()
+{
+ assert(m_barrier);
+
+ // Check for divergence
+ if (m_barrier->workItems.size() != m_workItems.size())
+ {
+ Context::Message msg(ERROR, m_context);
+ msg << "Work-group divergence detected (barrier)" << endl
+ << msg.INDENT
+ << "Kernel: " << msg.CURRENT_KERNEL << endl
+ << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+ << "Only " << dec << m_barrier->workItems.size() << " out of "
+ << m_workItems.size() << " work-items executed barrier" << endl
+ << m_barrier->instruction << endl;
+ msg.send();
+ }
+
+ // Move work-items to running state
+ set<WorkItem*>::iterator itr;
+ for (itr = m_barrier->workItems.begin();
+ itr != m_barrier->workItems.end();
+ itr++)
+ {
+ (*itr)->clearBarrier();
+ m_running.insert(*itr);
+ }
+ m_barrier->workItems.clear();
+
+ // Deal with events
+ while (!m_barrier->events.empty())
+ {
+ size_t event = m_barrier->events.front();
+
+ // Perform copy
+ list<AsyncCopy> copies = m_events[event];
+ list<AsyncCopy>::iterator itr;
+ for (itr = copies.begin(); itr != copies.end(); itr++)
+ {
+ Memory *destMem, *srcMem;
+ if (itr->type == GLOBAL_TO_LOCAL)
+ {
+ destMem = m_localMemory;
+ srcMem = m_context->getGlobalMemory();
+ }
+ else
+ {
+ destMem = m_context->getGlobalMemory();
+ srcMem = m_localMemory;
+ }
+
+ size_t src = itr->src;
+ size_t dest = itr->dest;
+ unsigned char *buffer = new unsigned char[itr->size];
+ for (unsigned i = 0; i < itr->num; i++)
+ {
+ srcMem->load(buffer, src, itr->size);
+ destMem->store(buffer, dest, itr->size);
+ src += itr->srcStride * itr->size;
+ dest += itr->destStride * itr->size;
+ }
+ delete[] buffer;
+ }
+ m_events.erase(event);
+
+ // Remove copies from list for this event
+ list< pair<AsyncCopy,set<const WorkItem*> > >::iterator cItr;
+ for (cItr = m_asyncCopies.begin(); cItr != m_asyncCopies.end();)
+ {
+ if (cItr->first.event == event)
+ {
+ // Check that all work-items registered the copy
+ if (cItr->second.size() != m_workItems.size())
+ {
+ Context::Message msg(ERROR, m_context);
+ msg << "Work-group divergence detected (async copy)" << endl
+ << msg.INDENT
+ << "Kernel: " << msg.CURRENT_KERNEL << endl
+ << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+ << "Only " << dec << cItr->second.size() << " out of "
+ << m_workItems.size() << " work-items executed copy" << endl
+ << cItr->first.instruction << endl;
+ msg.send();
+ }
+
+ cItr = m_asyncCopies.erase(cItr);
+ }
+ else
+ {
+ cItr++;
+ }
+ }
+
+ m_barrier->events.remove(event);
+ }
+
+ m_context->notifyWorkGroupBarrier(this, m_barrier->fence);
+
+ delete m_barrier;
+ m_barrier = NULL;
+}
+
+const llvm::Instruction* WorkGroup::getCurrentBarrier() const
+{
+ return m_barrier ? m_barrier->instruction : NULL;
+}
+
+Size3 WorkGroup::getGroupID() const
+{
+ return m_groupID;
+}
+
+size_t WorkGroup::getGroupIndex() const
+{
+ return m_groupIndex;
+}
+
+Size3 WorkGroup::getGroupSize() const
+{
+ return m_groupSize;
+}
+
+Memory* WorkGroup::getLocalMemory() const
+{
+ return m_localMemory;
+}
+
+WorkItem* WorkGroup::getNextWorkItem() const
+{
+ if (m_running.empty())
+ {
+ return NULL;
+ }
+ return *m_running.begin();
+}
+
+WorkItem* WorkGroup::getWorkItem(Size3 localID) const
+{
+ return m_workItems[localID.x +
+ (localID.y + localID.z*m_groupSize.y)*m_groupSize.x];
+}
+
+bool WorkGroup::hasBarrier() const
+{
+ return m_barrier;
+}
+
+void WorkGroup::notifyBarrier(WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ uint64_t fence, list<size_t> events)
+{
+ if (!m_barrier)
+ {
+ // Create new barrier
+ m_barrier = new Barrier;
+ m_barrier->instruction = instruction;
+ m_barrier->fence = fence;
+
+ m_barrier->events = events;
+
+ // Check for invalid events
+ list<size_t>::iterator itr;
+ for (itr = events.begin(); itr != events.end(); itr++)
+ {
+ if (!m_events.count(*itr))
+ {
+ m_context->logError("Invalid wait event");
+ }
+ }
+ }
+ else
+ {
+ // Check for divergence
+ bool divergence = false;
+ if (instruction->getDebugLoc() != m_barrier->instruction->getDebugLoc() ||
+ fence != m_barrier->fence ||
+ events.size() != m_barrier->events.size())
+ {
+ divergence = true;
+ }
+
+ // Check events are all the same
+ int divergentEventIndex = -1;
+ size_t newEvent = -1;
+ size_t oldEvent = -1;
+ if (!divergence)
+ {
+ int i = 0;
+ list<size_t>::iterator cItr = events.begin();
+ list<size_t>::iterator pItr = m_barrier->events.begin();
+ for (; cItr != events.end(); cItr++, pItr++, i++)
+ {
+ if (*cItr != *pItr)
+ {
+ divergence = true;
+
+ divergentEventIndex = i;
+ newEvent = *cItr;
+ oldEvent = *pItr;
+
+ break;
+ }
+ }
+ }
+
+ if (divergence)
+ {
+ Context::Message msg(ERROR, m_context);
+ msg << "Work-group divergence detected (barrier)" << endl
+ << msg.INDENT
+ << "Kernel: " << msg.CURRENT_KERNEL << endl
+ << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+ << endl
+ << "Work-item: " << msg.CURRENT_ENTITY << endl
+ << msg.CURRENT_LOCATION << endl
+ << "fence=0x" << hex << fence << ", "
+ << "num_events=" << dec << events.size() << endl;
+ if (divergentEventIndex >= 0)
+ {
+ msg << "events[" << dec << divergentEventIndex << "]="
+ << newEvent << endl;
+ }
+ msg << endl
+ << "Previous work-items executed:" << endl
+ << m_barrier->instruction << endl
+ << "fence=0x" << hex << m_barrier->fence << ", "
+ << "num_events=" << dec << m_barrier->events.size() << endl;
+ if (divergentEventIndex >= 0)
+ {
+ msg << "events[" << dec << divergentEventIndex << "]="
+ << oldEvent << endl;
+ }
+ msg.send();
+ }
+ }
+
+ m_running.erase(workItem);
+ m_barrier->workItems.insert(workItem);
+}
+
+void WorkGroup::notifyFinished(WorkItem *workItem)
+{
+ m_running.erase(workItem);
+
+ // Check if work-group finished without waiting for all events
+ if (m_running.empty() && !m_barrier && !m_events.empty())
+ {
+ m_context->logError("Work-item finished without waiting for events");
+ }
+}
+
+bool WorkGroup::WorkItemCmp::operator()(const WorkItem *lhs,
+ const WorkItem *rhs) const
+{
+ Size3 lgid = lhs->getGlobalID();
+ Size3 rgid = rhs->getGlobalID();
+ if (lgid.z != rgid.z)
+ {
+ return lgid.z < rgid.z;
+ }
+ if (lgid.y != rgid.y)
+ {
+ return lgid.y < rgid.y;
+ }
+ return lgid.x < rgid.x;
+}
diff --git a/src/core/WorkGroup.h b/src/core/WorkGroup.h
new file mode 100644
index 0000000..88319cf
--- /dev/null
+++ b/src/core/WorkGroup.h
@@ -0,0 +1,100 @@
+// WorkGroup.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+#define CLK_LOCAL_MEM_FENCE (1<<0)
+#define CLK_GLOBAL_MEM_FENCE (1<<1)
+
+namespace oclgrind
+{
+ class Context;
+ class Memory;
+ class Kernel;
+ class KernelInvocation;
+ class WorkItem;
+
+ class WorkGroup
+ {
+ public:
+ enum AsyncCopyType{GLOBAL_TO_LOCAL, LOCAL_TO_GLOBAL};
+
+ private:
+ // Comparator for ordering work-items
+ struct WorkItemCmp
+ {
+ bool operator()(const WorkItem *lhs, const WorkItem *rhs) const;
+ };
+ std::set<WorkItem*, WorkItemCmp> m_running;
+
+ typedef struct
+ {
+ const llvm::Instruction *instruction;
+ AsyncCopyType type;
+ size_t dest;
+ size_t src;
+ size_t size;
+ size_t num;
+ size_t srcStride;
+ size_t destStride;
+
+ size_t event;
+ } AsyncCopy;
+
+ typedef struct
+ {
+ const llvm::Instruction *instruction;
+ std::set<WorkItem*, WorkItemCmp> workItems;
+
+ uint64_t fence;
+ std::list<size_t> events;
+ } Barrier;
+
+ public:
+ WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid);
+ virtual ~WorkGroup();
+
+ size_t async_copy(
+ const WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ AsyncCopyType type,
+ size_t dest,
+ size_t src,
+ size_t size,
+ size_t num,
+ size_t srcStride,
+ size_t destStride,
+ size_t event);
+ void clearBarrier();
+ const llvm::Instruction* getCurrentBarrier() const;
+ Size3 getGroupID() const;
+ size_t getGroupIndex() const;
+ Size3 getGroupSize() const;
+ Memory* getLocalMemory() const;
+ WorkItem *getNextWorkItem() const;
+ WorkItem *getWorkItem(Size3 localID) const;
+ bool hasBarrier() const;
+ void notifyBarrier(WorkItem *workItem, const llvm::Instruction *instruction,
+ uint64_t fence,
+ std::list<size_t> events=std::list<size_t>());
+ void notifyFinished(WorkItem *workItem);
+
+ private:
+ size_t m_groupIndex;
+ Size3 m_groupID;
+ Size3 m_groupSize;
+ const Context *m_context;
+ Memory *m_localMemory;
+ std::vector<WorkItem*> m_workItems;
+
+ Barrier *m_barrier;
+ size_t m_nextEvent;
+ std::list< std::pair<AsyncCopy,std::set<const WorkItem*> > > m_asyncCopies;
+ std::map < size_t, std::list<AsyncCopy> > m_events;
+ };
+}
diff --git a/src/core/WorkItem.cpp b/src/core/WorkItem.cpp
new file mode 100644
index 0000000..9d37ade
--- /dev/null
+++ b/src/core/WorkItem.cpp
@@ -0,0 +1,1660 @@
+// WorkItem.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.`
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/InstIterator.h"
+
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "Program.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+
+using namespace oclgrind;
+using namespace std;
+
+struct WorkItem::Position
+{
+ llvm::Function::const_iterator prevBlock;
+ llvm::Function::const_iterator currBlock;
+ llvm::Function::const_iterator nextBlock;
+ llvm::BasicBlock::const_iterator currInst;
+ std::stack<const llvm::Instruction*> callStack;
+ std::stack< std::list<size_t> > allocations;
+};
+
+WorkItem::WorkItem(const KernelInvocation *kernelInvocation,
+ WorkGroup *workGroup, Size3 lid)
+ : m_context(kernelInvocation->getContext()),
+ m_kernelInvocation(kernelInvocation),
+ m_workGroup(workGroup)
+{
+ m_localID = lid;
+
+ // Compute global ID
+ Size3 groupID = workGroup->getGroupID();
+ Size3 groupSize = workGroup->getGroupSize();
+ Size3 globalOffset = kernelInvocation->getGlobalOffset();
+ m_globalID.x = lid.x + groupID.x*groupSize.x + globalOffset.x;
+ m_globalID.y = lid.y + groupID.y*groupSize.y + globalOffset.y;
+ m_globalID.z = lid.z + groupID.z*groupSize.z + globalOffset.z;
+
+ Size3 globalSize = kernelInvocation->getGlobalSize();
+ m_globalIndex = (m_globalID.x +
+ (m_globalID.y +
+ m_globalID.z*globalSize.y) * globalSize.x);
+
+ const Kernel *kernel = kernelInvocation->getKernel();
+
+ // Load interpreter cache
+ m_cache = kernel->getProgram()->getInterpreterCache(kernel->getFunction());
+
+ // Set initial number of values to store based on cache
+ m_values.resize(m_cache->getNumValues());
+
+ m_privateMemory = kernel->getPrivateMemory()->clone();
+
+ // Initialise kernel arguments
+ TypedValueMap::const_iterator argItr;
+ for (argItr = kernel->args_begin(); argItr != kernel->args_end(); argItr++)
+ {
+ setValue(argItr->first, m_pool.clone(argItr->second));
+ }
+
+ // Initialize interpreter state
+ m_state = READY;
+ m_position = new Position;
+ m_position->prevBlock = NULL;
+ m_position->nextBlock = NULL;
+ m_position->currBlock = kernel->getFunction()->begin();
+ m_position->currInst = m_position->currBlock->begin();
+}
+
+WorkItem::~WorkItem()
+{
+ delete m_privateMemory;
+ delete m_position;
+}
+
+void WorkItem::clearBarrier()
+{
+ if (m_state == BARRIER)
+ {
+ m_state = READY;
+ }
+}
+
+void WorkItem::dispatch(const llvm::Instruction *instruction,
+ TypedValue& result)
+{
+ switch (instruction->getOpcode())
+ {
+ case llvm::Instruction::Add:
+ add(instruction, result);
+ break;
+ case llvm::Instruction::Alloca:
+ alloc(instruction, result);
+ break;
+ case llvm::Instruction::And:
+ bwand(instruction, result);
+ break;
+ case llvm::Instruction::AShr:
+ ashr(instruction, result);
+ break;
+ case llvm::Instruction::BitCast:
+ bitcast(instruction, result);
+ break;
+ case llvm::Instruction::Br:
+ br(instruction, result);
+ break;
+ case llvm::Instruction::Call:
+ call(instruction, result);
+ break;
+ case llvm::Instruction::ExtractElement:
+ extractelem(instruction, result);
+ break;
+ case llvm::Instruction::ExtractValue:
+ extractval(instruction, result);
+ break;
+ case llvm::Instruction::FAdd:
+ fadd(instruction, result);
+ break;
+ case llvm::Instruction::FCmp:
+ fcmp(instruction, result);
+ break;
+ case llvm::Instruction::FDiv:
+ fdiv(instruction, result);
+ break;
+ case llvm::Instruction::FMul:
+ fmul(instruction, result);
+ break;
+ case llvm::Instruction::FPExt:
+ fpext(instruction, result);
+ break;
+ case llvm::Instruction::FPToSI:
+ fptosi(instruction, result);
+ break;
+ case llvm::Instruction::FPToUI:
+ fptoui(instruction, result);
+ break;
+ case llvm::Instruction::FPTrunc:
+ fptrunc(instruction, result);
+ break;
+ case llvm::Instruction::FRem:
+ frem(instruction, result);
+ break;
+ case llvm::Instruction::FSub:
+ fsub(instruction, result);
+ break;
+ case llvm::Instruction::GetElementPtr:
+ gep(instruction, result);
+ break;
+ case llvm::Instruction::ICmp:
+ icmp(instruction, result);
+ break;
+ case llvm::Instruction::InsertElement:
+ insertelem(instruction, result);
+ break;
+ case llvm::Instruction::InsertValue:
+ insertval(instruction, result);
+ break;
+ case llvm::Instruction::IntToPtr:
+ inttoptr(instruction, result);
+ break;
+ case llvm::Instruction::Load:
+ load(instruction, result);
+ break;
+ case llvm::Instruction::LShr:
+ lshr(instruction, result);
+ break;
+ case llvm::Instruction::Mul:
+ mul(instruction, result);
+ break;
+ case llvm::Instruction::Or:
+ bwor(instruction, result);
+ break;
+ case llvm::Instruction::PHI:
+ phi(instruction, result);
+ break;
+ case llvm::Instruction::PtrToInt:
+ ptrtoint(instruction, result);
+ break;
+ case llvm::Instruction::Ret:
+ ret(instruction, result);
+ break;
+ case llvm::Instruction::SDiv:
+ sdiv(instruction, result);
+ break;
+ case llvm::Instruction::Select:
+ select(instruction, result);
+ break;
+ case llvm::Instruction::SExt:
+ sext(instruction, result);
+ break;
+ case llvm::Instruction::Shl:
+ shl(instruction, result);
+ break;
+ case llvm::Instruction::ShuffleVector:
+ shuffle(instruction, result);
+ break;
+ case llvm::Instruction::SIToFP:
+ sitofp(instruction, result);
+ break;
+ case llvm::Instruction::SRem:
+ srem(instruction, result);
+ break;
+ case llvm::Instruction::Store:
+ store(instruction, result);
+ break;
+ case llvm::Instruction::Sub:
+ sub(instruction, result);
+ break;
+ case llvm::Instruction::Switch:
+ swtch(instruction, result);
+ break;
+ case llvm::Instruction::Trunc:
+ itrunc(instruction, result);
+ break;
+ case llvm::Instruction::UDiv:
+ udiv(instruction, result);
+ break;
+ case llvm::Instruction::UIToFP:
+ uitofp(instruction, result);
+ break;
+ case llvm::Instruction::URem:
+ urem(instruction, result);
+ break;
+ case llvm::Instruction::Unreachable:
+ FATAL_ERROR("Encountered unreachable instruction");
+ case llvm::Instruction::Xor:
+ bwxor(instruction, result);
+ break;
+ case llvm::Instruction::ZExt:
+ zext(instruction, result);
+ break;
+ default:
+ FATAL_ERROR("Unsupported instruction: %s", instruction->getOpcodeName());
+ }
+}
+
+void WorkItem::execute(const llvm::Instruction *instruction)
+{
+ // Prepare private variable for instruction result
+ pair<unsigned,unsigned> resultSize = getValueSize(instruction);
+
+ // Prepare result
+ TypedValue result = {
+ resultSize.first,
+ resultSize.second,
+ NULL
+ };
+ if (result.size)
+ {
+ result.data = m_pool.alloc(result.size*result.num);
+ }
+
+ if (instruction->getOpcode() != llvm::Instruction::PHI &&
+ m_phiTemps.size() > 0)
+ {
+ TypedValueMap::iterator itr;
+ for (itr = m_phiTemps.begin(); itr != m_phiTemps.end(); itr++)
+ {
+ setValue(itr->first, itr->second);
+ }
+ m_phiTemps.clear();
+ }
+
+ // Execute instruction
+ dispatch(instruction, result);
+
+ // Store result
+ if (result.size)
+ {
+ if (instruction->getOpcode() != llvm::Instruction::PHI)
+ {
+ setValue(instruction, result);
+ }
+ else
+ {
+ m_phiTemps[instruction] = result;
+ }
+ }
+
+ m_context->notifyInstructionExecuted(this, instruction, result);
+}
+
+TypedValue WorkItem::getValue(const llvm::Value *key) const
+{
+ return m_values[m_cache->getValueID(key)];
+}
+
+const stack<const llvm::Instruction*>& WorkItem::getCallStack() const
+{
+ return m_position->callStack;
+}
+
+const llvm::Instruction* WorkItem::getCurrentInstruction() const
+{
+ return m_position->currInst;
+}
+
+Size3 WorkItem::getGlobalID() const
+{
+ return m_globalID;
+}
+
+size_t WorkItem::getGlobalIndex() const
+{
+ return m_globalIndex;
+}
+
+Size3 WorkItem::getLocalID() const
+{
+ return m_localID;
+}
+
+Memory* WorkItem::getMemory(unsigned int addrSpace) const
+{
+ switch (addrSpace)
+ {
+ case AddrSpacePrivate:
+ return m_privateMemory;
+ case AddrSpaceGlobal:
+ case AddrSpaceConstant:
+ return m_context->getGlobalMemory();
+ case AddrSpaceLocal:
+ return m_workGroup->getLocalMemory();
+ default:
+ FATAL_ERROR("Unsupported address space: %d", addrSpace);
+ }
+}
+
+TypedValue WorkItem::getOperand(const llvm::Value *operand) const
+{
+ unsigned valID = operand->getValueID();
+ if (valID == llvm::Value::ArgumentVal ||
+ valID == llvm::Value::GlobalVariableVal ||
+ valID >= llvm::Value::InstructionVal)
+ {
+ return getValue(operand);
+ }
+ //else if (valID == llvm::Value::BasicBlockVal)
+ //{
+ //}
+ //else if (valID == llvm::Value::FunctionVal)
+ //{
+ //}
+ //else if (valID == llvm::Value::GlobalAliasVal)
+ //{
+ //}
+ //else if (valID == llvm::Value::BlockAddressVal)
+ //{
+ //}
+ else if (valID == llvm::Value::ConstantExprVal)
+ {
+ pair<unsigned,unsigned> size = getValueSize(operand);
+ TypedValue result;
+ result.size = size.first;
+ result.num = size.second;
+ result.data = m_pool.alloc(getTypeSize(operand->getType()));
+
+ // Use of const_cast here is ugly, but ConstExpr instructions
+ // shouldn't actually modify WorkItem state anyway
+ const_cast<WorkItem*>(this)->dispatch(
+ m_cache->getConstantExpr(operand), result);
+ return result;
+ }
+ else if (valID == llvm::Value::UndefValueVal ||
+ valID == llvm::Value::ConstantAggregateZeroVal ||
+ valID == llvm::Value::ConstantDataArrayVal ||
+ valID == llvm::Value::ConstantDataVectorVal ||
+ valID == llvm::Value::ConstantIntVal ||
+ valID == llvm::Value::ConstantFPVal ||
+ valID == llvm::Value::ConstantArrayVal ||
+ valID == llvm::Value::ConstantStructVal ||
+ valID == llvm::Value::ConstantVectorVal ||
+ valID == llvm::Value::ConstantPointerNullVal)
+ {
+ return m_cache->getConstant(operand);
+ }
+ //else if (valID == llvm::Value::MDNodeVal)
+ //{
+ //}
+ //else if (valID == llvm::Value::MDStringVal)
+ //{
+ //}
+ //else if (valID == llvm::Value::InlineAsmVal)
+ //{
+ //}
+ //else if (valID == llvm::Value::PseudoSourceValueVal)
+ //{
+ //}
+ //else if (valID == llvm::Value::FixedStackPseudoSourceValueVal)
+ //{
+ //}
+ else
+ {
+ FATAL_ERROR("Unhandled operand type: %d", valID);
+ }
+
+ // Unreachable
+ assert(false);
+}
+
+Memory* WorkItem::getPrivateMemory() const
+{
+ return m_privateMemory;
+}
+
+WorkItem::State WorkItem::getState() const
+{
+ return m_state;
+}
+
+const unsigned char* WorkItem::getValueData(const llvm::Value *value) const
+{
+ if (!hasValue(value))
+ {
+ return NULL;
+ }
+ return getValue(value).data;
+}
+
+const llvm::Value* WorkItem::getVariable(std::string name) const
+{
+ VariableMap::const_iterator itr;
+ itr = m_variables.find(name);
+ if (itr == m_variables.end())
+ {
+ return NULL;
+ }
+ return itr->second;
+}
+
+const WorkGroup* WorkItem::getWorkGroup() const
+{
+ return m_workGroup;
+}
+
+bool WorkItem::hasValue(const llvm::Value *key) const
+{
+ return m_cache->hasValue(key);
+}
+
+bool WorkItem::printValue(const llvm::Value *value) const
+{
+ if (!hasValue(value))
+ {
+ return false;
+ }
+
+ printTypedData(value->getType(), getValue(value).data);
+
+ return true;
+}
+
+bool WorkItem::printVariable(string name) const
+{
+ // Find variable
+ const llvm::Value *value = getVariable(name);
+ if (!value)
+ {
+ return false;
+ }
+
+ // Get variable value
+ TypedValue result = getValue(value);
+ const llvm::Type *type = value->getType();
+
+ if (((const llvm::Instruction*)value)->getOpcode()
+ == llvm::Instruction::Alloca)
+ {
+ // If value is alloca result, look-up data at address
+ const llvm::Type *elemType = value->getType()->getPointerElementType();
+ size_t address = result.getPointer();
+
+ unsigned char *data = (unsigned char*)m_privateMemory->getPointer(address);
+ printTypedData(elemType, data);
+ }
+ else
+ {
+ printTypedData(type, result.data);
+ }
+
+ return true;
+}
+
+void WorkItem::setValue(const llvm::Value *key, TypedValue value)
+{
+ m_values[m_cache->getValueID(key)] = value;
+}
+
+WorkItem::State WorkItem::step()
+{
+ assert(m_state == READY);
+
+ // Execute the next instruction
+ execute(m_position->currInst);
+
+ // Check if we've reached the end of the block
+ if (++m_position->currInst == m_position->currBlock->end() ||
+ m_position->nextBlock)
+ {
+ if (m_position->nextBlock)
+ {
+ // Move to next basic block
+ m_position->prevBlock = m_position->currBlock;
+ m_position->currBlock = m_position->nextBlock;
+ m_position->nextBlock = NULL;
+ m_position->currInst = m_position->currBlock->begin();
+ }
+ }
+
+ return m_state;
+}
+
+
+///////////////////////////////
+//// Instruction execution ////
+///////////////////////////////
+
+#define INSTRUCTION(name) \
+ void WorkItem::name(const llvm::Instruction *instruction, TypedValue& result)
+
+INSTRUCTION(add)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) + opB.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(alloc)
+{
+ const llvm::AllocaInst *allocInst = ((const llvm::AllocaInst*)instruction);
+ const llvm::Type *type = allocInst->getAllocatedType();
+
+ // Perform allocation
+ unsigned size = getTypeSize(type);
+ size_t address = m_privateMemory->allocateBuffer(size);
+ if (!address)
+ FATAL_ERROR("Insufficient private memory (alloca)");
+
+ // Create pointer to alloc'd memory
+ result.setPointer(address);
+
+ // Track allocation in stack frame
+ if (!m_position->allocations.empty())
+ m_position->allocations.top().push_back(address);
+}
+
+INSTRUCTION(ashr)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ uint64_t shiftMask =
+ (result.num > 1 ? result.size : max((size_t)result.size, sizeof(uint32_t)))
+ * 8 - 1;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getSInt(i) >> (opB.getUInt(i) & shiftMask), i);
+ }
+}
+
+INSTRUCTION(bitcast)
+{
+ const llvm::Value *op = instruction->getOperand(0);
+
+ // Check for address space casts
+ if (instruction->getType()->isPointerTy())
+ {
+ unsigned srcAddrSpace = op->getType()->getPointerAddressSpace();
+ unsigned dstAddrSpace = instruction->getType()->getPointerAddressSpace();
+ if (srcAddrSpace != dstAddrSpace)
+ {
+ FATAL_ERROR("Invalid pointer cast from %s to %s address spaces",
+ getAddressSpaceName(srcAddrSpace),
+ getAddressSpaceName(dstAddrSpace));
+ }
+ }
+
+ TypedValue operand = getOperand(op);
+ memcpy(result.data, operand.data, result.size*result.num);
+}
+
+INSTRUCTION(br)
+{
+ if (instruction->getNumOperands() == 1)
+ {
+ // Unconditional branch
+ m_position->nextBlock = (const llvm::BasicBlock*)instruction->getOperand(0);
+ }
+ else
+ {
+ // Conditional branch
+ bool pred = getOperand(instruction->getOperand(0)).getUInt();
+ const llvm::Value *iftrue = instruction->getOperand(2);
+ const llvm::Value *iffalse = instruction->getOperand(1);
+ m_position->nextBlock = (const llvm::BasicBlock*)(pred ? iftrue : iffalse);
+ }
+}
+
+INSTRUCTION(bwand)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) & opB.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(bwor)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) | opB.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(bwxor)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) ^ opB.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(call)
+{
+ const llvm::CallInst *callInst = (const llvm::CallInst*)instruction;
+ const llvm::Function *function = callInst->getCalledFunction();
+
+ // Check for indirect function calls
+ if (!callInst->getCalledFunction())
+ {
+ // Resolve indirect function pointer
+ const llvm::Value *func = callInst->getCalledValue();
+ const llvm::Value *funcPtr = ((const llvm::User*)func)->getOperand(0);
+ function = (const llvm::Function*)funcPtr;
+ }
+
+ // Check if function has definition
+ if (!function->isDeclaration())
+ {
+ m_position->callStack.push(m_position->currInst);
+ m_position->allocations.push(list<size_t>());
+ m_position->nextBlock = function->begin();
+
+ // Set function arguments
+ llvm::Function::const_arg_iterator argItr;
+ for (argItr = function->arg_begin();
+ argItr != function->arg_end(); argItr++)
+ {
+ const llvm::Value *arg = callInst->getArgOperand(argItr->getArgNo());
+ setValue(argItr, m_pool.clone(getOperand(arg)));
+ }
+
+ return;
+ }
+
+ // Call builtin function
+ InterpreterCache::Builtin builtin = m_cache->getBuiltin(function);
+ builtin.function.func(this, callInst,
+ builtin.name, builtin.overload,
+ result, builtin.function.op);
+}
+
+INSTRUCTION(extractelem)
+{
+ const llvm::ExtractElementInst *extract =
+ (const llvm::ExtractElementInst*)instruction;
+ unsigned index = getOperand(extract->getIndexOperand()).getUInt();
+ TypedValue operand = getOperand(extract->getVectorOperand());
+ memcpy(result.data, operand.data + result.size*index, result.size);
+}
+
+INSTRUCTION(extractval)
+{
+ const llvm::ExtractValueInst *extract =
+ (const llvm::ExtractValueInst*)instruction;
+ const llvm::Value *agg = extract->getAggregateOperand();
+ llvm::ArrayRef<unsigned int> indices = extract->getIndices();
+
+ // Compute offset for target value
+ int offset = 0;
+ const llvm::Type *type = agg->getType();
+ for (unsigned i = 0; i < indices.size(); i++)
+ {
+ if (type->isArrayTy())
+ {
+ type = type->getArrayElementType();
+ offset += getTypeSize(type) * indices[i];
+ }
+ else if (type->isStructTy())
+ {
+ offset += getStructMemberOffset((const llvm::StructType*)type,
+ indices[i]);
+ type = type->getStructElementType(indices[i]);
+ }
+ else
+ {
+ FATAL_ERROR("Unsupported aggregate type: %d", type->getTypeID())
+ }
+ }
+
+ // Copy target value to result
+ memcpy(result.data, getOperand(agg).data + offset, getTypeSize(type));
+}
+
+INSTRUCTION(fadd)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(opA.getFloat(i) + opB.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(fcmp)
+{
+ const llvm::CmpInst *cmpInst = (const llvm::CmpInst*)instruction;
+ llvm::CmpInst::Predicate pred = cmpInst->getPredicate();
+
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+
+ uint64_t t = result.num > 1 ? -1 : 1;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double a = opA.getFloat(i);
+ double b = opB.getFloat(i);
+
+ uint64_t r;
+ switch (pred)
+ {
+ case llvm::CmpInst::FCMP_OEQ:
+ case llvm::CmpInst::FCMP_UEQ:
+ r = a == b;
+ break;
+ case llvm::CmpInst::FCMP_ONE:
+ case llvm::CmpInst::FCMP_UNE:
+ r = a != b;
+ break;
+ case llvm::CmpInst::FCMP_OGT:
+ case llvm::CmpInst::FCMP_UGT:
+ r = a > b;
+ break;
+ case llvm::CmpInst::FCMP_OGE:
+ case llvm::CmpInst::FCMP_UGE:
+ r = a >= b;
+ break;
+ case llvm::CmpInst::FCMP_OLT:
+ case llvm::CmpInst::FCMP_ULT:
+ r = a < b;
+ break;
+ case llvm::CmpInst::FCMP_OLE:
+ case llvm::CmpInst::FCMP_ULE:
+ r = a <= b;
+ break;
+ case llvm::CmpInst::FCMP_FALSE:
+ r = false;
+ break;
+ case llvm::CmpInst::FCMP_TRUE:
+ r = true;
+ break;
+ case llvm::CmpInst::FCMP_ORD:
+ case llvm::CmpInst::FCMP_UNO:
+ break;
+ default:
+ FATAL_ERROR("Unsupported FCmp predicate: %d", pred);
+ }
+
+ // Deal with NaN operands
+ if (::isnan(a) || ::isnan(b))
+ {
+ r = !llvm::CmpInst::isOrdered(pred);
+ }
+
+ result.setUInt(r ? t : 0, i);
+ }
+}
+
+INSTRUCTION(fdiv)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(opA.getFloat(i) / opB.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(fmul)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(opA.getFloat(i) * opB.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(fpext)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(op.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(fptosi)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setSInt((int64_t)op.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(fptoui)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt((uint64_t)op.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(frem)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(fmod(opA.getFloat(i), opB.getFloat(i)), i);
+ }
+}
+
+INSTRUCTION(fptrunc)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(op.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(fsub)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(opA.getFloat(i) - opB.getFloat(i), i);
+ }
+}
+
+INSTRUCTION(gep)
+{
+ const llvm::GetElementPtrInst *gepInst =
+ (const llvm::GetElementPtrInst*)instruction;
+
+ // Get base address
+ const llvm::Value *base = gepInst->getPointerOperand();
+ size_t address = getOperand(base).getPointer();
+ const llvm::Type *ptrType = gepInst->getPointerOperandType();
+
+ // Iterate over indices
+ llvm::User::const_op_iterator opItr;
+ for (opItr = gepInst->idx_begin(); opItr != gepInst->idx_end(); opItr++)
+ {
+ int64_t offset = getOperand(opItr->get()).getSInt();
+
+ if (ptrType->isPointerTy())
+ {
+ // Get pointer element size
+ const llvm::Type *elemType = ptrType->getPointerElementType();
+ address += offset*getTypeSize(elemType);
+ ptrType = elemType;
+ }
+ else if (ptrType->isArrayTy())
+ {
+ // Get array element size
+ const llvm::Type *elemType = ptrType->getArrayElementType();
+ address += offset*getTypeSize(elemType);
+ ptrType = elemType;
+ }
+ else if (ptrType->isVectorTy())
+ {
+ // Get vector element size
+ const llvm::Type *elemType = ptrType->getVectorElementType();
+ address += offset*getTypeSize(elemType);
+ ptrType = elemType;
+ }
+ else if (ptrType->isStructTy())
+ {
+ address +=
+ getStructMemberOffset((const llvm::StructType*)ptrType, offset);
+ ptrType = ptrType->getStructElementType(offset);
+ }
+ else
+ {
+ FATAL_ERROR("Unsupported GEP base type: %d", ptrType->getTypeID());
+ }
+ }
+
+ result.setPointer(address);
+}
+
+INSTRUCTION(icmp)
+{
+ const llvm::CmpInst *cmpInst = (const llvm::CmpInst*)instruction;
+ llvm::CmpInst::Predicate pred = cmpInst->getPredicate();
+
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+
+ uint64_t t = result.num > 1 ? -1 : 1;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ // Load operands
+ uint64_t ua = opA.getUInt(i);
+ uint64_t ub = opB.getUInt(i);
+ int64_t sa = opA.getSInt(i);
+ int64_t sb = opB.getSInt(i);
+
+ uint64_t r;
+ switch (pred)
+ {
+ case llvm::CmpInst::ICMP_EQ:
+ r = ua == ub;
+ break;
+ case llvm::CmpInst::ICMP_NE:
+ r = ua != ub;
+ break;
+ case llvm::CmpInst::ICMP_UGT:
+ r = ua > ub;
+ break;
+ case llvm::CmpInst::ICMP_UGE:
+ r = ua >= ub;
+ break;
+ case llvm::CmpInst::ICMP_ULT:
+ r = ua < ub;
+ break;
+ case llvm::CmpInst::ICMP_ULE:
+ r = ua <= ub;
+ break;
+ case llvm::CmpInst::ICMP_SGT:
+ r = sa > sb;
+ break;
+ case llvm::CmpInst::ICMP_SGE:
+ r = sa >= sb;
+ break;
+ case llvm::CmpInst::ICMP_SLT:
+ r = sa < sb;
+ break;
+ case llvm::CmpInst::ICMP_SLE:
+ r = sa <= sb;
+ break;
+ default:
+ FATAL_ERROR("Unsupported ICmp predicate: %d", pred);
+ }
+
+ result.setUInt(r ? t : 0, i);
+ }
+}
+
+INSTRUCTION(insertelem)
+{
+ TypedValue vector = getOperand(instruction->getOperand(0));
+ TypedValue element = getOperand(instruction->getOperand(1));
+ unsigned index = getOperand(instruction->getOperand(2)).getUInt();
+ memcpy(result.data, vector.data, result.size*result.num);
+ memcpy(result.data + index*result.size, element.data, result.size);
+}
+
+INSTRUCTION(insertval)
+{
+ const llvm::InsertValueInst *insert =
+ (const llvm::InsertValueInst*)instruction;
+
+ // Load original aggregate data
+ const llvm::Value *agg = insert->getAggregateOperand();
+ memcpy(result.data, getOperand(agg).data, result.size*result.num);
+
+ // Compute offset for inserted value
+ int offset = 0;
+ llvm::ArrayRef<unsigned int> indices = insert->getIndices();
+ const llvm::Type *type = agg->getType();
+ for (unsigned i = 0; i < indices.size(); i++)
+ {
+ if (type->isArrayTy())
+ {
+ type = type->getArrayElementType();
+ offset += getTypeSize(type) * indices[i];
+ }
+ else if (type->isStructTy())
+ {
+ offset += getStructMemberOffset((const llvm::StructType*)type,
+ indices[i]);
+ type = type->getStructElementType(indices[i]);
+ }
+ else
+ {
+ FATAL_ERROR("Unsupported aggregate type: %d", type->getTypeID())
+ }
+ }
+
+ // Copy inserted value into result
+ const llvm::Value *value = insert->getInsertedValueOperand();
+ memcpy(result.data + offset, getOperand(value).data,
+ getTypeSize(value->getType()));
+}
+
+INSTRUCTION(inttoptr)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setPointer(op.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(itrunc)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ memcpy(result.data+i*result.size, op.data+i*op.size, result.size);
+ }
+}
+
+INSTRUCTION(load)
+{
+ const llvm::LoadInst *loadInst = (const llvm::LoadInst*)instruction;
+ unsigned addressSpace = loadInst->getPointerAddressSpace();
+ size_t address = getOperand(loadInst->getPointerOperand()).getPointer();
+
+ // Check address is correctly aligned
+ if (address & (loadInst->getAlignment()-1))
+ {
+ m_context->logError("Invalid memory load - source pointer is "
+ "not aligned to the pointed type");
+ }
+
+ // Load data
+ getMemory(addressSpace)->load(result.data, address, result.size*result.num);
+}
+
+INSTRUCTION(lshr)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ uint64_t shiftMask =
+ (result.num > 1 ? result.size : max((size_t)result.size, sizeof(uint32_t)))
+ * 8 - 1;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) >> (opB.getUInt(i) & shiftMask), i);
+ }
+}
+
+INSTRUCTION(mul)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) * opB.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(phi)
+{
+ const llvm::PHINode *phiNode = (const llvm::PHINode*)instruction;
+ const llvm::Value *value = phiNode->getIncomingValueForBlock(
+ (const llvm::BasicBlock*)m_position->prevBlock);
+ memcpy(result.data, getOperand(value).data, result.size*result.num);
+}
+
+INSTRUCTION(ptrtoint)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(op.getPointer(i), i);
+ }
+}
+
+INSTRUCTION(ret)
+{
+ const llvm::ReturnInst *retInst = (const llvm::ReturnInst*)instruction;
+
+ if (!m_position->callStack.empty())
+ {
+ m_position->currInst = m_position->callStack.top();
+ m_position->currBlock = m_position->currInst->getParent();
+ m_position->callStack.pop();
+
+ // Set return value
+ const llvm::Value *returnVal = retInst->getReturnValue();
+ if (returnVal)
+ {
+ setValue(m_position->currInst, m_pool.clone(getOperand(returnVal)));
+ }
+
+ // Clear stack allocations
+ list<size_t>& allocs = m_position->allocations.top();
+ list<size_t>::iterator itr;
+ for (itr = allocs.begin(); itr != allocs.end(); itr++)
+ {
+ m_privateMemory->deallocateBuffer(*itr);
+ }
+ m_position->allocations.pop();
+ }
+ else
+ {
+ m_position->nextBlock = NULL;
+ m_state = FINISHED;
+ m_workGroup->notifyFinished(this);
+ m_context->notifyWorkItemComplete(this);
+ }
+}
+
+INSTRUCTION(sdiv)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ int64_t a = opA.getSInt(i);
+ int64_t b = opB.getSInt(i);
+ int64_t r = 0;
+ if (b && !(a == INT64_MIN && b == -1))
+ {
+ r = a / b;
+ }
+ result.setSInt(r, i);
+ }
+}
+
+INSTRUCTION(select)
+{
+ const llvm::SelectInst *selectInst = (const llvm::SelectInst*)instruction;
+ TypedValue opCondition = getOperand(selectInst->getCondition());
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ const bool cond =
+ selectInst->getCondition()->getType()->isVectorTy() ?
+ opCondition.getUInt(i) :
+ opCondition.getUInt();
+ const llvm::Value *op = cond ?
+ selectInst->getTrueValue() :
+ selectInst->getFalseValue();
+ memcpy(result.data + i*result.size,
+ getOperand(op).data + i*result.size,
+ result.size);
+ }
+}
+
+INSTRUCTION(sext)
+{
+ const llvm::Value *operand = instruction->getOperand(0);
+ TypedValue value = getOperand(operand);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ int64_t val = value.getSInt(i);
+ if (operand->getType()->getPrimitiveSizeInBits() == 1)
+ {
+ val = val ? -1 : 0;
+ }
+ result.setSInt(val, i);
+ }
+}
+
+INSTRUCTION(shl)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ uint64_t shiftMask =
+ (result.num > 1 ? result.size : max((size_t)result.size, sizeof(uint32_t)))
+ * 8 - 1;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) << (opB.getUInt(i) & shiftMask), i);
+ }
+}
+
+INSTRUCTION(shuffle)
+{
+ const llvm::ShuffleVectorInst *shuffle =
+ (const llvm::ShuffleVectorInst*)instruction;
+
+ const llvm::Value *v1 = shuffle->getOperand(0);
+ const llvm::Value *v2 = shuffle->getOperand(1);
+ TypedValue mask = getOperand(shuffle->getMask());
+
+ unsigned num = v1->getType()->getVectorNumElements();
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ if (shuffle->getMask()->getAggregateElement(i)->getValueID()
+ == llvm::Value::UndefValueVal)
+ {
+ // Don't care / undef
+ continue;
+ }
+
+ const llvm::Value *src = v1;
+ unsigned int index = mask.getUInt(i);
+ if (index >= num)
+ {
+ index -= num;
+ src = v2;
+ }
+ memcpy(result.data + i*result.size,
+ getOperand(src).data + index*result.size, result.size);
+ }
+}
+
+INSTRUCTION(sitofp)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(op.getSInt(i), i);
+ }
+}
+
+INSTRUCTION(srem)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ int64_t a = opA.getSInt(i);
+ int64_t b = opB.getSInt(i);
+ int64_t r = 0;
+ if (b && !(a == INT64_MIN && b == -1))
+ {
+ r = a % b;
+ }
+ result.setSInt(r, i);
+ }
+}
+
+INSTRUCTION(store)
+{
+ const llvm::StoreInst *storeInst = (const llvm::StoreInst*)instruction;
+ unsigned addressSpace = storeInst->getPointerAddressSpace();
+ size_t address = getOperand(storeInst->getPointerOperand()).getPointer();
+
+ // Check address is correctly aligned
+ if (address & (storeInst->getAlignment()-1))
+ {
+ m_context->logError("Invalid memory store - source pointer is "
+ "not aligned to the pointed type");
+ }
+
+ // Store data
+ TypedValue operand = getOperand(storeInst->getValueOperand());
+ getMemory(addressSpace)->store(operand.data, address,
+ operand.size*operand.num);
+}
+
+INSTRUCTION(sub)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(opA.getUInt(i) - opB.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(swtch)
+{
+ const llvm::SwitchInst *swtch = (const llvm::SwitchInst*)instruction;
+ const llvm::Value *cond = swtch->getCondition();
+ uint64_t val = getOperand(cond).getUInt();
+ const llvm::ConstantInt *cval =
+ (const llvm::ConstantInt*)llvm::ConstantInt::get(cond->getType(), val);
+ m_position->nextBlock = swtch->findCaseValue(cval).getCaseSuccessor();
+}
+
+INSTRUCTION(udiv)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t a = opA.getUInt(i);
+ uint64_t b = opB.getUInt(i);
+ result.setUInt(b ? a / b : 0, i);
+ }
+}
+
+INSTRUCTION(uitofp)
+{
+ TypedValue op = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(op.getUInt(i), i);
+ }
+}
+
+INSTRUCTION(urem)
+{
+ TypedValue opA = getOperand(instruction->getOperand(0));
+ TypedValue opB = getOperand(instruction->getOperand(1));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t a = opA.getUInt(i);
+ uint64_t b = opB.getUInt(i);
+ result.setUInt(b ? a % b : 0, i);
+ }
+}
+
+INSTRUCTION(zext)
+{
+ TypedValue operand = getOperand(instruction->getOperand(0));
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(operand.getUInt(i), i);
+ }
+}
+
+#undef INSTRUCTION
+
+
+////////////////////////////////
+// WorkItem::InterpreterCache //
+////////////////////////////////
+
+InterpreterCache::InterpreterCache(llvm::Function *kernel)
+{
+ // TODO: Determine this number dynamically?
+ m_valueIDs.reserve(1024);
+
+ // Add global variables to cache
+ // TODO: Only add variables that are used?
+ const llvm::Module *module = kernel->getParent();
+ llvm::Module::const_global_iterator G;
+ for (G = module->global_begin(); G != module->global_end(); G++)
+ {
+ addValueID(G);
+ }
+
+
+ set<llvm::Function*> processed;
+ set<llvm::Function*> pending;
+
+ pending.insert(kernel);
+
+ while (!pending.empty())
+ {
+ // Get next function to process
+ llvm::Function *function = *pending.begin();
+ processed.insert(function);
+ pending.erase(function);
+
+ // Iterate through the function arguments
+ llvm::Function::arg_iterator A;
+ for (A = function->arg_begin(); A != function->arg_end(); A++)
+ {
+ addValueID(A);
+ }
+
+ // Iterate through instructions in function
+ llvm::inst_iterator I;
+ for (I = inst_begin(function); I != inst_end(function); I++)
+ {
+ addValueID(&*I);
+
+ // Check for function calls
+ if (I->getOpcode() == llvm::Instruction::Call)
+ {
+ const llvm::CallInst *call = ((const llvm::CallInst*)&*I);
+ llvm::Function *callee =
+ (llvm::Function*)call->getCalledValue()->stripPointerCasts();
+ if (callee->isDeclaration())
+ {
+ // Resolve builtin function calls
+ addBuiltin(callee);
+ }
+ else if (!processed.count(callee))
+ {
+ // Process called function
+ pending.insert(callee);
+ }
+ }
+
+ // Process operands
+ for (llvm::User::value_op_iterator O = I->value_op_begin();
+ O != I->value_op_end(); O++)
+ {
+ addOperand(*O);
+ }
+ }
+ }
+}
+
+InterpreterCache::~InterpreterCache()
+{
+ ConstantMap::iterator constItr;
+ for (constItr = m_constants.begin();
+ constItr != m_constants.end(); constItr++)
+ {
+ delete[] constItr->second.data;
+ }
+
+ ConstExprMap::iterator constExprItr;
+ for (constExprItr = m_constExpressions.begin();
+ constExprItr != m_constExpressions.end(); constExprItr++)
+ {
+ delete constExprItr->second;
+ }
+}
+
+void InterpreterCache::addBuiltin(
+ const llvm::Function *function)
+{
+ // Check if already in cache
+ InterpreterCache::BuiltinMap::iterator fItr = m_builtins.find(function);
+ if (fItr != m_builtins.end())
+ {
+ return;
+ }
+
+ // Extract unmangled name and overload
+ string name, overload;
+ const string fullname = function->getName().str();
+ if (fullname.compare(0,2, "_Z") == 0)
+ {
+ int len = atoi(fullname.c_str()+2);
+ int start = fullname.find_first_not_of("0123456789", 2);
+ name = fullname.substr(start, len);
+ overload = fullname.substr(start + len);
+ }
+ else
+ {
+ name = fullname;
+ overload = "";
+ }
+
+ // Find builtin function in map
+ BuiltinFunctionMap::iterator bItr = workItemBuiltins.find(name);
+ if (bItr != workItemBuiltins.end())
+ {
+ // Add builtin to cache
+ const InterpreterCache::Builtin builtin = {bItr->second, name, overload};
+ m_builtins[function] = builtin;
+ return;
+ }
+
+ // Check for builtin with matching prefix
+ BuiltinFunctionPrefixList::iterator pItr;
+ for (pItr = workItemPrefixBuiltins.begin();
+ pItr != workItemPrefixBuiltins.end(); pItr++)
+ {
+ if (name.compare(0, pItr->first.length(), pItr->first) == 0)
+ {
+ // Add builtin to cache
+ const InterpreterCache::Builtin builtin = {pItr->second, name, overload};
+ m_builtins[function] = builtin;
+ return;
+ }
+ }
+
+ // Function didn't match any builtins
+ FATAL_ERROR("Undefined external function: %s", name.c_str());
+}
+
+InterpreterCache::Builtin InterpreterCache::getBuiltin(
+ const llvm::Function *function) const
+{
+ return m_builtins.at(function);
+}
+
+void InterpreterCache::addConstant(const llvm::Value *value)
+{
+ // Check if constant already in cache
+ if (m_constants.count(value))
+ {
+ return;
+ }
+
+ // Create constant and add to cache
+ pair<unsigned,unsigned> size = getValueSize(value);
+ TypedValue constant;
+ constant.size = size.first;
+ constant.num = size.second;
+ constant.data = new unsigned char[getTypeSize(value->getType())];
+ getConstantData(constant.data, (const llvm::Constant*)value);
+
+ m_constants[value] = constant;
+}
+
+TypedValue InterpreterCache::getConstant(const llvm::Value *operand) const
+{
+ ConstantMap::const_iterator itr = m_constants.find(operand);
+ if (itr == m_constants.end())
+ {
+ FATAL_ERROR("Constant not found in cache (ID %d)", operand->getValueID());
+ }
+ return itr->second;
+}
+
+const llvm::Instruction* InterpreterCache::getConstantExpr(
+ const llvm::Value *expr) const
+{
+ ConstExprMap::const_iterator itr = m_constExpressions.find(expr);
+ if (itr == m_constExpressions.end())
+ {
+ FATAL_ERROR("Constant expression not found in cache");
+ }
+ return itr->second;
+}
+
+unsigned InterpreterCache::addValueID(const llvm::Value *value)
+{
+ ValueMap::iterator itr = m_valueIDs.find(value);
+ if (itr == m_valueIDs.end())
+ {
+ // Assign next index to value
+ unsigned pos = m_valueIDs.size();
+ itr = m_valueIDs.insert(make_pair(value, pos)).first;
+ }
+ return itr->second;
+}
+
+unsigned InterpreterCache::getValueID(const llvm::Value *value) const
+{
+ ValueMap::const_iterator itr = m_valueIDs.find(value);
+ if (itr == m_valueIDs.end())
+ {
+ FATAL_ERROR("Value not found in cache (ID %d)", value->getValueID());
+ }
+ return itr->second;
+}
+
+unsigned InterpreterCache::getNumValues() const
+{
+ return m_valueIDs.size();
+}
+
+bool InterpreterCache::hasValue(const llvm::Value *value) const
+{
+ return m_valueIDs.count(value);
+}
+
+void InterpreterCache::addOperand(const llvm::Value *operand)
+{
+ addValueID(operand);
+
+ // Resolve constants
+ if (operand->getValueID() == llvm::Value::UndefValueVal ||
+ operand->getValueID() == llvm::Value::ConstantAggregateZeroVal ||
+ operand->getValueID() == llvm::Value::ConstantDataArrayVal ||
+ operand->getValueID() == llvm::Value::ConstantDataVectorVal ||
+ operand->getValueID() == llvm::Value::ConstantIntVal ||
+ operand->getValueID() == llvm::Value::ConstantFPVal ||
+ operand->getValueID() == llvm::Value::ConstantArrayVal ||
+ operand->getValueID() == llvm::Value::ConstantStructVal ||
+ operand->getValueID() == llvm::Value::ConstantVectorVal ||
+ operand->getValueID() == llvm::Value::ConstantPointerNullVal)
+ {
+ addConstant(operand);
+ }
+ else if (operand->getValueID() == llvm::Value::ConstantExprVal)
+ {
+ // Resolve constant expressions
+ const llvm::ConstantExpr *expr = (const llvm::ConstantExpr*)operand;
+ if (!m_constExpressions.count(expr))
+ {
+ for (llvm::User::const_op_iterator O = expr->op_begin();
+ O != expr->op_end(); O++)
+ {
+ addOperand(*O);
+ }
+ m_constExpressions[expr] = getConstExprAsInstruction(expr);
+ // TODO: Resolve actual value?
+ }
+ }
+}
+
+
+//////////////////////////
+// WorkItem::MemoryPool //
+//////////////////////////
+
+WorkItem::MemoryPool::MemoryPool(size_t blockSize) : m_blockSize(blockSize)
+{
+ // Force first allocation to create new block
+ m_offset = m_blockSize;
+}
+
+WorkItem::MemoryPool::~MemoryPool()
+{
+ list<unsigned char*>::iterator itr;
+ for (itr = m_blocks.begin(); itr != m_blocks.end(); itr++)
+ {
+ delete[] *itr;
+ }
+}
+
+unsigned char* WorkItem::MemoryPool::alloc(size_t size)
+{
+ // Check if requested size larger than block size
+ if (size > m_blockSize)
+ {
+ // Oversized buffers allocated separately from main pool
+ unsigned char *buffer = new unsigned char[size];
+ m_blocks.push_back(buffer);
+ return buffer;
+ }
+
+ // Check if enough space in current block
+ if (m_offset + size > m_blockSize)
+ {
+ // Allocate new block
+ m_blocks.push_front(new unsigned char[m_blockSize]);
+ m_offset = 0;
+ }
+ unsigned char *buffer = m_blocks.front() + m_offset;
+ m_offset += size;
+ return buffer;
+}
+
+TypedValue WorkItem::MemoryPool::clone(const TypedValue& source)
+{
+ TypedValue dest;
+ dest.size = source.size;
+ dest.num = source.num;
+ dest.data = alloc(dest.size*dest.num);
+ memcpy(dest.data, source.data, dest.size*dest.num);
+ return dest;
+}
diff --git a/src/core/WorkItem.h b/src/core/WorkItem.h
new file mode 100644
index 0000000..ae8380c
--- /dev/null
+++ b/src/core/WorkItem.h
@@ -0,0 +1,213 @@
+// WorkItem.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+namespace llvm
+{
+ class CallInst;
+ class ConstExpr;
+ class DbgValueInst;
+ class Function;
+ class Module;
+}
+
+namespace oclgrind
+{
+ class Context;
+ class Kernel;
+ class KernelInvocation;
+ class Memory;
+ class WorkGroup;
+ class WorkItem;
+ class WorkItemBuiltins;
+
+ // Data structures for builtin functions
+ typedef struct _BuiltinFunction
+ {
+ void (*func)(WorkItem*, const llvm::CallInst*,
+ const std::string&, const std::string&, TypedValue&, void*);
+ void *op;
+ _BuiltinFunction(){};
+ _BuiltinFunction(void (*f)(WorkItem*, const llvm::CallInst*,
+ const std::string&, const std::string&, TypedValue&,
+ void*),
+ void *o) : func(f), op(o) {};
+ } BuiltinFunction;
+ typedef std::unordered_map<std::string,BuiltinFunction> BuiltinFunctionMap;
+ typedef std::list< std::pair<std::string, BuiltinFunction> >
+ BuiltinFunctionPrefixList;
+
+ extern BuiltinFunctionMap workItemBuiltins;
+ extern BuiltinFunctionPrefixList workItemPrefixBuiltins;
+
+ // Per-kernel cache for various interpreter state information
+ class InterpreterCache
+ {
+ public:
+ typedef struct
+ {
+ BuiltinFunction function;
+ std::string name, overload;
+ } Builtin;
+
+ InterpreterCache(llvm::Function *kernel);
+ ~InterpreterCache();
+
+ void addBuiltin(const llvm::Function *function);
+ Builtin getBuiltin(const llvm::Function *function) const;
+
+ void addConstant(const llvm::Value *constant);
+ TypedValue getConstant(const llvm::Value *operand) const;
+ const llvm::Instruction* getConstantExpr(const llvm::Value *expr) const;
+
+ unsigned addValueID(const llvm::Value *value);
+ unsigned getValueID(const llvm::Value *value) const;
+ unsigned getNumValues() const;
+ bool hasValue(const llvm::Value *value) const;
+
+ private:
+ typedef std::unordered_map<const llvm::Value*, unsigned> ValueMap;
+ typedef std::unordered_map<const llvm::Function*, Builtin> BuiltinMap;
+ typedef std::unordered_map<const llvm::Value*, TypedValue> ConstantMap;
+ typedef std::unordered_map<const llvm::Value*, const llvm::Instruction*>
+ ConstExprMap;
+
+ BuiltinMap m_builtins;
+ ConstantMap m_constants;
+ ConstExprMap m_constExpressions;
+ ValueMap m_valueIDs;
+
+ void addOperand(const llvm::Value *value);
+ };
+
+ class WorkItem
+ {
+ friend class WorkItemBuiltins;
+
+ public:
+ enum State {READY, BARRIER, FINISHED};
+
+ private:
+ class MemoryPool
+ {
+ public:
+ MemoryPool(size_t blockSize = 1024);
+ ~MemoryPool();
+ unsigned char* alloc(size_t size);
+ TypedValue clone(const TypedValue& source);
+ private:
+ size_t m_blockSize;
+ size_t m_offset;
+ std::list<unsigned char *> m_blocks;
+ } mutable m_pool;
+
+ public:
+ WorkItem(const KernelInvocation *kernelInvocation,
+ WorkGroup *workGroup, Size3 lid);
+ virtual ~WorkItem();
+
+ void clearBarrier();
+ void dispatch(const llvm::Instruction *instruction, TypedValue& result);
+ void execute(const llvm::Instruction *instruction);
+ const std::stack<const llvm::Instruction*>& getCallStack() const;
+ const llvm::Instruction* getCurrentInstruction() const;
+ Size3 getGlobalID() const;
+ size_t getGlobalIndex() const;
+ Size3 getLocalID() const;
+ TypedValue getOperand(const llvm::Value *operand) const;
+ Memory* getPrivateMemory() const;
+ State getState() const;
+ const unsigned char* getValueData(const llvm::Value *value) const;
+ const llvm::Value* getVariable(std::string name) const;
+ const WorkGroup* getWorkGroup() const;
+ bool printValue(const llvm::Value *value) const;
+ bool printVariable(std::string name) const;
+ State step();
+
+ // SPIR instructions
+ private:
+#define INSTRUCTION(name) \
+ void name(const llvm::Instruction *instruction, TypedValue& result)
+ INSTRUCTION(add);
+ INSTRUCTION(alloc);
+ INSTRUCTION(ashr);
+ INSTRUCTION(bitcast);
+ INSTRUCTION(br);
+ INSTRUCTION(bwand);
+ INSTRUCTION(bwor);
+ INSTRUCTION(bwxor);
+ INSTRUCTION(call);
+ INSTRUCTION(extractelem);
+ INSTRUCTION(extractval);
+ INSTRUCTION(fadd);
+ INSTRUCTION(fcmp);
+ INSTRUCTION(fdiv);
+ INSTRUCTION(fmul);
+ INSTRUCTION(fpext);
+ INSTRUCTION(fptosi);
+ INSTRUCTION(fptoui);
+ INSTRUCTION(fptrunc);
+ INSTRUCTION(frem);
+ INSTRUCTION(fsub);
+ INSTRUCTION(gep);
+ INSTRUCTION(icmp);
+ INSTRUCTION(insertelem);
+ INSTRUCTION(insertval);
+ INSTRUCTION(inttoptr);
+ INSTRUCTION(itrunc);
+ INSTRUCTION(load);
+ INSTRUCTION(lshr);
+ INSTRUCTION(mul);
+ INSTRUCTION(phi);
+ INSTRUCTION(ptrtoint);
+ INSTRUCTION(ret);
+ INSTRUCTION(sdiv);
+ INSTRUCTION(select);
+ INSTRUCTION(sext);
+ INSTRUCTION(shl);
+ INSTRUCTION(shuffle);
+ INSTRUCTION(sitofp);
+ INSTRUCTION(srem);
+ INSTRUCTION(store);
+ INSTRUCTION(sub);
+ INSTRUCTION(swtch);
+ INSTRUCTION(udiv);
+ INSTRUCTION(uitofp);
+ INSTRUCTION(urem);
+ INSTRUCTION(zext);
+#undef INSTRUCTION
+
+ private:
+ typedef std::map<std::string, const llvm::Value*> VariableMap;
+
+ size_t m_globalIndex;
+ Size3 m_globalID;
+ Size3 m_localID;
+ TypedValueMap m_phiTemps;
+ VariableMap m_variables;
+ const Context *m_context;
+ const KernelInvocation *m_kernelInvocation;
+ Memory *m_privateMemory;
+ WorkGroup *m_workGroup;
+
+ State m_state;
+ struct Position;
+ Position *m_position;
+
+ Memory* getMemory(unsigned int addrSpace) const;
+
+ // Store for instruction results and other operand values
+ std::vector<TypedValue> m_values;
+ TypedValue getValue(const llvm::Value *key) const;
+ bool hasValue(const llvm::Value *key) const;
+ void setValue(const llvm::Value *key, TypedValue value);
+
+ const InterpreterCache *m_cache;
+ };
+}
diff --git a/src/core/WorkItemBuiltins.cpp b/src/core/WorkItemBuiltins.cpp
new file mode 100644
index 0000000..cce6da6
--- /dev/null
+++ b/src/core/WorkItemBuiltins.cpp
@@ -0,0 +1,3561 @@
+// WorkItemBuiltins.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+#include <algorithm>
+#include <fenv.h>
+#include <mutex>
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#if LLVM_VERSION > 36
+#include "llvm/IR/DebugInfoMetadata.h"
+#endif
+
+#include "CL/cl.h"
+#include "Context.h"
+#include "half.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+
+using namespace oclgrind;
+using namespace std;
+
+#define CLK_NORMALIZED_COORDS_TRUE 0x0001
+
+#define CLK_ADDRESS_NONE 0x0000
+#define CLK_ADDRESS_CLAMP_TO_EDGE 0x0002
+#define CLK_ADDRESS_CLAMP 0x0004
+#define CLK_ADDRESS_REPEAT 0x0006
+#define CLK_ADDRESS_MIRRORED_REPEAT 0x0008
+#define CLK_ADDRESS_MASK 0x000E
+
+#define CLK_FILTER_NEAREST 0x0010
+#define CLK_FILTER_LINEAR 0x0020
+
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795
+#endif
+
+namespace oclgrind
+{
+ static mutex printfMutex;
+
+ class WorkItemBuiltins
+ {
+ // Utility macros for creating builtins
+#define DEFINE_BUILTIN(name) \
+ static void name(WorkItem *workItem, const llvm::CallInst *callInst, \
+ const string& fnName, const string& overload, \
+ TypedValue& result, void *)
+#define ARG(i) (callInst->getArgOperand(i))
+#define UARGV(i,v) workItem->getOperand(ARG(i)).getUInt(v)
+#define SARGV(i,v) workItem->getOperand(ARG(i)).getSInt(v)
+#define FARGV(i,v) workItem->getOperand(ARG(i)).getFloat(v)
+#define PARGV(i,v) workItem->getOperand(ARG(i)).getPointer(v)
+#define UARG(i) UARGV(i, 0)
+#define SARG(i) SARGV(i, 0)
+#define FARG(i) FARGV(i, 0)
+#define PARG(i) PARGV(i, 0)
+
+ // Functions that apply generic builtins to each component of a vector
+ static void f1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, double (*func)(double))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(func(FARGV(0, i)), i);
+ }
+ }
+ static void f2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, double (*func)(double, double))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(func(FARGV(0, i), FARGV(1, i)), i);
+ }
+ }
+ static void f3arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, double (*func)(double, double, double))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(func(FARGV(0, i), FARGV(1, i), FARGV(2, i)), i);
+ }
+ }
+ static void u1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, uint64_t (*func)(uint64_t))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(func(UARGV(0, i)), i);
+ }
+ }
+ static void u2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, uint64_t (*func)(uint64_t, uint64_t))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(func(UARGV(0, i), UARGV(1, i)), i);
+ }
+ }
+ static void u3arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, uint64_t (*func)(uint64_t, uint64_t, uint64_t))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(func(UARGV(0, i), UARGV(1, i), UARGV(2, i)), i);
+ }
+ }
+ static void s1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, int64_t (*func)(int64_t))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setSInt(func(SARGV(0, i)), i);
+ }
+ }
+ static void s2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, int64_t (*func)(int64_t, int64_t))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setSInt(func(SARGV(0, i), SARGV(1, i)), i);
+ }
+ }
+ static void s3arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, int64_t (*func)(int64_t, int64_t, int64_t))
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setSInt(func(SARGV(0, i), SARGV(1, i), SARGV(2, i)), i);
+ }
+ }
+ static void rel1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, int64_t (*func)(double))
+ {
+ int64_t t = result.num > 1 ? -1 : 1;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setSInt(func(FARGV(0, i))*t, i);
+ }
+ }
+ static void rel2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+ const string& name, const string& overload,
+ TypedValue& result, int64_t (*func)(double, double))
+ {
+ int64_t t = result.num > 1 ? -1 : 1;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setSInt(func(FARGV(0, i), FARGV(1, i))*t, i);
+ }
+ }
+
+ // Extract the (first) argument type from an overload string
+ static char getOverloadArgType(const string& overload)
+ {
+ char type = overload[0];
+ if (type == 'D')
+ {
+ char *typestr;
+ strtol(overload.c_str() + 2, &typestr, 10);
+ type = typestr[1];
+ }
+ return type;
+ }
+
+
+ ///////////////////////////////////////
+ // Async Copy and Prefetch Functions //
+ ///////////////////////////////////////
+
+ DEFINE_BUILTIN(async_work_group_copy)
+ {
+ int arg = 0;
+
+ // Get src/dest addresses
+ const llvm::Value *destOp = ARG(arg++);
+ const llvm::Value *srcOp = ARG(arg++);
+ size_t dest = workItem->getOperand(destOp).getPointer();
+ size_t src = workItem->getOperand(srcOp).getPointer();
+
+ // Get size of copy
+ unsigned elemSize =
+ getTypeSize(destOp->getType()->getPointerElementType());
+ uint64_t num = UARG(arg++);
+
+ // Get stride
+ uint64_t stride = 1;
+ size_t srcStride = 1;
+ size_t destStride = 1;
+ if (fnName == "async_work_group_strided_copy")
+ {
+ stride = UARG(arg++);
+ }
+
+ size_t event = UARG(arg++);
+
+ // Get type of copy
+ WorkGroup::AsyncCopyType type;
+ if (destOp->getType()->getPointerAddressSpace() == AddrSpaceLocal)
+ {
+ type = WorkGroup::GLOBAL_TO_LOCAL;
+ srcStride = stride;
+ }
+ else
+ {
+ type = WorkGroup::LOCAL_TO_GLOBAL;
+ destStride = stride;
+ }
+
+ // Register copy
+ event = workItem->m_workGroup->async_copy(
+ workItem,
+ callInst,
+ type,
+ dest,
+ src,
+ elemSize,
+ num,
+ srcStride,
+ destStride,
+ event);
+ result.setUInt(event);
+ }
+
+ DEFINE_BUILTIN(wait_group_events)
+ {
+ uint64_t num = UARG(0);
+ size_t address = PARG(1);
+ list<size_t> events;
+ for (unsigned i = 0; i < num; i++)
+ {
+ size_t event;
+ if (!workItem->m_privateMemory->load((unsigned char*)&event,
+ address, sizeof(size_t)))
+ {
+ return;
+ }
+ events.push_back(event);
+ address += sizeof(size_t);
+ }
+ workItem->m_state = WorkItem::BARRIER;
+ workItem->m_workGroup->notifyBarrier(workItem, callInst,
+ CLK_LOCAL_MEM_FENCE, events);
+ }
+
+ DEFINE_BUILTIN(prefetch)
+ {
+ // Do nothing.
+ }
+
+
+ //////////////////////
+ // Atomic Functions //
+ //////////////////////
+
+ DEFINE_BUILTIN(atomic_add)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_add");
+ }
+ uint32_t old = memory->atomic(AtomicAdd, address, UARG(1));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_and)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_and");
+ }
+ uint32_t old = memory->atomic(AtomicAnd, address, UARG(1));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_cmpxchg)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_cmpxchg");
+ }
+ uint32_t old = memory->atomicCmpxchg(address, UARG(1), UARG(2));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_dec)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_dec");
+ }
+ uint32_t old = memory->atomic(AtomicDec, address);
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_inc)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_dec");
+ }
+ uint32_t old = memory->atomic(AtomicInc, address);
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_max)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_max");
+ }
+ uint32_t old = memory->atomic(AtomicMax, address, UARG(1));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_min)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_min");
+ }
+ uint32_t old = memory->atomic(AtomicMin, address, UARG(1));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_or)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_or");
+ }
+ uint32_t old = memory->atomic(AtomicOr, address, UARG(1));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_sub)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_sub");
+ }
+ uint32_t old = memory->atomic(AtomicSub, address, UARG(1));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_xchg)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_xchg");
+ }
+ uint32_t old = memory->atomic(AtomicXchg, address, UARG(1));
+ result.setUInt(old);
+ }
+
+ DEFINE_BUILTIN(atomic_xor)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+
+ size_t address = PARG(0);
+ // Verify the address is 4-byte aligned
+ if ((address & 0x3) != 0) {
+ workItem->m_context->logError("Unaligned address on atomic_xor");
+ }
+ uint32_t old = memory->atomic(AtomicXor, address, UARG(1));
+ result.setUInt(old);
+ }
+
+
+ //////////////////////
+ // Common Functions //
+ //////////////////////
+
+ template<typename T> T static _max_(T a, T b){return a > b ? a : b;}
+ template<typename T> T static _min_(T a, T b){return a < b ? a : b;}
+ template<typename T> T static _clamp_(T x, T min, T max)
+ {
+ return _min_(_max_(x, min), max);
+ }
+
+ static double _degrees_(double x)
+ {
+ return x * (180 / M_PI);
+ }
+
+ static double _radians_(double x)
+ {
+ return x * (M_PI / 180);
+ }
+
+ static double _sign_(double x)
+ {
+ if (::isnan(x)) return 0.0;
+ if (x > 0.0) return 1.0;
+ if (x == -0.0) return -0.0;
+ if (x == 0.0) return 0.0;
+ if (x < 0.0) return -1.0;
+ return 0.0;
+ }
+
+ DEFINE_BUILTIN(clamp)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'f':
+ case 'd':
+ if (ARG(1)->getType()->isVectorTy())
+ {
+ f3arg(workItem, callInst, fnName, overload, result, _clamp_);
+ }
+ else
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ double minval = FARG(1);
+ double maxval = FARG(2);
+ result.setFloat(_clamp_(x, minval, maxval), i);
+ }
+ }
+ break;
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ u3arg(workItem, callInst, fnName, overload, result, _clamp_);
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ s3arg(workItem, callInst, fnName, overload, result, _clamp_);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+
+ DEFINE_BUILTIN(max)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'f':
+ case 'd':
+ if (ARG(1)->getType()->isVectorTy())
+ {
+ f2arg(workItem, callInst, fnName, overload, result, fmax);
+ }
+ else
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ double y = FARG(1);
+ result.setFloat(_max_(x, y), i);
+ }
+ }
+ break;
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ u2arg(workItem, callInst, fnName, overload, result, _max_);
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ s2arg(workItem, callInst, fnName, overload, result, _max_);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+
+ DEFINE_BUILTIN(min)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'f':
+ case 'd':
+ if (ARG(1)->getType()->isVectorTy())
+ {
+ f2arg(workItem, callInst, fnName, overload, result, fmin);
+ }
+ else
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ double y = FARG(1);
+ result.setFloat(_min_(x, y), i);
+ }
+ }
+ break;
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ u2arg(workItem, callInst, fnName, overload, result, _min_);
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ s2arg(workItem, callInst, fnName, overload, result, _min_);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+
+ DEFINE_BUILTIN(mix)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ double y = FARGV(1, i);
+ double a = ARG(2)->getType()->isVectorTy() ? FARGV(2, i) : FARG(2);
+ double r = x + (y - x) * a;
+ result.setFloat(r, i);
+ }
+ }
+
+ DEFINE_BUILTIN(smoothstep)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double edge0 = ARG(0)->getType()->isVectorTy() ? FARGV(0, i) : FARG(0);
+ double edge1 = ARG(1)->getType()->isVectorTy() ? FARGV(1, i) : FARG(1);
+ double x = FARGV(2, i);
+ double t = _clamp_<double>((x - edge0) / (edge1 - edge0), 0, 1);
+ double r = t * t * (3 - 2*t);
+ result.setFloat(r, i);
+ }
+ }
+
+ DEFINE_BUILTIN(step)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double edge = ARG(0)->getType()->isVectorTy() ? FARGV(0, i) : FARG(0);
+ double x = FARGV(1, i);
+ double r = (x < edge) ? 0.0 : 1.0;
+ result.setFloat(r, i);
+ }
+ }
+
+
+ /////////////////////////
+ // Geometric Functions //
+ /////////////////////////
+
+ DEFINE_BUILTIN(cross)
+ {
+ double u1 = FARGV(0, 0);
+ double u2 = FARGV(0, 1);
+ double u3 = FARGV(0, 2);
+ double v1 = FARGV(1, 0);
+ double v2 = FARGV(1, 1);
+ double v3 = FARGV(1, 2);
+ result.setFloat(u2*v3 - u3*v2, 0);
+ result.setFloat(u3*v1 - u1*v3, 1);
+ result.setFloat(u1*v2 - u2*v1, 2);
+ result.setFloat(0, 3);
+ }
+
+ DEFINE_BUILTIN(dot)
+ {
+ unsigned num = 1;
+ if (ARG(0)->getType()->isVectorTy())
+ {
+ num = ARG(0)->getType()->getVectorNumElements();
+ }
+
+ double r = 0.f;
+ for (unsigned i = 0; i < num; i++)
+ {
+ double a = FARGV(0, i);
+ double b = FARGV(1, i);
+ r += a * b;
+ }
+ result.setFloat(r);
+ }
+
+ DEFINE_BUILTIN(distance)
+ {
+ unsigned num = 1;
+ if (ARG(0)->getType()->isVectorTy())
+ {
+ num = ARG(0)->getType()->getVectorNumElements();
+ }
+
+ double distSq = 0.0;
+ for (unsigned i = 0; i < num; i++)
+ {
+ double diff = FARGV(0,i) - FARGV(1,i);
+ distSq += diff*diff;
+ }
+ result.setFloat(sqrt(distSq));
+ }
+
+ DEFINE_BUILTIN(length)
+ {
+ unsigned num = 1;
+ if (ARG(0)->getType()->isVectorTy())
+ {
+ num = ARG(0)->getType()->getVectorNumElements();
+ }
+
+ double lengthSq = 0.0;
+ for (unsigned i = 0; i < num; i++)
+ {
+ lengthSq += FARGV(0, i) * FARGV(0, i);
+ }
+ result.setFloat(sqrt(lengthSq));
+ }
+
+ DEFINE_BUILTIN(normalize)
+ {
+ double lengthSq = 0.0;
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ lengthSq += FARGV(0, i) * FARGV(0, i);
+ }
+ double length = sqrt(lengthSq);
+
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(FARGV(0, i)/length, i);
+ }
+ }
+
+
+ /////////////////////
+ // Image Functions //
+ /////////////////////
+
+ static size_t getChannelSize(const cl_image_format& format)
+ {
+ switch (format.image_channel_data_type)
+ {
+ case CL_SNORM_INT8:
+ case CL_UNORM_INT8:
+ case CL_SIGNED_INT8:
+ case CL_UNSIGNED_INT8:
+ return 1;
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT16:
+ case CL_SIGNED_INT16:
+ case CL_UNSIGNED_INT16:
+ case CL_HALF_FLOAT:
+ return 2;
+ case CL_SIGNED_INT32:
+ case CL_UNSIGNED_INT32:
+ case CL_FLOAT:
+ return 4;
+ default:
+ return 0;
+ }
+ }
+
+ static size_t getNumChannels(const cl_image_format& format)
+ {
+ switch (format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_A:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ return 1;
+ case CL_RG:
+ case CL_RGx:
+ case CL_RA:
+ return 2;
+ case CL_RGB:
+ case CL_RGBx:
+ return 3;
+ case CL_RGBA:
+ case CL_ARGB:
+ case CL_BGRA:
+ return 4;
+ default:
+ return 0;
+ }
+ }
+
+ static bool hasZeroAlphaBorder(const cl_image_format& format)
+ {
+ switch (format.image_channel_order)
+ {
+ case CL_A:
+ case CL_INTENSITY:
+ case CL_Rx:
+ case CL_RA:
+ case CL_RGx:
+ case CL_RGBx:
+ case CL_ARGB:
+ case CL_BGRA:
+ case CL_RGBA:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ DEFINE_BUILTIN(get_image_array_size)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+ result.setUInt(image->desc.image_array_size);
+ }
+
+ DEFINE_BUILTIN(get_image_channel_data_type)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+ result.setSInt(image->format.image_channel_data_type);
+ }
+
+ DEFINE_BUILTIN(get_image_channel_order)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+ result.setSInt(image->format.image_channel_order);
+ }
+
+ DEFINE_BUILTIN(get_image_dim)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+
+ result.setSInt(image->desc.image_width, 0);
+ result.setSInt(image->desc.image_height, 1);
+ if (result.num > 2)
+ {
+ result.setSInt(image->desc.image_depth, 2);
+ result.setSInt(0, 3);
+ }
+ }
+
+ DEFINE_BUILTIN(get_image_depth)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+ result.setSInt(image->desc.image_depth);
+ }
+
+ DEFINE_BUILTIN(get_image_height)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+ result.setSInt(image->desc.image_height);
+ }
+
+ DEFINE_BUILTIN(get_image_width)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+ result.setSInt(image->desc.image_width);
+ }
+
+ static inline float getCoordinate(const llvm::Value *value, int index,
+ char type, WorkItem *workItem)
+ {
+ switch (type)
+ {
+ case 'i':
+ return workItem->getOperand(value).getSInt(index);
+ case 'f':
+ return workItem->getOperand(value).getFloat(index);
+ default:
+ FATAL_ERROR("Unsupported coordinate type: '%c'", type);
+ }
+ }
+
+ static inline int getNearestCoordinate(uint32_t sampler,
+ float n, // Normalized
+ float u, // Unormalized
+ size_t size)
+ {
+ switch (sampler & CLK_ADDRESS_MASK)
+ {
+ case CLK_ADDRESS_NONE:
+ return floor(u);
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ return _clamp_<int>(floor(u), 0, size - 1);
+ case CLK_ADDRESS_CLAMP:
+ return _clamp_<int>(floor(u), -1, size);
+ case CLK_ADDRESS_REPEAT:
+ return (int)floorf((n - floorf(n))*size) % size;
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ return _min_<int>((int)floorf(fabsf(n - 2.f * rintf(0.5f*n)) * size),
+ size - 1);
+ default:
+ FATAL_ERROR("Unsupported sampler addressing mode: %X",
+ sampler & CLK_ADDRESS_MASK);
+ }
+ }
+
+ static inline float getAdjacentCoordinates(uint32_t sampler,
+ float n, // Normalized
+ float u, // Unnormalized
+ size_t size,
+ int *c0, int *c1)
+ {
+ switch (sampler & CLK_ADDRESS_MASK)
+ {
+ case CLK_ADDRESS_NONE:
+ *c0 = floor(u);
+ *c1 = floor(u) + 1;
+ return u;
+ case CLK_ADDRESS_CLAMP_TO_EDGE:
+ *c0 = _clamp_<int>(floorf(u - 0.5f), 0, size - 1);
+ *c1 = _clamp_<int>(floorf(u - 0.5f) + 1, 0, size - 1);
+ return u;
+ case CLK_ADDRESS_CLAMP:
+ *c0 = _clamp_<int>((floorf(u - 0.5f)), -1, size);
+ *c1 = _clamp_<int>((floorf(u - 0.5f)) + 1, -1, size);
+ return u;
+ case CLK_ADDRESS_REPEAT:
+ {
+ u = (n - floorf(n)) * size;
+ *c0 = (int)floorf(u - 0.5f);
+ *c1 = *c0 + 1;
+ if (*c0 < 0) *c0 += size;
+ if (*c1 >= size) *c1 -= size;
+ return u;
+ }
+ case CLK_ADDRESS_MIRRORED_REPEAT:
+ {
+ u = fabsf(n - 2.0f * rintf(0.5f * n)) * size;
+ *c0 = (int)floorf(u - 0.5f);
+ *c1 = *c0 + 1;
+ *c0 = _max_(*c0, 0);
+ *c1 = _min_(*c1, (int)size-1);
+ return u;
+ }
+ default:
+ FATAL_ERROR("Unsupported sampler addressing mode: %X",
+ sampler & CLK_ADDRESS_MASK);
+ }
+ }
+
+ static inline int getInputChannel(const cl_image_format& format,
+ int output, float *ret)
+ {
+ int input = output;
+ switch (format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ if (output == 1)
+ {
+ *ret = 0.f;
+ return -1;
+ }
+ case CL_RG:
+ case CL_RGx:
+ if (output == 2)
+ {
+ *ret = 0.f;
+ return -1;
+ }
+ case CL_RGB:
+ case CL_RGBx:
+ if (output == 3)
+ {
+ *ret = 1.f;
+ return -1;
+ }
+ break;
+ case CL_RGBA:
+ break;
+ case CL_BGRA:
+ if (output == 0) input = 2;
+ if (output == 2) input = 0;
+ break;
+ case CL_ARGB:
+ if (output == 0) input = 1;
+ if (output == 1) input = 2;
+ if (output == 2) input = 3;
+ if (output == 3) input = 0;
+ break;
+ case CL_A:
+ if (output == 3) input = 0;
+ else
+ {
+ *ret = 0.f;
+ return -1;
+ }
+ break;
+ case CL_RA:
+ if (output == 3) input = 1;
+ else if (output != 0)
+ {
+ *ret = 0.f;
+ return -1;
+ }
+ break;
+ case CL_INTENSITY:
+ input = 0;
+ break;
+ case CL_LUMINANCE:
+ if (output == 3)
+ {
+ *ret = 1.f;
+ return -1;
+ }
+ input = 0;
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel order: %X",
+ format.image_channel_order);
+ }
+ return input;
+ }
+
+ static inline float readNormalizedColor(const Image *image,
+ WorkItem *workItem,
+ int i, int j, int k,
+ int layer, int c)
+ {
+ // Check for out-of-range coordinages
+ if (i < 0 || i >= image->desc.image_width ||
+ j < 0 || j >= image->desc.image_height ||
+ k < 0 || k >= image->desc.image_depth)
+ {
+ // Return border color
+ if (c == 3 && !hasZeroAlphaBorder(image->format))
+ {
+ return 1.f;
+ }
+ return 0.f;
+ }
+
+ // Remap channels
+ float ret;
+ int channel = getInputChannel(image->format, c, &ret);
+ if (channel < 0)
+ {
+ return ret;
+ }
+
+ // Calculate pixel address
+ size_t channelSize = getChannelSize(image->format);
+ size_t numChannels = getNumChannels(image->format);
+ size_t pixelSize = channelSize*numChannels;
+ size_t address = image->address
+ + (i + (j + (k + layer*image->desc.image_depth)
+ * image->desc.image_height)
+ * image->desc.image_width) * pixelSize
+ + channel*channelSize;
+
+ // Load channel data
+ unsigned char *data = workItem->m_pool.alloc(channelSize);
+ if (!workItem->getMemory(AddrSpaceGlobal)->load(data, address,
+ channelSize))
+ {
+ return 0.f;
+ }
+
+ // Compute normalized color value
+ float color;
+ switch (image->format.image_channel_data_type)
+ {
+ case CL_SNORM_INT8:
+ color = _clamp_(*(int8_t*)data / 127.f, -1.f, 1.f);
+ break;
+ case CL_UNORM_INT8:
+ color = _clamp_(*(uint8_t*)data / 255.f, 0.f, 1.f);
+ break;
+ case CL_SNORM_INT16:
+ color = _clamp_(*(int16_t*)data / 32767.f, -1.f, 1.f);
+ break;
+ case CL_UNORM_INT16:
+ color = _clamp_(*(uint16_t*)data / 65535.f, 0.f, 1.f);
+ break;
+ case CL_FLOAT:
+ color = *(float*)data;
+ break;
+ case CL_HALF_FLOAT:
+ color = halfToFloat(*(uint16_t*)data);
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel data type: %X",
+ image->format.image_channel_data_type);
+ }
+
+ return color;
+ }
+
+ static inline int32_t readSignedColor(const Image *image,
+ WorkItem *workItem,
+ int i, int j, int k,
+ int layer, int c)
+ {
+ // Check for out-of-range coordinages
+ if (i < 0 || i >= image->desc.image_width ||
+ j < 0 || j >= image->desc.image_height ||
+ k < 0 || k >= image->desc.image_depth)
+ {
+ // Return border color
+ if (c == 3 && !hasZeroAlphaBorder(image->format))
+ {
+ return 1.f;
+ }
+ return 0.f;
+ }
+
+ // Remap channels
+ float ret;
+ int channel = getInputChannel(image->format, c, &ret);
+ if (channel < 0)
+ {
+ return ret;
+ }
+
+ // Calculate pixel address
+ size_t channelSize = getChannelSize(image->format);
+ size_t numChannels = getNumChannels(image->format);
+ size_t pixelSize = channelSize*numChannels;
+ size_t address = image->address
+ + (i + (j + (k + layer*image->desc.image_depth)
+ * image->desc.image_height)
+ * image->desc.image_width) * pixelSize
+ + channel*channelSize;
+
+ // Load channel data
+ unsigned char *data = workItem->m_pool.alloc(channelSize);
+ if (!workItem->getMemory(AddrSpaceGlobal)->load(data, address,
+ channelSize))
+ {
+ return 0;
+ }
+
+ // Compute unnormalized color value
+ int32_t color;
+ switch (image->format.image_channel_data_type)
+ {
+ case CL_SIGNED_INT8:
+ color = *(int8_t*)data;
+ break;
+ case CL_SIGNED_INT16:
+ color = *(int16_t*)data;
+ break;
+ case CL_SIGNED_INT32:
+ color = *(int32_t*)data;
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel data type: %X",
+ image->format.image_channel_data_type);
+ }
+
+ return color;
+ }
+
+ static inline uint32_t readUnsignedColor(const Image *image,
+ WorkItem *workItem,
+ int i, int j, int k,
+ int layer, int c)
+ {
+ // Check for out-of-range coordinages
+ if (i < 0 || i >= image->desc.image_width ||
+ j < 0 || j >= image->desc.image_height ||
+ k < 0 || k >= image->desc.image_depth)
+ {
+ // Return border color
+ if (c == 3 && !hasZeroAlphaBorder(image->format))
+ {
+ return 1.f;
+ }
+ return 0.f;
+ }
+
+ // Remap channels
+ float ret;
+ int channel = getInputChannel(image->format, c, &ret);
+ if (channel < 0)
+ {
+ return ret;
+ }
+
+ // Calculate pixel address
+ size_t channelSize = getChannelSize(image->format);
+ size_t numChannels = getNumChannels(image->format);
+ size_t pixelSize = channelSize*numChannels;
+ size_t address = image->address
+ + (i + (j + (k + layer*image->desc.image_depth)
+ * image->desc.image_height)
+ * image->desc.image_width) * pixelSize
+ + channel*channelSize;
+
+ // Load channel data
+ unsigned char *data = workItem->m_pool.alloc(channelSize);
+ if (!workItem->getMemory(AddrSpaceGlobal)->load(data, address,
+ channelSize))
+ {
+ return 0;
+ }
+
+ // Load color value
+ uint32_t color;
+ switch (image->format.image_channel_data_type)
+ {
+ case CL_UNSIGNED_INT8:
+ color = *(uint8_t*)data;
+ break;
+ case CL_UNSIGNED_INT16:
+ color = *(uint16_t*)data;
+ break;
+ case CL_UNSIGNED_INT32:
+ color = *(uint32_t*)data;
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel data type: %X",
+ image->format.image_channel_data_type);
+ }
+
+ return color;
+ }
+
+ static inline float frac(float x)
+ {
+ return x - floorf(x);
+ }
+
+ static inline float interpolate(float v000, float v010,
+ float v100, float v110,
+ float v001, float v011,
+ float v101, float v111,
+ float a, float b, float c)
+ {
+ return (1-a) * (1-b) * (1-c) * v000
+ + a * (1-b) * (1-c) * v100
+ + (1-a) * b * (1-c) * v010
+ + a * b * (1-c) * v110
+ + (1-a) * (1-b) * c * v001
+ + a * (1-b) * c * v101
+ + (1-a) * b * c * v011
+ + a * b * c * v111;
+ }
+
+ DEFINE_BUILTIN(read_imagef)
+ {
+ const Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+
+ uint32_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int coordIndex = 1;
+
+ // Check for sampler version
+ if (callInst->getNumArgOperands() > 2)
+ {
+ sampler = UARG(1);
+ coordIndex = 2;
+ }
+
+ // Get coordinates
+ float s = 0.f, t = 0.f, r = 0.f;
+ char coordType = *overload.rbegin();
+ s = getCoordinate(ARG(coordIndex), 0, coordType, workItem);
+ if (ARG(coordIndex)->getType()->isVectorTy())
+ {
+ t = getCoordinate(ARG(coordIndex), 1, coordType, workItem);
+ if (ARG(coordIndex)->getType()->getVectorNumElements() > 2)
+ {
+ r = getCoordinate(ARG(coordIndex), 2, coordType, workItem);
+ }
+ }
+
+ // Get unnormalized coordinates
+ float u = 0.f, v = 0.f, w = 0.f;
+ bool noormCoords = sampler & CLK_NORMALIZED_COORDS_TRUE;
+ if (noormCoords)
+ {
+ u = s * image->desc.image_width;
+ v = t * image->desc.image_height;
+ w = r * image->desc.image_depth;
+ }
+ else
+ {
+ u = s;
+ v = t;
+ w = r;
+ }
+
+ // Get array layer index
+ int layer = 0;
+ if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ {
+ layer = _clamp_<int>(rintf(t), 0, image->desc.image_array_size - 1);
+ v = t = 0.f;
+ }
+ else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ {
+ layer = _clamp_<int>(rintf(r), 0, image->desc.image_array_size - 1);
+ w = r = 0.f;
+ }
+
+ float values[4];
+ if (sampler & CLK_FILTER_LINEAR)
+ {
+ // Get coordinates of adjacent pixels
+ int i0 = 0, i1 = 0, j0 = 0, j1 = 0, k0 = 0, k1 = 0;
+ u = getAdjacentCoordinates(sampler, s, u, image->desc.image_width,
+ &i0, &i1);
+ v = getAdjacentCoordinates(sampler, t, v, image->desc.image_height,
+ &j0, &j1);
+ w = getAdjacentCoordinates(sampler, r, w, image->desc.image_depth,
+ &k0, &k1);
+
+ // Make sure y and z coordinates are equal for 1 and 2D images
+ if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D ||
+ image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ {
+ j0 = j1;
+ k0 = k1;
+ }
+ else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D ||
+ image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ {
+ k0 = k1;
+ }
+
+ // Perform linear interpolation
+ float a = frac(u - 0.5f);
+ float b = frac(v - 0.5f);
+ float c = frac(w - 0.5f);
+ for (int i = 0; i < 4; i++)
+ {
+ values[i] = interpolate(
+ readNormalizedColor(image, workItem, i0, j0, k0, layer, i),
+ readNormalizedColor(image, workItem, i0, j1, k0, layer, i),
+ readNormalizedColor(image, workItem, i1, j0, k0, layer, i),
+ readNormalizedColor(image, workItem, i1, j1, k0, layer, i),
+ readNormalizedColor(image, workItem, i0, j0, k1, layer, i),
+ readNormalizedColor(image, workItem, i0, j1, k1, layer, i),
+ readNormalizedColor(image, workItem, i1, j0, k1, layer, i),
+ readNormalizedColor(image, workItem, i1, j1, k1, layer, i),
+ a, b, c);
+ }
+ }
+ else
+ {
+ // Read values from nearest pixel
+ int i = getNearestCoordinate(sampler, s, u, image->desc.image_width);
+ int j = getNearestCoordinate(sampler, t, v, image->desc.image_height);
+ int k = getNearestCoordinate(sampler, r, w, image->desc.image_depth);
+ values[0] = readNormalizedColor(image, workItem, i, j, k, layer, 0);
+ values[1] = readNormalizedColor(image, workItem, i, j, k, layer, 1);
+ values[2] = readNormalizedColor(image, workItem, i, j, k, layer, 2);
+ values[3] = readNormalizedColor(image, workItem, i, j, k, layer, 3);
+ }
+
+ // Store values in result
+ for (int i = 0; i < 4; i++)
+ {
+ result.setFloat(values[i], i);
+ }
+ }
+
+ DEFINE_BUILTIN(read_imagei)
+ {
+ const Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+
+ uint32_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int coordIndex = 1;
+
+ // Check for sampler version
+ if (callInst->getNumArgOperands() > 2)
+ {
+ sampler = UARG(1);
+ coordIndex = 2;
+ }
+
+ // Get coordinates
+ float s = 0.f, t = 0.f, r = 0.f;
+ char coordType = *overload.rbegin();
+ s = getCoordinate(ARG(coordIndex), 0, coordType, workItem);
+ if (ARG(coordIndex)->getType()->isVectorTy())
+ {
+ t = getCoordinate(ARG(coordIndex), 1, coordType, workItem);
+ if (ARG(coordIndex)->getType()->getVectorNumElements() > 2)
+ {
+ r = getCoordinate(ARG(coordIndex), 2, coordType, workItem);
+ }
+ }
+
+ // Get unnormalized coordinates
+ float u = 0.f, v = 0.f, w = 0.f;
+ bool noormCoords = sampler & CLK_NORMALIZED_COORDS_TRUE;
+ if (noormCoords)
+ {
+ u = s * image->desc.image_width;
+ v = t * image->desc.image_height;
+ w = r * image->desc.image_depth;
+ }
+ else
+ {
+ u = s;
+ v = t;
+ w = r;
+ }
+
+ // Get array layer index
+ int layer = 0;
+ if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ {
+ layer = _clamp_<int>(rintf(t), 0, image->desc.image_array_size - 1);
+ v = t = 0.f;
+ }
+ else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ {
+ layer = _clamp_<int>(rintf(r), 0, image->desc.image_array_size - 1);
+ w = r = 0.f;
+ }
+
+ // Read values from nearest pixel
+ int32_t values[4];
+ int i = getNearestCoordinate(sampler, s, u, image->desc.image_width);
+ int j = getNearestCoordinate(sampler, t, v, image->desc.image_height);
+ int k = getNearestCoordinate(sampler, r, w, image->desc.image_depth);
+ values[0] = readSignedColor(image, workItem, i, j, k, layer, 0);
+ values[1] = readSignedColor(image, workItem, i, j, k, layer, 1);
+ values[2] = readSignedColor(image, workItem, i, j, k, layer, 2);
+ values[3] = readSignedColor(image, workItem, i, j, k, layer, 3);
+
+ // Store values in result
+ for (int i = 0; i < 4; i++)
+ {
+ result.setSInt(values[i], i);
+ }
+ }
+
+ DEFINE_BUILTIN(read_imageui)
+ {
+ const Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+
+ uint32_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+ int coordIndex = 1;
+
+ // Check for sampler version
+ if (callInst->getNumArgOperands() > 2)
+ {
+ sampler = UARG(1);
+ coordIndex = 2;
+ }
+
+ // Get coordinates
+ float s = 0.f, t = 0.f, r = 0.f;
+ char coordType = *overload.rbegin();
+ s = getCoordinate(ARG(coordIndex), 0, coordType, workItem);
+ if (ARG(coordIndex)->getType()->isVectorTy())
+ {
+ t = getCoordinate(ARG(coordIndex), 1, coordType, workItem);
+ if (ARG(coordIndex)->getType()->getVectorNumElements() > 2)
+ {
+ r = getCoordinate(ARG(coordIndex), 2, coordType, workItem);
+ }
+ }
+
+ // Get unnormalized coordinates
+ float u = 0.f, v = 0.f, w = 0.f;
+ bool noormCoords = sampler & CLK_NORMALIZED_COORDS_TRUE;
+ if (noormCoords)
+ {
+ u = s * image->desc.image_width;
+ v = t * image->desc.image_height;
+ w = r * image->desc.image_depth;
+ }
+ else
+ {
+ u = s;
+ v = t;
+ w = r;
+ }
+
+ // Get array layer index
+ int layer = 0;
+ if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ {
+ layer = _clamp_<int>(rintf(t), 0, image->desc.image_array_size - 1);
+ v = t = 0.f;
+ }
+ else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ {
+ layer = _clamp_<int>(rintf(r), 0, image->desc.image_array_size - 1);
+ w = r = 0.f;
+ }
+
+ // Read values from nearest pixel
+ uint32_t values[4];
+ int i = getNearestCoordinate(sampler, s, u, image->desc.image_width);
+ int j = getNearestCoordinate(sampler, t, v, image->desc.image_height);
+ int k = getNearestCoordinate(sampler, r, w, image->desc.image_depth);
+ values[0] = readUnsignedColor(image, workItem, i, j, k, layer, 0);
+ values[1] = readUnsignedColor(image, workItem, i, j, k, layer, 1);
+ values[2] = readUnsignedColor(image, workItem, i, j, k, layer, 2);
+ values[3] = readUnsignedColor(image, workItem, i, j, k, layer, 3);
+
+ // Store values in result
+ for (int i = 0; i < 4; i++)
+ {
+ result.setUInt(values[i], i);
+ }
+ }
+
+ DEFINE_BUILTIN(write_imagef)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+
+ // Get pixel coordinates
+ int x, y = 0, z = 0 ;
+ x = SARGV(1, 0);
+ if (ARG(1)->getType()->isVectorTy())
+ {
+ y = SARGV(1, 1);
+ if (ARG(1)->getType()->getVectorNumElements() > 2)
+ {
+ z = SARGV(1, 2);
+ }
+ }
+
+ // Get color data
+ float values[4] =
+ {
+ (float)FARGV(2, 0),
+ (float)FARGV(2, 1),
+ (float)FARGV(2, 2),
+ (float)FARGV(2, 3),
+ };
+
+ // Re-order color values
+ switch (image->format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_RG:
+ case CL_RGx:
+ case CL_RGB:
+ case CL_RGBx:
+ case CL_RGBA:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ break;
+ case CL_A:
+ values[0] = values[3];
+ break;
+ case CL_RA:
+ values[1] = values[3];
+ break;
+ case CL_ARGB:
+ swap(values[2], values[3]);
+ swap(values[1], values[2]);
+ swap(values[0], values[1]);
+ break;
+ case CL_BGRA:
+ swap(values[0], values[2]);
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel order: %X",
+ image->format.image_channel_order);
+ }
+
+ size_t channelSize = getChannelSize(image->format);
+ size_t numChannels = getNumChannels(image->format);
+ size_t pixelSize = channelSize*numChannels;
+ size_t pixelAddress = image->address
+ + (x + (y + z*image->desc.image_height)
+ * image->desc.image_width) * pixelSize;
+
+ // Generate channel values
+ Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+ unsigned char *data = workItem->m_pool.alloc(channelSize*numChannels);
+ for (unsigned i = 0; i < numChannels; i++)
+ {
+ switch (image->format.image_channel_data_type)
+ {
+ case CL_SNORM_INT8:
+ ((int8_t*)data)[i] = rint(_clamp_(values[i] * 127.f,
+ -128.f, 127.f));
+ break;
+ case CL_UNORM_INT8:
+ data[i] = rint(_clamp_(values[i] * 255.f, 0.f, 255.f));
+ break;
+ case CL_SNORM_INT16:
+ ((int16_t*)data)[i] = rint(_clamp_(values[i] * 32767.f,
+ -32768.f, 32767.f));
+ break;
+ case CL_UNORM_INT16:
+ ((uint16_t*)data)[i] = rint(_clamp_(values[i] * 65535.f,
+ 0.f, 65535.f));
+ break;
+ case CL_FLOAT:
+ ((float*)data)[i] = values[i];
+ break;
+ case CL_HALF_FLOAT:
+ ((uint16_t*)data)[i] = floatToHalf(values[i]);
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel data type: %X",
+ image->format.image_channel_data_type);
+ }
+ }
+
+ // Write pixel data
+ memory->store(data, pixelAddress, channelSize*numChannels);
+ }
+
+ DEFINE_BUILTIN(write_imagei)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+
+ // Get pixel coordinates
+ int x, y = 0, z = 0 ;
+ x = SARGV(1, 0);
+ if (ARG(1)->getType()->isVectorTy())
+ {
+ y = SARGV(1, 1);
+ if (ARG(1)->getType()->getVectorNumElements() > 2)
+ {
+ z = SARGV(1, 2);
+ }
+ }
+
+ // Get color data
+ int32_t values[4] =
+ {
+ (int32_t)SARGV(2, 0),
+ (int32_t)SARGV(2, 1),
+ (int32_t)SARGV(2, 2),
+ (int32_t)SARGV(2, 3),
+ };
+
+ // Re-order color values
+ switch (image->format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_RG:
+ case CL_RGx:
+ case CL_RGB:
+ case CL_RGBx:
+ case CL_RGBA:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ break;
+ case CL_A:
+ values[0] = values[3];
+ break;
+ case CL_RA:
+ values[1] = values[3];
+ break;
+ case CL_ARGB:
+ swap(values[2], values[3]);
+ swap(values[1], values[2]);
+ swap(values[0], values[1]);
+ break;
+ case CL_BGRA:
+ swap(values[0], values[2]);
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel order: %X",
+ image->format.image_channel_order);
+ }
+
+ size_t channelSize = getChannelSize(image->format);
+ size_t numChannels = getNumChannels(image->format);
+ size_t pixelSize = channelSize*numChannels;
+ size_t pixelAddress = image->address
+ + (x + (y + z*image->desc.image_height)
+ * image->desc.image_width) * pixelSize;
+
+ // Generate channel values
+ Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+ unsigned char *data = workItem->m_pool.alloc(channelSize*numChannels);
+ for (unsigned i = 0; i < numChannels; i++)
+ {
+ switch (image->format.image_channel_data_type)
+ {
+ case CL_SIGNED_INT8:
+ ((int8_t*)data)[i] = _clamp_(values[i], -128, 127);
+ break;
+ case CL_SIGNED_INT16:
+ ((int16_t*)data)[i] = _clamp_(values[i], -32768, 32767);
+ break;
+ case CL_SIGNED_INT32:
+ ((int32_t*)data)[i] = values[i];
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel data type: %X",
+ image->format.image_channel_data_type);
+ }
+ }
+
+ // Write pixel data
+ memory->store(data, pixelAddress, channelSize*numChannels);
+ }
+
+ DEFINE_BUILTIN(write_imageui)
+ {
+ Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+
+ // Get pixel coordinates
+ int x, y = 0, z = 0 ;
+ x = SARGV(1, 0);
+ if (ARG(1)->getType()->isVectorTy())
+ {
+ y = SARGV(1, 1);
+ if (ARG(1)->getType()->getVectorNumElements() > 2)
+ {
+ z = SARGV(1, 2);
+ }
+ }
+
+ // Get color data
+ uint32_t values[4] =
+ {
+ (uint32_t)SARGV(2, 0),
+ (uint32_t)SARGV(2, 1),
+ (uint32_t)SARGV(2, 2),
+ (uint32_t)SARGV(2, 3),
+ };
+
+ // Re-order color values
+ switch (image->format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_RG:
+ case CL_RGx:
+ case CL_RGB:
+ case CL_RGBx:
+ case CL_RGBA:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ break;
+ case CL_A:
+ values[0] = values[3];
+ break;
+ case CL_RA:
+ values[1] = values[3];
+ break;
+ case CL_ARGB:
+ swap(values[2], values[3]);
+ swap(values[1], values[2]);
+ swap(values[0], values[1]);
+ break;
+ case CL_BGRA:
+ swap(values[0], values[2]);
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel order: %X",
+ image->format.image_channel_order);
+ }
+
+ size_t channelSize = getChannelSize(image->format);
+ size_t numChannels = getNumChannels(image->format);
+ size_t pixelSize = channelSize*numChannels;
+ size_t pixelAddress = image->address
+ + (x + (y + z*image->desc.image_height)
+ * image->desc.image_width) * pixelSize;
+
+ // Generate channel values
+ Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+ unsigned char *data = workItem->m_pool.alloc(channelSize*numChannels);
+ for (unsigned i = 0; i < numChannels; i++)
+ {
+ switch (image->format.image_channel_data_type)
+ {
+ case CL_UNSIGNED_INT8:
+ ((uint8_t*)data)[i] = _min_<uint32_t>(values[i], UINT8_MAX);
+ break;
+ case CL_UNSIGNED_INT16:
+ ((uint16_t*)data)[i] = _min_<uint32_t>(values[i], UINT16_MAX);
+ break;
+ case CL_UNSIGNED_INT32:
+ ((uint32_t*)data)[i] = values[i];
+ break;
+ default:
+ FATAL_ERROR("Unsupported image channel data type: %X",
+ image->format.image_channel_data_type);
+ }
+ }
+
+ // Write pixel data
+ memory->store(data, pixelAddress, channelSize*numChannels);
+ }
+
+
+ ///////////////////////
+ // Integer Functions //
+ ///////////////////////
+
+ DEFINE_BUILTIN(abs_builtin)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ result.setUInt(UARGV(0,i), i);
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ result.setSInt(abs(SARGV(0,i)), i);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ DEFINE_BUILTIN(abs_diff)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ {
+ uint64_t a = UARGV(0, i);
+ uint64_t b = UARGV(1, i);
+ result.setUInt(_max_(a,b) - _min_(a,b), i);
+ break;
+ }
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ {
+ int64_t a = SARGV(0, i);
+ int64_t b = SARGV(1, i);
+ result.setSInt(_max_(a,b) - _min_(a,b), i);
+ break;
+ }
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ DEFINE_BUILTIN(add_sat)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t uresult = UARGV(0,i) + UARGV(1,i);
+ int64_t sresult = SARGV(0,i) + SARGV(1,i);
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ uresult = _min_<uint64_t>(uresult, UINT8_MAX);
+ result.setUInt(uresult, i);
+ break;
+ case 't':
+ uresult = _min_<uint64_t>(uresult, UINT16_MAX);
+ result.setUInt(uresult, i);
+ break;
+ case 'j':
+ uresult = _min_<uint64_t>(uresult, UINT32_MAX);
+ result.setUInt(uresult, i);
+ break;
+ case 'm':
+ uresult = (UARGV(1, i) > uresult) ? UINT64_MAX : uresult;
+ result.setUInt(uresult, i);
+ break;
+ case 'c':
+ sresult = _clamp_<int64_t>(sresult, INT8_MIN, INT8_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 's':
+ sresult = _clamp_<int64_t>(sresult, INT16_MIN, INT16_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 'i':
+ sresult = _clamp_<int64_t>(sresult, INT32_MIN, INT32_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 'l':
+ if ((SARGV(0,i)>0) == (SARGV(1,i)>0) &&
+ (SARGV(0,i)>0) != (sresult>0))
+ {
+ sresult = (SARGV(0,i)>0) ? INT64_MAX : INT64_MIN;
+ }
+ result.setSInt(sresult, i);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ DEFINE_BUILTIN(clz)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t x = UARGV(0, i);
+ int nz = 0;
+ while (x)
+ {
+ x >>= 1;
+ nz++;
+ }
+
+ uint64_t r = ((result.size<<3) - nz);
+ result.setUInt(r, i);
+ }
+ }
+
+ DEFINE_BUILTIN(hadd)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ {
+ uint64_t a = UARGV(0, i);
+ uint64_t b = UARGV(1, i);
+ uint64_t c = (a > UINT64_MAX-b) ? (1L<<63) : 0;
+ result.setUInt(((a + b) >> 1) | c, i);
+ break;
+ }
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ {
+ int64_t a = SARGV(0, i);
+ int64_t b = SARGV(1, i);
+ int64_t c = (a & b) & 1;
+ result.setSInt((a>>1) + (b>>1) + c, i);
+ break;
+ }
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ static uint64_t _mad_(uint64_t a, uint64_t b, uint64_t c)
+ {
+ return a*b + c;
+ }
+
+ static uint64_t _umul_hi_(uint64_t x, uint64_t y, uint64_t bits)
+ {
+ if (bits == 64)
+ {
+ uint64_t xl = x & UINT32_MAX;
+ uint64_t xh = x >> 32;
+ uint64_t yl = y & UINT32_MAX;
+ uint64_t yh = y >> 32;
+
+ uint64_t xlyl = xl*yl;
+ uint64_t xlyh = xl*yh;
+ uint64_t xhyl = xh*yl;
+ uint64_t xhyh = xh*yh;
+
+ uint64_t a = xhyl + ((xlyl)>>32);
+ uint64_t al = a & UINT32_MAX;
+ uint64_t ah = a >> 32;
+ uint64_t b = ((al + xlyh)>>32) + ah;
+
+ return xhyh + b;
+ }
+ else
+ {
+ return (x*y) >> bits;
+ }
+ }
+
+ static int64_t _smul_hi_(int64_t x, int64_t y, int64_t bits)
+ {
+ if (bits == 64)
+ {
+ int64_t xl = x & UINT32_MAX;
+ int64_t xh = x >> 32;
+ int64_t yl = y & UINT32_MAX;
+ int64_t yh = y >> 32;
+
+ int64_t xlyl = xl*yl;
+ int64_t xlyh = xl*yh;
+ int64_t xhyl = xh*yl;
+ int64_t xhyh = xh*yh;
+
+ int64_t a = xhyl + ((xlyl>>32) & UINT32_MAX);
+ int64_t al = a & UINT32_MAX;
+ int64_t ah = a >> 32;
+ int64_t b = ((al + xlyh)>>32) + ah;
+
+ return xhyh + b;
+ }
+ else
+ {
+ return (x*y) >> bits;
+ }
+ }
+
+ DEFINE_BUILTIN(mad_hi)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ {
+ uint64_t r =
+ _umul_hi_(UARGV(0, i), UARGV(1, i), result.size<<3) + UARGV(2, i);
+ result.setUInt(r, i);
+ break;
+ }
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ {
+ int64_t r =
+ _smul_hi_(SARGV(0, i), SARGV(1, i), result.size<<3) + SARGV(2, i);
+ result.setSInt(r, i);
+ break;
+ }
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ DEFINE_BUILTIN(mad_sat)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t uresult = UARGV(0,i)*UARGV(1,i) + UARGV(2,i);
+ int64_t sresult = SARGV(0,i)*SARGV(1,i) + SARGV(2,i);
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ uresult = _min_<uint64_t>(uresult, UINT8_MAX);
+ result.setUInt(uresult, i);
+ break;
+ case 't':
+ uresult = _min_<uint64_t>(uresult, UINT16_MAX);
+ result.setUInt(uresult, i);
+ break;
+ case 'j':
+ uresult = _min_<uint64_t>(uresult, UINT32_MAX);
+ result.setUInt(uresult, i);
+ break;
+ case 'm':
+ {
+ uint64_t hi = _umul_hi_(UARGV(0, i), UARGV(1, i), 64);
+ if (hi || UARGV(2, i) > uresult)
+ {
+ uresult = UINT64_MAX;
+ }
+ result.setUInt(uresult, i);
+ break;
+ }
+ case 'c':
+ sresult = _clamp_<int64_t>(sresult, INT8_MIN, INT8_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 's':
+ sresult = _clamp_<int64_t>(sresult, INT16_MIN, INT16_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 'i':
+ sresult = _clamp_<int64_t>(sresult, INT32_MIN, INT32_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 'l':
+ // Check for overflow in multiplication
+ if (_smul_hi_(SARGV(0, i), SARGV(1, i), 64))
+ {
+ sresult = (SARGV(0,i)>0) ^ (SARGV(1,i)>0) ? INT64_MIN : INT64_MAX;
+ }
+ else
+ {
+ // Check for overflow in addition
+ int64_t m = SARGV(0, i) * SARGV(1, i);
+ if ((m>0) == (SARGV(2,i)>0) && (m>0) != (sresult>0))
+ {
+ sresult = (m>0) ? INT64_MAX : INT64_MIN;
+ }
+ }
+ result.setSInt(sresult, i);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ static uint64_t _mul_(uint64_t a, uint64_t b)
+ {
+ return a*b;
+ }
+
+ DEFINE_BUILTIN(mul_hi)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ {
+ uint64_t r = _umul_hi_(UARGV(0, i), UARGV(1, i), result.size<<3);
+ result.setUInt(r, i);
+ break;
+ }
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ {
+ int64_t r = _smul_hi_(SARGV(0, i), SARGV(1, i), result.size<<3);
+ result.setSInt(r, i);
+ break;
+ }
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ static uint64_t _popcount_(uint64_t x)
+ {
+ int i = 0;
+ while (x)
+ {
+ i += (x & 0x1);
+ x >>= 1;
+ }
+ return i;
+ }
+
+ DEFINE_BUILTIN(rhadd)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ {
+ uint64_t a = UARGV(0, i);
+ uint64_t b = UARGV(1, i);
+ uint64_t c = (a > UINT64_MAX-(b+1)) ? (1L<<63) : 0;
+ result.setUInt(((a + b + 1) >> 1) | c, i);
+ break;
+ }
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ {
+ int64_t a = SARGV(0, i);
+ int64_t b = SARGV(1, i);
+ int64_t c = (a | b) & 1;
+ result.setSInt((a>>1) + (b>>1) + c, i);
+ break;
+ }
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ DEFINE_BUILTIN(rotate)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t width = (result.size << 3);
+ uint64_t v = UARGV(0, i);
+ uint64_t ls = UARGV(1, i) % width;
+ uint64_t rs = width - ls;
+ result.setUInt((v << ls) | (v >> rs), i);
+ }
+ }
+
+ DEFINE_BUILTIN(sub_sat)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t uresult = UARGV(0,i) - UARGV(1,i);
+ int64_t sresult = SARGV(0,i) - SARGV(1,i);
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ uresult = uresult > UINT8_MAX ? 0 : uresult;
+ result.setUInt(uresult, i);
+ break;
+ case 't':
+ uresult = uresult > UINT16_MAX ? 0 : uresult;
+ result.setUInt(uresult, i);
+ break;
+ case 'j':
+ uresult = uresult > UINT32_MAX ? 0 : uresult;
+ result.setUInt(uresult, i);
+ break;
+ case 'm':
+ uresult = (UARGV(1, i) > UARGV(0, i)) ? 0 : uresult;
+ result.setUInt(uresult, i);
+ break;
+ case 'c':
+ sresult = _clamp_<int64_t>(sresult, INT8_MIN, INT8_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 's':
+ sresult = _clamp_<int64_t>(sresult, INT16_MIN, INT16_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 'i':
+ sresult = _clamp_<int64_t>(sresult, INT32_MIN, INT32_MAX);
+ result.setSInt(sresult, i);
+ break;
+ case 'l':
+ if ((SARGV(0,i)>0) != (SARGV(1,i)>0) &&
+ (SARGV(0,i)>0) != (sresult>0))
+ {
+ sresult = (SARGV(0,i)>0) ? INT64_MAX : INT64_MIN;
+ }
+ result.setSInt(sresult, i);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ DEFINE_BUILTIN(upsample)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t r = (UARGV(0,i)<<(result.size<<2)) | UARGV(1, i);
+ result.setUInt(r, i);
+ }
+ }
+
+
+ ////////////////////
+ // Math Functions //
+ ////////////////////
+
+ static double _acospi_(double x){ return (acos(x) / M_PI); }
+ static double _asinpi_(double x){ return (asin(x) / M_PI); }
+ static double _atanpi_(double x){ return (atan(x) / M_PI); }
+ static double _atan2pi_(double x, double y){ return (atan2(x, y) / M_PI); }
+ static double _cospi_(double x){ return (cos(x * M_PI)); }
+ static double _exp10_(double x){ return pow(10, x); }
+ static double _fdivide_(double x, double y){ return x/y; }
+ static double _frecip_(double x){ return 1.0/x; }
+ static double _rsqrt_(double x){ return 1.0 / sqrt(x); }
+ static double _sinpi_(double x){ return (sin(x * M_PI)); }
+ static double _tanpi_(double x){ return (tan(x * M_PI)); }
+
+ static double _fma_(double a, double b, double c)
+ {
+ return a*b + c;
+ }
+
+ static double _maxmag_(double x, double y)
+ {
+ double _x = fabs(x);
+ double _y = fabs(y);
+ if (_x > _y)
+ {
+ return x;
+ }
+ else if (_y > _x)
+ {
+ return y;
+ }
+ else
+ {
+ return fmax(x, y);
+ }
+ }
+
+ static double _minmag_(double x, double y)
+ {
+ double _x = fabs(x);
+ double _y = fabs(y);
+ if (_x < _y)
+ {
+ return x;
+ }
+ else if (_y < _x)
+ {
+ return y;
+ }
+ else
+ {
+ return fmin(x, y);
+ }
+ }
+
+ DEFINE_BUILTIN(fract)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+
+ size_t iptr = PARG(1);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ double fl = floor(x);
+#if defined(_WIN32) && !defined(__MINGW32__)
+ double r = fmin(x - fl, nextafter(1, 0));
+#else
+ double r = fmin(x - fl, 0x1.fffffep-1f);
+#endif
+
+ size_t offset = i*result.size;
+ result.setFloat(fl, i);
+ memory->store(result.data + offset, iptr + offset, result.size);
+ result.setFloat(r, i);
+ }
+ }
+
+ DEFINE_BUILTIN(frexp_builtin)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+
+ size_t iptr = PARG(1);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ int32_t e;
+ double r = frexp(FARGV(0, i), &e);
+ memory->store((const unsigned char*)&e, iptr + i*4, 4);
+ result.setFloat(r, i);
+ }
+ }
+
+ DEFINE_BUILTIN(ilogb_builtin)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setSInt(ilogb(FARGV(0, i)), i);
+ }
+ }
+
+ DEFINE_BUILTIN(ldexp_builtin)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(ldexp(FARGV(0, i), SARGV(1, i)), i);
+ }
+ }
+
+ DEFINE_BUILTIN(lgamma_r)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+
+ size_t signp = PARG(1);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double r = lgamma(FARGV(0, i));
+ int32_t s = (tgamma(FARGV(0, i)) < 0 ? -1 : 1);
+ memory->store((const unsigned char*)&s, signp + i*4, 4);
+ result.setFloat(r, i);
+ }
+ }
+
+ DEFINE_BUILTIN(modf_builtin)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+
+ size_t iptr = PARG(1);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ double integral = trunc(x);
+ double fractional = copysign(::isinf(x) ? 0.0 : x - integral, x);
+
+ size_t offset = i*result.size;
+ result.setFloat(integral, i);
+ memory->store(result.data + offset, iptr + offset, result.size);
+ result.setFloat(fractional, i);
+ }
+ }
+
+ DEFINE_BUILTIN(nan_builtin)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setFloat(nan(""), i);
+ }
+ }
+
+ DEFINE_BUILTIN(nextafter_builtin)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ if (result.size == 4)
+ result.setFloat(nextafterf(FARGV(0, i), FARGV(1, i)), i);
+ else
+ result.setFloat(nextafter(FARGV(0, i), FARGV(1, i)), i);
+ }
+ }
+
+ DEFINE_BUILTIN(pown)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ int32_t y = SARGV(1, i);
+ result.setFloat(pow(x, y), i);
+ }
+ }
+
+ DEFINE_BUILTIN(remquo_builtin)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(2)->getType()->getPointerAddressSpace());
+
+ size_t quop = PARG(2);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ double y = FARGV(1, i);
+
+ int32_t quo;
+ double rem = remquo(x, y, &quo);
+ memory->store((const unsigned char*)&quo, quop + i*4, 4);
+ result.setFloat(rem, i);
+ }
+ }
+
+ DEFINE_BUILTIN(rootn)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ int y = SARGV(1, i);
+ result.setFloat(pow(x, (double)(1.0/y)), i);
+ }
+ }
+
+ DEFINE_BUILTIN(sincos)
+ {
+ Memory *memory =
+ workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+
+ size_t cv = PARG(1);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ double x = FARGV(0, i);
+ size_t offset = i*result.size;
+ result.setFloat(cos(x), i);
+ memory->store(result.data + offset, cv + offset, result.size);
+ result.setFloat(sin(x), i);
+ }
+ }
+
+
+ ////////////////////////////
+ // Misc. Vector Functions //
+ ////////////////////////////
+
+ DEFINE_BUILTIN(shuffle_builtin)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ result.setUInt(UARGV(0, UARGV(1, i)), i);
+ }
+ }
+
+ DEFINE_BUILTIN(shuffle2_builtin)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t m = 1;
+ if (ARG(0)->getType()->isVectorTy())
+ {
+ m = ARG(0)->getType()->getVectorNumElements();
+ }
+
+ uint64_t src = 0;
+ uint64_t index = UARGV(2, i);
+ if (index >= m)
+ {
+ index -= m;
+ src = 1;
+ }
+ result.setUInt(UARGV(src, index), i);
+ }
+ }
+
+
+ //////////////////////////
+ // Relational Functions //
+ //////////////////////////
+
+ static int64_t _iseq_(double x, double y){ return x == y; }
+ static int64_t _isneq_(double x, double y){ return x != y; }
+ static int64_t _isgt_(double x, double y){ return isgreater(x, y); }
+ static int64_t _isge_(double x, double y){ return isgreaterequal(x, y); }
+ static int64_t _islt_(double x, double y){ return isless(x, y); }
+ static int64_t _isle_(double x, double y){ return islessequal(x, y); }
+ static int64_t _islg_(double x, double y){ return islessgreater(x, y); }
+ static int64_t _isfin_(double x){ return isfinite(x); }
+ static int64_t _isinf_(double x){ return ::isinf(x); }
+ static int64_t _isnan_(double x){ return ::isnan(x); }
+ static int64_t _isnorm_(double x){ return isnormal(x); }
+ static int64_t _isord_(double x, double y){ return !isunordered(x, y); }
+ static int64_t _isuord_(double x, double y){ return isunordered(x, y); }
+ static int64_t _signbit_(double x){ return signbit(x); }
+
+ DEFINE_BUILTIN(all)
+ {
+ unsigned num = 1;
+ if (ARG(0)->getType()->isVectorTy())
+ {
+ num = ARG(0)->getType()->getVectorNumElements();
+ }
+
+ for (unsigned i = 0; i < num; i++)
+ {
+ if (!(SARGV(0, i) & INT64_MIN))
+ {
+ result.setSInt(0);
+ return;
+ }
+ }
+ result.setSInt(1);
+ }
+
+ DEFINE_BUILTIN(any)
+ {
+ unsigned num = 1;
+ if (ARG(0)->getType()->isVectorTy())
+ {
+ num = ARG(0)->getType()->getVectorNumElements();
+ }
+
+ for (unsigned i = 0; i < num; i++)
+ {
+ if (SARGV(0, i) & INT64_MIN)
+ {
+ result.setSInt(1);
+ return;
+ }
+ }
+ result.setSInt(0);
+ }
+
+ static uint64_t _ibitselect_(uint64_t a, uint64_t b, uint64_t c)
+ {
+ return ((a & ~c) | (b & c));
+ }
+
+ static double _fbitselect_(double a, double b, double c)
+ {
+ uint64_t _a = *(uint64_t*)&a;
+ uint64_t _b = *(uint64_t*)&b;
+ uint64_t _c = *(uint64_t*)&c;
+ uint64_t _r = _ibitselect_(_a, _b, _c);
+ return *(double*)&_r;
+ }
+
+ DEFINE_BUILTIN(bitselect)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'f':
+ case 'd':
+ f3arg(workItem, callInst, fnName, overload, result, _fbitselect_);
+ break;
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ u3arg(workItem, callInst, fnName, overload, result, _ibitselect_);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+
+ DEFINE_BUILTIN(select_builtin)
+ {
+ char type = getOverloadArgType(overload);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ int64_t c = SARGV(2, i);
+ bool _c = (result.num > 1) ? c & INT64_MIN : c;
+ switch (type)
+ {
+ case 'f':
+ case 'd':
+ result.setFloat(_c ? FARGV(1, i) : FARGV(0, i), i);
+ break;
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ result.setSInt(_c ? SARGV(1, i) : SARGV(0, i), i);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+
+ ///////////////////////////////
+ // Synchronization Functions //
+ ///////////////////////////////
+
+ DEFINE_BUILTIN(barrier)
+ {
+ workItem->m_state = WorkItem::BARRIER;
+ workItem->m_workGroup->notifyBarrier(workItem, callInst, UARG(0));
+ }
+
+ DEFINE_BUILTIN(mem_fence)
+ {
+ // TODO: Implement?
+ }
+
+
+ //////////////////////////////////////////
+ // Vector Data Load and Store Functions //
+ //////////////////////////////////////////
+
+ DEFINE_BUILTIN(vload)
+ {
+ size_t base = PARG(1);
+ unsigned int addressSpace = ARG(1)->getType()->getPointerAddressSpace();
+ uint64_t offset = UARG(0);
+
+ size_t address = base + offset*result.size*result.num;
+ size_t size = result.size*result.num;
+ workItem->getMemory(addressSpace)->load(result.data, address, size);
+ }
+
+ DEFINE_BUILTIN(vstore)
+ {
+ const llvm::Value *value = ARG(0);
+ unsigned size = getTypeSize(value->getType());
+ if (isVector3(value))
+ {
+ // 3-element vectors are same size as 4-element vectors,
+ // but vstore address offset shouldn't use this.
+ size = (size/4) * 3;
+ }
+
+ size_t base = PARG(2);
+ unsigned int addressSpace = ARG(2)->getType()->getPointerAddressSpace();
+ uint64_t offset = UARG(1);
+
+ size_t address = base + offset*size;
+ unsigned char *data = workItem->getOperand(value).data;
+ workItem->getMemory(addressSpace)->store(data, address, size);
+ }
+
+ DEFINE_BUILTIN(vload_half)
+ {
+ size_t base = PARG(1);
+ unsigned int addressSpace = ARG(1)->getType()->getPointerAddressSpace();
+ uint64_t offset = UARG(0);
+
+ size_t address;
+ if (fnName.compare(0, 6, "vloada") == 0 && result.num == 3)
+ {
+ address = base + offset*sizeof(cl_half)*4;
+ }
+ else
+ {
+ address = base + offset*sizeof(cl_half)*result.num;
+ }
+ size_t size = sizeof(cl_half)*result.num;
+ uint16_t *halfData = (uint16_t*)workItem->m_pool.alloc(2*result.num);
+ workItem->getMemory(addressSpace)->load((unsigned char*)halfData,
+ address, size);
+
+ // Convert to floats
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ ((float*)result.data)[i] = halfToFloat(halfData[i]);
+ }
+ }
+
+ DEFINE_BUILTIN(vstore_half)
+ {
+ const llvm::Value *value = ARG(0);
+ unsigned size = getTypeSize(value->getType());
+ if (isVector3(value))
+ {
+ // 3-element vectors are same size as 4-element vectors,
+ // but vstore address offset shouldn't use this.
+ size = (size/4) * 3;
+ }
+
+
+ size_t base = PARG(2);
+ unsigned int addressSpace = ARG(2)->getType()->getPointerAddressSpace();
+ uint64_t offset = UARG(1);
+
+ // Convert to halfs
+ unsigned char *data = workItem->getOperand(value).data;
+ size_t num = size / sizeof(float);
+ size = num*sizeof(cl_half);
+ uint16_t *halfData = (uint16_t*)workItem->m_pool.alloc(2*num);
+ HalfRoundMode rmode = Half_RTE; // The Oclgrind device's round mode
+ if (fnName.find("_rtz") != std::string::npos)
+ rmode = Half_RTZ;
+ else if (fnName.find("_rtn") != std::string::npos)
+ rmode = Half_RTN;
+ else if (fnName.find("_rtp") != std::string::npos)
+ rmode = Half_RTP;
+
+ for (unsigned i = 0; i < num; i++)
+ {
+ halfData[i] = floatToHalf(((float*)data)[i], rmode);
+ }
+
+ size_t address;
+ if (fnName.compare(0, 7, "vstorea") == 0 && num == 3)
+ {
+ address = base + offset*sizeof(cl_half)*4;
+ }
+ else
+ {
+ address = base + offset*sizeof(cl_half)*num;
+ }
+
+ workItem->getMemory(addressSpace)->store((unsigned char*)halfData,
+ address, size);
+ }
+
+
+ /////////////////////////
+ // Work-Item Functions //
+ /////////////////////////
+
+ DEFINE_BUILTIN(get_global_id)
+ {
+ uint64_t dim = UARG(0);
+ size_t r = dim < 3 ? workItem->m_globalID[dim] : 0;
+ result.setUInt(r);
+ }
+
+ DEFINE_BUILTIN(get_global_size)
+ {
+ uint64_t dim = UARG(0);
+ size_t r = dim < 3 ?
+ workItem->m_kernelInvocation->getGlobalSize()[dim] : 0;
+ result.setUInt(r);
+ }
+
+ DEFINE_BUILTIN(get_global_offset)
+ {
+ uint64_t dim = UARG(0);
+ size_t r = dim < 3 ?
+ workItem->m_kernelInvocation->getGlobalOffset()[dim] : 0;
+ result.setUInt(r);
+ }
+
+ DEFINE_BUILTIN(get_group_id)
+ {
+ uint64_t dim = UARG(0);
+ size_t r = dim < 3 ? workItem->m_workGroup->getGroupID()[dim] : 0;
+ result.setUInt(r);
+ }
+
+ DEFINE_BUILTIN(get_local_id)
+ {
+ uint64_t dim = UARG(0);
+ size_t r = dim < 3 ? workItem->m_localID[dim] : 0;
+ result.setUInt(r);
+ }
+
+ DEFINE_BUILTIN(get_local_size)
+ {
+ uint64_t dim = UARG(0);
+ size_t r = dim < 3 ? workItem->m_workGroup->getGroupSize()[dim] : 0;
+ result.setUInt(r);
+ }
+
+ DEFINE_BUILTIN(get_num_groups)
+ {
+ uint64_t dim = UARG(0);
+ size_t r = 0;
+ if (dim < 3)
+ {
+ r = workItem->m_kernelInvocation->getNumGroups()[dim];
+ }
+ result.setUInt(r);
+ }
+
+ DEFINE_BUILTIN(get_work_dim)
+ {
+ result.setUInt(workItem->m_kernelInvocation->getWorkDim());
+ }
+
+
+ /////////////////////
+ // Other Functions //
+ /////////////////////
+
+ DEFINE_BUILTIN(convert_float)
+ {
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ result.setFloat((float)UARGV(0, i), i);
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ result.setFloat((float)SARGV(0, i), i);
+ break;
+ case 'f':
+ case 'd':
+ result.setFloat(FARGV(0, i), i);
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ }
+ }
+
+ DEFINE_BUILTIN(convert_half)
+ {
+ float f;
+ HalfRoundMode rmode = Half_RTE;
+ if (fnName.find("_rtz") != std::string::npos)
+ rmode = Half_RTZ;
+ else if (fnName.find("_rtn") != std::string::npos)
+ rmode = Half_RTN;
+ else if (fnName.find("_rtp") != std::string::npos)
+ rmode = Half_RTP;
+ const char srcType = getOverloadArgType(overload);
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ switch (srcType)
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ f = (float)UARGV(0, i);
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ f = (float)SARGV(0, i);
+ break;
+ case 'd':
+ case 'f':
+ f = FARGV(0, i);
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+ result.setUInt(floatToHalf(f, rmode), i);
+ }
+ }
+
+ static void setConvertRoundingMode(const string& name)
+ {
+ size_t rpos = name.find("_rt");
+ if (rpos != string::npos)
+ {
+ switch (name[rpos+3])
+ {
+ case 'e':
+ fesetround(FE_TONEAREST);
+ break;
+ case 'z':
+ fesetround(FE_TOWARDZERO);
+ break;
+ case 'p':
+ fesetround(FE_UPWARD);
+ break;
+ case 'n':
+ fesetround(FE_DOWNWARD);
+ break;
+ default:
+ FATAL_ERROR("Unsupported rounding mode: %c", name[rpos=3]);
+ }
+ }
+ else
+ {
+ fesetround(FE_TOWARDZERO);
+ }
+ }
+
+ DEFINE_BUILTIN(convert_uint)
+ {
+ // Check for saturation modifier
+ bool sat = fnName.find("_sat") != string::npos;
+ uint64_t max = (1UL<<(result.size*8)) - 1;
+
+ // Use rounding mode
+ const int origRnd = fegetround();
+ setConvertRoundingMode(fnName);
+
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ uint64_t r;
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ r = UARGV(0, i);
+ if (sat)
+ {
+ r = _min_(r, max);
+ }
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ {
+ int64_t si = SARGV(0, i);
+ r = si;
+ if (sat)
+ {
+ if (si < 0)
+ {
+ r = 0;
+ }
+ else if (si > max)
+ {
+ r = max;
+ }
+ }
+ break;
+ }
+ case 'f':
+ case 'd':
+ if (sat)
+ {
+ r = rint(_clamp_(FARGV(0, i), 0.0, (double)max));
+ }
+ else
+ {
+ r = rint(FARGV(0, i));
+ }
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+
+ result.setUInt(r, i);
+ }
+ fesetround(origRnd);
+ }
+
+ DEFINE_BUILTIN(convert_sint)
+ {
+ // Check for saturation modifier
+ bool sat = fnName.find("_sat") != string::npos;
+ int64_t min, max;
+ switch (result.size)
+ {
+ case 1:
+ min = INT8_MIN;
+ max = INT8_MAX;
+ break;
+ case 2:
+ min = INT16_MIN;
+ max = INT16_MAX;
+ break;
+ case 4:
+ min = INT32_MIN;
+ max = INT32_MAX;
+ break;
+ case 8:
+ min = INT64_MIN;
+ max = INT64_MAX;
+ break;
+ }
+
+ // Use rounding mode
+ const int origRnd = fegetround();
+ setConvertRoundingMode(fnName);
+
+ for (unsigned i = 0; i < result.num; i++)
+ {
+ int64_t r;
+ switch (getOverloadArgType(overload))
+ {
+ case 'h':
+ case 't':
+ case 'j':
+ case 'm':
+ r = UARGV(0, i);
+ if (sat)
+ {
+ r = _min_((uint64_t)r, (uint64_t)max);
+ }
+ break;
+ case 'c':
+ case 's':
+ case 'i':
+ case 'l':
+ r = SARGV(0, i);
+ if (sat)
+ {
+ r = _clamp_(r, min, max);
+ }
+ break;
+ case 'f':
+ case 'd':
+ if (sat)
+ {
+ r = rint(_clamp_(FARGV(0, i), (double)min, (double)max));
+ }
+ else
+ {
+ r = rint(FARGV(0, i));
+ }
+ break;
+ default:
+ FATAL_ERROR("Unsupported argument type: %c",
+ getOverloadArgType(overload));
+ }
+
+ result.setSInt(r, i);
+ }
+ fesetround(origRnd);
+ }
+
+ DEFINE_BUILTIN(printf_builtin)
+ {
+ lock_guard<mutex> lck(printfMutex);
+
+ size_t formatPtr = workItem->getOperand(ARG(0)).getPointer();
+ Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+
+ int arg = 1;
+ while (true)
+ {
+ char c;
+ memory->load((unsigned char*)&c, formatPtr++);
+ if (c == '\0')
+ {
+ break;
+ }
+
+ if (c == '%')
+ {
+ unsigned vectorWidth = 1;
+ string format = "%";
+ while (true)
+ {
+ memory->load((unsigned char*)&c, formatPtr++);
+ if (c == '\0')
+ {
+ cout << format;
+ break;
+ }
+
+ if (c == 'v')
+ {
+ // Load vector width specifier
+ memory->load((unsigned char*)&c, formatPtr++);
+ vectorWidth = c - '0';
+ if (vectorWidth == 1)
+ {
+ // Assume this is 16
+ vectorWidth = 16;
+ formatPtr++;
+ }
+
+ continue;
+ }
+
+ // Ignore all 'h' specifiers
+ if (c == 'h')
+ continue;
+
+ format += c;
+ bool done = false;
+ switch (c)
+ {
+ case 'c':
+ case 'd':
+ case 'i':
+ for (unsigned i = 0; i < vectorWidth; i++)
+ {
+ if (i > 0)
+ printf(",");
+ printf(format.c_str(), SARGV(arg, i));
+ }
+ arg++;
+ done = true;
+ break;
+ case 'o':
+ case 'u':
+ case 'x':
+ case 'X':
+ case 'p':
+ for (unsigned i = 0; i < vectorWidth; i++)
+ {
+ if (i > 0)
+ printf(",");
+ printf(format.c_str(), UARGV(arg, i));
+ }
+ arg++;
+ done = true;
+ break;
+ case 'f':
+ case 'F':
+ case 'e':
+ case 'E':
+ case 'g':
+ case 'G':
+ case 'a':
+ case 'A':
+ for (unsigned i = 0; i < vectorWidth; i++)
+ {
+ if (i > 0)
+ printf(",");
+ printf(format.c_str(), FARGV(arg, i));
+ }
+ arg++;
+ done = true;
+ break;
+ case 's':
+ {
+ size_t ptr = UARG(arg++);
+ if (!ptr)
+ {
+ // Special case for printing NULL pointer
+ printf(format.c_str(), NULL);
+ }
+ else
+ {
+ // Load string from memory
+ char c;
+ string str = "";
+ while (true)
+ {
+ if (!memory->load((unsigned char*)&c, ptr++))
+ break;
+ if (c == '\0')
+ break;
+ str += c;
+ }
+
+ printf(format.c_str(), str.c_str());
+ }
+ done = true;
+ break;
+ }
+ case '%':
+ printf("%%");
+ done = true;
+ break;
+ }
+ if (done)
+ {
+ break;
+ }
+ }
+ if (c == '\0')
+ {
+ break;
+ }
+ }
+ else
+ {
+ cout << c;
+ }
+ }
+ }
+
+
+ /////////////////////
+ // LLVM Intrinsics //
+ /////////////////////
+
+ DEFINE_BUILTIN(llvm_dbg_declare)
+ {
+ const llvm::DbgDeclareInst *dbgInst =
+ (const llvm::DbgDeclareInst*)callInst;
+ const llvm::Value *addr = dbgInst->getAddress();
+
+#if LLVM_VERSION > 36
+ const llvm::DILocalVariable *var = dbgInst->getVariable();
+ workItem->m_variables[var->getName()] = addr;
+#else
+ const llvm::MDNode *var = dbgInst->getVariable();
+ llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(var->getOperand(0));
+ if (str)
+ {
+ // TODO: There must be a better way of getting the variable name...
+ unsigned length = str->getLength();
+ const char *name = str->getString().str().c_str();
+ if (length > strlen(name) + 1)
+ {
+ name += strlen(name) + 1;
+ workItem->m_variables[name] = addr;
+ }
+ }
+#endif
+ }
+
+ DEFINE_BUILTIN(llvm_dbg_value)
+ {
+ const llvm::DbgValueInst *dbgInst = (const llvm::DbgValueInst*)callInst;
+ const llvm::Value *value = dbgInst->getValue();
+
+ // TODO: Use offset?
+ //uint64_t offset = dbgInst->getOffset();
+
+#if LLVM_VERSION > 36
+ const llvm::DILocalVariable *var = dbgInst->getVariable();
+ workItem->m_variables[var->getName()] = value;
+#else
+ const llvm::MDNode *var = dbgInst->getVariable();
+ llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(var->getOperand(0));
+ if (str)
+ {
+ // TODO: There must be a better way of getting the variable name...
+ unsigned length = str->getLength();
+ const char *name = str->getString().str().c_str();
+ if (length > strlen(name) + 1)
+ {
+ name += strlen(name) + 1;
+ workItem->m_variables[name] = value;
+ }
+ }
+#endif
+ }
+
+ DEFINE_BUILTIN(llvm_lifetime_start)
+ {
+ // TODO: Implement?
+ }
+
+ DEFINE_BUILTIN(llvm_lifetime_end)
+ {
+ // TODO: Implement?
+ }
+
+ DEFINE_BUILTIN(llvm_memcpy)
+ {
+ const llvm::MemCpyInst *memcpyInst = (const llvm::MemCpyInst*)callInst;
+ size_t dest = workItem->getOperand(memcpyInst->getDest()).getPointer();
+ size_t src = workItem->getOperand(memcpyInst->getSource()).getPointer();
+ size_t size = workItem->getOperand(memcpyInst->getLength()).getUInt();
+ unsigned destAddrSpace = memcpyInst->getDestAddressSpace();
+ unsigned srcAddrSpace = memcpyInst->getSourceAddressSpace();
+
+ unsigned char *buffer = workItem->m_pool.alloc(size);
+ workItem->getMemory(srcAddrSpace)->load(buffer, src, size);
+ workItem->getMemory(destAddrSpace)->store(buffer, dest, size);
+ }
+
+ DEFINE_BUILTIN(llvm_memset)
+ {
+ const llvm::MemSetInst *memsetInst = (const llvm::MemSetInst*)callInst;
+ size_t dest = workItem->getOperand(memsetInst->getDest()).getPointer();
+ size_t size = workItem->getOperand(memsetInst->getLength()).getUInt();
+ unsigned addressSpace = memsetInst->getDestAddressSpace();
+
+ unsigned char *buffer = workItem->m_pool.alloc(size);
+ unsigned char value = UARG(1);
+ memset(buffer, value, size);
+ workItem->getMemory(addressSpace)->store(buffer, dest, size);
+ }
+
+ DEFINE_BUILTIN(llvm_trap)
+ {
+ FATAL_ERROR("Encountered trap instruction");
+ }
+
+ public:
+ static BuiltinFunctionMap initBuiltins();
+ };
+
+ // Utility macros for generating builtin function map
+#define CAST \
+ void(*)(WorkItem*, const llvm::CallInst*, \
+ const std::string&, const std::string&, TypedValue& result, void*)
+#define F1ARG(name) (double(*)(double))name
+#define F2ARG(name) (double(*)(double,double))name
+#define F3ARG(name) (double(*)(double,double,double))name
+#define ADD_BUILTIN(name, func, op) \
+ builtins[name] = BuiltinFunction((CAST)func, (void*)op);
+#define ADD_PREFIX_BUILTIN(name, func, op) \
+ workItemPrefixBuiltins.push_back( \
+ make_pair(name, BuiltinFunction((CAST)func, (void*)op)));
+
+ // Generate builtin function map
+ BuiltinFunctionPrefixList workItemPrefixBuiltins;
+ BuiltinFunctionMap workItemBuiltins = WorkItemBuiltins::initBuiltins();
+ BuiltinFunctionMap WorkItemBuiltins::initBuiltins()
+ {
+ BuiltinFunctionMap builtins;
+
+ // Async Copy and Prefetch Functions
+ ADD_BUILTIN("async_work_group_copy", async_work_group_copy, NULL);
+ ADD_BUILTIN("async_work_group_strided_copy", async_work_group_copy, NULL);
+ ADD_BUILTIN("wait_group_events", wait_group_events, NULL);
+ ADD_BUILTIN("prefetch", prefetch, NULL);
+
+ // Atomic Functions
+ ADD_BUILTIN("atom_add", atomic_add, NULL);
+ ADD_BUILTIN("atomic_add", atomic_add, NULL);
+ ADD_BUILTIN("atom_and", atomic_and, NULL);
+ ADD_BUILTIN("atomic_and", atomic_and, NULL);
+ ADD_BUILTIN("atom_cmpxchg", atomic_cmpxchg, NULL);
+ ADD_BUILTIN("atomic_cmpxchg", atomic_cmpxchg, NULL);
+ ADD_BUILTIN("atom_dec", atomic_dec, NULL);
+ ADD_BUILTIN("atomic_dec", atomic_dec, NULL);
+ ADD_BUILTIN("atom_inc", atomic_inc, NULL);
+ ADD_BUILTIN("atomic_inc", atomic_inc, NULL);
+ ADD_BUILTIN("atom_max", atomic_max, NULL);
+ ADD_BUILTIN("atomic_max", atomic_max, NULL);
+ ADD_BUILTIN("atom_min", atomic_min, NULL);
+ ADD_BUILTIN("atomic_min", atomic_min, NULL);
+ ADD_BUILTIN("atom_or", atomic_or, NULL);
+ ADD_BUILTIN("atomic_or", atomic_or, NULL);
+ ADD_BUILTIN("atom_sub", atomic_sub, NULL);
+ ADD_BUILTIN("atomic_sub", atomic_sub, NULL);
+ ADD_BUILTIN("atom_xchg", atomic_xchg, NULL);
+ ADD_BUILTIN("atomic_xchg", atomic_xchg, NULL);
+ ADD_BUILTIN("atom_xor", atomic_xor, NULL);
+ ADD_BUILTIN("atomic_xor", atomic_xor, NULL);
+
+ // Common Functions
+ ADD_BUILTIN("clamp", clamp, NULL);
+ ADD_BUILTIN("degrees", f1arg, _degrees_);
+ ADD_BUILTIN("max", max, NULL);
+ ADD_BUILTIN("min", min, NULL);
+ ADD_BUILTIN("mix", mix, NULL);
+ ADD_BUILTIN("radians", f1arg, _radians_);
+ ADD_BUILTIN("sign", f1arg, _sign_);
+ ADD_BUILTIN("smoothstep", smoothstep, NULL);
+ ADD_BUILTIN("step", step, NULL);
+
+ // Geometric Functions
+ ADD_BUILTIN("cross", cross, NULL);
+ ADD_BUILTIN("dot", dot, NULL);
+ ADD_BUILTIN("distance", distance, NULL);
+ ADD_BUILTIN("length", length, NULL);
+ ADD_BUILTIN("normalize", normalize, NULL);
+ ADD_BUILTIN("fast_distance", distance, NULL);
+ ADD_BUILTIN("fast_length", length, NULL);
+ ADD_BUILTIN("fast_normalize", normalize, NULL);
+
+ // Image Functions
+ ADD_BUILTIN("get_image_array_size", get_image_array_size, NULL);
+ ADD_BUILTIN("get_image_channel_data_type",
+ get_image_channel_data_type, NULL);
+ ADD_BUILTIN("get_image_channel_order", get_image_channel_order, NULL);
+ ADD_BUILTIN("get_image_dim", get_image_dim, NULL);
+ ADD_BUILTIN("get_image_depth", get_image_depth, NULL);
+ ADD_BUILTIN("get_image_height", get_image_height, NULL);
+ ADD_BUILTIN("get_image_width", get_image_width, NULL);
+ ADD_BUILTIN("read_imagef", read_imagef, NULL);
+ ADD_BUILTIN("read_imagei", read_imagei, NULL);
+ ADD_BUILTIN("read_imageui", read_imageui, NULL);
+ ADD_BUILTIN("write_imagef", write_imagef, NULL);
+ ADD_BUILTIN("write_imagei", write_imagei, NULL);
+ ADD_BUILTIN("write_imageui", write_imageui, NULL);
+
+ // Integer Functions
+ ADD_BUILTIN("abs", abs_builtin, NULL);
+ ADD_BUILTIN("abs_diff", abs_diff, NULL);
+ ADD_BUILTIN("add_sat", add_sat, NULL);
+ ADD_BUILTIN("clz", clz, NULL);
+ ADD_BUILTIN("hadd", hadd, NULL);
+ ADD_BUILTIN("mad24", u3arg, _mad_);
+ ADD_BUILTIN("mad_hi", mad_hi, NULL);
+ ADD_BUILTIN("mad_sat", mad_sat, NULL);
+ ADD_BUILTIN("mul24", u2arg, _mul_);
+ ADD_BUILTIN("mul_hi", mul_hi, NULL);
+ ADD_BUILTIN("popcount", u1arg, _popcount_);
+ ADD_BUILTIN("rhadd", rhadd, NULL);
+ ADD_BUILTIN("rotate", rotate, NULL);
+ ADD_BUILTIN("sub_sat", sub_sat, NULL);
+ ADD_BUILTIN("upsample", upsample, NULL);
+
+ // Math Functions
+ ADD_BUILTIN("acos", f1arg, F1ARG(acos));
+ ADD_BUILTIN("acosh", f1arg, F1ARG(acosh));
+ ADD_BUILTIN("acospi", f1arg, _acospi_);
+ ADD_BUILTIN("asin", f1arg, F1ARG(asin));
+ ADD_BUILTIN("asinh", f1arg, F1ARG(asinh));
+ ADD_BUILTIN("asinpi", f1arg, _asinpi_);
+ ADD_BUILTIN("atan", f1arg, F1ARG(atan));
+ ADD_BUILTIN("atan2", f2arg, F2ARG(atan2));
+ ADD_BUILTIN("atanh", f1arg, F1ARG(atanh));
+ ADD_BUILTIN("atanpi", f1arg, _atanpi_);
+ ADD_BUILTIN("atan2pi", f2arg, _atan2pi_);
+ ADD_BUILTIN("cbrt", f1arg, F1ARG(cbrt));
+ ADD_BUILTIN("ceil", f1arg, F1ARG(ceil));
+ ADD_BUILTIN("copysign", f2arg, F2ARG(copysign));
+ ADD_BUILTIN("cos", f1arg, F1ARG(cos));
+ ADD_BUILTIN("cosh", f1arg, F1ARG(cosh));
+ ADD_BUILTIN("cospi", f1arg, _cospi_);
+ ADD_BUILTIN("erfc", f1arg, F1ARG(erfc));
+ ADD_BUILTIN("erf", f1arg, F1ARG(erf));
+ ADD_BUILTIN("exp", f1arg, F1ARG(exp));
+ ADD_BUILTIN("exp2", f1arg, F1ARG(exp2));
+ ADD_BUILTIN("exp10", f1arg, _exp10_);
+ ADD_BUILTIN("expm1", f1arg, F1ARG(expm1));
+ ADD_BUILTIN("fabs", f1arg, F1ARG(fabs));
+ ADD_BUILTIN("fdim", f2arg, F2ARG(fdim));
+ ADD_BUILTIN("floor", f1arg, F1ARG(floor));
+ ADD_BUILTIN("fma", f3arg, F3ARG(_fma_));
+ ADD_BUILTIN("fmax", f2arg, F2ARG(fmax));
+ ADD_BUILTIN("fmin", f2arg, F2ARG(fmin));
+ ADD_BUILTIN("fmod", f2arg, F2ARG(fmod));
+ ADD_BUILTIN("fract", fract, NULL);
+ ADD_BUILTIN("frexp", frexp_builtin, NULL);
+ ADD_BUILTIN("hypot", f2arg, F2ARG(hypot));
+ ADD_BUILTIN("ilogb", ilogb_builtin, NULL);
+ ADD_BUILTIN("ldexp", ldexp_builtin, NULL);
+ ADD_BUILTIN("lgamma", f1arg, F1ARG(lgamma));
+ ADD_BUILTIN("lgamma_r", lgamma_r, NULL);
+ ADD_BUILTIN("log", f1arg, F1ARG(log));
+ ADD_BUILTIN("log2", f1arg, F1ARG(log2));
+ ADD_BUILTIN("log10", f1arg, F1ARG(log10));
+ ADD_BUILTIN("log1p", f1arg, F1ARG(log1p));
+ ADD_BUILTIN("logb", f1arg, F1ARG(logb));
+ ADD_BUILTIN("mad", f3arg, F3ARG(_fma_));
+ ADD_BUILTIN("maxmag", f2arg, _maxmag_);
+ ADD_BUILTIN("minmag", f2arg, _minmag_);
+ ADD_BUILTIN("modf", modf_builtin, NULL);
+ ADD_BUILTIN("nan", nan_builtin, NULL);
+ ADD_BUILTIN("nanf", nan_builtin, NULL);
+ ADD_BUILTIN("nextafter", nextafter_builtin, NULL);
+ ADD_BUILTIN("pow", f2arg, F2ARG(pow));
+ ADD_BUILTIN("pown", pown, NULL);
+ ADD_BUILTIN("powr", f2arg, F2ARG(pow));
+ ADD_BUILTIN("remainder", f2arg, F2ARG(remainder));
+ ADD_BUILTIN("remquo", remquo_builtin, NULL);
+ ADD_BUILTIN("rint", f1arg, F1ARG(rint));
+ ADD_BUILTIN("rootn", rootn, NULL);
+ ADD_BUILTIN("round", f1arg, F1ARG(round));
+ ADD_BUILTIN("rsqrt", f1arg, _rsqrt_);
+ ADD_BUILTIN("sin", f1arg, F1ARG(sin));
+ ADD_BUILTIN("sinh", f1arg, F1ARG(sinh));
+ ADD_BUILTIN("sinpi", f1arg, _sinpi_);
+ ADD_BUILTIN("sincos", sincos, NULL);
+ ADD_BUILTIN("sqrt", f1arg, F1ARG(sqrt));
+ ADD_BUILTIN("tan", f1arg, F1ARG(tan));
+ ADD_BUILTIN("tanh", f1arg, F1ARG(tanh));
+ ADD_BUILTIN("tanpi", f1arg, _tanpi_);
+ ADD_BUILTIN("tgamma", f1arg, F1ARG(tgamma));
+ ADD_BUILTIN("trunc", f1arg, F1ARG(trunc));
+
+ // Native Math Functions
+ ADD_BUILTIN("half_cos", f1arg, F1ARG(cos));
+ ADD_BUILTIN("native_cos", f1arg, F1ARG(cos));
+ ADD_BUILTIN("half_divide", f2arg, _fdivide_);
+ ADD_BUILTIN("native_divide", f2arg, _fdivide_);
+ ADD_BUILTIN("half_exp", f1arg, F1ARG(exp));
+ ADD_BUILTIN("native_exp", f1arg, F1ARG(exp));
+ ADD_BUILTIN("half_exp2", f1arg, F1ARG(exp2));
+ ADD_BUILTIN("native_exp2", f1arg, F1ARG(exp2));
+ ADD_BUILTIN("half_exp10", f1arg, _exp10_);
+ ADD_BUILTIN("native_exp10", f1arg, _exp10_);
+ ADD_BUILTIN("half_log", f1arg, F1ARG(log));
+ ADD_BUILTIN("native_log", f1arg, F1ARG(log));
+ ADD_BUILTIN("half_log2", f1arg, F1ARG(log2));
+ ADD_BUILTIN("native_log2", f1arg, F1ARG(log2));
+ ADD_BUILTIN("half_log10", f1arg, F1ARG(log10));
+ ADD_BUILTIN("native_log10", f1arg, F1ARG(log10));
+ ADD_BUILTIN("half_powr", f2arg, F2ARG(pow));
+ ADD_BUILTIN("native_powr", f2arg, F2ARG(pow));
+ ADD_BUILTIN("half_recip", f1arg, _frecip_);
+ ADD_BUILTIN("native_recip", f1arg, _frecip_);
+ ADD_BUILTIN("half_rsqrt", f1arg, _rsqrt_);
+ ADD_BUILTIN("native_rsqrt", f1arg, _rsqrt_);
+ ADD_BUILTIN("half_sin", f1arg, F1ARG(sin));
+ ADD_BUILTIN("native_sin", f1arg, F1ARG(sin));
+ ADD_BUILTIN("half_sqrt", f1arg, F1ARG(sqrt));
+ ADD_BUILTIN("native_sqrt", f1arg, F1ARG(sqrt));
+ ADD_BUILTIN("half_tan", f1arg, F1ARG(tan));
+ ADD_BUILTIN("native_tan", f1arg, F1ARG(tan));
+
+ // Misc. Vector Functions
+ ADD_BUILTIN("shuffle", shuffle_builtin, NULL);
+ ADD_BUILTIN("shuffle2", shuffle2_builtin, NULL);
+
+ // Relational Functional
+ ADD_BUILTIN("all", all, NULL);
+ ADD_BUILTIN("any", any, NULL);
+ ADD_BUILTIN("bitselect", bitselect, NULL);
+ ADD_BUILTIN("isequal", rel2arg, _iseq_);
+ ADD_BUILTIN("isnotequal", rel2arg, _isneq_);
+ ADD_BUILTIN("isgreater", rel2arg, _isgt_);
+ ADD_BUILTIN("isgreaterequal", rel2arg, _isge_);
+ ADD_BUILTIN("isless", rel2arg, _islt_);
+ ADD_BUILTIN("islessequal", rel2arg, _isle_);
+ ADD_BUILTIN("islessgreater", rel2arg, _islg_);
+ ADD_BUILTIN("isfinite", rel1arg, _isfin_);
+ ADD_BUILTIN("isinf", rel1arg, _isinf_);
+ ADD_BUILTIN("isnan", rel1arg, _isnan_);
+ ADD_BUILTIN("isnormal", rel1arg, _isnorm_);
+ ADD_BUILTIN("isordered", rel2arg, _isord_);
+ ADD_BUILTIN("isunordered", rel2arg, _isuord_);
+ ADD_BUILTIN("select", select_builtin, NULL);
+ ADD_BUILTIN("signbit", rel1arg, _signbit_);
+
+ // Synchronization Functions
+ ADD_BUILTIN("barrier", barrier, NULL);
+ ADD_BUILTIN("mem_fence", mem_fence, NULL);
+ ADD_BUILTIN("read_mem_fence", mem_fence, NULL);
+ ADD_BUILTIN("write_mem_fence", mem_fence, NULL);
+
+ // Vector Data Load and Store Functions
+ ADD_PREFIX_BUILTIN("vload_half", vload_half, NULL);
+ ADD_PREFIX_BUILTIN("vloada_half", vload_half, NULL);
+ ADD_PREFIX_BUILTIN("vstore_half", vstore_half, NULL);
+ ADD_PREFIX_BUILTIN("vstorea_half", vstore_half, NULL);
+ ADD_PREFIX_BUILTIN("vload", vload, NULL);
+ ADD_PREFIX_BUILTIN("vstore", vstore, NULL);
+
+ // Work-Item Functions
+ ADD_BUILTIN("get_global_id", get_global_id, NULL);
+ ADD_BUILTIN("get_global_size", get_global_size, NULL);
+ ADD_BUILTIN("get_global_offset", get_global_offset, NULL);
+ ADD_BUILTIN("get_group_id", get_group_id, NULL);
+ ADD_BUILTIN("get_local_id", get_local_id, NULL);
+ ADD_BUILTIN("get_local_size", get_local_size, NULL);
+ ADD_BUILTIN("get_num_groups", get_num_groups, NULL);
+ ADD_BUILTIN("get_work_dim", get_work_dim, NULL);
+
+ // Other Functions
+ ADD_PREFIX_BUILTIN("convert_half", convert_half, NULL);
+ ADD_PREFIX_BUILTIN("convert_float", convert_float, NULL);
+ ADD_PREFIX_BUILTIN("convert_double", convert_float, NULL);
+ ADD_PREFIX_BUILTIN("convert_u", convert_uint, NULL);
+ ADD_PREFIX_BUILTIN("convert_", convert_sint, NULL);
+ ADD_BUILTIN("printf", printf_builtin, NULL);
+
+ // LLVM Intrinsics
+ ADD_BUILTIN("llvm.dbg.declare", llvm_dbg_declare, NULL);
+ ADD_BUILTIN("llvm.dbg.value", llvm_dbg_value, NULL);
+ ADD_BUILTIN("llvm.lifetime.start", llvm_lifetime_start, NULL);
+ ADD_BUILTIN("llvm.lifetime.end", llvm_lifetime_end, NULL);
+ ADD_PREFIX_BUILTIN("llvm.memcpy", llvm_memcpy, NULL);
+ ADD_PREFIX_BUILTIN("llvm.memmove", llvm_memcpy, NULL);
+ ADD_PREFIX_BUILTIN("llvm.memset", llvm_memset, NULL);
+ ADD_PREFIX_BUILTIN("llvm.fmuladd", f3arg, F3ARG(_fma_));
+ ADD_BUILTIN("llvm.trap", llvm_trap, NULL);
+
+ return builtins;
+ }
+}
diff --git a/src/core/clc.h b/src/core/clc.h
new file mode 100644
index 0000000..320ddce
--- /dev/null
+++ b/src/core/clc.h
@@ -0,0 +1,1035 @@
+// clc.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+
+#if defined(__SPIR32__)
+ typedef uint size_t;
+ typedef int ptrdiff_t;
+#else
+ typedef ulong size_t;
+ typedef long ptrdiff_t;
+#endif
+typedef size_t uintptr_t;
+typedef ptrdiff_t intptr_t;
+
+#define event_t size_t
+
+#define TYPEDEF_VECTOR(type) \
+ typedef __attribute__((ext_vector_type(2))) type type##2; \
+ typedef __attribute__((ext_vector_type(3))) type type##3; \
+ typedef __attribute__((ext_vector_type(4))) type type##4; \
+ typedef __attribute__((ext_vector_type(8))) type type##8; \
+ typedef __attribute__((ext_vector_type(16))) type type##16;
+TYPEDEF_VECTOR(char);
+TYPEDEF_VECTOR(uchar);
+TYPEDEF_VECTOR(short);
+TYPEDEF_VECTOR(ushort);
+TYPEDEF_VECTOR(int);
+TYPEDEF_VECTOR(uint);
+TYPEDEF_VECTOR(long);
+TYPEDEF_VECTOR(ulong);
+TYPEDEF_VECTOR(float);
+TYPEDEF_VECTOR(double);
+
+#define __ENDIAN_LITTLE__ 1
+#define __OPENCL_VERSION__ 120
+#define __OPENCL_C_VERSION__ 120
+#define __IMAGE_SUPPORT__ 1
+#define __kernel_exec(X, typen) __kernel \
+ __attribute__((work_group_size_hint(X, 1, 1))) \
+ __attribute__((vec_type_hint(typen)))
+
+#define CHAR_BIT 8
+#define SCHAR_MAX 127
+#define SCHAR_MIN (-128)
+#define UCHAR_MAX 255
+#define CHAR_MAX SCHAR_MAX
+#define CHAR_MIN SCHAR_MIN
+#define USHRT_MAX 65535
+#define SHRT_MAX 32767
+#define SHRT_MIN (-32768)
+#define UINT_MAX 0xffffffff
+#define INT_MAX 2147483647
+#define INT_MIN (-2147483647-1)
+#define ULONG_MAX 0xffffffffffffffffUL
+#define LONG_MAX ((long)0x7fffffffffffffffL)
+#define LONG_MIN ((long)(-0x7fffffffffffffffL-1))
+
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define DBL_DIG 15
+#define DBL_MANT_DIG 53
+#define DBL_MAX_10_EXP +308
+#define DBL_MAX_EXP +1024
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP -1021
+#define DBL_RADIX 2
+#define DBL_MAX 0x1.fffffffffffffp1023
+#define DBL_MIN 0x1.0p-1022
+#define DBL_EPSILON 0x1.0p-52
+
+#define FP_ILOGB0 INT_MIN
+#define FP_ILOGBNAN INT_MIN
+
+#define M_E_F 2.71828182845904523536028747135266250f
+#define M_LOG2E_F 1.44269504088896340735992468100189214f
+#define M_LOG10E_F 0.434294481903251827651128918916605082f
+#define M_LN2_F 0.693147180559945309417232121458176568f
+#define M_LN10_F 2.3025850929940456840179914546843642f
+#define M_PI_F 3.14159265358979323846264338327950288f
+#define M_PI_2_F 1.57079632679489661923132169163975144f
+#define M_PI_4_F 0.785398163397448309615660845819875721f
+#define M_1_PI_F 0.318309886183790671537767526745028724f
+#define M_2_PI_F 0.636619772367581343075535053490057448f
+#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f
+#define M_SQRT2_F 1.41421356237309504880168872420969808f
+#define M_SQRT1_2_F 0.707106781186547524400844362104849039f
+
+#define M_E 2.71828182845904523536028747135266250
+#define M_LOG2E 1.44269504088896340735992468100189214
+#define M_LOG10E 0.434294481903251827651128918916605082
+#define M_LN2 0.693147180559945309417232121458176568
+#define M_LN10 2.30258509299404568401799145468436421
+#define M_PI 3.14159265358979323846264338327950288
+#define M_PI_2 1.57079632679489661923132169163975144
+#define M_PI_4 0.785398163397448309615660845819875721
+#define M_1_PI 0.318309886183790671537767526745028724
+#define M_2_PI 0.636619772367581343075535053490057448
+#define M_2_SQRTPI 1.12837916709551257389615890312154517
+#define M_SQRT2 1.41421356237309504880168872420969808
+#define M_SQRT1_2 0.707106781186547524400844362104849039
+
+#define MAXFLOAT ((float)3.40282346638528860e+38)
+#define HUGE_VALF __builtin_huge_valf()
+#define HUGE_VAL __builtin_huge_val()
+#define INFINITY __builtin_inff()
+#define NAN __builtin_nanf(0)
+
+#define CLK_SNORM_INT8 0x10D0
+#define CLK_SNORM_INT16 0x10D1
+#define CLK_UNORM_INT8 0x10D2
+#define CLK_UNORM_INT16 0x10D3
+#define CLK_UNORM_SHORT_565 0x10D4
+#define CLK_UNORM_SHORT_555 0x10D5
+#define CLK_UNORM_INT_101010 0x10D6
+#define CLK_SIGNED_INT8 0x10D7
+#define CLK_SIGNED_INT16 0x10D8
+#define CLK_SIGNED_INT32 0x10D9
+#define CLK_UNSIGNED_INT8 0x10DA
+#define CLK_UNSIGNED_INT16 0x10DB
+#define CLK_UNSIGNED_INT32 0x10DC
+#define CLK_HALF_FLOAT 0x10DD
+#define CLK_FLOAT 0x10DE
+#define CLK_UNORM_INT24 0x10DF
+
+#define CLK_R 0x10B0
+#define CLK_A 0x10B1
+#define CLK_RG 0x10B2
+#define CLK_RA 0x10B3
+#define CLK_RGB 0x10B4
+#define CLK_RGBA 0x10B5
+#define CLK_BGRA 0x10B6
+#define CLK_ARGB 0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx 0x10BA
+#define CLK_RGx 0x10BB
+#define CLK_RGBx 0x10BC
+#define CLK_DEPTH 0x10BD
+#define CLK_DEPTH_STENCIL 0x10BE
+
+#define CLK_NORMALIZED_COORDS_FALSE 0x0000
+#define CLK_NORMALIZED_COORDS_TRUE 0x0001
+
+#define CLK_ADDRESS_NONE 0x0000
+#define CLK_ADDRESS_CLAMP_TO_EDGE 0x0002
+#define CLK_ADDRESS_CLAMP 0x0004
+#define CLK_ADDRESS_REPEAT 0x0006
+#define CLK_ADDRESS_MIRRORED_REPEAT 0x0008
+
+#define CLK_FILTER_NEAREST 0x0010
+#define CLK_FILTER_LINEAR 0x0020
+
+#define __OVERLOAD__ __attribute__((__overloadable__))
+
+#define BUILTIN_1ARG(rtype, type0, name) \
+ rtype __OVERLOAD__ name(type0 a); \
+ rtype##2 __OVERLOAD__ name(type0##2 a); \
+ rtype##3 __OVERLOAD__ name(type0##3 a); \
+ rtype##4 __OVERLOAD__ name(type0##4 a); \
+ rtype##8 __OVERLOAD__ name(type0##8 a); \
+ rtype##16 __OVERLOAD__ name(type0##16 a);
+#define BUILTIN_2ARG(rtype, type0, type1, name) \
+ rtype __OVERLOAD__ name(type0 a, type1 b); \
+ rtype##2 __OVERLOAD__ name(type0##2 a, type1##2 b); \
+ rtype##3 __OVERLOAD__ name(type0##3 a, type1##3 b); \
+ rtype##4 __OVERLOAD__ name(type0##4 a, type1##4 b); \
+ rtype##8 __OVERLOAD__ name(type0##8 a, type1##8 b); \
+ rtype##16 __OVERLOAD__ name(type0##16 a, type1##16 b);
+#define BUILTIN_3ARG(rtype, type0, type1, type2, name) \
+ rtype __OVERLOAD__ name(type0 a, type1 b, type2 c); \
+ rtype##2 __OVERLOAD__ name(type0##2 a, type1##2 b, type2##2 c); \
+ rtype##3 __OVERLOAD__ name(type0##3 a, type1##3 b, type2##3 c); \
+ rtype##4 __OVERLOAD__ name(type0##4 a, type1##4 b, type2##4 c); \
+ rtype##8 __OVERLOAD__ name(type0##8 a, type1##8 b, type2##8 c); \
+ rtype##16 __OVERLOAD__ name(type0##16 a, type1##16 b, type2##16 c);
+
+#define BUILTIN_1ARG_INTEGERS(name) \
+ BUILTIN_1ARG(char, char, name) \
+ BUILTIN_1ARG(uchar, uchar, name) \
+ BUILTIN_1ARG(short, short, name) \
+ BUILTIN_1ARG(ushort, ushort, name) \
+ BUILTIN_1ARG(int, int, name) \
+ BUILTIN_1ARG(uint, uint, name) \
+ BUILTIN_1ARG(long, long, name) \
+ BUILTIN_1ARG(ulong, ulong, name);
+#define BUILTIN_2ARG_INTEGERS(name) \
+ BUILTIN_2ARG(char, char, char, name) \
+ BUILTIN_2ARG(uchar, uchar, uchar, name) \
+ BUILTIN_2ARG(short, short, short, name) \
+ BUILTIN_2ARG(ushort, ushort, ushort, name) \
+ BUILTIN_2ARG(int, int, int, name) \
+ BUILTIN_2ARG(uint, uint, uint, name) \
+ BUILTIN_2ARG(long, long, long, name) \
+ BUILTIN_2ARG(ulong, ulong, ulong, name);
+#define BUILTIN_3ARG_INTEGERS(name) \
+ BUILTIN_3ARG(char, char, char, char, name) \
+ BUILTIN_3ARG(uchar, uchar, uchar, uchar, name) \
+ BUILTIN_3ARG(short, short, short, short, name) \
+ BUILTIN_3ARG(ushort, ushort, ushort, ushort, name) \
+ BUILTIN_3ARG(int, int, int, int, name) \
+ BUILTIN_3ARG(uint, uint, uint, uint, name) \
+ BUILTIN_3ARG(long, long, long, long, name) \
+ BUILTIN_3ARG(ulong, ulong, ulong, ulong, name);
+
+#define BUILTIN_1ARG_FLOATS(name) \
+ BUILTIN_1ARG(float, float, name) \
+ BUILTIN_1ARG(double, double, name);
+#define BUILTIN_2ARG_FLOATS(name) \
+ BUILTIN_2ARG(float, float, float, name) \
+ BUILTIN_2ARG(double, double, double, name);
+#define BUILTIN_3ARG_FLOATS(name) \
+ BUILTIN_3ARG(float, float, float, float, name) \
+ BUILTIN_3ARG(double, double, double, double, name);
+
+
+///////////////////////////////////////
+// Async Copy and Prefetch Functions //
+///////////////////////////////////////
+
+#define ASYNC_COPY_TYPE(type) \
+ event_t __OVERLOAD__ async_work_group_copy(__local type*, const __global type*, size_t, event_t); \
+ event_t __OVERLOAD__ async_work_group_copy(__global type*, const __local type*, size_t, event_t); \
+ event_t __OVERLOAD__ async_work_group_strided_copy(__local type*, const __global type*, size_t, size_t, event_t); \
+ event_t __OVERLOAD__ async_work_group_strided_copy(__global type*, const __local type*, size_t, size_t, event_t);
+#define ASYNC_COPY(type) \
+ ASYNC_COPY_TYPE(type) \
+ ASYNC_COPY_TYPE(type##2) \
+ ASYNC_COPY_TYPE(type##3) \
+ ASYNC_COPY_TYPE(type##4) \
+ ASYNC_COPY_TYPE(type##8) \
+ ASYNC_COPY_TYPE(type##16);
+ASYNC_COPY(char);
+ASYNC_COPY(uchar);
+ASYNC_COPY(short);
+ASYNC_COPY(ushort);
+ASYNC_COPY(int);
+ASYNC_COPY(uint);
+ASYNC_COPY(long);
+ASYNC_COPY(ulong);
+ASYNC_COPY(float);
+ASYNC_COPY(double);
+
+void wait_group_events(int, event_t*);
+
+#define PREFETCH(type) \
+ void __OVERLOAD__ prefetch(const __global type*, size_t); \
+ void __OVERLOAD__ prefetch(const __global type##2*, size_t); \
+ void __OVERLOAD__ prefetch(const __global type##3*, size_t); \
+ void __OVERLOAD__ prefetch(const __global type##4*, size_t); \
+ void __OVERLOAD__ prefetch(const __global type##8*, size_t); \
+ void __OVERLOAD__ prefetch(const __global type##16*, size_t);
+PREFETCH(char);
+PREFETCH(uchar);
+PREFETCH(short);
+PREFETCH(ushort);
+PREFETCH(int);
+PREFETCH(uint);
+PREFETCH(long);
+PREFETCH(ulong);
+PREFETCH(float);
+PREFETCH(double);
+
+
+//////////////////////
+// Atomic Functions //
+//////////////////////
+
+#define ATOMIC_0ARG_DEF(name, type) \
+ type __OVERLOAD__ name(volatile __global type *p); \
+ type __OVERLOAD__ name(volatile __local type *p);
+#define ATOMIC_0ARG(name) \
+ ATOMIC_0ARG_DEF(atom_##name, int); \
+ ATOMIC_0ARG_DEF(atom_##name, uint); \
+ ATOMIC_0ARG_DEF(atomic_##name, int); \
+ ATOMIC_0ARG_DEF(atomic_##name, uint);
+
+#define ATOMIC_1ARG_DEF(name, type) \
+ type __OVERLOAD__ name(volatile __global type *p, type val); \
+ type __OVERLOAD__ name(volatile __local type *p, type val);
+#define ATOMIC_1ARG(name) \
+ ATOMIC_1ARG_DEF(atom_##name, int); \
+ ATOMIC_1ARG_DEF(atom_##name, uint); \
+ ATOMIC_1ARG_DEF(atomic_##name, int); \
+ ATOMIC_1ARG_DEF(atomic_##name, uint);
+
+ATOMIC_1ARG(add);
+ATOMIC_1ARG(and);
+ATOMIC_0ARG(dec);
+ATOMIC_0ARG(inc);
+ATOMIC_1ARG(max);
+ATOMIC_1ARG(min);
+ATOMIC_1ARG(or);
+ATOMIC_1ARG(sub);
+ATOMIC_1ARG(xchg);
+ATOMIC_1ARG_DEF(atom_xchg, float);
+ATOMIC_1ARG_DEF(atomic_xchg, float);
+ATOMIC_1ARG(xor);
+
+int __OVERLOAD__ atom_cmpxchg(volatile __global int *p, int cmp, int val);
+int __OVERLOAD__ atom_cmpxchg(volatile __local int *p, int cmp, int val);
+uint __OVERLOAD__ atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
+uint __OVERLOAD__ atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+int __OVERLOAD__ atomic_cmpxchg(volatile __global int *p, int cmp, int val);
+int __OVERLOAD__ atomic_cmpxchg(volatile __local int *p, int cmp, int val);
+uint __OVERLOAD__ atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
+uint __OVERLOAD__ atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+
+
+//////////////////////
+// Common Functions //
+//////////////////////
+
+#define ABS(type) \
+ u##type __OVERLOAD__ abs(type); \
+ u##type __OVERLOAD__ abs(u##type);
+#define ABS_DIFF(type) \
+ u##type __OVERLOAD__ abs_diff(type, type); \
+ u##type __OVERLOAD__ abs_diff(u##type, u##type);
+#define ABS_BOTH(type) \
+ ABS(type); \
+ ABS_DIFF(type);
+#define ABS_ALL(type) \
+ ABS_BOTH(type); \
+ ABS_BOTH(type##2); \
+ ABS_BOTH(type##3); \
+ ABS_BOTH(type##4); \
+ ABS_BOTH(type##8); \
+ ABS_BOTH(type##16);
+
+ABS_ALL(char);
+ABS_ALL(short);
+ABS_ALL(int);
+ABS_ALL(long);
+BUILTIN_3ARG_FLOATS(clamp);
+BUILTIN_1ARG_FLOATS(degrees);
+BUILTIN_2ARG_FLOATS(max);
+BUILTIN_2ARG_FLOATS(min);
+BUILTIN_3ARG_FLOATS(mix);
+BUILTIN_1ARG_FLOATS(radians);
+BUILTIN_1ARG_FLOATS(sign);
+BUILTIN_3ARG_FLOATS(smoothstep);
+BUILTIN_2ARG_FLOATS(step);
+
+#define COMMON_SCALAR(type, n) \
+ type##n __OVERLOAD__ clamp(type##n, type, type); \
+ type##n __OVERLOAD__ max(type##n, type); \
+ type##n __OVERLOAD__ min(type##n, type); \
+ type##n __OVERLOAD__ mix(type##n, type##n, type); \
+ type##n __OVERLOAD__ smoothstep(type, type, type##n); \
+ type##n __OVERLOAD__ step(type, type##n);
+COMMON_SCALAR(float, 2);
+COMMON_SCALAR(float, 3);
+COMMON_SCALAR(float, 4);
+COMMON_SCALAR(float, 8);
+COMMON_SCALAR(float, 16);
+COMMON_SCALAR(double, 2);
+COMMON_SCALAR(double, 3);
+COMMON_SCALAR(double, 4);
+COMMON_SCALAR(double, 8);
+COMMON_SCALAR(double, 16);
+
+
+/////////////////////////
+// Geometric Functions //
+/////////////////////////
+
+#define GEOM_1ARG(type, name) \
+ type __OVERLOAD__ name(type); \
+ type __OVERLOAD__ name(type##2); \
+ type __OVERLOAD__ name(type##3); \
+ type __OVERLOAD__ name(type##4); \
+ type __OVERLOAD__ name(type##8); \
+ type __OVERLOAD__ name(type##16);
+#define GEOM_2ARG(type, name) \
+ type __OVERLOAD__ name(type, type); \
+ type __OVERLOAD__ name(type##2, type##2); \
+ type __OVERLOAD__ name(type##3, type##3); \
+ type __OVERLOAD__ name(type##4, type##4); \
+ type __OVERLOAD__ name(type##8, type##8); \
+ type __OVERLOAD__ name(type##16, type##16);
+
+float4 __OVERLOAD__ cross(float4, float4);
+float3 __OVERLOAD__ cross(float3, float3);
+double4 __OVERLOAD__ cross(double4, double4);
+double3 __OVERLOAD__ cross(double3, double3);
+GEOM_2ARG(float, dot);
+GEOM_2ARG(double, dot);
+GEOM_2ARG(float, distance);
+GEOM_2ARG(double, distance);
+GEOM_1ARG(float, length);
+GEOM_1ARG(double, length);
+BUILTIN_1ARG_FLOATS(normalize);
+GEOM_2ARG(float, fast_distance);
+GEOM_2ARG(double, fast_distance);
+GEOM_1ARG(float, fast_length);
+GEOM_1ARG(double, fast_length);
+BUILTIN_1ARG_FLOATS(fast_normalize);
+
+
+/////////////////////
+// Image Functions //
+/////////////////////
+
+size_t __OVERLOAD__ get_image_array_size(image1d_array_t image);
+size_t __OVERLOAD__ get_image_array_size(image2d_array_t image);
+
+int __OVERLOAD__ get_image_channel_data_type(image1d_t image);
+int __OVERLOAD__ get_image_channel_data_type(image1d_buffer_t image);
+int __OVERLOAD__ get_image_channel_data_type(image1d_array_t image);
+int __OVERLOAD__ get_image_channel_data_type(image2d_t image);
+int __OVERLOAD__ get_image_channel_data_type(image2d_array_t image);
+int __OVERLOAD__ get_image_channel_data_type(image3d_t image);
+
+int __OVERLOAD__ get_image_channel_order(image1d_t image);
+int __OVERLOAD__ get_image_channel_order(image1d_buffer_t image);
+int __OVERLOAD__ get_image_channel_order(image1d_array_t image);
+int __OVERLOAD__ get_image_channel_order(image2d_t image);
+int __OVERLOAD__ get_image_channel_order(image2d_array_t image);
+int __OVERLOAD__ get_image_channel_order(image3d_t image);
+
+int2 __OVERLOAD__ get_image_dim(image2d_t image);
+int2 __OVERLOAD__ get_image_dim(image2d_array_t image);
+int4 __OVERLOAD__ get_image_dim(image3d_t image);
+
+int __OVERLOAD__ get_image_depth(image3d_t image);
+int __OVERLOAD__ get_image_height(image2d_t image);
+int __OVERLOAD__ get_image_height(image2d_array_t image);
+int __OVERLOAD__ get_image_height(image3d_t image);
+int __OVERLOAD__ get_image_width(image1d_t image);
+int __OVERLOAD__ get_image_width(image1d_buffer_t image);
+int __OVERLOAD__ get_image_width(image1d_array_t image);
+int __OVERLOAD__ get_image_width(image2d_t image);
+int __OVERLOAD__ get_image_width(image2d_array_t image);
+int __OVERLOAD__ get_image_width(image3d_t image);
+
+float4 __OVERLOAD__ read_imagef(image1d_t, int);
+float4 __OVERLOAD__ read_imagef(image1d_buffer_t, int);
+float4 __OVERLOAD__ read_imagef(image1d_array_t, int2);
+float4 __OVERLOAD__ read_imagef(image2d_t, int2);
+float4 __OVERLOAD__ read_imagef(image2d_array_t, int4);
+float4 __OVERLOAD__ read_imagef(image3d_t, int4);
+
+float4 __OVERLOAD__ read_imagef(image1d_t, sampler_t, int);
+float4 __OVERLOAD__ read_imagef(image1d_t, sampler_t, float);
+float4 __OVERLOAD__ read_imagef(image1d_array_t, sampler_t, int2);
+float4 __OVERLOAD__ read_imagef(image1d_array_t, sampler_t, float2);
+float4 __OVERLOAD__ read_imagef(image2d_t, sampler_t, int2);
+float4 __OVERLOAD__ read_imagef(image2d_t, sampler_t, float2);
+float4 __OVERLOAD__ read_imagef(image2d_array_t, sampler_t, int4);
+float4 __OVERLOAD__ read_imagef(image2d_array_t, sampler_t, float4);
+float4 __OVERLOAD__ read_imagef(image3d_t, sampler_t, int4);
+float4 __OVERLOAD__ read_imagef(image3d_t, sampler_t, float4);
+
+int4 __OVERLOAD__ read_imagei(image1d_t, int);
+int4 __OVERLOAD__ read_imagei(image1d_buffer_t, int);
+int4 __OVERLOAD__ read_imagei(image1d_array_t, int2);
+int4 __OVERLOAD__ read_imagei(image2d_t, int2);
+int4 __OVERLOAD__ read_imagei(image2d_array_t, int4);
+int4 __OVERLOAD__ read_imagei(image3d_t, int4);
+
+int4 __OVERLOAD__ read_imagei(image1d_t, sampler_t, int);
+int4 __OVERLOAD__ read_imagei(image1d_t, sampler_t, float);
+int4 __OVERLOAD__ read_imagei(image1d_array_t, sampler_t, int2);
+int4 __OVERLOAD__ read_imagei(image1d_array_t, sampler_t, float2);
+int4 __OVERLOAD__ read_imagei(image2d_t, sampler_t, int2);
+int4 __OVERLOAD__ read_imagei(image2d_t, sampler_t, float2);
+int4 __OVERLOAD__ read_imagei(image2d_array_t, sampler_t, int4);
+int4 __OVERLOAD__ read_imagei(image2d_array_t, sampler_t, float4);
+int4 __OVERLOAD__ read_imagei(image3d_t, sampler_t, int4);
+int4 __OVERLOAD__ read_imagei(image3d_t, sampler_t, float4);
+
+uint4 __OVERLOAD__ read_imageui(image1d_t, int);
+uint4 __OVERLOAD__ read_imageui(image1d_buffer_t, int);
+uint4 __OVERLOAD__ read_imageui(image1d_array_t, int2);
+uint4 __OVERLOAD__ read_imageui(image2d_t, int2);
+uint4 __OVERLOAD__ read_imageui(image2d_array_t, int4);
+uint4 __OVERLOAD__ read_imageui(image3d_t, int4);
+
+uint4 __OVERLOAD__ read_imageui(image1d_t, sampler_t, int);
+uint4 __OVERLOAD__ read_imageui(image1d_t, sampler_t, float);
+uint4 __OVERLOAD__ read_imageui(image1d_array_t, sampler_t, int2);
+uint4 __OVERLOAD__ read_imageui(image1d_array_t, sampler_t, float2);
+uint4 __OVERLOAD__ read_imageui(image2d_t, sampler_t, int2);
+uint4 __OVERLOAD__ read_imageui(image2d_t, sampler_t, float2);
+uint4 __OVERLOAD__ read_imageui(image2d_array_t, sampler_t, int4);
+uint4 __OVERLOAD__ read_imageui(image2d_array_t, sampler_t, float4);
+uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, int4);
+uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, float4);
+
+void __OVERLOAD__ write_imagef(image1d_t, int, float4);
+void __OVERLOAD__ write_imagef(image1d_array_t, int2, float4);
+void __OVERLOAD__ write_imagef(image2d_t, int2, float4);
+void __OVERLOAD__ write_imagef(image2d_array_t, int4, float4);
+void __OVERLOAD__ write_imagef(image3d_t, int4, float4);
+void __OVERLOAD__ write_imagei(image1d_t, int, int4);
+void __OVERLOAD__ write_imagei(image1d_array_t, int2, int4);
+void __OVERLOAD__ write_imagei(image2d_t, int2, int4);
+void __OVERLOAD__ write_imagei(image2d_array_t, int4, int4);
+void __OVERLOAD__ write_imagei(image3d_t, int4, int4);
+void __OVERLOAD__ write_imageui(image1d_t, int, uint4);
+void __OVERLOAD__ write_imageui(image1d_array_t, int2, uint4);
+void __OVERLOAD__ write_imageui(image2d_t, int2, uint4);
+void __OVERLOAD__ write_imageui(image2d_array_t, int4, uint4);
+void __OVERLOAD__ write_imageui(image3d_t, int4, uint4);
+
+
+///////////////////////
+// Integer Functions //
+///////////////////////
+
+BUILTIN_2ARG_INTEGERS(add_sat);
+BUILTIN_3ARG_INTEGERS(clamp);
+BUILTIN_1ARG_INTEGERS(clz);
+BUILTIN_2ARG_INTEGERS(hadd);
+BUILTIN_3ARG(int, int, int, int, mad24);
+BUILTIN_3ARG(uint, uint, uint, uint, mad24);
+BUILTIN_3ARG_INTEGERS(mad_hi);
+BUILTIN_3ARG_INTEGERS(mad_sat);
+BUILTIN_2ARG_INTEGERS(max);
+BUILTIN_2ARG_INTEGERS(min);
+BUILTIN_2ARG(int, int, int, mul24);
+BUILTIN_2ARG(uint, uint, uint, mul24);
+BUILTIN_2ARG_INTEGERS(mul_hi);
+BUILTIN_1ARG_INTEGERS(popcount);
+BUILTIN_2ARG_INTEGERS(rhadd);
+BUILTIN_2ARG_INTEGERS(rotate);
+BUILTIN_2ARG_INTEGERS(sub_sat);
+#define UPSAMPLE_SIZES(out, in1, in2) \
+ out __OVERLOAD__ upsample(in1, in2); \
+ out##2 __OVERLOAD__ upsample(in1##2, in2##2); \
+ out##3 __OVERLOAD__ upsample(in1##3, in2##3); \
+ out##4 __OVERLOAD__ upsample(in1##4, in2##4); \
+ out##8 __OVERLOAD__ upsample(in1##8, in2##8); \
+ out##16 __OVERLOAD__ upsample(in1##16, in2##16);
+#define UPSAMPLE(out, in) \
+ UPSAMPLE_SIZES(out, in, u##in); \
+ UPSAMPLE_SIZES(u##out, u##in, u##in);
+UPSAMPLE(short, char);
+UPSAMPLE(int, short);
+UPSAMPLE(long, int);
+
+
+////////////////////
+// Math Functions //
+////////////////////
+
+#define BUILTIN_2TYPE_PTR(type1, type2, name) \
+ type1 __OVERLOAD__ name(type1, __global type2*); \
+ type1 __OVERLOAD__ name(type1, __local type2*); \
+ type1 __OVERLOAD__ name(type1, __private type2*);
+#define BUILTIN_PTR_ARG(type1, type2, name) \
+ BUILTIN_2TYPE_PTR(type1, type2, name) \
+ BUILTIN_2TYPE_PTR(type1##2, type2##2, name) \
+ BUILTIN_2TYPE_PTR(type1##3, type2##3, name) \
+ BUILTIN_2TYPE_PTR(type1##4, type2##4, name) \
+ BUILTIN_2TYPE_PTR(type1##8, type2##8, name) \
+ BUILTIN_2TYPE_PTR(type1##16, type2##16, name);
+#define REMQUO(type, addrspace) \
+ type __OVERLOAD__ remquo(type, type, addrspace int*); \
+ type##2 __OVERLOAD__ remquo(type##2, type##2, addrspace int2*); \
+ type##3 __OVERLOAD__ remquo(type##3, type##3, addrspace int3*); \
+ type##4 __OVERLOAD__ remquo(type##4, type##4, addrspace int4*); \
+ type##8 __OVERLOAD__ remquo(type##8, type##8, addrspace int8*); \
+ type##16 __OVERLOAD__ remquo(type##16, type##16, addrspace int16*);
+
+BUILTIN_1ARG_FLOATS(acos);
+BUILTIN_1ARG_FLOATS(acosh);
+BUILTIN_1ARG_FLOATS(acospi);
+BUILTIN_1ARG_FLOATS(asin);
+BUILTIN_1ARG_FLOATS(asinh);
+BUILTIN_1ARG_FLOATS(asinpi);
+BUILTIN_1ARG_FLOATS(atan);
+BUILTIN_2ARG_FLOATS(atan2);
+BUILTIN_1ARG_FLOATS(atanh);
+BUILTIN_1ARG_FLOATS(atanpi);
+BUILTIN_2ARG_FLOATS(atan2pi);
+BUILTIN_1ARG_FLOATS(cbrt);
+BUILTIN_1ARG_FLOATS(ceil);
+BUILTIN_2ARG_FLOATS(copysign);
+BUILTIN_1ARG_FLOATS(cos);
+BUILTIN_1ARG_FLOATS(cosh);
+BUILTIN_1ARG_FLOATS(cospi);
+BUILTIN_1ARG_FLOATS(erfc);
+BUILTIN_1ARG_FLOATS(erf);
+BUILTIN_1ARG_FLOATS(exp);
+BUILTIN_1ARG_FLOATS(exp2);
+BUILTIN_1ARG_FLOATS(exp10);
+BUILTIN_1ARG_FLOATS(expm1);
+BUILTIN_1ARG_FLOATS(fabs);
+BUILTIN_2ARG_FLOATS(fdim);
+BUILTIN_1ARG_FLOATS(floor);
+BUILTIN_3ARG_FLOATS(fma);
+BUILTIN_2ARG_FLOATS(fmax);
+BUILTIN_2ARG_FLOATS(fmin);
+BUILTIN_2ARG_FLOATS(fmod);
+BUILTIN_PTR_ARG(float, float, fract);
+BUILTIN_PTR_ARG(double, double, fract);
+BUILTIN_PTR_ARG(float, int, frexp);
+BUILTIN_PTR_ARG(double, int, frexp);
+BUILTIN_2ARG_FLOATS(hypot);
+BUILTIN_1ARG(int, float, ilogb);
+BUILTIN_1ARG(int, double, ilogb);
+BUILTIN_2ARG(float, float, int, ldexp);
+BUILTIN_2ARG(double, double, int, ldexp);
+BUILTIN_1ARG_FLOATS(lgamma);
+BUILTIN_PTR_ARG(float, int, lgamma_r);
+BUILTIN_PTR_ARG(double, int, lgamma_r);
+BUILTIN_1ARG_FLOATS(log);
+BUILTIN_1ARG_FLOATS(log2);
+BUILTIN_1ARG_FLOATS(log10);
+BUILTIN_1ARG_FLOATS(log1p);
+BUILTIN_1ARG_FLOATS(logb);
+BUILTIN_3ARG_FLOATS(mad);
+BUILTIN_2ARG_FLOATS(maxmag);
+BUILTIN_2ARG_FLOATS(minmag);
+BUILTIN_PTR_ARG(float, float, modf);
+BUILTIN_PTR_ARG(double, double, modf);
+BUILTIN_1ARG(float, uint, nan);
+BUILTIN_1ARG(double, ulong, nan);
+BUILTIN_2ARG_FLOATS(nextafter);
+BUILTIN_2ARG_FLOATS(pow);
+BUILTIN_2ARG(float, float, int, pown);
+BUILTIN_2ARG(double, double, int, pown);
+BUILTIN_2ARG_FLOATS(powr);
+BUILTIN_2ARG_FLOATS(remainder);
+REMQUO(float, global);
+REMQUO(float, local);
+REMQUO(float, private);
+REMQUO(double, global);
+REMQUO(double, local);
+REMQUO(double, private);
+BUILTIN_1ARG_FLOATS(rint);
+BUILTIN_2ARG(float, float, int, rootn);
+BUILTIN_2ARG(double, double, int, rootn);
+BUILTIN_1ARG_FLOATS(round);
+BUILTIN_1ARG_FLOATS(rsqrt);
+BUILTIN_1ARG_FLOATS(sin);
+BUILTIN_1ARG_FLOATS(sinpi);
+BUILTIN_1ARG_FLOATS(sinh);
+BUILTIN_PTR_ARG(float, float, sincos);
+BUILTIN_PTR_ARG(double, double, sincos);
+BUILTIN_1ARG_FLOATS(sqrt);
+BUILTIN_1ARG_FLOATS(tan);
+BUILTIN_1ARG_FLOATS(tanh);
+BUILTIN_1ARG_FLOATS(tanpi);
+BUILTIN_1ARG_FLOATS(tgamma);
+BUILTIN_1ARG_FLOATS(trunc);
+
+// Native math functions
+BUILTIN_1ARG_FLOATS(half_cos);
+BUILTIN_1ARG_FLOATS(native_cos);
+BUILTIN_2ARG_FLOATS(half_divide);
+BUILTIN_2ARG_FLOATS(native_divide);
+BUILTIN_1ARG_FLOATS(half_exp);
+BUILTIN_1ARG_FLOATS(native_exp);
+BUILTIN_1ARG_FLOATS(half_exp2);
+BUILTIN_1ARG_FLOATS(native_exp2);
+BUILTIN_1ARG_FLOATS(half_exp10);
+BUILTIN_1ARG_FLOATS(native_exp10);
+BUILTIN_1ARG_FLOATS(half_log);
+BUILTIN_1ARG_FLOATS(native_log);
+BUILTIN_1ARG_FLOATS(half_log2);
+BUILTIN_1ARG_FLOATS(native_log2);
+BUILTIN_1ARG_FLOATS(half_log10);
+BUILTIN_1ARG_FLOATS(native_log10);
+BUILTIN_2ARG_FLOATS(half_powr);
+BUILTIN_2ARG_FLOATS(native_powr);
+BUILTIN_1ARG_FLOATS(half_recip);
+BUILTIN_1ARG_FLOATS(native_recip);
+BUILTIN_1ARG_FLOATS(half_rsqrt);
+BUILTIN_1ARG_FLOATS(native_rsqrt);
+BUILTIN_1ARG_FLOATS(half_sin);
+BUILTIN_1ARG_FLOATS(native_sin);
+BUILTIN_1ARG_FLOATS(half_sqrt);
+BUILTIN_1ARG_FLOATS(native_sqrt);
+BUILTIN_1ARG_FLOATS(half_tan);
+BUILTIN_1ARG_FLOATS(native_tan);
+
+
+
+////////////////////////////
+// Misc. Vector Functions //
+////////////////////////////
+
+#define SHUFFLE_TYPE(ret, type, mask) \
+ ret __OVERLOAD__ shuffle(type, mask); \
+ ret##2 __OVERLOAD__ shuffle(type, mask##2); \
+ ret##3 __OVERLOAD__ shuffle(type, mask##3); \
+ ret##4 __OVERLOAD__ shuffle(type, mask##4); \
+ ret##8 __OVERLOAD__ shuffle(type, mask##8); \
+ ret##16 __OVERLOAD__ shuffle(type, mask##16);
+#define SHUFFLE(type, mask) \
+ SHUFFLE_TYPE(type, type, mask); \
+ SHUFFLE_TYPE(type, type##2, mask); \
+ SHUFFLE_TYPE(type, type##3, mask); \
+ SHUFFLE_TYPE(type, type##4, mask); \
+ SHUFFLE_TYPE(type, type##8, mask); \
+ SHUFFLE_TYPE(type, type##16, mask);
+SHUFFLE(char, uchar);
+SHUFFLE(uchar, uchar);
+SHUFFLE(short, ushort);
+SHUFFLE(ushort, ushort);
+SHUFFLE(int, uint);
+SHUFFLE(uint, uint);
+SHUFFLE(long, ulong);
+SHUFFLE(ulong, ulong);
+SHUFFLE(float, uint);
+SHUFFLE(double, ulong);
+
+#define SHUFFLE2_TYPE(ret, type, mask) \
+ ret __OVERLOAD__ shuffle2(type, type, mask); \
+ ret##2 __OVERLOAD__ shuffle2(type, type, mask##2); \
+ ret##3 __OVERLOAD__ shuffle2(type, type, mask##3); \
+ ret##4 __OVERLOAD__ shuffle2(type, type, mask##4); \
+ ret##8 __OVERLOAD__ shuffle2(type, type, mask##8); \
+ ret##16 __OVERLOAD__ shuffle2(type, type, mask##16);
+#define SHUFFLE2(type, mask) \
+ SHUFFLE2_TYPE(type, type, mask); \
+ SHUFFLE2_TYPE(type, type##2, mask); \
+ SHUFFLE2_TYPE(type, type##3, mask); \
+ SHUFFLE2_TYPE(type, type##4, mask); \
+ SHUFFLE2_TYPE(type, type##8, mask); \
+ SHUFFLE2_TYPE(type, type##16, mask);
+SHUFFLE2(char, uchar);
+SHUFFLE2(uchar, uchar);
+SHUFFLE2(short, ushort);
+SHUFFLE2(ushort, ushort);
+SHUFFLE2(int, uint);
+SHUFFLE2(uint, uint);
+SHUFFLE2(long, ulong);
+SHUFFLE2(ulong, ulong);
+SHUFFLE2(float, uint);
+SHUFFLE2(double, ulong);
+
+
+//////////////////////////
+// Relational Functions //
+//////////////////////////
+
+#define BUILTIN_ANYALL(name, type) \
+ int __OVERLOAD__ name(type); \
+ int __OVERLOAD__ name(type##2); \
+ int __OVERLOAD__ name(type##3); \
+ int __OVERLOAD__ name(type##4); \
+ int __OVERLOAD__ name(type##8); \
+ int __OVERLOAD__ name(type##16);
+#define REL_1ARG(name) \
+ BUILTIN_1ARG(int, float, name); \
+ BUILTIN_1ARG(long, double, name);
+#define REL_2ARG(name) \
+ BUILTIN_2ARG(int, float, float, name); \
+ BUILTIN_2ARG(long, double, double, name);
+BUILTIN_ANYALL(all, char);
+BUILTIN_ANYALL(all, short);
+BUILTIN_ANYALL(all, int);
+BUILTIN_ANYALL(all, long);
+BUILTIN_ANYALL(any, char);
+BUILTIN_ANYALL(any, short);
+BUILTIN_ANYALL(any, int);
+BUILTIN_ANYALL(any, long);
+BUILTIN_3ARG_FLOATS(bitselect);
+BUILTIN_3ARG_INTEGERS(bitselect);
+REL_2ARG(isequal);
+REL_2ARG(isnotequal);
+REL_2ARG(isgreater);
+REL_2ARG(isgreaterequal);
+REL_2ARG(isless);
+REL_2ARG(islessequal);
+REL_2ARG(islessgreater);
+REL_1ARG(isfinite);
+REL_1ARG(isinf);
+REL_1ARG(isnan);
+REL_1ARG(isnormal);
+REL_2ARG(isordered);
+REL_2ARG(isunordered);
+REL_1ARG(signbit);
+
+#define SELECT_TYPE(type, ctype) \
+ type __OVERLOAD__ select(type, type, ctype); \
+ type __OVERLOAD__ select(type, type, u##ctype);
+#define SELECT(type, ctype) \
+ SELECT_TYPE(type, ctype) \
+ SELECT_TYPE(type##2, ctype##2) \
+ SELECT_TYPE(type##3, ctype##3) \
+ SELECT_TYPE(type##4, ctype##4) \
+ SELECT_TYPE(type##8, ctype##8) \
+ SELECT_TYPE(type##16, ctype##16);
+SELECT(char, char);
+SELECT(uchar, char);
+SELECT(short, short);
+SELECT(ushort, short);
+SELECT(int, int);
+SELECT(uint, int);
+SELECT(long, long);
+SELECT(ulong, long);
+SELECT(float, int);
+SELECT(double, long);
+
+
+///////////////////////////////
+// Synchronization Functions //
+///////////////////////////////
+
+typedef uint cl_mem_fence_flags;
+#define CLK_LOCAL_MEM_FENCE (1<<0)
+#define CLK_GLOBAL_MEM_FENCE (1<<1)
+
+void barrier(cl_mem_fence_flags);
+void mem_fence(cl_mem_fence_flags);
+void read_mem_fence(cl_mem_fence_flags);
+void write_mem_fence(cl_mem_fence_flags);
+
+
+//////////////////////////////////////////
+// Vector Data Load and Store Functions //
+//////////////////////////////////////////
+
+#define VLOAD_ADDRSPACE(type, width) \
+ type##width __OVERLOAD__ vload##width(size_t, const __private type*); \
+ type##width __OVERLOAD__ vload##width(size_t, const __local type*); \
+ type##width __OVERLOAD__ vload##width(size_t, const __global type*); \
+ type##width __OVERLOAD__ vload##width(size_t, const __constant type*);
+
+#define VSTORE_ADDRSPACE(type, width) \
+ void __OVERLOAD__ vstore##width(type##width, size_t, __local type*); \
+ void __OVERLOAD__ vstore##width(type##width, size_t, __global type*); \
+ void __OVERLOAD__ vstore##width(type##width, size_t, __private type*);
+
+#define V_ADDRSPACE(macro, type) \
+ macro(type, 2) \
+ macro(type, 3) \
+ macro(type, 4) \
+ macro(type, 8) \
+ macro(type, 16);
+
+#define VLOADSTORE(type) \
+ V_ADDRSPACE(VLOAD_ADDRSPACE, type); \
+ V_ADDRSPACE(VSTORE_ADDRSPACE, type);
+
+VLOADSTORE(char);
+VLOADSTORE(uchar);
+VLOADSTORE(short);
+VLOADSTORE(ushort);
+VLOADSTORE(int);
+VLOADSTORE(uint);
+VLOADSTORE(long);
+VLOADSTORE(ulong);
+VLOADSTORE(float);
+VLOADSTORE(double);
+
+#define VLOAD_HALF_WIDTH(n) \
+ float##n __OVERLOAD__ vload_half##n(size_t, const __private half*); \
+ float##n __OVERLOAD__ vloada_half##n(size_t, const __private half*); \
+ float##n __OVERLOAD__ vload_half##n(size_t, const __local half*); \
+ float##n __OVERLOAD__ vloada_half##n(size_t, const __local half*); \
+ float##n __OVERLOAD__ vload_half##n(size_t, const __global half*); \
+ float##n __OVERLOAD__ vloada_half##n(size_t, const __global half*); \
+ float##n __OVERLOAD__ vload_half##n(size_t, const __constant half*); \
+ float##n __OVERLOAD__ vloada_half##n(size_t, const __constant half*);
+#define VSTORE_HALF_ADDRSPACE(func, type) \
+ void __OVERLOAD__ func(type, size_t, const __private half*); \
+ void __OVERLOAD__ func(type, size_t, const __local half*); \
+ void __OVERLOAD__ func(type, size_t, const __global half*); \
+ void __OVERLOAD__ func(type, size_t, const __constant half*);
+#define VSTORE_HALF_ROUND(func, type) \
+ VSTORE_HALF_ADDRSPACE(func, type); \
+ VSTORE_HALF_ADDRSPACE(func##_rte, type); \
+ VSTORE_HALF_ADDRSPACE(func##_rtz, type); \
+ VSTORE_HALF_ADDRSPACE(func##_rtp, type); \
+ VSTORE_HALF_ADDRSPACE(func##_rtn, type);
+#define VSTORE_HALF_WIDTH(n) \
+ VSTORE_HALF_ROUND(vstore_half##n, float##n); \
+ VSTORE_HALF_ROUND(vstorea_half##n, float##n);
+#define VLOADSTORE_HALF_WIDTH(n) \
+ VLOAD_HALF_WIDTH(n); \
+ VSTORE_HALF_WIDTH(n);
+VLOADSTORE_HALF_WIDTH();
+VLOADSTORE_HALF_WIDTH(2);
+VLOADSTORE_HALF_WIDTH(3);
+VLOADSTORE_HALF_WIDTH(4);
+VLOADSTORE_HALF_WIDTH(8);
+VLOADSTORE_HALF_WIDTH(16);
+
+
+/////////////////////////
+// Work-Item Functions //
+/////////////////////////
+
+size_t get_global_id(uint dim);
+size_t get_global_size(uint dim);
+size_t get_global_offset(uint dim);
+size_t get_group_id(uint dim);
+size_t get_local_id(uint dim);
+size_t get_local_size(uint dim);
+size_t get_num_groups(uint dim);
+uint get_work_dim(void);
+
+
+
+/////////////////////
+// Other Functions //
+/////////////////////
+
+int printf(__constant char * restrict, ...);
+
+
+/////////////////
+// Conversions //
+/////////////////
+
+#define as_char( _x ) __builtin_astype( _x, char )
+#define as_char2( _x ) __builtin_astype( _x, char2 )
+#define as_char3( _x ) __builtin_astype( _x, char3 )
+#define as_char4( _x ) __builtin_astype( _x, char4 )
+#define as_char8( _x ) __builtin_astype( _x, char8 )
+#define as_char16( _x ) __builtin_astype( _x, char16 )
+#define as_uchar( _x ) __builtin_astype( _x, uchar )
+#define as_uchar2( _x ) __builtin_astype( _x, uchar2 )
+#define as_uchar3( _x ) __builtin_astype( _x, uchar3 )
+#define as_uchar4( _x ) __builtin_astype( _x, uchar4 )
+#define as_uchar8( _x ) __builtin_astype( _x, uchar8 )
+#define as_uchar16( _x ) __builtin_astype( _x, uchar16 )
+#define as_short( _x ) __builtin_astype( _x, short )
+#define as_short2( _x ) __builtin_astype( _x, short2 )
+#define as_short3( _x ) __builtin_astype( _x, short3 )
+#define as_short4( _x ) __builtin_astype( _x, short4 )
+#define as_short8( _x ) __builtin_astype( _x, short8 )
+#define as_short16( _x ) __builtin_astype( _x, short16 )
+#define as_ushort( _x ) __builtin_astype( _x, ushort )
+#define as_ushort2( _x ) __builtin_astype( _x, ushort2 )
+#define as_ushort3( _x ) __builtin_astype( _x, ushort3 )
+#define as_ushort4( _x ) __builtin_astype( _x, ushort4 )
+#define as_ushort8( _x ) __builtin_astype( _x, ushort8 )
+#define as_ushort16( _x ) __builtin_astype( _x, ushort16 )
+#define as_int( _x ) __builtin_astype( _x, int )
+#define as_int2( _x ) __builtin_astype( _x, int2 )
+#define as_int3( _x ) __builtin_astype( _x, int3 )
+#define as_int4( _x ) __builtin_astype( _x, int4 )
+#define as_int8( _x ) __builtin_astype( _x, int8 )
+#define as_int16( _x ) __builtin_astype( _x, int16 )
+#define as_uint( _x ) __builtin_astype( _x, uint )
+#define as_uint2( _x ) __builtin_astype( _x, uint2 )
+#define as_uint3( _x ) __builtin_astype( _x, uint3 )
+#define as_uint4( _x ) __builtin_astype( _x, uint4 )
+#define as_uint8( _x ) __builtin_astype( _x, uint8 )
+#define as_uint16( _x ) __builtin_astype( _x, uint16 )
+#define as_long( _x ) __builtin_astype( _x, long )
+#define as_long2( _x ) __builtin_astype( _x, long2 )
+#define as_long3( _x ) __builtin_astype( _x, long3 )
+#define as_long4( _x ) __builtin_astype( _x, long4 )
+#define as_long8( _x ) __builtin_astype( _x, long8 )
+#define as_long16( _x ) __builtin_astype( _x, long16 )
+#define as_ulong( _x ) __builtin_astype( _x, ulong )
+#define as_ulong2( _x ) __builtin_astype( _x, ulong2 )
+#define as_ulong3( _x ) __builtin_astype( _x, ulong3 )
+#define as_ulong4( _x ) __builtin_astype( _x, ulong4 )
+#define as_ulong8( _x ) __builtin_astype( _x, ulong8 )
+#define as_ulong16( _x ) __builtin_astype( _x, ulong16 )
+#define as_float( _x ) __builtin_astype( _x, float )
+#define as_float2( _x ) __builtin_astype( _x, float2 )
+#define as_float3( _x ) __builtin_astype( _x, float3 )
+#define as_float4( _x ) __builtin_astype( _x, float4 )
+#define as_float8( _x ) __builtin_astype( _x, float8 )
+#define as_float16( _x ) __builtin_astype( _x, float16 )
+#define as_double( _x ) __builtin_astype( _x, double )
+#define as_double2( _x ) __builtin_astype( _x, double2 )
+#define as_double3( _x ) __builtin_astype( _x, double3 )
+#define as_double4( _x ) __builtin_astype( _x, double4 )
+#define as_double8( _x ) __builtin_astype( _x, double8 )
+#define as_double16( _x ) __builtin_astype( _x, double16 )
+#define as_size_t( _x ) __builtin_astype( _x, size_t )
+#define as_ptrdiff_t( _x ) __builtin_astype( _x, ptrdiff_t )
+#define as_uintptr_t( _x ) __builtin_astype( _x, uintptr_t )
+#define as_intptr_t( _x ) __builtin_astype( _x, intptr_t )
+
+#define CONVERT_TYPE_SIZE(out, in) \
+ out __OVERLOAD__ convert_##out(in); \
+ out __OVERLOAD__ convert_##out##_rte(in); \
+ out __OVERLOAD__ convert_##out##_rtz(in); \
+ out __OVERLOAD__ convert_##out##_rtp(in); \
+ out __OVERLOAD__ convert_##out##_rtn(in); \
+ out __OVERLOAD__ convert_##out##_sat(in); \
+ out __OVERLOAD__ convert_##out##_sat_rte(in); \
+ out __OVERLOAD__ convert_##out##_sat_rtz(in); \
+ out __OVERLOAD__ convert_##out##_sat_rtp(in); \
+ out __OVERLOAD__ convert_##out##_sat_rtn(in);
+#define CONVERT_TYPE(out, in) \
+ CONVERT_TYPE_SIZE(out, in); \
+ CONVERT_TYPE_SIZE(out##2, in##2); \
+ CONVERT_TYPE_SIZE(out##3, in##3); \
+ CONVERT_TYPE_SIZE(out##4, in##4); \
+ CONVERT_TYPE_SIZE(out##8, in##8); \
+ CONVERT_TYPE_SIZE(out##16, in##16);
+#define CONVERT(out) \
+ CONVERT_TYPE(out, char); \
+ CONVERT_TYPE(out, uchar); \
+ CONVERT_TYPE(out, short); \
+ CONVERT_TYPE(out, ushort); \
+ CONVERT_TYPE(out, int); \
+ CONVERT_TYPE(out, uint); \
+ CONVERT_TYPE(out, long); \
+ CONVERT_TYPE(out, ulong); \
+ CONVERT_TYPE(out, float); \
+ CONVERT_TYPE(out, double);
+
+CONVERT(char);
+CONVERT(uchar);
+CONVERT(short);
+CONVERT(ushort);
+CONVERT(int);
+CONVERT(uint);
+CONVERT(long);
+CONVERT(ulong);
+CONVERT(float);
+CONVERT(double);
diff --git a/src/core/common.cpp b/src/core/common.cpp
new file mode 100644
index 0000000..3f849fa
--- /dev/null
+++ b/src/core/common.cpp
@@ -0,0 +1,712 @@
+// common.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <time.h>
+#else
+#include <sys/time.h>
+#endif
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/raw_os_ostream.h"
+
+using namespace oclgrind;
+using namespace std;
+
+namespace oclgrind
+{
+ _Size3_::_Size3_()
+ {
+ x = y = z = 0;
+ }
+
+ _Size3_::_Size3_(size_t _x, size_t _y, size_t _z)
+ {
+ x = _x;
+ y = _y;
+ z = _z;
+ }
+
+ _Size3_::_Size3_(size_t linear, _Size3_ dimensions)
+ {
+ x = linear % dimensions.x;
+ y = (linear / dimensions.x) % dimensions.y;
+ z = (linear / (dimensions.x * dimensions.y));
+ }
+
+ size_t& Size3::operator[](unsigned i)
+ {
+ switch (i)
+ {
+ case 0:
+ return x;
+ case 1:
+ return y;
+ case 2:
+ return z;
+ default:
+ assert(false && "Size3 index out of range");
+ }
+ }
+
+ const size_t& Size3::operator[](unsigned i) const
+ {
+ switch (i)
+ {
+ case 0:
+ return x;
+ case 1:
+ return y;
+ case 2:
+ return z;
+ default:
+ assert(false && "Size3 index out of range");
+ }
+ }
+
+ bool Size3::operator==(const Size3& rhs) const
+ {
+ return x == rhs.x && y == rhs.y && z == rhs.z;
+ }
+
+ ostream& operator<<(ostream& stream, const Size3& size)
+ {
+ stream << dec << "("
+ << size.x << ","
+ << size.y << ","
+ << size.z << ")";
+ return stream;
+ }
+
+ double TypedValue::getFloat(unsigned index) const
+ {
+ switch (size)
+ {
+ case 4:
+ return ((float*)data)[index];
+ case 8:
+ return ((double*)data)[index];
+ default:
+ FATAL_ERROR("Unsupported float size: %u bytes", size);
+ }
+ }
+
+ size_t TypedValue::getPointer(unsigned index) const
+ {
+ if (size != sizeof(size_t))
+ {
+ FATAL_ERROR("Unsupported pointer size: %u bytes", size);
+ }
+ return ((size_t*)data)[index];
+ }
+
+ int64_t TypedValue::getSInt(unsigned index) const
+ {
+ switch (size)
+ {
+ case 1:
+ return ((int8_t*)data)[index];
+ case 2:
+ return ((int16_t*)data)[index];
+ case 4:
+ return ((int32_t*)data)[index];
+ case 8:
+ return ((int64_t*)data)[index];
+ default:
+ FATAL_ERROR("Unsupported signed int size: %u bytes", size);
+ }
+ }
+
+ uint64_t TypedValue::getUInt(unsigned index) const
+ {
+ switch (size)
+ {
+ case 1:
+ return ((uint8_t*)data)[index];
+ case 2:
+ return ((uint16_t*)data)[index];
+ case 4:
+ return ((uint32_t*)data)[index];
+ case 8:
+ return ((uint64_t*)data)[index];
+ default:
+ FATAL_ERROR("Unsupported unsigned int size: %u bytes", size);
+ }
+ }
+
+ void TypedValue::setFloat(double value, unsigned index)
+ {
+ switch (size)
+ {
+ case 4:
+ ((float*)data)[index] = value;
+ break;
+ case 8:
+ ((double*)data)[index] = value;
+ break;
+ default:
+ FATAL_ERROR("Unsupported float size: %u bytes", size);
+ }
+ }
+
+ void TypedValue::setPointer(size_t value, unsigned index)
+ {
+ if (size != sizeof(size_t))
+ {
+ FATAL_ERROR("Unsupported pointer size: %u bytes", size);
+ }
+ ((size_t*)data)[index] = value;
+ }
+
+ void TypedValue::setSInt(int64_t value, unsigned index)
+ {
+ switch (size)
+ {
+ case 1:
+ ((int8_t*)data)[index] = value;
+ break;
+ case 2:
+ ((int16_t*)data)[index] = value;
+ break;
+ case 4:
+ ((int32_t*)data)[index] = value;
+ break;
+ case 8:
+ ((int64_t*)data)[index] = value;
+ break;
+ default:
+ FATAL_ERROR("Unsupported signed int size: %u bytes", size);
+ }
+ }
+
+ void TypedValue::setUInt(uint64_t value, unsigned index)
+ {
+ switch (size)
+ {
+ case 1:
+ ((uint8_t*)data)[index] = value;
+ break;
+ case 2:
+ ((uint16_t*)data)[index] = value;
+ break;
+ case 4:
+ ((uint32_t*)data)[index] = value;
+ break;
+ case 8:
+ ((uint64_t*)data)[index] = value;
+ break;
+ default:
+ FATAL_ERROR("Unsupported unsigned int size: %u bytes", size);
+ }
+ }
+
+ TypedValue TypedValue::clone() const
+ {
+ TypedValue result;
+ result.size = size;
+ result.num = num;
+ result.data = new unsigned char[size*num];
+ memcpy(result.data, data, size*num);
+ return result;
+ }
+
+ bool checkEnv(const char *var)
+ {
+ const char *value = getenv(var);
+ return (value && !strcmp(value, "1"));
+ }
+
+ void dumpInstruction(ostream& out, const llvm::Instruction *instruction)
+ {
+ llvm::raw_os_ostream stream(out);
+ instruction->print(stream);
+ }
+
+ const char* getAddressSpaceName(unsigned addrSpace)
+ {
+ switch (addrSpace)
+ {
+ case AddrSpacePrivate:
+ return "private";
+ case AddrSpaceGlobal:
+ return "global";
+ case AddrSpaceConstant:
+ return "constant";
+ case AddrSpaceLocal:
+ return "local";
+ default:
+ return "(unknown)";
+ }
+ }
+
+ void getConstantData(unsigned char *data, const llvm::Constant *constant)
+ {
+ if (constant->getValueID() == llvm::Value::UndefValueVal)
+ {
+ return;
+ }
+
+ const llvm::Type *type = constant->getType();
+ unsigned size = getTypeSize(type);
+ switch (type->getTypeID())
+ {
+ case llvm::Type::IntegerTyID:
+ memcpy(data,
+ ((llvm::ConstantInt*)constant)->getValue().getRawData(),
+ size);
+ break;
+ case llvm::Type::FloatTyID:
+ {
+ *(float*)data =
+ ((llvm::ConstantFP*)constant)->getValueAPF().convertToFloat();
+ break;
+ }
+ case llvm::Type::DoubleTyID:
+ {
+ *(double*)data =
+ ((llvm::ConstantFP*)constant)->getValueAPF().convertToDouble();
+ break;
+ }
+ case llvm::Type::VectorTyID:
+ {
+ unsigned num = type->getVectorNumElements();
+ const llvm::Type *elemType = type->getVectorElementType();
+ unsigned elemSize = getTypeSize(elemType);
+ for (unsigned i = 0; i < num; i++)
+ {
+ getConstantData(data + i*elemSize, constant->getAggregateElement(i));
+ }
+ break;
+ }
+ case llvm::Type::ArrayTyID:
+ {
+ unsigned num = type->getArrayNumElements();
+ const llvm::Type *elemType = type->getArrayElementType();
+ unsigned elemSize = getTypeSize(elemType);
+ for (unsigned i = 0; i < num; i++)
+ {
+ getConstantData(data + i*elemSize, constant->getAggregateElement(i));
+ }
+ break;
+ }
+ case llvm::Type::PointerTyID:
+ {
+ if (constant->getValueID() != llvm::Value::ConstantPointerNullVal)
+ {
+ FATAL_ERROR("Unsupported constant pointer value: %d",
+ constant->getValueID());
+ }
+ *(size_t*)data = 0;
+ break;
+ }
+ case llvm::Type::StructTyID:
+ {
+ unsigned num = type->getStructNumElements();
+ for (unsigned i = 0; i < num; i++)
+ {
+ unsigned offset =
+ getStructMemberOffset((const llvm::StructType*)type, i);
+ getConstantData(data + offset, constant->getAggregateElement(i));
+ }
+ break;
+ }
+ default:
+ FATAL_ERROR("Unsupported constant type: %d", type->getTypeID());
+ }
+ }
+
+ const llvm::Instruction* getConstExprAsInstruction(
+ const llvm::ConstantExpr *expr)
+ {
+ // Get operands
+ unsigned numOperands = expr->getNumOperands();
+ llvm::Value **valueOperands = new llvm::Value*[numOperands];
+ for (unsigned i = 0; i < numOperands; i++)
+ {
+ valueOperands[i] = expr->getOperand(i);
+ }
+ llvm::ArrayRef<llvm::Value*> operands(valueOperands, numOperands);
+
+ // Create instruction
+ unsigned opcode = expr->getOpcode();
+ switch (opcode)
+ {
+ case llvm::Instruction::Trunc:
+ case llvm::Instruction::ZExt:
+ case llvm::Instruction::SExt:
+ case llvm::Instruction::FPTrunc:
+ case llvm::Instruction::FPExt:
+ case llvm::Instruction::UIToFP:
+ case llvm::Instruction::SIToFP:
+ case llvm::Instruction::FPToUI:
+ case llvm::Instruction::FPToSI:
+ case llvm::Instruction::PtrToInt:
+ case llvm::Instruction::IntToPtr:
+ case llvm::Instruction::BitCast:
+ return llvm::CastInst::Create((llvm::Instruction::CastOps)opcode,
+ operands[0], expr->getType());
+ case llvm::Instruction::Select:
+ return llvm::SelectInst::Create(operands[0], operands[1], operands[2]);
+ case llvm::Instruction::InsertElement:
+ return llvm::InsertElementInst::Create(operands[0], operands[1],
+ operands[2]);
+ case llvm::Instruction::ExtractElement:
+ return llvm::ExtractElementInst::Create(operands[0], operands[1]);
+ case llvm::Instruction::InsertValue:
+ return llvm::InsertValueInst::Create(operands[0], operands[1],
+ expr->getIndices());
+ case llvm::Instruction::ExtractValue:
+ return llvm::ExtractValueInst::Create(operands[0], expr->getIndices());
+ case llvm::Instruction::ShuffleVector:
+ return new llvm::ShuffleVectorInst(operands[0], operands[1],
+ operands[2]);
+ case llvm::Instruction::GetElementPtr:
+ if (((const llvm::GEPOperator*)expr)->isInBounds())
+ {
+ return llvm::GetElementPtrInst::CreateInBounds(operands[0],
+ operands.slice(1));
+ }
+ else
+ {
+#if LLVM_VERSION > 36
+ return llvm::GetElementPtrInst::Create(expr->getType(),
+ operands[0], operands.slice(1));
+#else
+ return llvm::GetElementPtrInst::Create(operands[0], operands.slice(1));
+#endif
+
+ }
+ case llvm::Instruction::ICmp:
+ case llvm::Instruction::FCmp:
+ return llvm::CmpInst::Create((llvm::Instruction::OtherOps)opcode,
+ expr->getPredicate(),
+ operands[0], operands[1]);
+ default:
+ assert(expr->getNumOperands() == 2 && "Must be binary operator?");
+
+ llvm::BinaryOperator *binaryOp =
+ llvm::BinaryOperator::Create((llvm::Instruction::BinaryOps)opcode,
+ operands[0], operands[1]);
+
+ // Check for overflowing operator
+ if (opcode == llvm::Instruction::Add ||
+ opcode == llvm::Instruction::Mul ||
+ opcode == llvm::Instruction::Shl ||
+ opcode == llvm::Instruction::Sub)
+ {
+ binaryOp->setHasNoUnsignedWrap(
+ expr->getRawSubclassOptionalData() &
+ llvm::OverflowingBinaryOperator::NoUnsignedWrap);
+ binaryOp->setHasNoSignedWrap(
+ expr->getRawSubclassOptionalData() &
+ llvm::OverflowingBinaryOperator::NoSignedWrap);
+ }
+
+ // Check for possibly exact operator
+ if (opcode == llvm::Instruction::AShr ||
+ opcode == llvm::Instruction::LShr ||
+ opcode == llvm::Instruction::SDiv ||
+ opcode == llvm::Instruction::UDiv)
+ {
+ binaryOp->setIsExact(expr->getRawSubclassOptionalData() &
+ llvm::PossiblyExactOperator::IsExact);
+ }
+
+ return binaryOp;
+ }
+ }
+
+ const llvm::ConstantInt* getMDOpAsConstInt(const llvm::MDOperand& op)
+ {
+ llvm::Metadata *md = op.get();
+ llvm::ConstantAsMetadata *cam =
+ llvm::dyn_cast<llvm::ConstantAsMetadata>(md);
+ if (!cam)
+ return NULL;
+ return llvm::dyn_cast<llvm::ConstantInt>(cam->getValue());
+ }
+
+ unsigned getStructMemberOffset(const llvm::StructType *type, unsigned index)
+ {
+ bool packed = ((llvm::StructType*)type)->isPacked();
+
+ unsigned offset = 0;
+ for (unsigned i = 0; i <= index; i++)
+ {
+ // Get member size and alignment
+ const llvm::Type *elemType = type->getStructElementType(i);
+ unsigned size = getTypeSize(elemType);
+ unsigned align = getTypeAlignment(elemType);
+
+ // Add padding if necessary
+ if (!packed && offset % align)
+ {
+ offset += (align - (offset%align));
+ }
+
+ if (i == index)
+ {
+ return offset;
+ }
+ offset += size;
+ }
+
+ // Unreachable
+ assert(false);
+ }
+
+ unsigned getTypeSize(const llvm::Type *type)
+ {
+ if (type->isArrayTy())
+ {
+ unsigned num = type->getArrayNumElements();
+ unsigned sz = getTypeSize(type->getArrayElementType());
+ return num*sz;
+ }
+ else if (type->isStructTy())
+ {
+ bool packed = ((llvm::StructType*)type)->isPacked();
+
+ unsigned size = 0;
+ unsigned alignment = 1;
+ for (unsigned i = 0; i < type->getStructNumElements(); i++)
+ {
+ // Get member size and alignment
+ const llvm::Type *elemType = type->getStructElementType(i);
+ unsigned sz = getTypeSize(elemType);
+ unsigned align = getTypeAlignment(elemType);
+
+ // Add padding if necessary
+ if (!packed && size % align)
+ {
+ size += (align - (size%align));
+ }
+
+ size += sz;
+
+ alignment = max(alignment, align);
+ }
+
+ // Alignment of struct should match member with largest alignment
+ if (!packed && size % alignment)
+ {
+ size += (alignment - (size%alignment));
+ }
+
+ return size;
+ }
+ else if (type->isVectorTy())
+ {
+ unsigned num = type->getVectorNumElements();
+ unsigned sz = getTypeSize(type->getVectorElementType());
+ if (num == 3) num = 4; // Hack for 3-element vectors
+ return num*sz;
+ }
+ else if (type->isPointerTy())
+ {
+ return sizeof(size_t);
+ }
+ else
+ {
+ // For some reason, getScalarSizeInBits is not const
+ llvm::Type* nonConstTy = const_cast<llvm::Type*>(type);
+
+ // Round up for types that have a bit size not multiple of 8
+ // like "bool".
+ unsigned ret = nonConstTy->getScalarSizeInBits() / 8;
+ if (nonConstTy->getScalarSizeInBits() % 8)
+ ret++;
+
+ return ret;
+ }
+ }
+
+ /// Returns the byte alignment of this type
+ unsigned getTypeAlignment(const llvm::Type* type)
+ {
+ using namespace llvm;
+ // Array types are aligned to their element type
+ if (const ArrayType* psAT = dyn_cast<ArrayType>(type))
+ {
+ return getTypeAlignment(psAT->getElementType());
+ }
+
+ // Struct alignment is the size of its largest contained type
+ if (const StructType* structT = dyn_cast<StructType>(type))
+ {
+ if (structT->isPacked())
+ return 1;
+ StructType* nonConstTy = const_cast<StructType*>(structT);
+ unsigned uAlign = 0, uMaxAlign = 1;
+ unsigned uCount = structT->getNumElements();
+ for (unsigned i = 0; i < uCount; i++)
+ {
+ const Type* psElemType = nonConstTy->getTypeAtIndex(i);
+ uAlign = getTypeAlignment(psElemType);
+
+ if (uAlign > uMaxAlign)
+ uMaxAlign = uAlign;
+ }
+
+ return uMaxAlign;
+ }
+
+ return getTypeSize(type);
+ }
+
+ pair<unsigned,unsigned> getValueSize(const llvm::Value *value)
+ {
+ unsigned bits, numElements;
+ const llvm::Type *type = value->getType();
+
+ if (type->isVectorTy())
+ {
+ bits = type->getVectorElementType()->getPrimitiveSizeInBits();
+ numElements = type->getVectorNumElements();
+ }
+ else if (type->isAggregateType())
+ {
+ bits = getTypeSize(type)<<3;
+ numElements = 1;
+ }
+ else
+ {
+ bits = type->getPrimitiveSizeInBits();
+ numElements = 1;
+ }
+
+ unsigned elemSize = bits >> 3;
+
+ // Special case for pointer types
+ if (type->isPointerTy())
+ {
+ elemSize = sizeof(size_t);
+ }
+
+ // Special case for boolean results
+ if (bits == 1)
+ {
+ elemSize = sizeof(bool);
+ }
+
+ return pair<unsigned,unsigned>(elemSize,numElements);
+ }
+
+ bool isConstantOperand(const llvm::Value *operand)
+ {
+ unsigned id = operand->getValueID();
+ return (id >= llvm::Value::ConstantFirstVal &&
+ id <= llvm::Value::ConstantLastVal);
+ }
+
+ bool isVector3(const llvm::Value *value)
+ {
+ return (value->getType()->isVectorTy() &&
+ value->getType()->getVectorNumElements() == 3);
+ }
+
+ double now()
+ {
+#if defined(_WIN32) && !defined(__MINGW32__)
+ return time(NULL)*1e9;
+#else
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return tv.tv_usec*1e3 + tv.tv_sec*1e9;
+#endif
+ }
+
+ void printTypedData(const llvm::Type *type, const unsigned char *data)
+ {
+ // TODO: Interpret other types (array, struct)
+ unsigned size = getTypeSize(type);
+ switch (type->getTypeID())
+ {
+ case llvm::Type::FloatTyID:
+ cout << *(float*)data;
+ break;
+ case llvm::Type::DoubleTyID:
+ cout << *(double*)data;
+ break;
+ case llvm::Type::IntegerTyID:
+ switch (size)
+ {
+ case 1:
+ cout << (int)*(char*)data;
+ break;
+ case 2:
+ cout << *(short*)data;
+ break;
+ case 4:
+ cout << *(int*)data;
+ break;
+ case 8:
+ cout << *(long*)data;
+ break;
+ default:
+ cout << "(invalid integer size)";
+ break;
+ }
+ break;
+ case llvm::Type::VectorTyID:
+ {
+ const llvm::Type *elemType = type->getVectorElementType();
+ cout << "(";
+ for (unsigned i = 0; i < type->getVectorNumElements(); i++)
+ {
+ if (i > 0)
+ {
+ cout << ",";
+ }
+ printTypedData(elemType, data+i*getTypeSize(elemType));
+ }
+ cout << ")";
+ break;
+ }
+ case llvm::Type::PointerTyID:
+ cout << "0x" << hex << *(size_t*)data;
+ break;
+ default:
+ cout << "(raw) 0x" << hex << uppercase << setfill('0');
+ for (unsigned i = 0; i < size; i++)
+ {
+ cout << setw(2) << (int)data[i];
+ }
+ }
+ }
+
+ FatalError::FatalError(const string& msg, const string& file, size_t line)
+ : std::runtime_error(msg)
+ {
+ m_file = file;
+ m_line = line;
+ }
+
+ FatalError::~FatalError() throw()
+ {
+ }
+
+ const string& FatalError::getFile() const
+ {
+ return m_file;
+ }
+
+ size_t FatalError::getLine() const
+ {
+ return m_line;
+ }
+
+ const char* FatalError::what() const throw()
+ {
+ return runtime_error::what();
+ }
+}
diff --git a/src/core/common.h b/src/core/common.h
new file mode 100644
index 0000000..d908ffa
--- /dev/null
+++ b/src/core/common.h
@@ -0,0 +1,203 @@
+// common.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#ifndef __common_h_
+#define __common_h_
+
+#include "config.h"
+#include "CL/cl.h"
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <queue>
+#include <set>
+#include <sstream>
+#include <stack>
+#include <stdexcept>
+#include <stdint.h>
+#include <unordered_map>
+#include <vector>
+
+#define BIG_SEPARATOR "================================"
+#define SMALL_SEPARATOR "--------------------------------"
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#define snprintf _snprintf
+#undef ERROR
+#endif
+
+namespace llvm
+{
+ class Constant;
+ class ConstantExpr;
+ class ConstantInt;
+ class Instruction;
+ class MDOperand;
+ class StructType;
+ class Type;
+ class Value;
+}
+
+namespace oclgrind
+{
+ class Kernel;
+
+ // Enumeration for address spaces
+ enum AddressSpace
+ {
+ AddrSpacePrivate = 0,
+ AddrSpaceGlobal = 1,
+ AddrSpaceConstant = 2,
+ AddrSpaceLocal = 3,
+ };
+
+ enum AtomicOp
+ {
+ AtomicAdd,
+ AtomicAnd,
+ AtomicCmpXchg,
+ AtomicDec,
+ AtomicInc,
+ AtomicMax,
+ AtomicMin,
+ AtomicOr,
+ AtomicSub,
+ AtomicXchg,
+ AtomicXor,
+ };
+
+ // Enumeration for different log message types
+ enum MessageType
+ {
+ DEBUG,
+ INFO,
+ WARNING,
+ ERROR,
+ };
+
+ // 3-dimensional size
+ typedef struct _Size3_
+ {
+ size_t x, y, z;
+ _Size3_();
+ _Size3_(size_t x, size_t y, size_t z);
+ _Size3_(size_t linear, _Size3_ dimensions);
+ size_t& operator[](unsigned i);
+ const size_t& operator[](unsigned i) const;
+ bool operator==(const _Size3_& rhs) const;
+ friend std::ostream& operator<<(std::ostream& stream, const _Size3_& sz);
+ } Size3;
+
+ // Structure for a value with a size/type
+ struct _TypedValue_
+ {
+ unsigned size;
+ unsigned num;
+ unsigned char *data;
+
+ struct _TypedValue_ clone() const;
+
+ double getFloat(unsigned index = 0) const;
+ size_t getPointer(unsigned index = 0) const;
+ int64_t getSInt(unsigned index = 0) const;
+ uint64_t getUInt(unsigned index = 0) const;
+ void setFloat(double value, unsigned index = 0);
+ void setPointer(size_t value, unsigned index = 0);
+ void setSInt(int64_t value, unsigned index = 0);
+ void setUInt(uint64_t value, unsigned index = 0);
+
+ };
+ typedef _TypedValue_ TypedValue;
+
+ // Private memory map type
+ typedef std::map<const llvm::Value*,TypedValue> TypedValueMap;
+
+ // Image object
+ typedef struct
+ {
+ size_t address;
+ cl_image_format format;
+ cl_image_desc desc;
+ } Image;
+
+ // Check if an environment variable is set to 1
+ bool checkEnv(const char *var);
+
+ // Output an instruction in human-readable format
+ void dumpInstruction(std::ostream& out, const llvm::Instruction *instruction);
+
+ // Get the human readable name of an address space
+ const char* getAddressSpaceName(unsigned addrSpace);
+
+ // Retrieve the raw data for a constant
+ void getConstantData(unsigned char *data, const llvm::Constant *constant);
+
+ // Creates an instruction from a constant expression
+ const llvm::Instruction* getConstExprAsInstruction(
+ const llvm::ConstantExpr *expr);
+
+ // Get the ConstantInt object for an MDOperand
+ const llvm::ConstantInt* getMDOpAsConstInt(const llvm::MDOperand& op);
+
+ // Get the byte offset of a struct member
+ unsigned getStructMemberOffset(const llvm::StructType *type, unsigned index);
+
+ // Returns the size of a type
+ unsigned getTypeSize(const llvm::Type *type);
+
+ /// Returns the alignment requirements of this type
+ unsigned getTypeAlignment(const llvm::Type* type);
+
+ // Returns the size of a value
+ std::pair<unsigned,unsigned> getValueSize(const llvm::Value *value);
+
+ // Returns true if the operand is a constant value
+ bool isConstantOperand(const llvm::Value *operand);
+
+ // Returns true if the value is a 3-element vector
+ bool isVector3(const llvm::Value *value);
+
+ // Return the current time in nanoseconds since the epoch
+ double now();
+
+ // Print data in a human readable format (according to its type)
+ void printTypedData(const llvm::Type *type, const unsigned char *data);
+
+ // Exception class for raising fatal errors
+ class FatalError : std::runtime_error
+ {
+ public:
+ FatalError(const std::string& msg, const std::string& file, size_t line);
+ ~FatalError() throw();
+ virtual const std::string& getFile() const;
+ virtual size_t getLine() const;
+ virtual const char* what() const throw();
+ protected:
+ std::string m_file;
+ size_t m_line;
+ };
+
+ // Utility macro for raising an exception with a sprintf-based message
+ #define FATAL_ERROR(format, ...) \
+ { \
+ int sz = snprintf(NULL, 0, format, ##__VA_ARGS__); \
+ char *str = new char[sz+1]; \
+ sprintf(str, format, ##__VA_ARGS__); \
+ string msg = str; \
+ delete[] str; \
+ throw FatalError(msg, __FILE__, __LINE__); \
+ }
+}
+
+#endif // __common_h_
diff --git a/src/core/gen_clc_h.cmake b/src/core/gen_clc_h.cmake
new file mode 100644
index 0000000..43b4fa5
--- /dev/null
+++ b/src/core/gen_clc_h.cmake
@@ -0,0 +1,11 @@
+set(OUTPUT src/core/clc_h.cpp)
+
+file(WRITE ${OUTPUT} "extern const char CLC_H_DATA[] = \n\"")
+
+file(READ ${SOURCE_FILE} CLC_H)
+string(REGEX REPLACE "\\\\" "\\\\\\\\" CLC_H "${CLC_H}")
+string(REGEX REPLACE "\"" "\\\\\"" CLC_H "${CLC_H}")
+string(REGEX REPLACE "\n" "\\\\n\"\n\"" CLC_H "${CLC_H}")
+file(APPEND ${OUTPUT} "${CLC_H}")
+
+file(APPEND ${OUTPUT} "\";")
diff --git a/src/core/gen_clc_h.sh b/src/core/gen_clc_h.sh
new file mode 100755
index 0000000..e9ce2b1
--- /dev/null
+++ b/src/core/gen_clc_h.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+if [ $# -ne 2 ]
+then
+ echo "Usage: gen_clc_h.sh INPUT OUTPUT"
+ exit 1
+fi
+
+IN=$1
+OUT=$2
+
+echo "extern const char CLC_H_DATA[] =" >$OUT
+sed -e 's/\\/\\\\/g;s/"/\\"/g;s/^/"/;s/$/\\n"/' $IN >>$OUT
+if [ $? -ne 0 ]
+then
+ exit 1
+fi
+echo ";" >>$OUT
diff --git a/src/core/half.h b/src/core/half.h
new file mode 100644
index 0000000..58afcf1
--- /dev/null
+++ b/src/core/half.h
@@ -0,0 +1,160 @@
+// half.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "common.h"
+
+static float halfToFloat(uint16_t half)
+{
+ uint16_t h_sign, h_exponent, h_mantissa;
+ uint32_t f_sign, f_exponent, f_mantissa;
+
+ h_sign = half & 0x8000; // 1000 0000 0000 0000
+ h_exponent = half & 0x7C00; // 0111 1100 0000 0000
+ h_mantissa = half & 0x03FF; // 0000 0011 1111 1111
+
+ f_sign = ((uint32_t)h_sign) << 16;
+
+ if (h_exponent == 0)
+ {
+ if (h_mantissa == 0)
+ {
+ // Zero
+ f_exponent = 0;
+ f_mantissa = 0;
+ }
+ else
+ {
+ // Denorm - convert to normalized float
+ int e = -1;
+ do
+ {
+ e++;
+ h_mantissa <<= 1;
+ }
+ while((h_mantissa & 0x0400) == 0);
+
+ f_exponent = (-15 + 127 - e) << 23;
+ f_mantissa = ((uint32_t)(h_mantissa & 0x03FF)) << 13;
+ }
+ }
+ else if (h_exponent == 0x7C00)
+ {
+ // Inf or NaN
+ f_exponent = 0xFF << 23;
+ f_mantissa = h_mantissa;
+ }
+ else
+ {
+ // Normalized
+ f_exponent = (((int32_t)(h_exponent >> 10)) - 15 + 127) << 23;
+ f_mantissa = ((uint32_t)h_mantissa) << 13;
+ }
+
+ uint32_t result = f_sign | f_exponent | f_mantissa;
+ return *(float*)&result;
+}
+
+enum HalfRoundMode
+{
+ // Towards negative infinity
+ Half_RTN,
+ // Towards zero
+ Half_RTZ,
+ // Towards positive infinity
+ Half_RTP,
+ // Towards nearest even
+ Half_RTE
+};
+
+static uint16_t floatToHalf(float sp, HalfRoundMode round = Half_RTZ)
+{
+ uint16_t h_sign, h_exponent, h_mantissa;
+ uint32_t f_sign, f_exponent, f_mantissa;
+
+ union
+ {
+ float f;
+ uint32_t ui;
+ } FtoUI;
+ FtoUI.f = sp;
+ uint32_t f = FtoUI.ui;
+ f_sign = f & 0x80000000; // 1000 0000 0000 0000 0000 0000 0000 0000
+ f_exponent = f & 0x7F800000; // 0111 1111 1000 0000 0000 0000 0000 0000
+ f_mantissa = f & 0x007FFFFF; // 0000 0000 0111 1111 1111 1111 1111 1111
+
+ h_sign = f_sign >> 16;
+
+ if (f_exponent == 0)
+ {
+ // Zero
+ h_exponent = 0;
+ h_mantissa = 0;
+ }
+ else if (f_exponent == 0x7F800000)
+ {
+ // Inf or NaN
+ h_exponent = 0x7C00;
+ h_mantissa = f_mantissa;
+ }
+ else
+ {
+ int e = (((int32_t)(f_exponent >> 23)) - 127 + 15);
+ if (e >= 0x1F)
+ {
+ // Value will overflow
+ h_exponent = 0x7C00;
+ h_mantissa = 0;
+ }
+ else if (e <= 0)
+ {
+ // Value will underflow
+ h_exponent = 0;
+ if (14 - e > 24)
+ {
+ // Too small - flush to zero
+ h_mantissa = 0;
+ }
+ else
+ {
+ // Convert to denorm
+ f_mantissa |= 0x800000;
+ h_mantissa = (f_mantissa >> (14-e));
+ if ((f_mantissa >> (13 - e)) & 0x1)
+ {
+ h_mantissa += 0x1;
+ }
+ }
+ }
+ else
+ {
+ // Normalized
+ h_exponent = e << 10;
+ h_mantissa = f_mantissa >> 13;
+ // The current f_mantissa is done in RTZ
+ if (round == Half_RTE && (f & 0x00001000) != 0)
+ {
+ if ((f & 0x00002FFF) != 0)
+ h_mantissa += 1;
+ }
+ else if (round == Half_RTP)
+ {
+ FtoUI.ui &= 0xFFFFE000;
+ if (FtoUI.f < sp)
+ h_mantissa += 1;
+ }
+ else if (round == Half_RTN)
+ {
+ FtoUI.ui &= 0xFFFFE000;
+ if (sp < FtoUI.f)
+ h_mantissa += 1;
+ }
+ }
+ }
+
+ return h_sign + h_exponent + h_mantissa;
+}
diff --git a/src/install/INSTALL.darwin b/src/install/INSTALL.darwin
new file mode 100644
index 0000000..b3292d5
--- /dev/null
+++ b/src/install/INSTALL.darwin
@@ -0,0 +1,17 @@
+To install Oclgrind, simply copy the bin, lib and include directories
+to (for example) /usr/local/, ensuring that file modification times
+are preserved. The easiest way to do this is with the following
+command:
+
+ sudo cp -rp {bin,lib,include} /usr/local
+
+Alternatively, Oclgrind can be used from a non-system directory. To do
+so, add $OCLGRIND_ROOT/bin to your PATH environment variable, and
+$OCLGRIND_ROOT/lib to your DYLD_LIBRARY_PATH environment variable
+(where $OCLGRIND_ROOT is the directory containing this file). If
+copying Oclgrind to a new location, ensure that the -p flag is passed
+to cp, to ensure that file modification times are preserved.
+
+Information about using Oclgrind can be found on the GitHub wiki page:
+
+ http://github.com/jrprice/Oclgrind/wiki
diff --git a/src/install/INSTALL.linux b/src/install/INSTALL.linux
new file mode 100644
index 0000000..cf81cf9
--- /dev/null
+++ b/src/install/INSTALL.linux
@@ -0,0 +1,20 @@
+To install Oclgrind, simply copy the bin, lib and include directories
+to (for example) /usr/local/, ensuring that file modification times
+are preserved. The easiest way to do this is with the following
+command:
+
+ sudo cp -rp {bin,lib,include} /usr/local
+
+Alternatively, Oclgrind can be used from a non-system directory. To do
+so, add $OCLGRIND_ROOT/bin to your PATH environment variable, and
+$OCLGRIND_ROOT/lib to your LD_LIBRARY_PATH environment variable (where
+$OCLGRIND_ROOT is the directory containing this file). If copying
+Oclgrind to a new location, ensure that the -p flag is passed to cp,
+to ensure that file modification times are preserved.
+
+To use Oclgrind with the OpenCL ICD loader (optional), copy
+oclgrind.icd to /etc/OpenCL/vendors/.
+
+Information about using Oclgrind can be found on the GitHub wiki page:
+
+ http://github.com/jrprice/Oclgrind/wiki
diff --git a/src/install/INSTALL.windows b/src/install/INSTALL.windows
new file mode 100644
index 0000000..2b02636
--- /dev/null
+++ b/src/install/INSTALL.windows
@@ -0,0 +1,8 @@
+To install Oclgrind, run 'install.bat' as an Administrator. This will
+install Oclgrind to 'C:\Program Files\Oclgrind' and create a registry
+entry for the OpenCL ICD loader. Oclgrind can be uninstalled by
+running 'uninstall.bat' as an Administrator.
+
+Alternatively, Oclgrind can be run from any other directory. You will
+need to manually create OpenCL ICD loading points by editing the
+registry (see oclgrind-icd.reg).
diff --git a/src/install/install.bat b/src/install/install.bat
new file mode 100644
index 0000000..cea2457
--- /dev/null
+++ b/src/install/install.bat
@@ -0,0 +1,23 @@
+ at ECHO OFF
+
+cd %~dp0
+
+set "ROOT=%programfiles%\Oclgrind"
+
+mkdir "%ROOT%" || goto :error
+
+xcopy include "%ROOT%\include" /S /Y /I || goto :error
+xcopy x86 "%ROOT%\x86" /S /Y /I || goto :error
+xcopy x64 "%ROOT%\x64" /S /Y /I || goto :error
+xcopy uninstall.bat "%ROOT%\" /Y || goto :error
+
+regedit /S oclgrind-icd.reg || goto :error
+
+goto :EOF
+
+
+:error
+echo INSTALLATION FAILED
+echo Did you run as Administrator?
+pause
+
diff --git a/src/install/oclgrind-icd.reg b/src/install/oclgrind-icd.reg
new file mode 100644
index 0000000..89af7a6
Binary files /dev/null and b/src/install/oclgrind-icd.reg differ
diff --git a/src/install/uninstall.bat b/src/install/uninstall.bat
new file mode 100644
index 0000000..660e8d3
--- /dev/null
+++ b/src/install/uninstall.bat
@@ -0,0 +1 @@
+start /B "" cmd /C rmdir "%programfiles%\Oclgrind" /S /Q
diff --git a/src/kernel/Simulation.cpp b/src/kernel/Simulation.cpp
new file mode 100644
index 0000000..208ed77
--- /dev/null
+++ b/src/kernel/Simulation.cpp
@@ -0,0 +1,764 @@
+// Simulation.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "config.h"
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+
+#include "core/Context.h"
+#include "core/Kernel.h"
+#include "core/KernelInvocation.h"
+#include "core/Memory.h"
+#include "core/Program.h"
+#include "kernel/Simulation.h"
+
+using namespace oclgrind;
+using namespace std;
+
+#define PARSING(parsing) m_parsing = parsing;
+
+// Convert an integer to char/uchar, checking if the value is valid
+#define INT_TO_CHAR(intval, result) \
+ result = intval; \
+ if (result != intval) \
+ { \
+ throw "Invalid char value"; \
+ }
+
+// Utility to read a typed value from a stream
+template<typename T> T readValue(istream& stream);
+
+Simulation::Simulation()
+{
+ m_context = new Context();
+ m_kernel = NULL;
+ m_program = NULL;
+}
+
+Simulation::~Simulation()
+{
+ delete m_kernel;
+ delete m_program;
+ delete m_context;
+}
+
+template<typename T>
+void Simulation::dumpArgument(DumpArg& arg)
+{
+ size_t num = arg.size / sizeof(T);
+ T *data = new T[num];
+ m_context->getGlobalMemory()->load((uint8_t*)data, arg.address, arg.size);
+
+ for (size_t i = 0; i < num; i++)
+ {
+ cout << " " << arg.name << "[" << i << "] = ";
+ if (sizeof(T) == 1)
+ cout << (int)data[i];
+ else
+ cout << data[i];
+ cout << endl;
+ }
+ cout << endl;
+
+ delete[] data;
+}
+
+template<typename T>
+void Simulation::get(T& result)
+{
+ do
+ {
+ // Check if line buffer has content
+ streampos pos = m_lineBuffer.tellg();
+ string token;
+ m_lineBuffer >> token;
+ if (!m_lineBuffer.fail())
+ {
+ // Line has content, rewind line buffer
+ m_lineBuffer.clear();
+ m_lineBuffer.seekg(pos);
+
+ // Read value from line buffer
+ m_lineBuffer >> result;
+ if (m_lineBuffer.fail())
+ {
+ throw ifstream::failbit;
+ }
+
+ return;
+ }
+
+ // Get next line
+ string line;
+ getline(m_simfile, line);
+ m_lineNumber++;
+
+ // Remove comments
+ size_t comment = line.find_first_of('#');
+ if (comment != string::npos)
+ {
+ line = line.substr(0, comment);
+ }
+
+ // Update line buffer
+ m_lineBuffer.clear();
+ m_lineBuffer.str(line);
+ }
+ while (m_simfile.good());
+
+ // Couldn't read data from file, throw exception
+ throw m_simfile.eof() ? ifstream::eofbit : ifstream::failbit;
+}
+
+bool Simulation::load(const char *filename)
+{
+ // Open simulator file
+ m_lineNumber = 0;
+ m_lineBuffer.setstate(ios_base::eofbit);
+ m_simfile.open(filename);
+ if (m_simfile.fail())
+ {
+ cerr << "Unable to open simulator file." << endl;
+ return false;
+ }
+
+ try
+ {
+ // Read simulation parameters
+ string progFileName;
+ string kernelName;
+ PARSING("program file");
+ get(progFileName);
+ PARSING("kernel");
+ get(kernelName);
+ PARSING("NDRange");
+ get(m_ndrange.x);
+ get(m_ndrange.y);
+ get(m_ndrange.z);
+ PARSING("work-group size");
+ get(m_wgsize.x);
+ get(m_wgsize.y);
+ get(m_wgsize.z);
+
+ // Ensure work-group size exactly divides NDRange
+ if (m_ndrange.x % m_wgsize.x ||
+ m_ndrange.y % m_wgsize.y ||
+ m_ndrange.z % m_wgsize.z)
+ {
+ cerr << "Work group size must divide NDRange exactly." << endl;
+ return false;
+ }
+
+ // Open program file
+ ifstream progFile;
+ progFile.open(progFileName.c_str(), ios_base::in | ios_base::binary);
+ if (!progFile.good())
+ {
+ cerr << "Unable to open " << progFileName << endl;
+ return false;
+ }
+
+ // Check for LLVM bitcode magic numbers
+ char magic[2] = {0,0};
+ progFile.read(magic, 2);
+ if (magic[0] == 0x42 && magic[1] == 0x43)
+ {
+ // Load bitcode
+ progFile.close();
+ m_program = Program::createFromBitcodeFile(m_context, progFileName);
+ if (!m_program)
+ {
+ cerr << "Failed to load bitcode from " << progFileName << endl;
+ return false;
+ }
+ }
+ else
+ {
+ // Get size of file
+ progFile.seekg(0, ios_base::end);
+ size_t sz = progFile.tellg();
+ progFile.seekg(0, ios_base::beg);
+
+ // Load source
+ char *data = new char[sz + 1];
+ progFile.read(data, sz+1);
+ progFile.close();
+ data[sz] = '\0';
+ m_program = new Program(m_context, data);
+ delete[] data;
+
+ // Build program
+ if (!m_program->build(""))
+ {
+ cerr << "Build failure:" << endl << m_program->getBuildLog() << endl;
+ return false;
+ }
+ }
+
+ // Get kernel
+ m_kernel = m_program->createKernel(kernelName);
+ if (!m_kernel)
+ {
+ cerr << "Failed to create kernel " << kernelName << endl;
+ return false;
+ }
+
+ // Clear global memory
+ Memory *globalMemory = m_context->getGlobalMemory();
+ globalMemory->clear();
+
+ // Parse kernel arguments
+ m_dumpArguments.clear();
+ for (unsigned index = 0; index < m_kernel->getNumArguments(); index++)
+ {
+ parseArgument(index);
+ }
+
+ // Make sure there is no more input
+ string next;
+ m_simfile >> next;
+ if (m_simfile.good() || !m_simfile.eof())
+ {
+ cerr << "Unexpected token '" << next << "' (expected EOF)" << endl;
+ return false;
+ }
+ }
+ catch (const char *err)
+ {
+ cerr << "Line " << m_lineNumber << ": " << err
+ << " (" << m_parsing << ")" << endl;
+ return false;
+ }
+ catch (ifstream::iostate e)
+ {
+ if (e == ifstream::eofbit)
+ {
+ cerr << "Unexpected EOF when parsing " << m_parsing << endl;
+ return false;
+ }
+ else if (e == ifstream::failbit)
+ {
+ cerr << "Line " << m_lineNumber
+ << ": Failed to parse " << m_parsing << endl;
+ return false;
+ }
+ else
+ {
+ throw e;
+ }
+ }
+
+ return true;
+}
+
+void Simulation::parseArgument(size_t index)
+{
+ // Argument parsing parameters
+ size_t size = -1;
+ cl_mem_flags flags = 0;
+ ArgDataType type = TYPE_NONE;
+ size_t typeSize = 0;
+ bool null = false;
+ bool dump = false;
+ string fill = "";
+ string range = "";
+ string name = m_kernel->getArgumentName(index).str();
+
+ // Set meaningful parsing status for error messages
+ ostringstream stringstream;
+ stringstream << "argument " << index << ": " << name;
+ string formatted = stringstream.str();
+ PARSING(formatted.c_str());
+
+ // Get argument info
+ size_t argSize = m_kernel->getArgumentSize(index);
+ unsigned int addrSpace = m_kernel->getArgumentAddressQualifier(index);
+ const llvm::StringRef argType = m_kernel->getArgumentTypeName(index);
+
+ // Ensure we have an argument header
+ char c;
+ get(c);
+ if (c != '<')
+ {
+ throw "Expected argument header <...>";
+ }
+
+ // Get header
+ streampos startpos = m_lineBuffer.tellg();
+ string headerStr;
+ getline(m_lineBuffer, headerStr);
+ size_t end = headerStr.find_last_of('>');
+ if (end == string::npos)
+ {
+ throw "Missing '>' at end of argument header";
+ }
+ headerStr = headerStr.substr(0, end);
+
+ // Move line buffer to end of header
+ m_lineBuffer.clear();
+ m_lineBuffer.seekg((int)startpos + headerStr.size() + 1);
+
+ // Save format flags
+ ios_base::fmtflags previousFormat = m_lineBuffer.flags();
+
+ // Parse header
+ istringstream header(headerStr);
+ while (!header.eof())
+ {
+ // Get next header token
+ string token;
+ header >> token;
+ if (header.fail())
+ {
+ break;
+ }
+
+#define MATCH_TYPE(str, value, sz) \
+ else if (token == str) \
+ { \
+ if (type != TYPE_NONE) \
+ { \
+ throw "Argument type defined multiple times"; \
+ } \
+ type = value; \
+ typeSize = sz; \
+ }
+
+ // Parse token
+ if (false);
+ MATCH_TYPE("char", TYPE_CHAR, 1)
+ MATCH_TYPE("uchar", TYPE_UCHAR, 1)
+ MATCH_TYPE("short", TYPE_SHORT, 2)
+ MATCH_TYPE("ushort", TYPE_USHORT, 2)
+ MATCH_TYPE("int", TYPE_INT, 4)
+ MATCH_TYPE("uint", TYPE_UINT, 4)
+ MATCH_TYPE("long", TYPE_LONG, 8)
+ MATCH_TYPE("ulong", TYPE_ULONG, 8)
+ MATCH_TYPE("float", TYPE_FLOAT, 4)
+ MATCH_TYPE("double", TYPE_DOUBLE, 8)
+ else if (token.compare(0, 4, "dump") == 0)
+ {
+ dump = true;
+ }
+ else if (token.compare(0, 4, "fill") == 0)
+ {
+ if (token.size() < 6 || token[4] != '=')
+ {
+ throw "Expected =VALUE after 'fill";
+ }
+ fill = token.substr(5);
+ }
+ else if (token == "hex")
+ {
+ m_lineBuffer.setf(ios_base::hex);
+ m_lineBuffer.unsetf(ios_base::dec | ios_base::oct);
+ }
+ else if (token == "null")
+ {
+ if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL &&
+ addrSpace != CL_KERNEL_ARG_ADDRESS_CONSTANT)
+ {
+ throw "'null' only valid for buffer arguments";
+ }
+ null = true;
+ }
+ else if (token.compare(0, 5, "range") == 0)
+ {
+ if (token.size() < 7 || token[5] != '=')
+ {
+ throw "Expected =START:INC:END after 'range";
+ }
+ range = token.substr(6);
+ }
+ else if (token == "ro")
+ {
+ if (flags & CL_MEM_WRITE_ONLY)
+ {
+ throw "'ro' and 'wo' are mutually exclusive";
+ }
+ if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL)
+ {
+ throw "'ro' only valid for global memory buffers";
+ }
+ flags |= CL_MEM_READ_ONLY;
+ }
+ else if (token.compare(0, 4, "size") == 0)
+ {
+ istringstream value(token.substr(4));
+ char equals = 0;
+ value >> equals;
+ if (equals != '=')
+ {
+ throw "Expected = after 'size'";
+ }
+
+ value >> dec >> size;
+ if (value.fail() || !value.eof())
+ {
+ throw "Invalid value for 'size'";
+ }
+ }
+ else if (token == "wo")
+ {
+ if (flags & CL_MEM_READ_ONLY)
+ {
+ throw "'ro' and 'wo' are mutually exclusive";
+ }
+ if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL)
+ {
+ throw "'wo' only valid for global memory buffers";
+ }
+ flags |= CL_MEM_WRITE_ONLY;
+ }
+ else
+ {
+ string err = "Unrecognised header token '";
+ err += token;
+ err += "'";
+ throw err.c_str();
+ }
+ }
+
+ // Ensure size given
+ if (null)
+ {
+ if (size != -1 || !fill.empty() || !range.empty())
+ {
+ throw "'null' not valid with other argument descriptors";
+ }
+ size = 0;
+ }
+ else if (size == -1)
+ {
+ throw "size required";
+ }
+
+ if (type == TYPE_NONE)
+ {
+#define MATCH_TYPE_PREFIX(str, value, sz) \
+ else if (argType.startswith(str)) \
+ { \
+ type = value; \
+ typeSize = sz; \
+ }
+
+ // Set default type using kernel introspection
+ if (false);
+ MATCH_TYPE_PREFIX("char", TYPE_CHAR, 1)
+ MATCH_TYPE_PREFIX("uchar", TYPE_UCHAR, 1)
+ MATCH_TYPE_PREFIX("short", TYPE_SHORT, 2)
+ MATCH_TYPE_PREFIX("ushort", TYPE_USHORT, 2)
+ MATCH_TYPE_PREFIX("int", TYPE_INT, 4)
+ MATCH_TYPE_PREFIX("uint", TYPE_UINT, 4)
+ MATCH_TYPE_PREFIX("long", TYPE_LONG, 8)
+ MATCH_TYPE_PREFIX("ulong", TYPE_ULONG, 8)
+ MATCH_TYPE_PREFIX("float", TYPE_FLOAT, 4)
+ MATCH_TYPE_PREFIX("double", TYPE_DOUBLE, 8)
+ MATCH_TYPE_PREFIX("void*", TYPE_UCHAR, 1)
+ else
+ {
+ throw "Invalid default kernel argument type";
+ }
+ }
+
+ // Ensure argument data size is a multiple of format type size
+ if (size % typeSize)
+ {
+ throw "Initialiser type must exactly divide argument size";
+ }
+
+ // Ensure 'dump' only used with non-null buffers
+ if (dump)
+ {
+ if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL &&
+ addrSpace != CL_KERNEL_ARG_ADDRESS_CONSTANT)
+ {
+ throw "'dump' only valid for memory objects";
+ }
+ if (null)
+ {
+ throw "'dump' not valid with 'null' specifier";
+ }
+ }
+
+ // Generate argument data
+ TypedValue value;
+ value.size = argSize;
+ value.num = 1;
+ if (addrSpace == CL_KERNEL_ARG_ADDRESS_LOCAL)
+ {
+ value.size = size;
+ value.data = NULL;
+ }
+ else if (null)
+ {
+ value.data = new unsigned char[value.size];
+ memset(value.data, 0, value.size);
+ }
+ else
+ {
+ // Parse argument data
+ unsigned char *data = new unsigned char[size];
+ if (!fill.empty())
+ {
+ istringstream fillStream(fill);
+ fillStream.copyfmt(m_lineBuffer);
+
+ #define FILL_TYPE(type, T) \
+ case type: \
+ parseFill<T>(data, size, fillStream); \
+ break;
+
+ switch (type)
+ {
+ FILL_TYPE(TYPE_CHAR, int8_t);
+ FILL_TYPE(TYPE_UCHAR, uint8_t);
+ FILL_TYPE(TYPE_SHORT, int16_t);
+ FILL_TYPE(TYPE_USHORT, uint16_t);
+ FILL_TYPE(TYPE_INT, int32_t);
+ FILL_TYPE(TYPE_UINT, uint32_t);
+ FILL_TYPE(TYPE_LONG, int64_t);
+ FILL_TYPE(TYPE_ULONG, uint64_t);
+ FILL_TYPE(TYPE_FLOAT, float);
+ FILL_TYPE(TYPE_DOUBLE, double);
+ default:
+ throw "Invalid argument data type";
+ }
+ }
+ else if (!range.empty())
+ {
+ istringstream rangeStream(range);
+ rangeStream.copyfmt(m_lineBuffer);
+
+ #define RANGE_TYPE(type, T) \
+ case type: \
+ parseRange<T>(data, size, rangeStream); \
+ break;
+
+ switch (type)
+ {
+ RANGE_TYPE(TYPE_CHAR, int8_t);
+ RANGE_TYPE(TYPE_UCHAR, uint8_t);
+ RANGE_TYPE(TYPE_SHORT, int16_t);
+ RANGE_TYPE(TYPE_USHORT, uint16_t);
+ RANGE_TYPE(TYPE_INT, int32_t);
+ RANGE_TYPE(TYPE_UINT, uint32_t);
+ RANGE_TYPE(TYPE_LONG, int64_t);
+ RANGE_TYPE(TYPE_ULONG, uint64_t);
+ RANGE_TYPE(TYPE_FLOAT, float);
+ RANGE_TYPE(TYPE_DOUBLE, double);
+ default:
+ throw "Invalid argument data type";
+ }
+ }
+ else if (addrSpace != CL_KERNEL_ARG_ADDRESS_LOCAL)
+ {
+ #define PARSE_TYPE(type, T) \
+ case type: \
+ parseArgumentData<T>(data, size); \
+ break;
+
+ switch (type)
+ {
+ PARSE_TYPE(TYPE_CHAR, int8_t);
+ PARSE_TYPE(TYPE_UCHAR, uint8_t);
+ PARSE_TYPE(TYPE_SHORT, int16_t);
+ PARSE_TYPE(TYPE_USHORT, uint16_t);
+ PARSE_TYPE(TYPE_INT, int32_t);
+ PARSE_TYPE(TYPE_UINT, uint32_t);
+ PARSE_TYPE(TYPE_LONG, int64_t);
+ PARSE_TYPE(TYPE_ULONG, uint64_t);
+ PARSE_TYPE(TYPE_FLOAT, float);
+ PARSE_TYPE(TYPE_DOUBLE, double);
+ default:
+ throw "Invalid argument data type";
+ }
+ }
+
+ if (addrSpace == CL_KERNEL_ARG_ADDRESS_PRIVATE)
+ {
+ value.data = data;
+ }
+ else
+ {
+ // Allocate buffer and store content
+ Memory *globalMemory = m_context->getGlobalMemory();
+ size_t address = globalMemory->allocateBuffer(size, flags);
+ globalMemory->store((unsigned char*)&data[0], address, size);
+ value.data = new unsigned char[value.size];
+ value.setPointer(address);
+ delete[] data;
+
+ if (dump)
+ {
+ DumpArg dump =
+ {
+ address,
+ size,
+ type,
+ name,
+ };
+ m_dumpArguments.push_back(dump);
+ }
+ }
+ }
+
+ // Set argument value
+ m_kernel->setArgument(index, value);
+ if (value.data)
+ {
+ delete[] value.data;
+ }
+
+ // Reset parsing format
+ m_lineBuffer.flags(previousFormat);
+}
+
+template<typename T>
+void Simulation::parseArgumentData(unsigned char *result, size_t size)
+{
+ vector<T> data;
+ for (int i = 0; i < size / sizeof(T); i++)
+ {
+ T value;
+ if (sizeof(T) == 1)
+ {
+ int intval;
+ get(intval);
+ INT_TO_CHAR(intval, value);
+ }
+ else
+ {
+ get(value);
+ }
+ data.push_back(value);
+ }
+ memcpy(result, &data[0], size);
+}
+
+template<typename T>
+void Simulation::parseFill(unsigned char *result, size_t size,
+ istringstream& fill)
+{
+ T value = readValue<T>(fill);
+ for (int i = 0; i < size/sizeof(T); i++)
+ {
+ ((T*)result)[i] = value;
+ }
+
+ if (fill.fail() || !fill.eof())
+ {
+ throw "Invalid fill value";
+ }
+}
+
+template<typename T>
+void Simulation::parseRange(unsigned char *result, size_t size,
+ istringstream& range)
+{
+ // Parse range format
+ T values[3];
+ for (int i = 0; i < 3; i++)
+ {
+ values[i] = readValue<T>(range);
+ if (i < 2)
+ {
+ char colon = 0;
+ range >> colon;
+ if (range.fail() || colon != ':')
+ {
+ throw "Invalid range format";
+ }
+ }
+ }
+ if (range.fail() || !range.eof())
+ {
+ throw "Invalid range format";
+ }
+
+ // Ensure range is value
+ double num = (values[2] - values[0] + values[1]) / (double)values[1];
+ if (ceil(num) != num || num*sizeof(T) != size)
+ {
+ throw "Range doesn't produce correct buffer size";
+ }
+
+ // Produce range values
+ T value = values[0];
+ for (size_t i = 0; i < num; i++)
+ {
+ ((T*)result)[i] = value;
+ value += values[1];
+ }
+}
+
+void Simulation::run(bool dumpGlobalMemory)
+{
+ assert(m_kernel && m_program);
+ assert(m_kernel->allArgumentsSet());
+
+ Size3 offset(0, 0, 0);
+ KernelInvocation::run(m_context, m_kernel, 3, offset, m_ndrange, m_wgsize);
+
+ // Dump individual arguments
+ cout << dec;
+ list<DumpArg>::iterator itr;
+ for (itr = m_dumpArguments.begin(); itr != m_dumpArguments.end(); itr++)
+ {
+ cout << endl
+ << "Argument '" << itr->name << "': "
+ << itr->size << " bytes" << endl;
+
+#define DUMP_TYPE(type, T) \
+ case type: \
+ dumpArgument<T>(*itr); \
+ break;
+
+ switch (itr->type)
+ {
+ DUMP_TYPE(TYPE_CHAR, char);
+ DUMP_TYPE(TYPE_UCHAR, uint8_t);
+ DUMP_TYPE(TYPE_SHORT, int16_t);
+ DUMP_TYPE(TYPE_USHORT, uint16_t);
+ DUMP_TYPE(TYPE_INT, int32_t);
+ DUMP_TYPE(TYPE_UINT, uint32_t);
+ DUMP_TYPE(TYPE_LONG, int64_t);
+ DUMP_TYPE(TYPE_ULONG, uint64_t);
+ DUMP_TYPE(TYPE_FLOAT, float);
+ DUMP_TYPE(TYPE_DOUBLE, double);
+ default:
+ throw "Invalid argument data type";
+ }
+ }
+
+ // Dump global memory if required
+ if (dumpGlobalMemory)
+ {
+ cout << endl << "Global Memory:" << endl;
+ m_context->getGlobalMemory()->dump();
+ }
+}
+
+template<typename T>
+T readValue(istream& stream)
+{
+ T value;
+ if (sizeof(T) == 1)
+ {
+ int intval;
+ stream >> intval;
+ INT_TO_CHAR(intval, value);
+ }
+ else
+ {
+ stream >> value;
+ }
+ return value;
+}
diff --git a/src/kernel/Simulation.h b/src/kernel/Simulation.h
new file mode 100644
index 0000000..19b6e9b
--- /dev/null
+++ b/src/kernel/Simulation.h
@@ -0,0 +1,82 @@
+// Simulation.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/common.h"
+
+#include <fstream>
+#include <list>
+#include <sstream>
+#include <string>
+
+namespace oclgrind
+{
+ class Context;
+ class Kernel;
+ class Program;
+};
+
+class Simulation
+{
+ enum ArgDataType
+ {
+ TYPE_NONE,
+ TYPE_CHAR,
+ TYPE_UCHAR,
+ TYPE_SHORT,
+ TYPE_USHORT,
+ TYPE_INT,
+ TYPE_UINT,
+ TYPE_LONG,
+ TYPE_ULONG,
+ TYPE_FLOAT,
+ TYPE_DOUBLE,
+ };
+
+ public:
+ Simulation();
+ virtual ~Simulation();
+
+ bool load(const char *filename);
+ void run(bool dumpGlobalMemory=false);
+
+ private:
+ oclgrind::Context *m_context;
+ oclgrind::Kernel *m_kernel;
+ oclgrind::Program *m_program;
+
+ oclgrind::Size3 m_ndrange;
+ oclgrind::Size3 m_wgsize;
+
+ std::ifstream m_simfile;
+ std::string m_parsing;
+ size_t m_lineNumber;
+ std::istringstream m_lineBuffer;
+
+ typedef struct
+ {
+ size_t address;
+ size_t size;
+ ArgDataType type;
+ std::string name;
+ } DumpArg;
+ std::list<DumpArg> m_dumpArguments;
+
+ template<typename T>
+ void dumpArgument(DumpArg& arg);
+ template<typename T>
+ void get(T& result);
+ void parseArgument(size_t index);
+ template<typename T>
+ void parseArgumentData(unsigned char *result, size_t size);
+ template<typename T>
+ void parseFill(unsigned char *result, size_t size,
+ std::istringstream& fill);
+ template<typename T>
+ void parseRange(unsigned char *result, size_t size,
+ std::istringstream& range);
+};
diff --git a/src/kernel/oclgrind-kernel.cpp b/src/kernel/oclgrind-kernel.cpp
new file mode 100644
index 0000000..4ac1d31
--- /dev/null
+++ b/src/kernel/oclgrind-kernel.cpp
@@ -0,0 +1,233 @@
+// main.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "config.h"
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "kernel/Simulation.h"
+
+using namespace oclgrind;
+using namespace std;
+
+static bool outputGlobalMemory = false;
+static const char *simfile = NULL;
+
+static bool parseArguments(int argc, char *argv[]);
+static void printUsage();
+static void setEnvironment(const char *name, const char *value);
+
+int main(int argc, char *argv[])
+{
+ // Parse arguments
+ if (!parseArguments(argc, argv))
+ {
+ return 1;
+ }
+
+ // Initialise simulation
+ Simulation simulation;
+ if (!simulation.load(simfile))
+ {
+ return 1;
+ }
+
+ // Run simulation
+ simulation.run(outputGlobalMemory);
+}
+
+static bool parseArguments(int argc, char *argv[])
+{
+ for (int i = 1; i < argc; i++)
+ {
+ if (!strcmp(argv[i], "--build-options"))
+ {
+ if (++i >= argc)
+ {
+ cerr << "Missing argument to --build-options" << endl;
+ return false;
+ }
+ setEnvironment("OCLGRIND_BUILD_OPTIONS", argv[i]);
+ }
+ else if (!strcmp(argv[i], "--data-races"))
+ {
+ setEnvironment("OCLGRIND_DATA_RACES", "1");
+ }
+ else if (!strcmp(argv[i], "--disable-pch"))
+ {
+ setEnvironment("OCLGRIND_DISABLE_PCH", "1");
+ }
+ else if (!strcmp(argv[i], "--dump-spir"))
+ {
+ setEnvironment("OCLGRIND_DUMP_SPIR", "1");
+ }
+ else if (!strcmp(argv[i], "-g") || !strcmp(argv[i], "--global-mem"))
+ {
+ outputGlobalMemory = true;
+ }
+ else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help"))
+ {
+ printUsage();
+ exit(0);
+ }
+ else if (!strcmp(argv[i], "--inst-counts"))
+ {
+ setEnvironment("OCLGRIND_INST_COUNTS", "1");
+ }
+ else if (!strcmp(argv[i], "-i") || !strcmp(argv[i], "--interactive"))
+ {
+ setEnvironment("OCLGRIND_INTERACTIVE", "1");
+ }
+ else if (!strcmp(argv[i], "--log"))
+ {
+ if (++i >= argc)
+ {
+ cerr << "Missing argument to --log" << endl;
+ return false;
+ }
+ setEnvironment("OCLGRIND_LOG", argv[i]);
+ }
+ else if (!strcmp(argv[i], "--max-errors"))
+ {
+ if (++i >= argc)
+ {
+ cerr << "Missing argument to --max-errors" << endl;
+ return false;
+ }
+ setEnvironment("OCLGRIND_MAX_ERRORS", argv[i]);
+ }
+ else if (!strcmp(argv[i], "--num-threads"))
+ {
+ if (++i >= argc)
+ {
+ cerr << "Missing argument to --num-threads" << endl;
+ return false;
+ }
+ setEnvironment("OCLGRIND_NUM_THREADS", argv[i]);
+ }
+ else if (!strcmp(argv[i], "--pch-dir"))
+ {
+ if (++i >= argc)
+ {
+ cerr << "Missing argument to --pch-dir" << endl;
+ return false;
+ }
+ setEnvironment("OCLGRIND_PCH_DIR", argv[i]);
+ }
+ else if (!strcmp(argv[i], "--plugins"))
+ {
+ if (++i >= argc)
+ {
+ cerr << "Missing argument to --plugins" << endl;
+ return false;
+ }
+ setEnvironment("OCLGRIND_PLUGINS", argv[i]);
+ }
+ else if (!strcmp(argv[i], "-q") || !strcmp(argv[i], "--quick"))
+ {
+ setEnvironment("OCLGRIND_QUICK", "1");
+ }
+ else if (!strcmp(argv[i], "--uniform-writes"))
+ {
+ setEnvironment("OCLGRIND_UNIFORM_WRITES", "1");
+ }
+ else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version"))
+ {
+ cout << endl;
+ cout << "Oclgrind " PACKAGE_VERSION << endl;
+ cout << endl;
+ cout << "Copyright (c) 2013-2015" << endl;
+ cout << "James Price and Simon McIntosh-Smith, University of Bristol"
+ << endl;
+ cout << "https://github.com/jrprice/Oclgrind" << endl;
+ cout << endl;
+ exit(0);
+ }
+ else if (argv[i][0] == '-')
+ {
+ cerr << "Unrecognised option '" << argv[i] << "'" << endl;
+ return false;
+ }
+ else
+ {
+ if (simfile == NULL)
+ {
+ simfile = argv[i];
+ }
+ else
+ {
+ cerr << "Unexpected positional argument '" << argv[i] << "'" << endl;
+ return false;
+ }
+ }
+ }
+
+ if (simfile == NULL)
+ {
+ printUsage();
+ return false;
+ }
+
+ return true;
+}
+
+static void printUsage()
+{
+ cout
+ << "Usage: oclgrind-kernel [OPTIONS] simfile" << endl
+ << " oclgrind-kernel [--help | --version]" << endl
+ << endl
+ << "Options:" << endl
+ << " --build-options OPTIONS "
+ "Additional options to pass to the OpenCL compiler" << endl
+ << " --data-races "
+ "Enable data-race detection" << endl
+ << " --disable-pch "
+ "Don't use precompiled headers" << endl
+ << " --dump-spir "
+ "Dump SPIR to /tmp/oclgrind_*.{ll,bc}" << endl
+ << " -g --global-mem "
+ "Output global memory at exit" << endl
+ << " -h --help "
+ "Display usage information" << endl
+ << " --inst-counts "
+ "Output histograms of instructions executed" << endl
+ << " -i --interactive "
+ "Enable interactive mode" << endl
+ << " --log LOGFILE "
+ "Redirect log/error messages to a file" << endl
+ << " --max-errors NUM "
+ "Limit the number of error/warning messages" << endl
+ << " --num-threads NUM "
+ "Set the number of worker threads to use" << endl
+ << " --pch-dir DIR "
+ "Override directory containing precompiled headers" << endl
+ << " --plugins PLUGINS "
+ "Load colon seperated list of plugin libraries" << endl
+ << " -q --quick "
+ "Only run first and last work-group" << endl
+ << " --uniform-writes "
+ "Don't suppress uniform write-write data-races" << endl
+ << " -v --version "
+ "Display version information" << endl
+ << endl
+ << "For more information, please visit the Oclgrind wiki page:" << endl
+ << "-> https://github.com/jrprice/Oclgrind/wiki" << endl
+ << endl;
+}
+
+static void setEnvironment(const char *name, const char *value)
+{
+#if defined(_WIN32) && !defined(__MINGW32__)
+ _putenv_s(name, value);
+#else
+ setenv(name, value, 1);
+#endif
+}
diff --git a/src/plugins/InstructionCounter.cpp b/src/plugins/InstructionCounter.cpp
new file mode 100644
index 0000000..ce680f4
--- /dev/null
+++ b/src/plugins/InstructionCounter.cpp
@@ -0,0 +1,184 @@
+// InstructionCounter.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/common.h"
+
+#include <sstream>
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+
+#include "InstructionCounter.h"
+
+#include "core/Kernel.h"
+#include "core/KernelInvocation.h"
+
+using namespace oclgrind;
+using namespace std;
+
+#define COUNTED_LOAD_BASE (llvm::Instruction::OtherOpsEnd + 4)
+#define COUNTED_STORE_BASE (COUNTED_LOAD_BASE + 8)
+#define COUNTED_CALL_BASE (COUNTED_STORE_BASE + 8)
+
+static bool compareNamedCount(pair<string,size_t> a, pair<string,size_t> b)
+{
+ return a.second > b.second;
+}
+
+string InstructionCounter::getOpcodeName(unsigned opcode) const
+{
+ if (opcode >= COUNTED_CALL_BASE)
+ {
+ // Get functon name
+ unsigned index = opcode - COUNTED_CALL_BASE;
+ assert(index < m_functions.size());
+ return "call " + m_functions[index]->getName().str() + "()";
+ }
+ else if (opcode >= COUNTED_LOAD_BASE)
+ {
+ // Create stream using default locale
+ ostringstream name;
+ locale defaultLocale("");
+ name.imbue(defaultLocale);
+
+ // Get number of bytes
+ size_t bytes = m_memopBytes[opcode-COUNTED_LOAD_BASE];
+
+ // Get name of operation
+ if (opcode >= COUNTED_STORE_BASE)
+ {
+ opcode -= COUNTED_STORE_BASE;
+ name << "store";
+ }
+ else
+ {
+ opcode -= COUNTED_LOAD_BASE;
+ name << "load";
+ }
+
+ // Add address space to name
+ name << " " << getAddressSpaceName(opcode);
+
+ // Add number of bytes to name
+ name << " (" << bytes << " bytes)";
+
+ return name.str();
+ }
+
+ return llvm::Instruction::getOpcodeName(opcode);
+}
+
+void InstructionCounter::instructionExecuted(
+ const WorkItem *workItem, const llvm::Instruction *instruction,
+ const TypedValue& result)
+{
+ unsigned opcode = instruction->getOpcode();
+
+ // Check for loads and stores
+ if (opcode == llvm::Instruction::Load || opcode == llvm::Instruction::Store)
+ {
+ // Track operations in separate address spaces
+ bool load = (opcode == llvm::Instruction::Load);
+ const llvm::Type *type = instruction->getOperand(load?0:1)->getType();
+ unsigned addrSpace = type->getPointerAddressSpace();
+ opcode = (load ? COUNTED_LOAD_BASE : COUNTED_STORE_BASE) + addrSpace;
+
+ // Count total number of bytes loaded/stored
+ unsigned bytes = getTypeSize(type->getPointerElementType());
+ m_memopBytes[opcode-COUNTED_LOAD_BASE] += bytes;
+ }
+ else if (opcode == llvm::Instruction::Call)
+ {
+ // Track distinct function calls
+ const llvm::CallInst *callInst = (const llvm::CallInst*)instruction;
+ const llvm::Function *function = callInst->getCalledFunction();
+ if (function)
+ {
+ vector<const llvm::Function*>::iterator itr =
+ find(m_functions.begin(), m_functions.end(), function);
+ if (itr == m_functions.end())
+ {
+ opcode = COUNTED_CALL_BASE + m_functions.size();
+ m_functions.push_back(function);
+ }
+ else
+ {
+ opcode = COUNTED_CALL_BASE + (itr - m_functions.begin());
+ }
+ }
+ }
+
+ if (opcode >= m_instructionCounts.size())
+ {
+ m_instructionCounts.resize(opcode+1);
+ }
+ m_instructionCounts[opcode]++;
+}
+
+bool InstructionCounter::isThreadSafe() const
+{
+ return false;
+}
+
+void InstructionCounter::kernelBegin(const KernelInvocation *kernelInvocation)
+{
+ m_instructionCounts.clear();
+ m_instructionCounts.resize(COUNTED_CALL_BASE);
+
+ m_memopBytes.clear();
+ m_memopBytes.resize(16);
+
+ m_functions.clear();
+}
+
+void InstructionCounter::kernelEnd(const KernelInvocation *kernelInvocation)
+{
+ // Load default locale
+ locale previousLocale = cout.getloc();
+ locale defaultLocale("");
+ cout.imbue(defaultLocale);
+
+ cout << "Instructions executed for kernel '"
+ << kernelInvocation->getKernel()->getName() << "':";
+ cout << endl;
+
+ // Generate list named instructions and their counts
+ vector< pair<string,size_t> > namedCounts;
+ for (unsigned i = 0; i < m_instructionCounts.size(); i++)
+ {
+ if (m_instructionCounts[i] == 0)
+ {
+ continue;
+ }
+
+ string name = getOpcodeName(i);
+ if (name.compare(0, 14, "call llvm.dbg.") == 0)
+ {
+ continue;
+ }
+
+ namedCounts.push_back(make_pair(name, m_instructionCounts[i]));
+ }
+
+ // Sort named counts
+ sort(namedCounts.begin(), namedCounts.end(), compareNamedCount);
+
+ // Output sorted instruction counts
+ for (unsigned i = 0; i < namedCounts.size(); i++)
+ {
+ cout << setw(16) << dec << namedCounts[i].second << " - "
+ << namedCounts[i].first << endl;
+ }
+
+ cout << endl;
+
+ // Restore locale
+ cout.imbue(previousLocale);
+}
diff --git a/src/plugins/InstructionCounter.h b/src/plugins/InstructionCounter.h
new file mode 100644
index 0000000..f12c33a
--- /dev/null
+++ b/src/plugins/InstructionCounter.h
@@ -0,0 +1,38 @@
+// InstructionCounter.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/Plugin.h"
+
+namespace llvm
+{
+ class Function;
+}
+
+namespace oclgrind
+{
+ class InstructionCounter : public Plugin
+ {
+ public:
+ InstructionCounter(const Context *context) : Plugin(context){};
+
+ virtual void instructionExecuted(const WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ const TypedValue& result) override;
+ virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
+ virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
+
+ virtual bool isThreadSafe() const override;
+
+ private:
+ std::vector<size_t> m_instructionCounts;
+ std::vector<size_t> m_memopBytes;
+ std::vector<const llvm::Function*> m_functions;
+
+ std::string getOpcodeName(unsigned opcode) const;
+ };
+}
diff --git a/src/plugins/InteractiveDebugger.cpp b/src/plugins/InteractiveDebugger.cpp
new file mode 100644
index 0000000..a088338
--- /dev/null
+++ b/src/plugins/InteractiveDebugger.cpp
@@ -0,0 +1,1024 @@
+// InteractiveDebugger.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/common.h"
+
+#include <iterator>
+#include <sstream>
+
+#if !defined(_WIN32) || defined(__MINGW32__)
+#include <signal.h>
+#endif
+
+#if HAVE_READLINE
+#include <readline/readline.h>
+#include <readline/history.h>
+#endif
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+
+#include "InteractiveDebugger.h"
+#include "core/Context.h"
+#include "core/Kernel.h"
+#include "core/KernelInvocation.h"
+#include "core/Memory.h"
+#include "core/Program.h"
+#include "core/WorkGroup.h"
+#include "core/WorkItem.h"
+
+using namespace oclgrind;
+using namespace std;
+
+#define LIST_LENGTH 10
+
+static bool sigintBreak = false;
+#if !defined(_WIN32) || defined(__MINGW32__)
+static struct sigaction m_oldSignalHandler;
+void handleSignal(int s)
+{
+ if (s == SIGINT)
+ sigintBreak = true;
+}
+#endif
+
+InteractiveDebugger::InteractiveDebugger(const Context *context)
+ : Plugin(context)
+{
+ m_running = true;
+ m_forceBreak = false;
+ m_nextBreakpoint = 1;
+ m_program = NULL;
+ m_kernelInvocation = NULL;
+
+ // Set-up commands
+#define ADD_CMD(name, sname, func) \
+ m_commands[name] = &InteractiveDebugger::func; \
+ m_commands[sname] = &InteractiveDebugger::func;
+ ADD_CMD("backtrace", "bt", backtrace);
+ ADD_CMD("break", "b", brk);
+ ADD_CMD("continue", "c", cont);
+ ADD_CMD("delete", "d", del);
+ ADD_CMD("gmem", "gm", mem);
+ ADD_CMD("help", "h", help);
+ ADD_CMD("info", "i", info);
+ ADD_CMD("list", "l", list);
+ ADD_CMD("lmem", "lm", mem);
+ ADD_CMD("next", "n", next);
+ ADD_CMD("pmem", "pm", mem);
+ ADD_CMD("print", "p", print);
+ ADD_CMD("quit", "q", quit);
+ ADD_CMD("step", "s", step);
+ ADD_CMD("workitem", "wi", workitem);
+}
+
+void InteractiveDebugger::instructionExecuted(
+ const WorkItem *workItem, const llvm::Instruction *instruction,
+ const TypedValue& result)
+{
+ if (!shouldShowPrompt(workItem))
+ return;
+
+#if !defined(_WIN32) || defined(__MINGW32__)
+ // Restore old signal handler
+ sigaction(SIGINT, &m_oldSignalHandler, NULL);
+#endif
+
+ m_forceBreak = false;
+ sigintBreak = false;
+
+ // Print function if changed
+ if (m_previousDepth != workItem->getCallStack().size() &&
+ workItem->getState() != WorkItem::FINISHED)
+ {
+ cout << "In function ";
+ printFunction(workItem->getCurrentInstruction());
+ }
+
+ printCurrentLine();
+
+ m_listPosition = 0;
+ m_continue = false;
+ m_next = false;
+
+ while (true)
+ {
+ // Prompt for command
+ bool eof = false;
+ string cmd;
+ #if HAVE_READLINE
+ char *line = readline("(oclgrind) ");
+ if (line)
+ {
+ cmd = line;
+ free(line);
+ }
+ else
+ {
+ eof = true;
+ }
+ #else
+ cout << "(oclgrind) " << flush;
+ getline(cin, cmd);
+ eof = cin.eof();
+ #endif
+
+ // Quit on EOF
+ if (eof)
+ {
+ cout << "(quit)" << endl;
+ quit(vector<string>());
+ return;
+ }
+
+ // Split command into tokens
+ vector<string> tokens;
+ istringstream iss(cmd);
+ copy(istream_iterator<string>(iss),
+ istream_iterator<string>(),
+ back_inserter< vector<string> >(tokens));
+
+ // Skip empty lines
+ if (tokens.size() == 0)
+ {
+ continue;
+ }
+
+ #if HAVE_READLINE
+ add_history(cmd.c_str());
+ #endif
+
+ // Find command in map and execute
+ map<string,Command>::iterator itr = m_commands.find(tokens[0]);
+ if (itr != m_commands.end())
+ {
+ if ((this->*itr->second)(tokens))
+ break;
+ }
+ else
+ {
+ cout << "Unrecognized command '" << tokens[0] << "'" << endl;
+ }
+ }
+}
+
+bool InteractiveDebugger::isThreadSafe() const
+{
+ return false;
+}
+
+void InteractiveDebugger::kernelBegin(const KernelInvocation *kernelInvocation)
+{
+ m_continue = false;
+ m_lastBreakLine = 0;
+ m_listPosition = 0;
+ m_next = false;
+ m_previousDepth = 0;
+ m_previousLine = 0;
+
+ m_kernelInvocation = kernelInvocation;
+ m_program = kernelInvocation->getKernel()->getProgram();
+}
+
+void InteractiveDebugger::kernelEnd(const KernelInvocation *kernelInvocation)
+{
+ m_kernelInvocation = NULL;
+
+#if !defined(_WIN32) || defined(__MINGW32__)
+ // Restore old signal handler
+ sigaction(SIGINT, &m_oldSignalHandler, NULL);
+#endif
+}
+
+void InteractiveDebugger::log(MessageType type, const char *message)
+{
+ if (type == ERROR)
+ m_forceBreak = true;
+}
+
+///////////////////////////
+//// Utility Functions ////
+///////////////////////////
+
+size_t InteractiveDebugger::getCurrentLineNumber() const
+{
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (!workItem || workItem->getState() == WorkItem::FINISHED)
+ {
+ return 0;
+ }
+
+ return getLineNumber(workItem->getCurrentInstruction());
+}
+
+size_t InteractiveDebugger::getLineNumber(
+ const llvm::Instruction *instruction) const
+{
+ llvm::MDNode *md = instruction->getMetadata("dbg");
+ if (md)
+ {
+#if LLVM_VERSION > 36
+ llvm::DILocation *loc = (llvm::DILocation*)md;
+ return loc->getLine();
+#else
+ llvm::DILocation loc((llvm::MDLocation*)md);
+ return loc.getLineNumber();
+#endif
+ }
+ return 0;
+}
+
+bool InteractiveDebugger::hasHitBreakpoint()
+{
+ if (m_breakpoints.empty())
+ return false;
+
+ // Check if we have passed over the previous breakpoint
+ if (m_lastBreakLine)
+ {
+ if (getCurrentLineNumber() != m_lastBreakLine)
+ m_lastBreakLine = 0;
+ else
+ return false;;
+ }
+
+ // Check if we're at a breakpoint
+ size_t line = getCurrentLineNumber();
+ map<size_t, size_t>::iterator itr;
+ for (itr = m_breakpoints[m_program].begin();
+ itr != m_breakpoints[m_program].end(); itr++)
+ {
+ if (itr->second == line)
+ {
+ cout << "Breakpoint " << itr->first
+ << " hit at line " << itr->second
+ << " by work-item "
+ << m_kernelInvocation->getCurrentWorkItem()->getGlobalID()
+ << endl;
+ m_lastBreakLine = line;
+ m_listPosition = 0;
+ return true;
+ }
+ }
+ return false;
+}
+
+void InteractiveDebugger::printCurrentLine() const
+{
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (!workItem || workItem->getState() == WorkItem::FINISHED)
+ {
+ return;
+ }
+
+ size_t lineNum = getCurrentLineNumber();
+ if (m_program->getNumSourceLines() && lineNum > 0)
+ {
+ printSourceLine(lineNum);
+ }
+ else
+ {
+ cout << "Source line not available." << endl;
+ dumpInstruction(cout, workItem->getCurrentInstruction());
+ cout << endl;
+ }
+}
+
+void InteractiveDebugger::printFunction(
+ const llvm::Instruction *instruction) const
+{
+ // Get function
+ const llvm::Function *function = instruction->getParent()->getParent();
+ cout << function->getName().str() << "(";
+
+ // Print arguments
+ llvm::Function::const_arg_iterator argItr;
+ for (argItr = function->arg_begin();
+ argItr != function->arg_end(); argItr++)
+ {
+ if (argItr != function->arg_begin())
+ {
+ cout << ", ";
+ }
+ cout << argItr->getName().str() << "=";
+ m_kernelInvocation->getCurrentWorkItem()->printValue(argItr);
+ }
+
+ cout << ") at line " << dec << getLineNumber(instruction) << endl;
+}
+
+void InteractiveDebugger::printSourceLine(size_t lineNum) const
+{
+ const char *line = m_program->getSourceLine(lineNum);
+ if (line)
+ {
+ cout << dec << lineNum << "\t" << line << endl;
+ }
+ else
+ {
+ cout << "Invalid line number: " << lineNum << endl;
+ }
+}
+
+bool InteractiveDebugger::shouldShowPrompt(const WorkItem *workItem)
+{
+ if (!m_running)
+ return false;
+
+ if (m_forceBreak || sigintBreak)
+ return true;
+
+ if (hasHitBreakpoint())
+ return true;
+
+ if (m_continue)
+ return false;
+
+ if (workItem->getState() == WorkItem::BARRIER)
+ return true;
+ if (workItem->getState() == WorkItem::FINISHED)
+ return true;
+
+ if (!m_program->getNumSourceLines())
+ return true;
+
+ size_t line = getCurrentLineNumber();
+ if (m_next && workItem->getCallStack().size() > m_previousDepth)
+ return false;
+ if (!line || line == m_previousLine)
+ return false;
+
+ return true;
+}
+
+//////////////////////////////
+//// Interactive Commands ////
+//////////////////////////////
+
+bool InteractiveDebugger::backtrace(vector<string> args)
+{
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (!workItem || workItem->getState() == WorkItem::FINISHED)
+ {
+ return false;
+ }
+
+ stack<const llvm::Instruction*> callStack = workItem->getCallStack();
+
+ // Print current instruction
+ cout << "#" << callStack.size() << " ";
+ printFunction(workItem->getCurrentInstruction());
+
+ // Print call stack
+ while (!callStack.empty())
+ {
+ cout << "#" << (callStack.size()-1) << " ";
+ printFunction(callStack.top());
+ callStack.pop();
+ }
+
+ return false;
+}
+
+bool InteractiveDebugger::brk(vector<string> args)
+{
+ if (!m_program->getNumSourceLines())
+ {
+ cout << "Breakpoints only valid when source is available." << endl;
+ return false;
+ }
+
+ size_t lineNum = getCurrentLineNumber();
+ if (args.size() > 1)
+ {
+ // Parse argument as a target line number
+ istringstream ss(args[1]);
+ ss >> lineNum;
+ if (!ss.eof() || !lineNum || lineNum > m_program->getNumSourceLines()+1)
+ {
+ cout << "Invalid line number." << endl;
+ return false;
+ }
+ }
+
+ if (lineNum)
+ {
+ m_breakpoints[m_program][m_nextBreakpoint++] = lineNum;
+ }
+ else
+ {
+ cout << "Not currently on a line." << endl;
+ }
+
+ return false;
+}
+
+bool InteractiveDebugger::cont(vector<string> args)
+{
+#if !defined(_WIN32) || defined(__MINGW32__)
+ // Register a signal handler to catch interrupts
+ struct sigaction sigHandler;
+ sigHandler.sa_handler = handleSignal;
+ sigemptyset(&sigHandler.sa_mask);
+ sigHandler.sa_flags = 0;
+ sigaction(SIGINT, &sigHandler, &m_oldSignalHandler);
+#endif
+
+ m_continue = true;
+ return true;
+}
+
+bool InteractiveDebugger::del(vector<string> args)
+{
+ if (args.size() > 1)
+ {
+ // Parse argument as a target breakpoint
+ size_t bpNum = 0;
+ istringstream ss(args[1]);
+ ss >> bpNum;
+ if (!ss.eof())
+ {
+ cout << "Invalid breakpoint number." << endl;
+ return false;
+ }
+
+ // Ensure breakpoint exists
+ if (!m_breakpoints[m_program].count(bpNum))
+ {
+ cout << "Breakpoint not found." << endl;
+ return false;
+ }
+ m_breakpoints[m_program].erase(bpNum);
+ }
+ else
+ {
+ // Prompt for confimation
+ string confirm;
+ cout << "Delete all breakpoints? (y/n) " << flush;
+ cin >> confirm;
+ cin.ignore();
+ if (confirm == "y")
+ {
+ m_breakpoints.clear();
+ }
+ }
+
+ return false;
+}
+
+bool InteractiveDebugger::help(vector<string> args)
+{
+ if (args.size() < 2)
+ {
+ cout << "Command list:" << endl;
+ cout << " backtrace (bt)" << endl;
+ cout << " break (b)" << endl;
+ cout << " continue (c)" << endl;
+ cout << " delete (d)" << endl;
+ cout << " gmem (gm)" << endl;
+ cout << " help (h)" << endl;
+ cout << " info (i)" << endl;
+ cout << " list (l)" << endl;
+ cout << " next (n)" << endl;
+ cout << " lmem (lm)" << endl;
+ cout << " pmem (pm)" << endl;
+ cout << " print (p)" << endl;
+ cout << " quit (q)" << endl;
+ cout << " step (s)" << endl;
+ cout << " workitem (wi)" << endl;
+ cout << "(type 'help command' for more information)" << endl;
+ return false;
+ }
+
+ if (args[1] == "backtrace" || args[1] == "bt")
+ {
+ cout << "Print function call stack." << endl;
+ }
+ else if (args[1] == "break" || args[1] == "b")
+ {
+ cout << "Set a breakpoint"
+ << " (only functional when source is available)." << endl
+ << "With no arguments, sets a breakpoint at the current line." << endl
+ << "Use a numeric argument to set a breakpoint at a specific line."
+ << endl;
+ }
+ else if (args[1] == "continue" || args[1] == "c")
+ {
+ cout << "Continue kernel execution until next breakpoint." << endl;
+ }
+ else if (args[1] == "delete" || args[1] == "d")
+ {
+ cout << "Delete a breakpoint." << endl
+ << "With no arguments, deletes all breakpoints." << endl;
+ }
+ else if (args[1] == "help" || args[1] == "h")
+ {
+ cout << "Display usage information for a command." << endl;
+ }
+ else if (args[1] == "info" || args[1] == "i")
+ {
+ cout << "Display information about current debugging context." << endl
+ << "With no arguments, displays general information." << endl
+ << "'info break' lists breakpoints."
+ << endl;
+ }
+ else if (args[1] == "list" || args[1] == "l")
+ {
+ cout << "List source lines." << endl
+ << "With no argument, lists " << LIST_LENGTH
+ << " lines after previous listing." << endl
+ << "Use - to list " << LIST_LENGTH
+ << " lines before the previous listing" << endl
+ << "Use a numeric argument to list around a specific line number."
+ << endl;
+ }
+ else if (args[1] == "gmem" || args[1] == "lmem" || args[1] == "pmem" ||
+ args[1] == "gm" || args[1] == "lm" || args[1] == "pm")
+ {
+ cout << "Examine contents of ";
+ if (args[1] == "gmem") cout << "global";
+ if (args[1] == "lmem") cout << "local";
+ if (args[1] == "pmem") cout << "private";
+ cout << " memory." << endl
+ << "With no arguments, dumps entire contents of memory." << endl
+ << "'" << args[1] << " address [size]'" << endl
+ << "address is hexadecimal and 4-byte aligned." << endl;
+ }
+ else if (args[1] == "next" || args[1] == "n")
+ {
+ cout << "Step forward,"
+ << " treating function calls as single instruction." << endl;
+ }
+ else if (args[1] == "print" || args[1] == "p")
+ {
+ cout << "Print the values of one or more variables." << endl
+ << "'print x y' prints the values of x and y" << endl
+ << "'print foo[i]' prints a value at a constant array index" << endl;
+ }
+ else if (args[1] == "quit" || args[1] == "q")
+ {
+ cout << "Quit interactive debugger." << endl;
+ }
+ else if (args[1] == "step" || args[1] == "s")
+ {
+ cout << "Step forward a single source line,"
+ << " or an instruction if no source available." << endl;
+ }
+ else if (args[1] == "workitem" || args[1] == "wi")
+ {
+ cout << "Switch to a different work-item." << endl
+ << "Up to three (space separated) arguments allowed,"
+ << " specifying the global ID of the work-item." << endl;
+ }
+ else
+ {
+ cout << "Unrecognized command '" << args[1] << "'" << endl;
+ }
+
+ return false;
+}
+
+bool InteractiveDebugger::info(vector<string> args)
+{
+ if (args.size() > 1)
+ {
+ if (args[1] == "break")
+ {
+ // List breakpoints
+ map<size_t, size_t>::iterator itr;
+ for (itr = m_breakpoints[m_program].begin();
+ itr != m_breakpoints[m_program].end(); itr++)
+ {
+ cout << "Breakpoint " << itr->first << ": Line " << itr->second << endl;
+ }
+ }
+ else
+ {
+ cout << "Invalid info command: " << args[1] << endl;
+ }
+ return false;
+ }
+
+ // Kernel invocation information
+ cout
+ << dec
+ << "Running kernel '" << m_kernelInvocation->getKernel()->getName() << "'"
+ << endl
+ << "-> Global work size: " << m_kernelInvocation->getGlobalSize()
+ << endl
+ << "-> Global work offset: " << m_kernelInvocation->getGlobalOffset()
+ << endl
+ << "-> Local work size: " << m_kernelInvocation->getLocalSize()
+ << endl;
+
+ // Current work-item
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (workItem)
+ {
+ cout << endl << "Current work-item: " << workItem->getGlobalID() << endl;
+ if (workItem->getState() == WorkItem::FINISHED)
+ {
+ cout << "Work-item has finished." << endl;
+ }
+ else
+ {
+ cout << "In function ";
+ printFunction(workItem->getCurrentInstruction());
+ printCurrentLine();
+ }
+ }
+ else
+ {
+ cout << "All work-items finished." << endl;
+ }
+
+ return false;
+}
+
+bool InteractiveDebugger::list(vector<string> args)
+{
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (!workItem)
+ {
+ cout << "All work-items finished." << endl;
+ return false;
+ }
+ if (!m_program->getNumSourceLines())
+ {
+ cout << "No source code available." << endl;
+ return false;
+ }
+
+ // Check for an argument
+ size_t start = 0;
+ bool forwards = true;
+ if (args.size() > 1)
+ {
+ if (args[1] == "-")
+ {
+ forwards = false;
+ }
+ else
+ {
+ // Parse argument as a target line number
+ istringstream ss(args[1]);
+ ss >> start;
+ if (!ss.eof())
+ {
+ cout << "Invalid line number." << endl;
+ return false;
+ }
+ start = start > LIST_LENGTH/2 ? start - LIST_LENGTH/2 : 1;
+ }
+ }
+
+ if (!start)
+ {
+ if (forwards)
+ {
+ // Starting position is the previous list position + LIST_LENGTH
+ start = m_listPosition ?
+ m_listPosition + LIST_LENGTH : getCurrentLineNumber() + 1;
+ if (start >= m_program->getNumSourceLines() + 1)
+ {
+ m_listPosition = m_program->getNumSourceLines() + 1;
+ return false;
+ }
+ }
+ else
+ {
+ // Starting position is the previous list position - LIST_LENGTH
+ start = m_listPosition ? m_listPosition : getCurrentLineNumber();
+ start = start > LIST_LENGTH ? start - LIST_LENGTH : 1;
+ }
+ }
+
+ // Display lines
+ for (int i = 0; i < LIST_LENGTH; i++)
+ {
+ if (start + i >= m_program->getNumSourceLines() + 1)
+ {
+ break;
+ }
+ printSourceLine(start + i);
+ }
+
+ m_listPosition = start;
+
+ return false;
+}
+
+bool InteractiveDebugger::mem(vector<string> args)
+{
+ // Get target memory object
+ Memory *memory = NULL;
+ if (args[0][0] == 'g')
+ {
+ memory = m_context->getGlobalMemory();
+ }
+ else if (args[0][0] == 'l')
+ {
+ memory = m_kernelInvocation->getCurrentWorkGroup()->getLocalMemory();
+ }
+ else if (args[0][0] == 'p')
+ {
+ memory = m_kernelInvocation->getCurrentWorkItem()->getPrivateMemory();
+ }
+
+ // If no arguments, dump memory
+ if (args.size() == 1)
+ {
+ memory->dump();
+ return false;
+ }
+ else if (args.size() > 3)
+ {
+ cout << "Invalid number of arguments." << endl;
+ return false;
+ }
+
+ // Get target address
+ size_t address;
+ stringstream ss(args[1]);
+ ss >> hex >> address;
+ if (!ss.eof() || address%4 != 0)
+ {
+ cout << "Invalid address." << endl;
+ return false;
+ }
+
+ // Get optional size
+ size_t size = 8;
+ if (args.size() == 3)
+ {
+ stringstream ss(args[2]);
+ ss >> dec >> size;
+ if (!ss.eof() || !size)
+ {
+ cout << "Invalid size" << endl;
+ return false;
+ }
+ }
+
+ // Check address is valid
+ if (!memory->isAddressValid(address, size))
+ {
+ cout << "Invalid memory address." << endl;
+ return false;
+ }
+
+ // Output data
+ unsigned char *data = (unsigned char*)memory->getPointer(address);
+ for (unsigned i = 0; i < size; i++)
+ {
+ if (i%4 == 0)
+ {
+ cout << endl << hex << uppercase
+ << setw(16) << setfill(' ') << right
+ << (address + i) << ":";
+ }
+ cout << " " << hex << uppercase << setw(2) << setfill('0') << (int)data[i];
+ }
+ cout << endl << endl;
+
+ return false;
+}
+
+bool InteractiveDebugger::next(vector<string> args)
+{
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (!workItem)
+ {
+ cout << "All work-items finished." << endl;
+ return false;
+ }
+
+ if (workItem->getState() == WorkItem::FINISHED)
+ {
+ cout << "Work-item has finished." << endl;
+ return false;
+ }
+ else if (workItem->getState() == WorkItem::BARRIER)
+ {
+ cout << "Work-item is at barrier." << endl;
+ return false;
+ }
+
+ // Step until we return to the same depth
+ m_previousDepth = workItem->getCallStack().size();
+ m_previousLine = getCurrentLineNumber();
+ m_next = true;
+
+ return true;
+}
+
+bool InteractiveDebugger::print(vector<string> args)
+{
+ if (args.size() < 2)
+ {
+ cout << "Variable name(s) required." << endl;
+ return false;
+ }
+
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ for (unsigned i = 1; i < args.size(); i++)
+ {
+ cout << args[i] << " = ";
+
+ // Check for subscript operator
+ size_t start = args[i].find("[");
+ if (start != string::npos)
+ {
+ // Find end of subscript
+ size_t end = args[i].find(']');
+ if (end == string::npos)
+ {
+ cout << "missing ']'" << endl;
+ return false;
+ }
+ if (end != args[i].length() - 1)
+ {
+ cout << "invalid variable" << endl;
+ return false;
+ }
+
+ // Parse index value
+ size_t index = 0;
+ string var = args[i].substr(0, start);
+ stringstream ss(args[i].substr(start+1, end-start-1));
+ ss >> index;
+ if (!ss.eof())
+ {
+ cout << "invalid index" << endl;
+ return false;
+ }
+
+ // Get variable value and type
+ const llvm::Value *ptr = workItem->getVariable(var);
+ if (!ptr)
+ {
+ cout << "not found" << endl;
+ return false;
+ }
+ const llvm::Type *ptrType = ptr->getType();
+
+ // Check for alloca instruction, in which case look at allocated type
+ bool alloca = false;
+ if (ptr->getValueID() >= llvm::Value::InstructionVal &&
+ ((llvm::Instruction*)ptr)->getOpcode() == llvm::Instruction::Alloca)
+ {
+ ptrType = ((const llvm::AllocaInst*)ptr)->getAllocatedType();
+ alloca = true;
+ }
+
+ // Ensure type is a pointer
+ if (!ptrType->isPointerTy())
+ {
+ cout << "not a pointer" << endl;
+ return false;
+ }
+
+ // Get base address
+ size_t base = *(size_t*)workItem->getValueData(ptr);
+ if (alloca)
+ {
+ // Load base address from private memory
+ workItem->getPrivateMemory()->load((unsigned char*)&base,
+ base, sizeof(size_t));
+ }
+
+ // Get target memory object
+ Memory *memory = NULL;
+ switch (ptrType->getPointerAddressSpace())
+ {
+ case AddrSpacePrivate:
+ memory = workItem->getPrivateMemory();
+ break;
+ case AddrSpaceGlobal:
+ case AddrSpaceConstant:
+ memory = m_context->getGlobalMemory();
+ break;
+ case AddrSpaceLocal:
+ memory = m_kernelInvocation->getCurrentWorkGroup()->getLocalMemory();
+ break;
+ default:
+ cout << "invalid address space" << endl;
+ return false;
+ }
+
+ // Get element type
+ const llvm::Type *elemType = ptrType->getPointerElementType();
+ unsigned elemSize = getTypeSize(elemType);
+
+ // Load data
+ if (!memory->isAddressValid(base + index*elemSize, elemSize))
+ {
+ cout << "invalid memory address" << endl;
+ }
+ else
+ {
+ // Print data
+ void *data = (void*)memory->getPointer(base+index*elemSize);
+ printTypedData(elemType, (unsigned char*)data);
+ cout << endl;
+ }
+ }
+ else
+ {
+ if (!workItem->printVariable(args[i]))
+ {
+ cout << "not found";
+ }
+ cout << endl;
+ }
+ }
+
+ return false;
+}
+
+bool InteractiveDebugger::quit(vector<string> args)
+{
+#if !defined(_WIN32) || defined(__MINGW32__)
+ // Restore old signal handler
+ sigaction(SIGINT, &m_oldSignalHandler, NULL);
+#endif
+
+ m_running = false;
+ return true;
+}
+
+bool InteractiveDebugger::step(vector<string> args)
+{
+ const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+ if (!workItem)
+ {
+ cout << "All work-items finished." << endl;
+ return false;
+ }
+
+ if (workItem->getState() == WorkItem::FINISHED)
+ {
+ cout << "Work-item has finished." << endl;
+ return false;
+ }
+ else if (workItem->getState() == WorkItem::BARRIER)
+ {
+ cout << "Work-item is at barrier." << endl;
+ return false;
+ }
+
+ // Save current position
+ m_previousDepth = workItem->getCallStack().size();
+ m_previousLine = getCurrentLineNumber();
+
+ return true;
+}
+
+bool InteractiveDebugger::workitem(vector<string> args)
+{
+ // TODO: Take offsets into account?
+ Size3 gid(0,0,0);
+ for (unsigned i = 1; i < args.size(); i++)
+ {
+ // Parse argument as a target line number
+ istringstream ss(args[i]);
+ ss >> gid[i-1];
+ if (!ss.eof() || gid[i-1] >= m_kernelInvocation->getGlobalSize()[i-1])
+ {
+ cout << "Invalid global ID." << endl;
+ return false;
+ }
+ }
+
+ // Ugly const_cast since this operation actually changes something about
+ // the simulation. This goes against the idea that plugins are entirely
+ // passive.
+ if (!const_cast<KernelInvocation*>(m_kernelInvocation)->switchWorkItem(gid))
+ {
+ cout << "Work-item has already finished, unable to load state." << endl;
+ return false;
+ }
+
+ // Print new WI id
+ cout << "Switched to work-item: (" << gid[0] << ","
+ << gid[1] << ","
+ << gid[2] << ")" << endl;
+ if (m_kernelInvocation->getCurrentWorkItem()->getState() ==
+ WorkItem::FINISHED)
+ {
+ cout << "Work-item has finished execution." << endl;
+ }
+ else
+ {
+ printCurrentLine();
+ }
+
+ return false;
+}
diff --git a/src/plugins/InteractiveDebugger.h b/src/plugins/InteractiveDebugger.h
new file mode 100644
index 0000000..2b5db65
--- /dev/null
+++ b/src/plugins/InteractiveDebugger.h
@@ -0,0 +1,72 @@
+// InteractiveDebugger.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/Plugin.h"
+
+namespace oclgrind
+{
+ class Program;
+
+ class InteractiveDebugger : public Plugin
+ {
+ public:
+ InteractiveDebugger(const Context *context);
+
+ virtual void instructionExecuted(const WorkItem *workItem,
+ const llvm::Instruction *instruction,
+ const TypedValue& result) override;
+ virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
+ virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
+ virtual void log(MessageType type, const char *message) override;
+
+ virtual bool isThreadSafe() const override;
+
+ private:
+
+ bool m_continue;
+ bool m_running;
+ bool m_forceBreak;
+ size_t m_listPosition;
+ bool m_next;
+ size_t m_lastBreakLine;
+ size_t m_nextBreakpoint;
+ size_t m_previousDepth;
+ size_t m_previousLine;
+ std::map<const Program*, std::map<size_t, size_t> > m_breakpoints;
+ const Program *m_program;
+ const KernelInvocation *m_kernelInvocation;
+
+ size_t getCurrentLineNumber() const;
+ size_t getLineNumber(const llvm::Instruction *instruction) const;
+ bool hasHitBreakpoint();
+ void printCurrentLine() const;
+ void printFunction(const llvm::Instruction *instruction) const;
+ void printSourceLine(size_t lineNum) const;
+ bool shouldShowPrompt(const WorkItem *workItem);
+
+ // Interactive commands
+ typedef bool (InteractiveDebugger::*Command)(std::vector<std::string>);
+ std::map<std::string, Command> m_commands;
+#define CMD(name) bool name(std::vector<std::string> args);
+ CMD(backtrace);
+ CMD(brk);
+ CMD(cont);
+ CMD(del);
+ CMD(help);
+ CMD(info);
+ CMD(list);
+ CMD(mem);
+ CMD(next);
+ CMD(print);
+ CMD(quit);
+ CMD(step);
+ CMD(workitem);
+#undef CMD
+
+ };
+}
diff --git a/src/plugins/Logger.cpp b/src/plugins/Logger.cpp
new file mode 100644
index 0000000..7b73296
--- /dev/null
+++ b/src/plugins/Logger.cpp
@@ -0,0 +1,81 @@
+// Logger.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/common.h"
+
+#include <fstream>
+#include <mutex>
+
+#include "Logger.h"
+
+using namespace oclgrind;
+using namespace std;
+
+#define DEFAULT_MAX_ERRORS 1000
+
+unsigned Logger::m_numErrors = 0;
+
+static mutex logMutex;
+
+Logger::Logger(const Context *context)
+ : Plugin(context)
+{
+ m_log = &cerr;
+
+ const char *logfile = getenv("OCLGRIND_LOG");
+ if (logfile)
+ {
+ m_log = new ofstream(logfile);
+ if (!m_log->good())
+ {
+ cerr << "Oclgrind: Unable to open log file '" << logfile << "'" << endl;
+ m_log = &cerr;
+ }
+ }
+
+ m_maxErrors = DEFAULT_MAX_ERRORS;
+ const char *maxErrors = getenv("OCLGRIND_MAX_ERRORS");
+ if (maxErrors)
+ {
+ char *next;
+ m_maxErrors = strtoul(maxErrors, &next, 10);
+ if (strlen(next))
+ {
+ cerr << "Oclgrind: Invalid value for OCLGRIND_MAX_ERRORS" << endl;
+ }
+ }
+}
+
+Logger::~Logger()
+{
+ if (m_log != &cerr)
+ {
+ ((ofstream*)m_log)->close();
+ delete m_log;
+ }
+}
+
+void Logger::log(MessageType type, const char *message)
+{
+ lock_guard<mutex> lock(logMutex);
+
+ // Limit number of errors/warning printed
+ if (type == ERROR || type == WARNING)
+ {
+ if (m_numErrors == m_maxErrors)
+ {
+ *m_log << endl << "Oclgrind: "
+ << m_numErrors << " errors generated - suppressing further errors"
+ << endl << endl;
+ }
+ if (m_numErrors++ >= m_maxErrors)
+ return;
+ }
+
+ *m_log << endl << message << endl;
+}
diff --git a/src/plugins/Logger.h b/src/plugins/Logger.h
new file mode 100644
index 0000000..294bc67
--- /dev/null
+++ b/src/plugins/Logger.h
@@ -0,0 +1,27 @@
+// Logger.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/Plugin.h"
+
+namespace oclgrind
+{
+ class Logger : public Plugin
+ {
+ public:
+ Logger(const Context *context);
+ virtual ~Logger();
+
+ virtual void log(MessageType type, const char *message) override;
+
+ private:
+ std::ostream *m_log;
+
+ unsigned m_maxErrors;
+ static unsigned m_numErrors;
+ };
+}
diff --git a/src/plugins/MemCheck.cpp b/src/plugins/MemCheck.cpp
new file mode 100644
index 0000000..fb04e57
--- /dev/null
+++ b/src/plugins/MemCheck.cpp
@@ -0,0 +1,107 @@
+// MemCheck.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/common.h"
+
+#include "core/Context.h"
+#include "core/Memory.h"
+
+#include "MemCheck.h"
+
+using namespace oclgrind;
+using namespace std;
+
+MemCheck::MemCheck(const Context *context)
+ : Plugin(context)
+{
+}
+
+void MemCheck::memoryAtomicLoad(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op, size_t address, size_t size)
+{
+ checkLoad(memory, address, size);
+}
+
+void MemCheck::memoryAtomicStore(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op, size_t address, size_t size)
+{
+ checkStore(memory, address, size);
+}
+
+void MemCheck::memoryLoad(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size)
+{
+ checkLoad(memory, address, size);
+}
+
+void MemCheck::memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size)
+{
+ checkLoad(memory, address, size);
+}
+
+void MemCheck::memoryStore(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size,
+ const uint8_t *storeData)
+{
+ checkStore(memory, address, size);
+}
+
+void MemCheck::memoryStore(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size,
+ const uint8_t *storeData)
+{
+ checkStore(memory, address, size);
+}
+
+void MemCheck::checkLoad(const Memory *memory,
+ size_t address, size_t size) const
+{
+ if (!memory->isAddressValid(address, size))
+ {
+ logInvalidAccess(true, memory->getAddressSpace(), address, size);
+ return;
+ }
+
+ if (memory->getBuffer(address)->flags & CL_MEM_WRITE_ONLY)
+ {
+ m_context->logError("Invalid read from write-only buffer");
+ }
+}
+
+void MemCheck::checkStore(const Memory *memory,
+ size_t address, size_t size) const
+{
+ if (!memory->isAddressValid(address, size))
+ {
+ logInvalidAccess(false, memory->getAddressSpace(), address, size);
+ return;
+ }
+
+ if (memory->getBuffer(address)->flags & CL_MEM_READ_ONLY)
+ {
+ m_context->logError("Invalid write to read-only buffer");
+ }
+}
+
+void MemCheck::logInvalidAccess(bool read, unsigned addrSpace,
+ size_t address, size_t size) const
+{
+ Context::Message msg(ERROR, m_context);
+ msg << "Invalid " << (read ? "read" : "write")
+ << " of size " << size
+ << " at " << getAddressSpaceName(addrSpace)
+ << " memory address 0x" << hex << address << endl
+ << msg.INDENT
+ << "Kernel: " << msg.CURRENT_KERNEL << endl
+ << "Entity: " << msg.CURRENT_ENTITY << endl
+ << msg.CURRENT_LOCATION << endl;
+ msg.send();
+}
\ No newline at end of file
diff --git a/src/plugins/MemCheck.h b/src/plugins/MemCheck.h
new file mode 100644
index 0000000..9e685bf
--- /dev/null
+++ b/src/plugins/MemCheck.h
@@ -0,0 +1,43 @@
+// MemCheck.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/Plugin.h"
+
+namespace oclgrind
+{
+ class MemCheck : public Plugin
+ {
+ public:
+ MemCheck(const Context *context);
+
+ virtual void memoryAtomicLoad(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op,
+ size_t address, size_t size) override;
+ virtual void memoryAtomicStore(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op,
+ size_t address, size_t size) override;
+ virtual void memoryLoad(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size) override;
+ virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size) override;
+ virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size,
+ const uint8_t *storeData) override;
+ virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size,
+ const uint8_t *storeData) override;
+
+ private:
+ void checkLoad(const Memory *memory, size_t address, size_t size) const;
+ void checkStore(const Memory *memory, size_t address, size_t size) const;
+ void logInvalidAccess(bool read, unsigned addrSpace,
+ size_t address, size_t size) const;
+ };
+}
diff --git a/src/plugins/RaceDetector.cpp b/src/plugins/RaceDetector.cpp
new file mode 100644
index 0000000..10f417e
--- /dev/null
+++ b/src/plugins/RaceDetector.cpp
@@ -0,0 +1,336 @@
+// RaceDetector.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/common.h"
+
+#include "core/Context.h"
+#include "core/KernelInvocation.h"
+#include "core/Memory.h"
+#include "core/WorkGroup.h"
+#include "core/WorkItem.h"
+
+#include "RaceDetector.h"
+
+using namespace oclgrind;
+using namespace std;
+
+#define KEY(memory,address) make_pair(memory, EXTRACT_BUFFER(address))
+
+RaceDetector::RaceDetector(const Context *context)
+ : Plugin(context)
+{
+ m_kernelInvocation = NULL;
+
+ m_allowUniformWrites = !checkEnv("OCLGRIND_UNIFORM_WRITES");
+}
+
+bool RaceDetector::isThreadSafe() const
+{
+ // TODO: Improve DRD efficiency for multi-threaded case instead.
+ return false;
+}
+
+void RaceDetector::kernelBegin(const KernelInvocation *kernelInvocation)
+{
+ m_kernelInvocation = kernelInvocation;
+}
+
+void RaceDetector::kernelEnd(const KernelInvocation *kernelInvocation)
+{
+ synchronize(m_context->getGlobalMemory(), false);
+
+ m_kernelInvocation = NULL;
+}
+
+void RaceDetector::memoryAllocated(const Memory *memory, size_t address,
+ size_t size, cl_mem_flags flags)
+{
+ if (memory->getAddressSpace() == AddrSpacePrivate ||
+ memory->getAddressSpace() == AddrSpaceConstant)
+ return;
+
+ m_state[KEY(memory,address)] = make_pair(new State[size], size);
+}
+
+void RaceDetector::memoryAtomicLoad(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op, size_t address, size_t size)
+{
+ registerAtomic(memory, workItem, address, size, false);
+}
+
+void RaceDetector::memoryAtomicStore(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op, size_t address, size_t size)
+{
+ registerAtomic(memory, workItem, address, size, true);
+}
+
+void RaceDetector::memoryDeallocated(const Memory *memory, size_t address)
+{
+ if (memory->getAddressSpace() == AddrSpacePrivate ||
+ memory->getAddressSpace() == AddrSpaceConstant)
+ return;
+
+ delete[] m_state[KEY(memory,address)].first;
+ m_state.erase(KEY(memory,address));
+}
+
+void RaceDetector::memoryLoad(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size)
+{
+ registerLoadStore(memory, workItem, workItem->getWorkGroup(),
+ address, size, NULL);
+}
+
+void RaceDetector::memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size)
+{
+ registerLoadStore(memory, NULL, workGroup, address, size, NULL);
+}
+
+void RaceDetector::memoryStore(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size,
+ const uint8_t *storeData)
+{
+ registerLoadStore(memory, workItem, workItem->getWorkGroup(),
+ address, size, storeData);
+}
+
+void RaceDetector::memoryStore(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size,
+ const uint8_t *storeData)
+{
+ registerLoadStore(memory, NULL, workGroup, address, size, storeData);
+}
+
+void RaceDetector::logRace(DataRaceType type,
+ unsigned int addrSpace,
+ size_t address,
+ size_t lastWorkGroup,
+ size_t lastWorkItem,
+ const llvm::Instruction *lastInstruction) const
+{
+ const char *raceType = NULL;
+ switch (type)
+ {
+ case ReadWriteRace:
+ raceType = "Read-write";
+ break;
+ case WriteWriteRace:
+ raceType = "Write-write";
+ break;
+ }
+
+ Context::Message msg(ERROR, m_context);
+ msg << raceType << " data race at "
+ << getAddressSpaceName(addrSpace)
+ << " memory address 0x" << hex << address << endl
+ << msg.INDENT
+ << "Kernel: " << msg.CURRENT_KERNEL << endl
+ << endl
+ << "First entity: " << msg.CURRENT_ENTITY << endl
+ << msg.CURRENT_LOCATION << endl
+ << endl
+ << "Second entity: ";
+
+ // Show details of other entity involved in race
+ if (lastWorkItem != -1)
+ {
+ Size3 global(lastWorkItem, m_kernelInvocation->getGlobalSize());
+ Size3 local, group;
+ local.x = global.x % m_kernelInvocation->getLocalSize().x;
+ local.y = global.y % m_kernelInvocation->getLocalSize().y;
+ local.z = global.z % m_kernelInvocation->getLocalSize().z;
+ group.x = global.x / m_kernelInvocation->getLocalSize().x;
+ group.y = global.y / m_kernelInvocation->getLocalSize().y;
+ group.z = global.z / m_kernelInvocation->getLocalSize().z;
+ msg << "Global" << global << " Local" << local << " Group" << group;
+ }
+ else if (lastWorkGroup != -1)
+ {
+ msg << "Group"
+ << Size3(lastWorkGroup, m_kernelInvocation->getNumGroups());
+ }
+ else
+ {
+ msg << "(unknown)";
+ }
+ msg << endl
+ << lastInstruction << endl;
+ msg.send();
+}
+
+void RaceDetector::registerAtomic(const Memory *memory,
+ const WorkItem *workItem,
+ size_t address, size_t size,
+ bool store)
+{
+ if (!memory->isAddressValid(address, size))
+ return;
+
+ State *state = m_state[KEY(memory,address)].first + EXTRACT_OFFSET(address);
+
+ // Get work-item index
+ size_t workItemIndex = workItem->getGlobalIndex();
+
+ bool race = false;
+ for (size_t offset = 0; offset < size; offset++, state++)
+ {
+ // Check for races with non-atomic operations
+ bool conflict = store ? !state->canAtomicStore : !state->canAtomicLoad;
+ if (!race && conflict && workItemIndex != state->workItem)
+ {
+ logRace(ReadWriteRace,
+ memory->getAddressSpace(),
+ address,
+ state->workItem,
+ state->workGroup,
+ state->instruction);
+ race = true;
+ }
+
+ // Update state
+ if (store)
+ state->canLoad = false;
+ state->canStore = false;
+ if (!state->wasWorkItem)
+ {
+ state->instruction = workItem->getCurrentInstruction();
+ state->workItem = workItemIndex;
+ state->wasWorkItem = true;
+ }
+ }
+}
+
+void RaceDetector::registerLoadStore(const Memory *memory,
+ const WorkItem *workItem,
+ const WorkGroup *workGroup,
+ size_t address, size_t size,
+ const uint8_t *storeData)
+{
+ if (!m_kernelInvocation)
+ return;
+ if (memory->getAddressSpace() == AddrSpacePrivate ||
+ memory->getAddressSpace() == AddrSpaceConstant)
+ return;
+ if (!memory->isAddressValid(address, size))
+ return;
+
+ bool load = !storeData;
+ bool store = storeData;
+
+ // Get index of work-item and work-group performing access
+ size_t workItemIndex = -1, workGroupIndex = -1;
+ if (workItem)
+ {
+ workItemIndex = workItem->getGlobalIndex();
+ }
+ if (workGroup)
+ {
+ workGroupIndex = workGroup->getGroupIndex();
+ }
+
+ bool race = false;
+ size_t base = EXTRACT_OFFSET(address);
+ State *state = m_state[KEY(memory, address)].first + base;
+
+ for (size_t offset = 0; offset < size; offset++, state++)
+ {
+ bool conflict = store ? !state->canStore : !state->canLoad;
+ if (m_allowUniformWrites && storeData)
+ {
+ uint8_t *ptr = (uint8_t*)(memory->getPointer(address));
+ conflict &= (ptr[offset] != storeData[offset]);
+ }
+
+ if (!race && conflict &&
+ (state->wasWorkItem ? // If state set by work-item,
+ state->workItem != workItemIndex : // must be same work-item,
+ state->workGroup != workGroupIndex) // otherwise must be same group
+ )
+ {
+ // Report data-race
+ DataRaceType type = load|state->canLoad ? ReadWriteRace : WriteWriteRace;
+ logRace(type, memory->getAddressSpace(),
+ address + offset,
+ state->workItem,
+ state->workGroup,
+ state->instruction);
+ race = true;
+ }
+ else
+ {
+ // Only update WI info if this operation is stronger than previous one
+ bool updateWI = store || (load && state->canStore);
+
+ // Update state
+ if (store)
+ state->canAtomicLoad = false;
+ state->canAtomicStore = false;
+ state->canLoad &= load;
+ state->canStore = false;
+ if (updateWI)
+ {
+ state->workGroup = workGroupIndex;
+ if (workItem)
+ {
+ state->instruction = workItem->getCurrentInstruction();
+ state->workItem = workItemIndex;
+ state->wasWorkItem = true;
+ }
+ }
+ }
+ }
+}
+
+void RaceDetector::synchronize(const Memory *memory, bool workGroup)
+{
+ StateMap::iterator itr;
+ for (itr = m_state.begin(); itr != m_state.end(); itr++)
+ {
+ if (itr->first.first != memory)
+ continue;
+
+ pair<State*,size_t> obj = itr->second;
+ for (State *state = obj.first; state < obj.first+obj.second; state++)
+ {
+ // TODO: atomic_intergroup_race test failure
+ state->canAtomicLoad = true;
+ state->canAtomicStore = true;
+ state->workItem = -1;
+ state->wasWorkItem = false;
+ if (!workGroup)
+ {
+ state->workGroup = -1;
+ state->canLoad = true;
+ state->canStore = true;
+ }
+ }
+ }
+}
+
+void RaceDetector::workGroupBarrier(const WorkGroup *workGroup, uint32_t flags)
+{
+ if (flags & CLK_LOCAL_MEM_FENCE)
+ synchronize(workGroup->getLocalMemory(), false);
+ if (flags & CLK_GLOBAL_MEM_FENCE)
+ synchronize(m_context->getGlobalMemory(), true);
+}
+
+RaceDetector::State::State()
+{
+ instruction = NULL;
+ workItem = -1;
+ workGroup = -1;
+ canAtomicLoad = true;
+ canAtomicStore = true;
+ canLoad = true;
+ canStore = true;
+ wasWorkItem = false;
+}
diff --git a/src/plugins/RaceDetector.h b/src/plugins/RaceDetector.h
new file mode 100644
index 0000000..2442b56
--- /dev/null
+++ b/src/plugins/RaceDetector.h
@@ -0,0 +1,94 @@
+// RaceDetector.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "core/Plugin.h"
+
+namespace oclgrind
+{
+ class RaceDetector : public Plugin
+ {
+ public:
+ RaceDetector(const Context *context);
+
+ virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
+ virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
+ virtual void memoryAllocated(const Memory *memory, size_t address,
+ size_t size, cl_mem_flags flags) override;
+ virtual void memoryAtomicLoad(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op,
+ size_t address, size_t size) override;
+ virtual void memoryAtomicStore(const Memory *memory,
+ const WorkItem *workItem,
+ AtomicOp op,
+ size_t address, size_t size) override;
+ virtual void memoryDeallocated(const Memory *memory, size_t address);
+ virtual void memoryLoad(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size) override;
+ virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size) override;
+ virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
+ size_t address, size_t size,
+ const uint8_t *storeData) override;
+ virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
+ size_t address, size_t size,
+ const uint8_t *storeData) override;
+ virtual void workGroupBarrier(const WorkGroup *workGroup,
+ uint32_t flags) override;
+
+ virtual bool isThreadSafe() const override;
+
+ private:
+ struct State
+ {
+ const llvm::Instruction *instruction;
+ size_t workItem;
+ size_t workGroup;
+ bool canAtomicLoad;
+ bool canAtomicStore;
+ bool canLoad;
+ bool canStore;
+ bool wasWorkItem;
+
+ State();
+ };
+
+ // Enumeration for types of data-race
+ enum DataRaceType
+ {
+ ReadWriteRace,
+ WriteWriteRace
+ };
+
+ typedef std::map<
+ std::pair<const Memory*, size_t>,
+ std::pair<State*, size_t>
+ > StateMap;
+ StateMap m_state;
+
+ bool m_allowUniformWrites;
+ const KernelInvocation *m_kernelInvocation;
+
+ void logRace(DataRaceType type,
+ unsigned int addrSpace,
+ size_t address,
+ size_t lastWorkGroup,
+ size_t lastWorkItem,
+ const llvm::Instruction *lastInstruction) const;
+ void registerAtomic(const Memory *memory,
+ const WorkItem *workItem,
+ size_t address, size_t size,
+ bool store);
+ void registerLoadStore(const Memory *memory,
+ const WorkItem *workItem,
+ const WorkGroup *workGroup,
+ size_t address, size_t size,
+ const uint8_t *storeData);
+ void synchronize(const Memory *memory, bool workGroup);
+ };
+}
diff --git a/src/runtime/async_queue.cpp b/src/runtime/async_queue.cpp
new file mode 100644
index 0000000..cc5f41c
--- /dev/null
+++ b/src/runtime/async_queue.cpp
@@ -0,0 +1,136 @@
+// async_queue.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "async_queue.h"
+
+#include <cassert>
+#include <iostream>
+#include <list>
+#include <map>
+
+#include "core/Kernel.h"
+#include "core/Queue.h"
+
+using namespace oclgrind;
+using namespace std;
+
+// Maps to keep track of retained objects
+static map< Queue::Command*, list<cl_mem> > memObjectMap;
+static map< Queue::Command*, cl_kernel > kernelMap;
+static map< Queue::Command*, cl_event > eventMap;
+static map< Queue::Command*, list<cl_event> > waitListMap;
+
+void asyncEnqueue(cl_command_queue queue,
+ cl_command_type type,
+ Queue::Command *cmd,
+ cl_uint numEvents,
+ const cl_event *waitList,
+ cl_event *eventOut)
+{
+ // Add event wait list to command
+ for (unsigned i = 0; i < numEvents; i++)
+ {
+ cmd->waitList.push_back(waitList[i]->event);
+ waitListMap[cmd].push_back(waitList[i]);
+ clRetainEvent(waitList[i]);
+ }
+
+ // Enqueue command
+ Event *event = queue->queue->enqueue(cmd);
+
+ // Create event objects
+ cl_event _event = new _cl_event;
+ _event->dispatch = m_dispatchTable;
+ _event->context = queue->context;
+ _event->queue = queue;
+ _event->type = type;
+ _event->event = event;
+ _event->refCount = 1;
+
+ // Add event to map
+ eventMap[cmd] = _event;
+
+ // Pass event as output and retain (if required)
+ if (eventOut)
+ {
+ clRetainEvent(_event);
+ *eventOut = _event;
+ }
+}
+
+void asyncQueueRetain(Queue::Command *cmd, cl_mem mem)
+{
+ // Retain object and add to map
+ clRetainMemObject(mem);
+ memObjectMap[cmd].push_back(mem);
+}
+
+void asyncQueueRetain(Queue::Command *cmd, cl_kernel kernel)
+{
+ assert(kernelMap.find(cmd) == kernelMap.end());
+
+ // Retain kernel and add to map
+ clRetainKernel(kernel);
+ kernelMap[cmd] = kernel;
+
+ // Retain memory objects arguments
+ map<cl_uint,cl_mem>::const_iterator itr;
+ for (itr = kernel->memArgs.begin(); itr != kernel->memArgs.end(); itr++)
+ {
+ asyncQueueRetain(cmd, itr->second);
+ }
+}
+
+void asyncQueueRelease(Queue::Command *cmd)
+{
+ // Release memory objects
+ if (memObjectMap.find(cmd) != memObjectMap.end())
+ {
+ list<cl_mem> memObjects = memObjectMap[cmd];
+ while (!memObjects.empty())
+ {
+ clReleaseMemObject(memObjects.front());
+ memObjects.pop_front();
+ }
+ memObjectMap.erase(cmd);
+ }
+
+ // Release kernel
+ if (cmd->type == Queue::KERNEL)
+ {
+ assert(kernelMap.find(cmd) != kernelMap.end());
+ clReleaseKernel(kernelMap[cmd]);
+ kernelMap.erase(cmd);
+ delete ((Queue::KernelCommand*)cmd)->kernel;
+ }
+
+ // Remove event from map
+ cl_event event = eventMap[cmd];
+ eventMap.erase(cmd);
+
+ // Perform callbacks
+ list< pair<void (CL_CALLBACK *)(cl_event, cl_int, void *),
+ void*> >::iterator callItr;
+ for (callItr = event->callbacks.begin();
+ callItr != event->callbacks.end();
+ callItr++)
+ {
+ callItr->first(event, event->event->state, callItr->second);
+ }
+
+ // Release events
+ list<cl_event>::iterator waitItr;
+ for (waitItr = waitListMap[cmd].begin();
+ waitItr != waitListMap[cmd].end();
+ waitItr++)
+ {
+ clReleaseEvent(*waitItr);
+ }
+ waitListMap.erase(cmd);
+ clReleaseEvent(event);
+}
diff --git a/src/runtime/async_queue.h b/src/runtime/async_queue.h
new file mode 100644
index 0000000..5ff4f4a
--- /dev/null
+++ b/src/runtime/async_queue.h
@@ -0,0 +1,21 @@
+// async_queue.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include "icd.h"
+
+#include "core/Queue.h"
+
+extern void asyncEnqueue(cl_command_queue queue,
+ cl_command_type type,
+ oclgrind::Queue::Command *cmd,
+ cl_uint numEvents,
+ const cl_event *waitList,
+ cl_event *eventOut);
+extern void asyncQueueRetain(oclgrind::Queue::Command *cmd, cl_mem mem);
+extern void asyncQueueRetain(oclgrind::Queue::Command *cmd, cl_kernel);
+extern void asyncQueueRelease(oclgrind::Queue::Command *cmd);
diff --git a/src/runtime/icd.def b/src/runtime/icd.def
new file mode 100644
index 0000000..7e017c6
--- /dev/null
+++ b/src/runtime/icd.def
@@ -0,0 +1,5 @@
+EXPORTS
+
+; Make ICD initialisation functions visible
+clGetExtensionFunctionAddress
+clIcdGetPlatformIDsKHR
diff --git a/src/runtime/icd.h b/src/runtime/icd.h
new file mode 100644
index 0000000..7059cf9
--- /dev/null
+++ b/src/runtime/icd.h
@@ -0,0 +1,235 @@
+// icd.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#ifndef _ICD_H_
+#define _ICD_H_
+
+// Rename OpenCL API functions to avoid clashes with ICD library
+#ifdef OCLGRIND_ICD
+#define clGetPlatformIDs _clGetPlatformIDs
+#define clGetPlatformInfo _clGetPlatformInfo
+#define clGetDeviceIDs _clGetDeviceIDs
+#define clGetDeviceInfo _clGetDeviceInfo
+#define clCreateSubDevices _clCreateSubDevices
+#define clRetainDevice _clRetainDevice
+#define clReleaseDevice _clReleaseDevice
+#define clCreateContext _clCreateContext
+#define clCreateContextFromType _clCreateContextFromType
+#define clRetainContext _clRetainContext
+#define clReleaseContext _clReleaseContext
+#define clGetContextInfo _clGetContextInfo
+#define clCreateCommandQueue _clCreateCommandQueue
+#define clSetCommandQueueProperty _clSetCommandQueueProperty
+#define clRetainCommandQueue _clRetainCommandQueue
+#define clReleaseCommandQueue _clReleaseCommandQueue
+#define clGetCommandQueueInfo _clGetCommandQueueInfo
+#define clCreateBuffer _clCreateBuffer
+#define clCreateSubBuffer _clCreateSubBuffer
+#define clCreateImage _clCreateImage
+#define clCreateImage2D _clCreateImage2D
+#define clCreateImage3D _clCreateImage3D
+#define clRetainMemObject _clRetainMemObject
+#define clReleaseMemObject _clReleaseMemObject
+#define clGetSupportedImageFormats _clGetSupportedImageFormats
+#define clGetMemObjectInfo _clGetMemObjectInfo
+#define clGetImageInfo _clGetImageInfo
+#define clSetMemObjectDestructorCallback _clSetMemObjectDestructorCallback
+#define clCreateSampler _clCreateSampler
+#define clRetainSampler _clRetainSampler
+#define clReleaseSampler _clReleaseSampler
+#define clGetSamplerInfo _clGetSamplerInfo
+#define clCreateProgramWithSource _clCreateProgramWithSource
+#define clCreateProgramWithBinary _clCreateProgramWithBinary
+#define clCreateProgramWithBuiltInKernels _clCreateProgramWithBuiltInKernels
+#define clRetainProgram _clRetainProgram
+#define clReleaseProgram _clReleaseProgram
+#define clBuildProgram _clBuildProgram
+#define clUnloadCompiler _clUnloadCompiler
+#define clCompileProgram _clCompileProgram
+#define clLinkProgram _clLinkProgram
+#define clUnloadPlatformCompiler _clUnloadPlatformCompiler
+#define clGetProgramInfo _clGetProgramInfo
+#define clGetProgramBuildInfo _clGetProgramBuildInfo
+#define clCreateKernel _clCreateKernel
+#define clCreateKernelsInProgram _clCreateKernelsInProgram
+#define clRetainKernel _clRetainKernel
+#define clReleaseKernel _clReleaseKernel
+#define clSetKernelArg _clSetKernelArg
+#define clGetKernelInfo _clGetKernelInfo
+#define clGetKernelArgInfo _clGetKernelArgInfo
+#define clGetKernelWorkGroupInfo _clGetKernelWorkGroupInfo
+#define clWaitForEvents _clWaitForEvents
+#define clGetEventInfo _clGetEventInfo
+#define clCreateUserEvent _clCreateUserEvent
+#define clRetainEvent _clRetainEvent
+#define clReleaseEvent _clReleaseEvent
+#define clSetUserEventStatus _clSetUserEventStatus
+#define clSetEventCallback _clSetEventCallback
+#define clGetEventProfilingInfo _clGetEventProfilingInfo
+#define clFlush _clFlush
+#define clFinish _clFinish
+#define clEnqueueReadBuffer _clEnqueueReadBuffer
+#define clEnqueueReadBufferRect _clEnqueueReadBufferRect
+#define clEnqueueWriteBuffer _clEnqueueWriteBuffer
+#define clEnqueueWriteBufferRect _clEnqueueWriteBufferRect
+#define clEnqueueCopyBuffer _clEnqueueCopyBuffer
+#define clEnqueueCopyBufferRect _clEnqueueCopyBufferRect
+#define clEnqueueFillBuffer _clEnqueueFillBuffer
+#define clEnqueueFillImage _clEnqueueFillImage
+#define clEnqueueReadImage _clEnqueueReadImage
+#define clEnqueueWriteImage _clEnqueueWriteImage
+#define clEnqueueCopyImage _clEnqueueCopyImage
+#define clEnqueueCopyImageToBuffer _clEnqueueCopyImageToBuffer
+#define clEnqueueCopyBufferToImage _clEnqueueCopyBufferToImage
+#define clEnqueueMapBuffer _clEnqueueMapBuffer
+#define clEnqueueMapImage _clEnqueueMapImage
+#define clEnqueueUnmapMemObject _clEnqueueUnmapMemObject
+#define clEnqueueMigrateMemObjects _clEnqueueMigrateMemObjects
+#define clEnqueueNDRangeKernel _clEnqueueNDRangeKernel
+#define clEnqueueTask _clEnqueueTask
+#define clEnqueueNativeKernel _clEnqueueNativeKernel
+#define clGetExtensionFunctionAddressForPlatform _clGetExtensionFunctionAddressForPlatform
+#define clEnqueueMarkerWithWaitList _clEnqueueMarkerWithWaitList
+#define clEnqueueBarrierWithWaitList _clEnqueueBarrierWithWaitList
+#define clSetPrintfCallback _clSetPrintfCallback
+#define clEnqueueMarker _clEnqueueMarker
+#define clEnqueueWaitForEvents _clEnqueueWaitForEvents
+#define clEnqueueBarrier _clEnqueueBarrier
+#define clCreateFromGLBuffer _clCreateFromGLBuffer
+#define clCreateFromGLTexture _clCreateFromGLTexture
+#define clCreateFromGLTexture2D _clCreateFromGLTexture2D
+#define clCreateFromGLTexture3D _clCreateFromGLTexture3D
+#define clCreateFromGLRenderbuffer _clCreateFromGLRenderbuffer
+#define clGetGLObjectInfo _clGetGLObjectInfo
+#define clGetGLTextureInfo _clGetGLTextureInfo
+#define clEnqueueAcquireGLObjects _clEnqueueAcquireGLObjects
+#define clEnqueueReleaseGLObjects _clEnqueueReleaseGLObjects
+#define clGetGLContextInfoKHR _clGetGLContextInfoKHR
+#define clCreateEventFromGLsyncKHR _clCreateEventFromGLsyncKHR
+#endif // OCLGRIND_ICD
+
+#include <list>
+#include <map>
+#include <stack>
+#include <stdint.h>
+
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "CL/cl_gl.h"
+#include "CL/cl_gl_ext.h"
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include "CL/cl_d3d11.h"
+#include "CL/cl_d3d10.h"
+#include "CL/cl_dx9_media_sharing.h"
+#endif
+
+namespace oclgrind
+{
+ class Context;
+ class Kernel;
+ class Program;
+ class Queue;
+ struct Event;
+}
+
+struct _cl_platform_id
+{
+ void *dispatch;
+};
+
+struct _cl_device_id
+{
+ void **dispatch;
+};
+
+struct _cl_context
+{
+ void *dispatch;
+ oclgrind::Context *context;
+ void (CL_CALLBACK *notify)(const char *, const void *, size_t, void *);
+ void *data;
+ cl_context_properties *properties;
+ size_t szProperties;
+ unsigned int refCount;
+};
+
+struct _cl_command_queue
+{
+ void *dispatch;
+ cl_command_queue_properties properties;
+ cl_context context;
+ oclgrind::Queue *queue;
+ unsigned int refCount;
+};
+
+struct _cl_mem
+{
+ void *dispatch;
+ cl_context context;
+ cl_mem parent;
+ size_t address;
+ size_t size;
+ size_t offset;
+ cl_mem_flags flags;
+ bool isImage;
+ void *hostPtr;
+ std::stack< std::pair<void (CL_CALLBACK*)(cl_mem, void *), void*> > callbacks;
+ unsigned int refCount;
+};
+
+struct cl_image : _cl_mem
+{
+ cl_image_format format;
+ cl_image_desc desc;
+};
+
+struct _cl_program
+{
+ void *dispatch;
+ oclgrind::Program *program;
+ cl_context context;
+ unsigned int refCount;
+};
+
+struct _cl_kernel
+{
+ void *dispatch;
+ oclgrind::Kernel *kernel;
+ cl_program program;
+ std::map<cl_uint, cl_mem> memArgs;
+ unsigned int refCount;
+};
+
+struct _cl_event
+{
+ void *dispatch;
+ cl_context context;
+ cl_command_queue queue;
+ cl_command_type type;
+ oclgrind::Event *event;
+ std::list< std::pair<void (CL_CALLBACK*)(cl_event, cl_int, void*), void*> > callbacks;
+ unsigned int refCount;
+};
+
+struct _cl_sampler
+{
+ void *dispatch;
+ cl_context context;
+ cl_bool normCoords;
+ cl_addressing_mode addressMode;
+ cl_filter_mode filterMode;
+ uint32_t sampler;
+ unsigned int refCount;
+};
+
+extern void *m_dispatchTable[256];
+
+#endif // _ICD_H_
diff --git a/src/runtime/oclgrind b/src/runtime/oclgrind
new file mode 100755
index 0000000..4925be4
--- /dev/null
+++ b/src/runtime/oclgrind
@@ -0,0 +1,145 @@
+#!/bin/bash
+# oclgrind (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+function usage
+{
+ echo "Usage: "
+ echo " oclgrind [OPTIONS] COMMAND"
+ echo " oclgrind [--help | --version]"
+ echo
+ echo "Options:"
+ echo -n " --build-options OPTIONS "
+ echo "Additional options to pass to the OpenCL compiler"
+ echo -n " --check-api "
+ echo "Reports errors on API calls"
+ echo -n " --data-races "
+ echo "Enable data-race detection"
+ echo -n " --disable-pch "
+ echo "Don't use precompiled headers"
+ echo -n " --dump-spir "
+ echo "Dump SPIR to /tmp/oclgrind_*.{ll,bc}"
+ echo -n " -h --help "
+ echo "Display usage information"
+ echo -n " --inst-counts "
+ echo "Output histograms of instructions executed"
+ echo -n " -i --interactive "
+ echo "Enable interactive mode"
+ echo -n " --log LOGFILE "
+ echo "Redirect log/error messages to a file"
+ echo -n " --max-errors NUM "
+ echo "Limit the number of error/warning messages"
+ echo -n " --num-threads NUM "
+ echo "Set the number of worker threads to use"
+ echo -n " --pch-dir DIR "
+ echo "Override directory containing precompiled headers"
+ echo -n " --plugins PLUGINS "
+ echo "Load colon seperated list of plugin libraries"
+ echo -n " -q --quick "
+ echo "Only run first and last work-group"
+ echo -n " --uniform-writes "
+ echo "Don't suppress uniform write-write data-races"
+ echo -n " -v --version "
+ echo "Display version information"
+ echo
+ echo "For more information, please visit the Oclgrind wiki page:"
+ echo "-> https://github.com/jrprice/Oclgrind/wiki"
+ echo
+}
+
+# Parse arguments
+while [ $# -gt 0 -a "${1:0:1}" == "-" ]
+do
+ if [ "$1" == "--build-options" ]
+ then
+ shift
+ export OCLGRIND_BUILD_OPTIONS="$1"
+ elif [ "$1" == "--check-api" ]
+ then
+ export OCLGRIND_CHECK_API=1
+ elif [ "$1" == "--data-races" ]
+ then
+ export OCLGRIND_DATA_RACES=1
+ elif [ "$1" == "--disable-pch" ]
+ then
+ export OCLGRIND_DISABLE_PCH=1
+ elif [ "$1" == "--dump-spir" ]
+ then
+ export OCLGRIND_DUMP_SPIR=1
+ elif [ "$1" == "-h" -o "$1" == "--help" ]
+ then
+ usage
+ exit 0
+ elif [ "$1" == "--inst-counts" ]
+ then
+ export OCLGRIND_INST_COUNTS=1
+ elif [ "$1" == "-i" -o "$1" == "--interactive" ]
+ then
+ export OCLGRIND_INTERACTIVE=1
+ elif [ "$1" == "--log" ]
+ then
+ shift
+ export OCLGRIND_LOG="$1"
+ elif [ "$1" == "--max-errors" ]
+ then
+ shift
+ export OCLGRIND_MAX_ERRORS="$1"
+ elif [ "$1" == "--num-threads" ]
+ then
+ shift
+ export OCLGRIND_NUM_THREADS="$1"
+ elif [ "$1" == "--pch-dir" ]
+ then
+ shift
+ export OCLGRIND_PCH_DIR="$1"
+ elif [ "$1" == "--plugins" ]
+ then
+ shift
+ export OCLGRIND_PLUGINS="$1"
+ elif [ "$1" == "-q" -o "$1" == "--quick" ]
+ then
+ export OCLGRIND_QUICK=1
+ elif [ "$1" == "--uniform-writes" ]
+ then
+ export OCLGRIND_UNIFORM_WRITES=1
+ elif [ "$1" == "-v" -o "$1" == "--version" ]
+ then
+ echo
+ echo "Oclgrind __VERSION__"
+ echo
+ echo "Copyright (c) 2013-2015"
+ echo "James Price and Simon McIntosh-Smith, University of Bristol"
+ echo "https://github.com/jrprice/Oclgrind"
+ echo
+ exit 0
+ else
+ echo "Unrecognized argument '$1'"
+ usage
+ exit 1
+ fi
+ shift
+done
+
+# Ensure target command supplied
+if [ $# -lt 1 ]
+then
+ usage
+ exit 1
+fi
+
+# Inject liboclgrind.{so,dylib} and run command
+LIBDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/../lib"
+if [ "$(uname -s)" == "Darwin" ]
+then
+ DYLD_LIBRARY_PATH=$LIBDIR:$DYLD_LIBRARY_PATH \
+ DYLD_INSERT_LIBRARIES=$LIBDIR/liboclgrind-rt.dylib \
+ DYLD_FORCE_FLAT_NAMESPACE=1 "$@"
+else
+ LD_LIBRARY_PATH=$LIBDIR:$LD_LIBRARY_PATH \
+ LD_PRELOAD=$LIBDIR/liboclgrind-rt.so "$@"
+fi
diff --git a/src/runtime/runtime.cpp b/src/runtime/runtime.cpp
new file mode 100644
index 0000000..1cf7338
--- /dev/null
+++ b/src/runtime/runtime.cpp
@@ -0,0 +1,5594 @@
+// runtime.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+//
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+
+#include "async_queue.h"
+#include "icd.h"
+
+#include "core/Context.h"
+#include "core/Kernel.h"
+#include "core/half.h"
+#include "core/Memory.h"
+#include "core/Program.h"
+#include "core/Queue.h"
+
+using namespace std;
+
+#define MAX_GLOBAL_MEM_SIZE (128 * 1048576)
+#define MAX_CONSTANT_BUFFER_SIZE (1048576)
+#define MAX_LOCAL_MEM_SIZE (32768)
+#define MAX_WI_SIZE (65536)
+
+#define PLATFORM_NAME "Oclgrind"
+#define PLATFORM_VENDOR "University of Bristol"
+#define PLATFORM_VERSION "OpenCL 1.2 (Oclgrind " PACKAGE_VERSION ")"
+#define PLATFORM_PROFILE "FULL_PROFILE"
+#define PLATFORM_SUFFIX "oclg"
+#define PLATFORM_EXTENSIONS "cl_khr_icd"
+
+#define DEVICE_NAME "Oclgrind Simulator"
+#define DEVICE_VENDOR "University of Bristol"
+#define DEVICE_VENDOR_ID 0x0042
+#define DEVICE_VERSION "OpenCL 1.2 (Oclgrind " PACKAGE_VERSION ")"
+#define DEVICE_LANG_VERSION "OpenCL C 1.2 (Oclgrind " PACKAGE_VERSION ")"
+#define DRIVER_VERSION "Oclgrind " PACKAGE_VERSION
+#define DEVICE_PROFILE "FULL_PROFILE"
+#define DEVICE_SPIR_VERSIONS "1.2"
+#define DEVICE_EXTENSIONS " \
+ cl_khr_spir \
+ cl_khr_3d_image_writes \
+ cl_khr_global_int32_base_atomics \
+ cl_khr_global_int32_extended_atomics \
+ cl_khr_local_int32_base_atomics \
+ cl_khr_local_int32_extended_atomics \
+ cl_khr_byte_addressable_store \
+ cl_khr_fp64"
+
+
+namespace
+{
+#define CASE(X) case X: return #X;
+ const char* CLErrorToString(cl_int err)
+ {
+ switch (err)
+ {
+ CASE(CL_SUCCESS)
+ CASE(CL_DEVICE_NOT_FOUND)
+ CASE(CL_DEVICE_NOT_AVAILABLE)
+ CASE(CL_COMPILER_NOT_AVAILABLE)
+ CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+ CASE(CL_OUT_OF_RESOURCES)
+ CASE(CL_OUT_OF_HOST_MEMORY)
+ CASE(CL_PROFILING_INFO_NOT_AVAILABLE)
+ CASE(CL_MEM_COPY_OVERLAP)
+ CASE(CL_IMAGE_FORMAT_MISMATCH)
+ CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+ CASE(CL_BUILD_PROGRAM_FAILURE)
+ CASE(CL_MAP_FAILURE)
+ CASE(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+ CASE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+ CASE(CL_COMPILE_PROGRAM_FAILURE)
+ CASE(CL_LINKER_NOT_AVAILABLE)
+ CASE(CL_LINK_PROGRAM_FAILURE)
+ CASE(CL_DEVICE_PARTITION_FAILED)
+ CASE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)
+ CASE(CL_INVALID_VALUE)
+ CASE(CL_INVALID_DEVICE_TYPE)
+ CASE(CL_INVALID_PLATFORM)
+ CASE(CL_INVALID_DEVICE)
+ CASE(CL_INVALID_CONTEXT)
+ CASE(CL_INVALID_QUEUE_PROPERTIES)
+ CASE(CL_INVALID_COMMAND_QUEUE)
+ CASE(CL_INVALID_HOST_PTR)
+ CASE(CL_INVALID_MEM_OBJECT)
+ CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+ CASE(CL_INVALID_IMAGE_SIZE)
+ CASE(CL_INVALID_SAMPLER)
+ CASE(CL_INVALID_BINARY)
+ CASE(CL_INVALID_BUILD_OPTIONS)
+ CASE(CL_INVALID_PROGRAM)
+ CASE(CL_INVALID_PROGRAM_EXECUTABLE)
+ CASE(CL_INVALID_KERNEL_NAME)
+ CASE(CL_INVALID_KERNEL_DEFINITION)
+ CASE(CL_INVALID_KERNEL)
+ CASE(CL_INVALID_ARG_INDEX)
+ CASE(CL_INVALID_ARG_VALUE)
+ CASE(CL_INVALID_ARG_SIZE)
+ CASE(CL_INVALID_KERNEL_ARGS)
+ CASE(CL_INVALID_WORK_DIMENSION)
+ CASE(CL_INVALID_WORK_GROUP_SIZE)
+ CASE(CL_INVALID_WORK_ITEM_SIZE)
+ CASE(CL_INVALID_GLOBAL_OFFSET)
+ CASE(CL_INVALID_EVENT_WAIT_LIST)
+ CASE(CL_INVALID_EVENT)
+ CASE(CL_INVALID_OPERATION)
+ CASE(CL_INVALID_GL_OBJECT)
+ CASE(CL_INVALID_BUFFER_SIZE)
+ CASE(CL_INVALID_MIP_LEVEL)
+ CASE(CL_INVALID_GLOBAL_WORK_SIZE)
+ CASE(CL_INVALID_PROPERTY)
+ CASE(CL_INVALID_IMAGE_DESCRIPTOR)
+ CASE(CL_INVALID_COMPILER_OPTIONS)
+ CASE(CL_INVALID_LINKER_OPTIONS)
+ CASE(CL_INVALID_DEVICE_PARTITION_COUNT)
+ }
+ return "Unknown";
+ }
+#undef CASE
+
+ void notifyAPIError(cl_context context, cl_int err,
+ const char* function, string info = "")
+ {
+ // Remove leading underscore from function name if necessary
+ if (!strncmp(function, "_cl", 3))
+ {
+ function++;
+ }
+
+ // Build error message
+ ostringstream oss;
+ oss << endl
+ << "Oclgrind - OpenCL runtime error detected" << endl
+ << "\tFunction: " << function << endl
+ << "\tError: " << CLErrorToString(err) << endl;
+ if (!info.empty())
+ {
+ oss << "\t" << info << endl;
+ }
+ string error = oss.str();
+
+ // Output message to stderr if required
+ const char *checkAPI = getenv("OCLGRIND_CHECK_API");
+ if (checkAPI && strcmp(checkAPI, "1") == 0)
+ {
+ cerr << error << endl;
+ }
+
+ // Fire context callback if set
+ if (context && context->notify)
+ {
+ context->notify(error.c_str(), context->data, 0, NULL);
+ }
+ }
+}
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#define __func__ __FUNCTION__
+#endif
+
+#define ReturnErrorInfo(context, err, info) \
+{ \
+ ostringstream oss; \
+ oss << info; \
+ notifyAPIError(context, err, __func__, oss.str()); \
+ return err; \
+}
+#define ReturnErrorArg(context, err, arg) \
+ ReturnErrorInfo(context, err, "For argument '" #arg "'")
+#define ReturnError(context, err) \
+ ReturnErrorInfo(context, err, "")
+
+#define SetErrorInfo(context, err, info) \
+ if (err != CL_SUCCESS) \
+ { \
+ ostringstream oss; \
+ oss << info; \
+ notifyAPIError(context, err, __func__, oss.str()); \
+ } \
+ if (errcode_ret) \
+ { \
+ *errcode_ret = err; \
+ }
+#define SetErrorArg(context, err, arg) \
+ SetErrorInfo(context, err, "For argument '" #arg "'")
+#define SetError(context, err) \
+ SetErrorInfo(context, err, "")
+
+#define ParamValueSizeTooSmall \
+ "param_value_size is " << param_value_size << \
+ ", but result requires " << result_size << " bytes"
+
+
+static struct _cl_platform_id *m_platform = NULL;
+static struct _cl_device_id *m_device = NULL;
+
+CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR
+(
+ cl_uint num_entries,
+ cl_platform_id *platforms,
+ cl_uint *num_platforms
+)
+{
+ if (!m_platform)
+ {
+ m_platform = new _cl_platform_id;
+ m_platform->dispatch = m_dispatchTable;
+
+ m_device = new _cl_device_id;
+ m_device->dispatch = m_dispatchTable;
+ }
+
+ if (num_entries > 0)
+ {
+ platforms[0] = m_platform;
+ }
+
+ if (num_platforms)
+ {
+ *num_platforms = 1;
+ }
+
+ return CL_SUCCESS;
+}
+
+////////////////////////////////////
+// OpenCL Runtime API Definitions //
+////////////////////////////////////
+
+#ifndef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+
+#ifndef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+
+CL_API_ENTRY void* CL_API_CALL
+clGetExtensionFunctionAddress
+(
+ const char * funcname
+) CL_API_SUFFIX__VERSION_1_2
+{
+ if (strcmp(funcname, "clIcdGetPlatformIDsKHR") == 0)
+ {
+ return (void*)clIcdGetPlatformIDsKHR;
+ }
+ else
+ {
+ return NULL;
+ }
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs
+(
+ cl_uint num_entries,
+ cl_platform_id * platforms,
+ cl_uint * num_platforms
+) CL_API_SUFFIX__VERSION_1_0
+{
+ return clIcdGetPlatformIDsKHR(num_entries, platforms, num_platforms);
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo
+(
+ cl_platform_id platform,
+ cl_platform_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Select platform info string
+ const char *result = NULL;
+ switch(param_name)
+ {
+ case CL_PLATFORM_PROFILE:
+ result = PLATFORM_PROFILE;
+ break;
+ case CL_PLATFORM_VERSION:
+ result = PLATFORM_VERSION;
+ break;
+ case CL_PLATFORM_NAME:
+ result = PLATFORM_NAME;
+ break;
+ case CL_PLATFORM_VENDOR:
+ result = PLATFORM_VENDOR;
+ break;
+ case CL_PLATFORM_EXTENSIONS:
+ result = PLATFORM_EXTENSIONS;
+ break;
+ case CL_PLATFORM_ICD_SUFFIX_KHR:
+ result = PLATFORM_SUFFIX;
+ break;
+ default:
+ ReturnErrorArg(NULL, CL_INVALID_VALUE, param_name);
+ }
+
+ // Compute size of result
+ size_t result_size = strlen(result) + 1;
+ if (param_value_size_ret)
+ {
+ *param_value_size_ret = result_size;
+ }
+
+ // Return result
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(NULL, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+ }
+ else
+ {
+ memcpy(param_value, result, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs
+(
+ cl_platform_id platform,
+ cl_device_type device_type,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (devices && num_entries < 1)
+ {
+ ReturnError(NULL, CL_INVALID_VALUE);
+ }
+
+ if (device_type != CL_DEVICE_TYPE_CPU &&
+ device_type != CL_DEVICE_TYPE_DEFAULT &&
+ device_type != CL_DEVICE_TYPE_ALL)
+ {
+ ReturnError(NULL, CL_DEVICE_NOT_FOUND);
+ }
+
+ if (devices)
+ {
+ *devices = m_device;
+ }
+
+ if (num_devices)
+ {
+ *num_devices = 1;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo
+(
+ cl_device_id device,
+ cl_device_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check device is valid
+ if (device != m_device)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_DEVICE, device);
+ }
+
+ size_t dummy;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ // All possible return types
+ union
+ {
+ cl_uint cluint;
+ size_t sizet;
+ size_t sizet3[3];
+ cl_ulong clulong;
+ cl_bool clbool;
+ cl_device_id cldeviceid;
+ cl_device_type cldevicetype;
+ cl_device_fp_config devicefpconfig;
+ cl_device_mem_cache_type devicememcachetype;
+ cl_device_local_mem_type devicelocalmemtype;
+ cl_device_exec_capabilities cldevexeccap;
+ cl_command_queue_properties clcmdqprop;
+ cl_platform_id clplatid;
+ cl_device_partition_property cldevpartprop;
+ cl_device_affinity_domain cldevaffdom;
+ } result_data;
+ // The result is actually a string that needs copying
+ const char* str = 0;
+
+ switch (param_name)
+ {
+ case CL_DEVICE_TYPE:
+ result_size = sizeof(cl_device_type);
+ result_data.cldevicetype = CL_DEVICE_TYPE_CPU;
+ break;
+ case CL_DEVICE_VENDOR_ID:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = DEVICE_VENDOR_ID;
+ break;
+ case CL_DEVICE_MAX_COMPUTE_UNITS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1;
+ break;
+ case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 3;
+ break;
+ case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = MAX_WI_SIZE;
+ break;
+ case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+ result_size = 3*sizeof(size_t);
+ result_data.sizet3[0] = MAX_WI_SIZE;
+ result_data.sizet3[1] = MAX_WI_SIZE;
+ result_data.sizet3[2] = MAX_WI_SIZE;
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1;
+ break;
+ case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1;
+ break;
+ case CL_DEVICE_ADDRESS_BITS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = sizeof(size_t)<<3;
+ break;
+ case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 128;
+ break;
+ case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 8;
+ break;
+ case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+ result_size = sizeof(cl_ulong);
+ result_data.clulong = MAX_GLOBAL_MEM_SIZE;
+ break;
+ case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+ case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+ result_size = sizeof(size_t);
+ result_data.sizet = 8192;
+ break;
+ case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+ case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+ case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+ result_size = sizeof(size_t);
+ result_data.sizet = 2048;
+ break;
+ case CL_DEVICE_IMAGE_SUPPORT:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_TRUE;
+ break;
+ case CL_DEVICE_MAX_PARAMETER_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = 1024;
+ break;
+ case CL_DEVICE_MAX_SAMPLERS:
+ result_size = sizeof(cl_uint);
+ result_data.sizet = 16;
+ break;
+ case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = sizeof(cl_long16)<<3;
+ break;
+ case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1;
+ break;
+ case CL_DEVICE_SINGLE_FP_CONFIG:
+ result_size = sizeof(cl_device_fp_config);
+ result_data.devicefpconfig =
+ CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN | CL_FP_DENORM;
+ break;
+ case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+ result_size = sizeof(cl_device_mem_cache_type);
+ result_data.devicememcachetype = CL_NONE;
+ break;
+ case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 0;
+ break;
+ case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+ result_size = sizeof(cl_ulong);
+ result_data.clulong = 0;
+ break;
+ case CL_DEVICE_GLOBAL_MEM_SIZE:
+ result_size = sizeof(cl_ulong);
+ result_data.clulong = MAX_GLOBAL_MEM_SIZE;
+ break;
+ case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+ result_size = sizeof(cl_ulong);
+ result_data.clulong = MAX_CONSTANT_BUFFER_SIZE;
+ break;
+ case CL_DEVICE_MAX_CONSTANT_ARGS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1024;
+ break;
+ case CL_DEVICE_LOCAL_MEM_TYPE:
+ result_size = sizeof(cl_device_local_mem_type);
+ result_data.devicelocalmemtype = CL_LOCAL;
+ break;
+ case CL_DEVICE_LOCAL_MEM_SIZE:
+ result_size = sizeof(cl_ulong);
+ result_data.clulong = MAX_LOCAL_MEM_SIZE;
+ break;
+ case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_FALSE;
+ break;
+ case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+ result_size = sizeof(size_t);
+ result_data.sizet = 1000;
+ break;
+ case CL_DEVICE_ENDIAN_LITTLE:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_TRUE;
+ break;
+ case CL_DEVICE_AVAILABLE:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_TRUE;
+ break;
+ case CL_DEVICE_COMPILER_AVAILABLE:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_TRUE;
+ break;
+ case CL_DEVICE_EXECUTION_CAPABILITIES:
+ result_size = sizeof(cl_device_exec_capabilities);
+ result_data.cldevexeccap = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL;
+ break;
+ case CL_DEVICE_QUEUE_PROPERTIES:
+ result_size = sizeof(cl_command_queue_properties);
+ result_data.clcmdqprop = CL_QUEUE_PROFILING_ENABLE;
+ break;
+ case CL_DEVICE_NAME:
+ result_size = sizeof(DEVICE_NAME);
+ str = DEVICE_NAME;
+ break;
+ case CL_DEVICE_VENDOR:
+ result_size = sizeof(DEVICE_VENDOR);
+ str = DEVICE_VENDOR;
+ break;
+ case CL_DRIVER_VERSION:
+ result_size = sizeof(DRIVER_VERSION);
+ str = DRIVER_VERSION;
+ break;
+ case CL_DEVICE_PROFILE:
+ result_size = sizeof(DEVICE_PROFILE);
+ str = DEVICE_PROFILE;
+ break;
+ case CL_DEVICE_VERSION:
+ result_size = sizeof(DEVICE_VERSION);
+ str = DEVICE_VERSION;
+ break;
+ case CL_DEVICE_EXTENSIONS:
+ result_size = sizeof(DEVICE_EXTENSIONS);
+ str = DEVICE_EXTENSIONS;
+ break;
+ case CL_DEVICE_PLATFORM:
+ result_size = sizeof(cl_platform_id);
+ result_data.clplatid = m_platform;
+ break;
+ case CL_DEVICE_DOUBLE_FP_CONFIG:
+ result_size = sizeof(cl_device_fp_config);
+ result_data.devicefpconfig =
+ CL_FP_FMA | CL_FP_ROUND_TO_NEAREST |
+ CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF |
+ CL_FP_INF_NAN | CL_FP_DENORM;
+ break;
+ case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 0;
+ break;
+ case CL_DEVICE_HOST_UNIFIED_MEMORY:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_FALSE;
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1;
+ break;
+ case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 0;
+ break;
+ case CL_DEVICE_OPENCL_C_VERSION:
+ result_size = sizeof(DEVICE_LANG_VERSION);
+ str = DEVICE_LANG_VERSION;
+ break;
+ case CL_DEVICE_LINKER_AVAILABLE:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_TRUE;
+ break;
+ case CL_DEVICE_BUILT_IN_KERNELS:
+ result_size = 1;
+ str = "";
+ break;
+ case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = 65536;
+ break;
+ case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = 2048;
+ break;
+ case CL_DEVICE_PARENT_DEVICE:
+ result_size = sizeof(cl_device_id);
+ result_data.cldeviceid = NULL;
+ break;
+ case CL_DEVICE_PARTITION_MAX_SUB_DEVICES:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 0;
+ break;
+ case CL_DEVICE_PARTITION_PROPERTIES:
+ case CL_DEVICE_PARTITION_TYPE:
+ result_size = sizeof(cl_device_partition_property);
+ result_data.cldevpartprop = 0;
+ break;
+ case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
+ result_size = sizeof(cl_device_affinity_domain);
+ result_data.cldevaffdom = 0;
+ break;
+ case CL_DEVICE_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1;
+ break;
+ case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = CL_TRUE;
+ break;
+ case CL_DEVICE_PRINTF_BUFFER_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = 1024;
+ break;
+ case CL_DEVICE_SPIR_VERSIONS:
+ result_size = sizeof(DEVICE_SPIR_VERSIONS);
+ str = DEVICE_SPIR_VERSIONS;
+ break;
+ default:
+ ReturnErrorArg(NULL, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(NULL, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+ }
+ else
+ {
+ const void* src = str ? (const void*)str : (const void*)&result_data;
+ memcpy(param_value, src, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices
+(
+ cl_device_id in_device,
+ const cl_device_partition_property * properties,
+ cl_uint num_entries,
+ cl_device_id * out_devices,
+ cl_uint * num_devices
+) CL_API_SUFFIX__VERSION_1_2
+{
+ ReturnErrorInfo(NULL, CL_INVALID_VALUE, "Not yet implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice
+(
+ cl_device_id device
+) CL_API_SUFFIX__VERSION_1_2
+{
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice
+(
+ cl_device_id device
+) CL_API_SUFFIX__VERSION_1_2
+{
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext
+(
+ const cl_context_properties * properties,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ void (CL_CALLBACK * pfn_notify)(const char *,
+ const void *,
+ size_t,
+ void *),
+ void * user_data,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (num_devices != 1)
+ {
+ SetErrorArg(NULL, CL_INVALID_VALUE, num_devices);
+ return NULL;
+ }
+ if (!devices)
+ {
+ SetErrorArg(NULL, CL_INVALID_VALUE, devices);
+ return NULL;
+ }
+ if (devices[0] != m_device)
+ {
+ SetError(NULL, CL_INVALID_DEVICE);
+ return NULL;
+ }
+ if (!pfn_notify && user_data)
+ {
+ SetErrorInfo(NULL, CL_INVALID_VALUE,
+ "pfn_notify NULL but user_data non-NULL");
+ return NULL;
+ }
+
+ // Create context object
+ cl_context context = new _cl_context;
+ context->dispatch = m_dispatchTable;
+ context->context = new oclgrind::Context();
+ context->notify = pfn_notify;
+ context->data = user_data;
+ context->properties = NULL;
+ context->szProperties = 0;
+ context->refCount = 1;
+
+ if (properties)
+ {
+ int num = 1;
+ while (properties[num])
+ {
+ num++;
+ }
+ size_t sz = (num+1)*sizeof(cl_context_properties);
+ context->szProperties = sz;
+ context->properties = (cl_context_properties*)malloc(sz);
+ memcpy(context->properties, properties, sz);
+ }
+
+ SetError(NULL, CL_SUCCESS);
+ return context;
+}
+
+CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType
+(
+ const cl_context_properties * properties,
+ cl_device_type device_type,
+ void (CL_CALLBACK * pfn_notify)(const char *,
+ const void *,
+ size_t,
+ void *),
+ void * user_data,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!pfn_notify && user_data)
+ {
+ SetErrorInfo(NULL, CL_INVALID_VALUE,
+ "pfn_notify NULL but user_data non-NULL");
+ return NULL;
+ }
+ if (device_type != CL_DEVICE_TYPE_CPU &&
+ device_type != CL_DEVICE_TYPE_DEFAULT &&
+ device_type != CL_DEVICE_TYPE_ALL)
+ {
+ SetErrorArg(NULL, CL_DEVICE_NOT_FOUND, device_type);
+ return NULL;
+ }
+
+ // Create context object
+ cl_context context = new _cl_context;
+ context->dispatch = m_dispatchTable;
+ context->context = new oclgrind::Context();
+ context->notify = pfn_notify;
+ context->data = user_data;
+ context->properties = NULL;
+ context->szProperties = 0;
+ context->refCount = 1;
+
+ if (properties)
+ {
+ int num = 0;
+ while (properties[num])
+ {
+ num++;
+ }
+ size_t sz = (num+1)*sizeof(cl_context_properties);
+ context->szProperties = sz;
+ context->properties = (cl_context_properties*)malloc(sz);
+ memcpy(context->properties, properties, sz);
+ }
+
+ SetError(NULL, CL_SUCCESS);
+ return context;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext
+(
+ cl_context context
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!context)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ }
+
+ context->refCount++;
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext
+(
+ cl_context context
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!context)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ }
+
+ if (--context->refCount == 0)
+ {
+ delete context->context;
+ delete context;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo
+(
+ cl_context context,
+ cl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check context is valid
+ if (!context)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ }
+
+ size_t dummy = 0;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+
+ union
+ {
+ cl_uint cluint;
+ cl_device_id cldevid;
+ } result_data;
+ cl_context_properties* properties = NULL;
+
+ switch (param_name)
+ {
+ case CL_CONTEXT_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = context->refCount;
+ break;
+ case CL_CONTEXT_NUM_DEVICES:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 1;
+ break;
+ case CL_CONTEXT_DEVICES:
+ result_size = sizeof(cl_device_id);
+ result_data.cldevid = m_device;
+ break;
+ case CL_CONTEXT_PROPERTIES:
+ result_size = context->szProperties;
+ properties = context->properties;
+ break;
+ default:
+ ReturnErrorArg(context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+ }
+ else
+ {
+ if (properties)
+ memcpy(param_value, properties, result_size);
+ else
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue
+(
+ cl_context context,
+ cl_device_id device,
+ cl_command_queue_properties properties,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+ if (device != m_device)
+ {
+ SetErrorArg(context, CL_INVALID_DEVICE, device);
+ return NULL;
+ }
+ if (properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+ {
+ SetErrorInfo(context, CL_INVALID_QUEUE_PROPERTIES,
+ "Out-of-order command queues not supported");
+ return NULL;
+ }
+
+ // Create command-queue object
+ cl_command_queue queue;
+ queue = new _cl_command_queue;
+ queue->queue = new oclgrind::Queue(context->context);
+ queue->dispatch = m_dispatchTable;
+ queue->properties = properties;
+ queue->context = context;
+ queue->refCount = 1;
+
+ clRetainContext(context);
+
+ SetError(context, CL_SUCCESS);
+ return queue;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetCommandQueueProperty
+(
+ cl_command_queue command_queue,
+ cl_command_queue_properties properties,
+ cl_bool enable,
+ cl_command_queue_properties * old_properties
+)
+{
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue
+(
+ cl_command_queue command_queue
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ command_queue->refCount++;
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue
+(
+ cl_command_queue command_queue
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ if (--command_queue->refCount == 0)
+ {
+ // TODO: Retain/release queue from async thread
+ // TODO: Spec states that this function performs an implicit flush,
+ // so maybe we are OK to delete queue here?
+ clFinish(command_queue);
+ delete command_queue->queue;
+ clReleaseContext(command_queue->context);
+ delete command_queue;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo
+(
+ cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check queue is valid
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ size_t dummy = 0;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+
+ union
+ {
+ cl_uint cluint;
+ cl_context context;
+ cl_device_id cldevid;
+ cl_command_queue_properties properties;
+ } result_data;
+
+ switch (param_name)
+ {
+ case CL_QUEUE_CONTEXT:
+ result_size = sizeof(cl_context);
+ result_data.context = command_queue->context;
+ break;
+ case CL_QUEUE_DEVICE:
+ result_size = sizeof(cl_device_id);
+ result_data.cldevid = m_device;
+ break;
+ case CL_QUEUE_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = command_queue->refCount;
+ break;
+ case CL_QUEUE_PROPERTIES:
+ result_size = sizeof(cl_command_queue_properties);
+ result_data.properties = command_queue->properties;
+ break;
+ default:
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ ParamValueSizeTooSmall);
+ }
+ else
+ {
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer
+(
+ cl_context context,
+ cl_mem_flags flags,
+ size_t size,
+ void * host_ptr,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+ if (size == 0)
+ {
+ SetErrorArg(context, CL_INVALID_BUFFER_SIZE, size);
+ return NULL;
+ }
+ if ((host_ptr == NULL) ==
+ ((flags & CL_MEM_COPY_HOST_PTR) ||
+ flags & CL_MEM_USE_HOST_PTR))
+ {
+ SetErrorInfo(context, CL_INVALID_HOST_PTR,
+ "host_ptr NULL but CL_MEM_{COPY,USE}_HOST_PTR used");
+ return NULL;
+ }
+ if ((flags & CL_MEM_USE_HOST_PTR) &&
+ (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)))
+ {
+ SetErrorInfo(context, CL_INVALID_VALUE,
+ "CL_MEM_USE_HOST_PTR cannot be used with "
+ "CL_MEM_{COPY,ALLOC}_HOST_PTR");
+ return NULL;
+ }
+
+ // Create memory object
+ oclgrind::Memory *globalMemory = context->context->getGlobalMemory();
+ cl_mem mem = new _cl_mem;
+ mem->dispatch = m_dispatchTable;
+ mem->context = context;
+ mem->parent = NULL;
+ mem->size = size;
+ mem->offset = 0;
+ mem->flags = flags;
+ mem->isImage = false;
+ mem->refCount = 1;
+ if (flags & CL_MEM_USE_HOST_PTR)
+ {
+ mem->address = globalMemory->createHostBuffer(size, host_ptr, flags);
+ mem->hostPtr = host_ptr;
+ }
+ else
+ {
+ mem->address = globalMemory->allocateBuffer(size, flags);
+ mem->hostPtr = NULL;
+ }
+ if (!mem->address)
+ {
+ SetError(context, CL_MEM_OBJECT_ALLOCATION_FAILURE);
+ delete mem;
+ return NULL;
+ }
+ clRetainContext(context);
+
+ if (flags & CL_MEM_COPY_HOST_PTR)
+ {
+ context->context->getGlobalMemory()->store((const unsigned char*)host_ptr,
+ mem->address, size);
+ }
+
+ SetError(context, CL_SUCCESS);
+ return mem;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer
+(
+ cl_mem buffer,
+ cl_mem_flags flags,
+ cl_buffer_create_type buffer_create_type,
+ const void * buffer_create_info,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!buffer)
+ {
+ SetErrorArg(NULL, CL_INVALID_MEM_OBJECT, buffer);
+ return NULL;
+ }
+ if (buffer->parent)
+ {
+ SetErrorInfo(buffer->context, CL_INVALID_MEM_OBJECT,
+ "Parent buffer cannot be a sub-buffer");
+ return NULL;
+ }
+ if (buffer_create_type != CL_BUFFER_CREATE_TYPE_REGION)
+ {
+ SetErrorArg(buffer->context, CL_INVALID_VALUE, buffer_create_type);
+ return NULL;
+ }
+ if (!buffer_create_info)
+ {
+ SetErrorArg(buffer->context, CL_INVALID_VALUE, buffer_create_info);
+ return NULL;
+ }
+
+ _cl_buffer_region region = *(_cl_buffer_region*)buffer_create_info;
+ if (region.origin + region.size > buffer->size)
+ {
+ SetErrorInfo(buffer->context, CL_INVALID_VALUE,
+ "Region doesn't fit inside parent buffer");
+ return NULL;
+ }
+ if (region.size == 0)
+ {
+ SetErrorInfo(buffer->context, CL_INVALID_VALUE, "Region size cannot be 0");
+ return NULL;
+ }
+
+ // Inherit flags from parent where appropriate
+ cl_mem_flags memFlags = 0;
+ cl_mem_flags rwFlags = CL_MEM_READ_ONLY | CL_MEM_READ_WRITE |
+ CL_MEM_WRITE_ONLY;
+ cl_mem_flags hostAccess = CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY |
+ CL_MEM_HOST_WRITE_ONLY;
+ cl_mem_flags hostPtr = CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
+ CL_MEM_COPY_HOST_PTR;
+ if ((flags & rwFlags) == 0)
+ {
+ memFlags |= buffer->flags & rwFlags;
+ }
+ else
+ {
+ memFlags |= flags & rwFlags;
+ }
+ if ((flags & hostAccess) == 0)
+ {
+ memFlags |= buffer->flags & hostAccess;
+ }
+ else
+ {
+ memFlags |= flags & hostAccess;
+ }
+ memFlags |= buffer->flags & hostPtr;
+
+ // Create memory object
+ cl_mem mem = new _cl_mem;
+ mem->dispatch = m_dispatchTable;
+ mem->context = buffer->context;
+ mem->parent = buffer;
+ mem->size = region.size;
+ mem->offset = region.origin;
+ mem->isImage = false;
+ mem->flags = memFlags;
+ mem->hostPtr = (unsigned char*)buffer->hostPtr + region.origin;
+ mem->refCount = 1;
+ mem->address = buffer->address + region.origin;
+ clRetainMemObject(buffer);
+
+ SetError(buffer->context, CL_SUCCESS);
+ return mem;
+}
+
+// Utility function for getting number of dimensions in image
+size_t getNumDimensions(cl_mem_object_type type)
+{
+ switch (type)
+ {
+ case CL_MEM_OBJECT_IMAGE1D:
+ case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+ case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+ return 1;
+ case CL_MEM_OBJECT_IMAGE2D:
+ case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+ return 2;
+ case CL_MEM_OBJECT_IMAGE3D:
+ return 3;
+ default:
+ return 0;
+ }
+}
+
+// Utility function for getting number of channels in an image
+size_t getNumChannels(const cl_image_format *format)
+{
+ switch (format->image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_A:
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ return 1;
+ case CL_RG:
+ case CL_RGx:
+ case CL_RA:
+ return 2;
+ case CL_RGB:
+ case CL_RGBx:
+ return 3;
+ case CL_RGBA:
+ case CL_ARGB:
+ case CL_BGRA:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+// Utility function for computing an image format's pixel size (in bytes)
+size_t getPixelSize(const cl_image_format *format)
+{
+ // Get number of channels
+ size_t numChannels = getNumChannels(format);
+
+ // Get size of each pixel (in bytes)
+ switch (format->image_channel_data_type)
+ {
+ case CL_SNORM_INT8:
+ case CL_UNORM_INT8:
+ case CL_SIGNED_INT8:
+ case CL_UNSIGNED_INT8:
+ return numChannels;
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT16:
+ case CL_SIGNED_INT16:
+ case CL_UNSIGNED_INT16:
+ case CL_HALF_FLOAT:
+ return 2*numChannels;
+ case CL_SIGNED_INT32:
+ case CL_UNSIGNED_INT32:
+ case CL_FLOAT:
+ return 4*numChannels;
+ case CL_UNORM_SHORT_565:
+ case CL_UNORM_SHORT_555:
+ return 2;
+ case CL_UNORM_INT_101010:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+bool isImageArray(cl_mem_object_type type)
+{
+ if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+ type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ {
+ return true;
+ }
+ return false;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage
+(
+ cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ const cl_image_desc * image_desc,
+ void * host_ptr,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+ if (!image_format)
+ {
+ SetErrorArg(context, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, image_format);
+ return NULL;
+ }
+ if (!image_desc)
+ {
+ SetErrorArg(context, CL_INVALID_IMAGE_DESCRIPTOR, image_desc);
+ return NULL;
+ }
+
+ // Get size of each pixel (in bytes)
+ size_t pixelSize = getPixelSize(image_format);
+ if (!pixelSize)
+ {
+ SetErrorArg(context, CL_INVALID_VALUE, image_format);
+ return NULL;
+ }
+
+ // Get image dimensions
+ size_t dims = getNumDimensions(image_desc->image_type);
+ size_t width = image_desc->image_width;
+ size_t height = 1, depth = 1;
+ size_t arraySize = 1;
+ if (dims > 1)
+ {
+ height = image_desc->image_height;
+ }
+ if (dims > 2)
+ {
+ depth = image_desc->image_depth;
+ }
+ if (isImageArray(image_desc->image_type))
+ {
+ arraySize = image_desc->image_array_size;
+ }
+
+ // Calculate total size of image
+ size_t size = width * height * depth * arraySize * pixelSize;
+
+ cl_mem mem;
+
+ if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+ {
+ // Use existing buffer
+ if (!image_desc->buffer)
+ {
+ SetErrorInfo(context, CL_INVALID_VALUE,
+ "image_desc->buffer cannot be NULL "
+ "when using CL_MEM_OBJECT_IMAGE1D_BUFFER");
+ return NULL;
+ }
+ mem = image_desc->buffer;
+ clRetainMemObject(image_desc->buffer);
+ }
+ else
+ {
+ // Create buffer
+ // TODO: Use pitches
+ mem = clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
+ if (!mem)
+ {
+ return NULL;
+ }
+ }
+
+ // Create image object wrapper
+ cl_image *image = new cl_image;
+ *(cl_mem)image = *mem;
+ image->isImage = true;
+ image->format = *image_format;
+ image->desc = *image_desc;
+ image->desc.image_width = width;
+ image->desc.image_height = height;
+ image->desc.image_depth = depth;
+ image->desc.image_array_size = arraySize;
+ image->refCount = 1;
+ if (image_desc->image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)
+ {
+ delete mem;
+ }
+
+ SetError(context, CL_SUCCESS);
+ return image;
+}
+
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage2D
+(
+ cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_row_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE2D,
+ image_width,
+ image_height,
+ 1,
+ 1,
+ image_row_pitch,
+ 0,
+ 0,
+ 0,
+ NULL
+ };
+ return clCreateImage(context, flags,
+ image_format, &desc,
+ host_ptr, errcode_ret);
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage3D
+(
+ cl_context context,
+ cl_mem_flags flags,
+ const cl_image_format * image_format,
+ size_t image_width,
+ size_t image_height,
+ size_t image_depth,
+ size_t image_row_pitch,
+ size_t image_slice_pitch,
+ void * host_ptr,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ cl_image_desc desc =
+ {
+ CL_MEM_OBJECT_IMAGE3D,
+ image_width,
+ image_height,
+ image_depth,
+ 1,
+ image_row_pitch,
+ image_slice_pitch,
+ 0,
+ 0,
+ NULL
+ };
+ return clCreateImage(context, flags,
+ image_format, &desc,
+ host_ptr, errcode_ret);
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject
+(
+ cl_mem memobj
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!memobj)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+ }
+
+ memobj->refCount++;
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject
+(
+ cl_mem memobj
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!memobj)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+ }
+
+ if (--memobj->refCount == 0)
+ {
+ if (memobj->isImage &&
+ ((cl_image*)memobj)->desc.image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+ {
+ clReleaseMemObject(((cl_image*)memobj)->desc.buffer);
+ }
+ else
+ {
+ if (memobj->parent)
+ {
+ clReleaseMemObject(memobj->parent);
+ }
+ else
+ {
+ memobj->context->context->getGlobalMemory()->deallocateBuffer(
+ memobj->address);
+ clReleaseContext(memobj->context);
+ }
+
+ while (!memobj->callbacks.empty())
+ {
+ pair<void (CL_CALLBACK *)(cl_mem, void *), void*> callback =
+ memobj->callbacks.top();
+ callback.first(memobj, callback.second);
+ memobj->callbacks.pop();
+ }
+ }
+
+ delete memobj;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats
+(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format * image_formats,
+ cl_uint * num_image_formats
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!context)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ }
+ if (num_entries == 0 && image_formats)
+ {
+ ReturnErrorInfo(context, CL_INVALID_VALUE,
+ "num_entries should be >0 if image_formats non-NULL");
+ }
+
+ // TODO: Add support for packed image types
+
+ // Channel orders
+ const cl_channel_order ordersAll[] =
+ {
+ CL_R, CL_Rx, CL_A,
+ CL_RG, CL_RGx, CL_RA,
+ CL_RGBA,
+ };
+ const cl_channel_order ordersNormalized[] = {CL_INTENSITY, CL_LUMINANCE};
+ const cl_channel_order ordersByte[] = {CL_ARGB, CL_BGRA};
+ const cl_channel_order ordersPacked[] = {CL_RGB, CL_RGBx};
+ const cl_channel_order *orders[] =
+ {
+ ordersAll, ordersNormalized, ordersByte //, ordersPacked
+ };
+ const size_t numOrders[] =
+ {
+ sizeof(ordersAll) / sizeof(cl_channel_order),
+ sizeof(ordersNormalized) / sizeof(cl_channel_order),
+ sizeof(ordersByte) / sizeof(cl_channel_order),
+ //sizeof(ordersPacked) / sizeof(cl_channel_order),
+ };
+
+ // Channel types
+ const cl_channel_type typesAll[] =
+ {
+ CL_SNORM_INT8, CL_SNORM_INT16,
+ CL_UNORM_INT8, CL_UNORM_INT16,
+ CL_SIGNED_INT8, CL_SIGNED_INT16, CL_SIGNED_INT32,
+ CL_UNSIGNED_INT8, CL_UNSIGNED_INT16, CL_UNSIGNED_INT32,
+ CL_FLOAT, CL_HALF_FLOAT,
+ };
+ const cl_channel_type typesNormalized[] =
+ {
+ CL_SNORM_INT8, CL_SNORM_INT16,
+ CL_UNORM_INT8, CL_UNORM_INT16,
+ CL_FLOAT, CL_HALF_FLOAT,
+ };
+ const cl_channel_type typesByte[] =
+ {
+ CL_SNORM_INT8, CL_UNORM_INT8,
+ CL_SIGNED_INT8, CL_UNSIGNED_INT8,
+ };
+ const cl_channel_type typesPacked[] =
+ {
+ CL_UNORM_SHORT_565, CL_UNORM_SHORT_555, CL_UNORM_INT_101010
+ };
+ const cl_channel_type *types[] =
+ {
+ typesAll, typesNormalized, typesByte //, typesPacked,
+ };
+ const size_t numTypes[] =
+ {
+ sizeof(typesAll) / sizeof(cl_channel_order),
+ sizeof(typesNormalized) / sizeof(cl_channel_order),
+ sizeof(typesByte) / sizeof(cl_channel_order),
+ //sizeof(typesPacked) / sizeof(cl_channel_order),
+ };
+
+ // Calculate total number of formats
+ size_t numCatagories = sizeof(orders)/sizeof(cl_channel_order*);
+ size_t numFormats = 0;
+ for (size_t c = 0; c < numCatagories; c++)
+ {
+ numFormats += numOrders[c] * numTypes[c];
+ }
+ if (num_image_formats)
+ {
+ *num_image_formats = numFormats;
+ }
+
+ // Generate list of all valid order/type combinations
+ if (image_formats)
+ {
+ unsigned i = 0;
+ for (size_t c = 0; c < numCatagories; c++)
+ {
+ for (size_t o = 0; o < numOrders[c]; o++)
+ {
+ for (size_t t = 0; t < numTypes[c]; t++)
+ {
+ if (i >= num_entries)
+ {
+ return CL_SUCCESS;
+ }
+
+ cl_image_format format = {orders[c][o], types[c][t]};
+ image_formats[i++] = format;
+ }
+ }
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo
+(
+ cl_mem memobj,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check mem object is valid
+ if (!memobj)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+ }
+
+ size_t dummy = 0;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ cl_mem_object_type clmemobjty;
+ cl_mem_flags clmemflags;
+ cl_context context;
+ cl_mem clmem;
+ size_t sizet;
+ cl_uint cluint;
+ void* ptr;
+ } result_data;
+
+ switch (param_name)
+ {
+ case CL_MEM_TYPE:
+ result_size = sizeof(cl_mem_object_type);
+ result_data.clmemobjty = memobj->isImage ?
+ ((cl_image*)memobj)->desc.image_type : CL_MEM_OBJECT_BUFFER;
+ break;
+ case CL_MEM_FLAGS:
+ result_size = sizeof(cl_mem_flags);
+ result_data.clmemflags = memobj->flags;
+ break;
+ case CL_MEM_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = memobj->size;
+ break;
+ case CL_MEM_HOST_PTR:
+ result_size = sizeof(void*);
+ result_data.ptr = memobj->hostPtr;
+ break;
+ case CL_MEM_MAP_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 0;
+ break;
+ case CL_MEM_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = memobj->refCount;
+ break;
+ case CL_MEM_CONTEXT:
+ result_size = sizeof(cl_context);
+ result_data.context = memobj->context;
+ break;
+ case CL_MEM_ASSOCIATED_MEMOBJECT:
+ result_size = sizeof(cl_mem);
+ result_data.clmem = memobj->parent;
+ break;
+ case CL_MEM_OFFSET:
+ result_size = sizeof(size_t);
+ result_data.sizet = memobj->offset;
+ break;
+ default:
+ ReturnErrorArg(memobj->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(memobj->context, CL_INVALID_VALUE,
+ ParamValueSizeTooSmall);
+ }
+ else
+ {
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo
+(
+ cl_mem image,
+ cl_image_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check mem object is valid
+ if (!image)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, image);
+ }
+ cl_image *img = (cl_image*)image;
+
+ size_t dummy = 0;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ cl_image_format climgfmt;
+ size_t sizet;
+ cl_mem clmem;
+ cl_uint cluint;
+ } result_data;
+
+ switch (param_name)
+ {
+ case CL_IMAGE_FORMAT:
+ result_size = sizeof(cl_image_format);
+ result_data.climgfmt = img->format;
+ break;
+ case CL_IMAGE_ELEMENT_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = getPixelSize(&img->format);
+ break;
+ case CL_IMAGE_ROW_PITCH:
+ result_size = sizeof(size_t);
+ result_data.sizet = img->desc.image_row_pitch;
+ break;
+ case CL_IMAGE_SLICE_PITCH:
+ result_size = sizeof(size_t);
+ result_data.sizet = img->desc.image_slice_pitch;
+ break;
+ case CL_IMAGE_WIDTH:
+ result_size = sizeof(size_t);
+ result_data.sizet = img->desc.image_width;
+ break;
+ case CL_IMAGE_HEIGHT:
+ result_size = sizeof(size_t);
+ result_data.sizet =
+ getNumDimensions(img->desc.image_type) > 1 ? img->desc.image_height : 0;
+ break;
+ case CL_IMAGE_DEPTH:
+ result_size = sizeof(size_t);
+ result_data.sizet =
+ getNumDimensions(img->desc.image_type) > 2 ? img->desc.image_depth : 0;
+ break;
+ case CL_IMAGE_ARRAY_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet =
+ isImageArray(img->desc.image_type) ? img->desc.image_array_size : 0;
+ break;
+ case CL_IMAGE_BUFFER:
+ result_size = sizeof(cl_mem);
+ result_data.clmem = img->desc.buffer;
+ break;
+ case CL_IMAGE_NUM_MIP_LEVELS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 0;
+ break;
+ case CL_IMAGE_NUM_SAMPLES:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = 0;
+ break;
+ default:
+ ReturnErrorArg(image->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(image->context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+ }
+ else
+ {
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback
+(
+ cl_mem memobj,
+ void (CL_CALLBACK * pfn_notify)(cl_mem, void*),
+ void * user_data
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!memobj)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+ }
+ if (!pfn_notify)
+ {
+ ReturnErrorArg(memobj->context, CL_INVALID_VALUE, pfn_notify);
+ }
+
+ memobj->callbacks.push(make_pair(pfn_notify, user_data));
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler
+(
+ cl_context context,
+ cl_bool normalized_coords,
+ cl_addressing_mode addressing_mode,
+ cl_filter_mode filter_mode,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+
+ // Create sampler bitfield
+ uint32_t bitfield = 0;
+
+ if (normalized_coords)
+ {
+ bitfield |= 0x0001;
+ }
+
+ switch (addressing_mode)
+ {
+ case CL_ADDRESS_NONE:
+ break;
+ case CL_ADDRESS_CLAMP_TO_EDGE:
+ bitfield |= 0x0002;
+ break;
+ case CL_ADDRESS_CLAMP:
+ bitfield |= 0x0004;
+ break;
+ case CL_ADDRESS_REPEAT:
+ bitfield |= 0x0006;
+ break;
+ case CL_ADDRESS_MIRRORED_REPEAT:
+ bitfield |= 0x0008;
+ break;
+ default:
+ SetErrorArg(context, CL_INVALID_VALUE, addressing_mode);
+ return NULL;
+ }
+
+ switch (filter_mode)
+ {
+ case CL_FILTER_NEAREST:
+ bitfield |= 0x0010;
+ break;
+ case CL_FILTER_LINEAR:
+ bitfield |= 0x0020;
+ break;
+ default:
+ SetErrorArg(context, CL_INVALID_VALUE, filter_mode);
+ return NULL;
+ }
+
+ // Create sampler
+ cl_sampler sampler = new _cl_sampler;
+ sampler->dispatch = m_dispatchTable;
+ sampler->context = context;
+ sampler->normCoords = normalized_coords;
+ sampler->addressMode = addressing_mode;
+ sampler->filterMode = filter_mode;
+ sampler->sampler = bitfield;
+
+ SetError(context, CL_SUCCESS);
+ return sampler;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler
+(
+ cl_sampler sampler
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!sampler)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_SAMPLER, sampler);
+ }
+
+ sampler->refCount++;
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler
+(
+ cl_sampler sampler
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!sampler)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_SAMPLER, sampler);
+ }
+
+ if (--sampler->refCount == 0)
+ {
+ delete sampler;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo
+(
+ cl_sampler sampler,
+ cl_sampler_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check sampler is valid
+ if (!sampler)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_SAMPLER, sampler);
+ }
+
+ size_t dummy = 0;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ cl_uint cluint;
+ cl_context clcontext;
+ cl_bool clbool;
+ cl_addressing_mode claddrmode;
+ cl_filter_mode clfiltmode;
+ } result_data;
+
+ switch (param_name)
+ {
+ case CL_SAMPLER_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = sampler->refCount;
+ break;
+ case CL_SAMPLER_CONTEXT:
+ result_size = sizeof(cl_context);
+ result_data.clcontext = sampler->context;
+ break;
+ case CL_SAMPLER_NORMALIZED_COORDS:
+ result_size = sizeof(cl_bool);
+ result_data.clbool = sampler->normCoords;
+ break;
+ case CL_SAMPLER_ADDRESSING_MODE:
+ result_size = sizeof(cl_addressing_mode);
+ result_data.claddrmode = sampler->addressMode;
+ break;
+ case CL_SAMPLER_FILTER_MODE:
+ result_size = sizeof(cl_filter_mode);
+ result_data.clfiltmode = sampler->filterMode;
+ break;
+ default:
+ ReturnErrorArg(sampler->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(sampler->context, CL_INVALID_VALUE,
+ ParamValueSizeTooSmall);
+ }
+ else
+ {
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource
+(
+ cl_context context,
+ cl_uint count,
+ const char ** strings,
+ const size_t * lengths,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+ if (count == 0)
+ {
+ SetErrorArg(context, CL_INVALID_VALUE, count);
+ return NULL;
+ }
+ if (!strings || !strings[0])
+ {
+ SetErrorArg(context, CL_INVALID_VALUE, strings);
+ return NULL;
+ }
+
+ // Concatenate sources into a single string
+ std::string source;
+ for (unsigned i = 0; i < count; i++)
+ {
+ size_t length = (lengths && lengths[i]) ? lengths[i] : strlen(strings[i]);
+ source.append(strings[i], length);
+ }
+
+ // Create program object
+ cl_program prog = new _cl_program;
+ prog->dispatch = m_dispatchTable;
+ prog->program = new oclgrind::Program(context->context, source);
+ prog->context = context;
+ prog->refCount = 1;
+ if (!prog->program)
+ {
+ SetError(context, CL_OUT_OF_HOST_MEMORY);
+ delete prog;
+ return NULL;
+ }
+
+ clRetainContext(context);
+
+ SetError(context, CL_SUCCESS);
+ return prog;
+}
+
+CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary
+(
+ cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const size_t * lengths,
+ const unsigned char ** binaries,
+ cl_int * binary_status,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+ if (num_devices != 1 || !device_list)
+ {
+ SetErrorInfo(context, CL_INVALID_VALUE, "Invalid device list");
+ return NULL;
+ }
+ if (!lengths)
+ {
+ SetErrorArg(context, CL_INVALID_VALUE, lengths);
+ return NULL;
+ }
+ if (!binaries)
+ {
+ SetErrorArg(context, CL_INVALID_VALUE, binaries);
+ return NULL;
+ }
+ if (device_list[0] != m_device)
+ {
+ SetErrorArg(context, CL_INVALID_DEVICE, device_list);
+ return NULL;
+ }
+
+ // Create program object
+ cl_program prog = new _cl_program;
+ prog->dispatch = m_dispatchTable;
+ prog->program = oclgrind::Program::createFromBitcode(context->context,
+ binaries[0], lengths[0]);
+ prog->context = context;
+ prog->refCount = 1;
+ if (!prog->program)
+ {
+ SetError(context, CL_INVALID_BINARY);
+ if (binary_status)
+ {
+ binary_status[0] = CL_INVALID_BINARY;
+ }
+ delete prog;
+ return NULL;
+ }
+ if (binary_status)
+ {
+ binary_status[0] = CL_SUCCESS;
+ }
+
+ clRetainContext(context);
+
+ SetError(context, CL_SUCCESS);
+ return prog;
+}
+
+CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels
+(
+ cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * kernel_names,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_2
+{
+ if (!context)
+ {
+ SetError(NULL, CL_INVALID_CONTEXT);
+ return NULL;
+ }
+
+ SetErrorInfo(context, CL_INVALID_VALUE, "No built-in kernels available");
+ return NULL;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram
+(
+ cl_program program
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!program)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+ }
+
+ program->refCount++;
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram
+(
+ cl_program program
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!program)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+ }
+
+ if (--program->refCount == 0)
+ {
+ delete program->program;
+ clReleaseContext(program->context);
+ delete program;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * options,
+ void (CL_CALLBACK * pfn_notify)(cl_program, void*),
+ void * user_data
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!program || !program->program)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+ }
+ if (num_devices > 0 && !device_list)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ "num_devices >0 but device_list is NULL");
+ }
+ if (num_devices == 0 && device_list)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ "num_devices == 0 but device_list non-NULL");
+ }
+ if (!pfn_notify && user_data)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ "pfn_notify NULL but user_data non-NULL");
+ }
+ if (device_list && !device_list[0])
+ {
+ ReturnErrorArg(program->context, CL_INVALID_DEVICE, device);
+ }
+
+ // Build program
+ if (!program->program->build(options))
+ {
+ ReturnError(program->context, CL_BUILD_PROGRAM_FAILURE);
+ }
+
+ // Fire callback
+ if (pfn_notify)
+ {
+ pfn_notify(program, user_data);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clUnloadCompiler
+(
+ void
+) CL_API_SUFFIX__VERSION_1_0
+{
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram
+(
+ cl_program program,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * options,
+ cl_uint num_input_headers,
+ const cl_program * input_headers,
+ const char ** header_include_names,
+ void (CL_CALLBACK * pfn_notify)(cl_program, void*),
+ void * user_data
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!program)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+ }
+ if (num_devices > 0 && !device_list)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ "num_devices >0 but device_list is NULL");
+ }
+ if (num_devices == 0 && device_list)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ "num_devices == 0 but device_list non-NULL");
+ }
+ if (!pfn_notify && user_data)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ "pfn_notify NULL but user_data non-NULL");
+ }
+ if (device_list && !device_list[0])
+ {
+ ReturnErrorArg(program->context, CL_INVALID_DEVICE, device);
+ }
+
+ // Prepare headers
+ list<oclgrind::Program::Header> headers;
+ for (unsigned i = 0; i < num_input_headers; i++)
+ {
+ headers.push_back(make_pair(header_include_names[i],
+ input_headers[i]->program));
+ }
+
+ // Build program
+ if (!program->program->build(options, headers))
+ {
+ ReturnError(program->context, CL_BUILD_PROGRAM_FAILURE);
+ }
+
+ // Fire callback
+ if (pfn_notify)
+ {
+ pfn_notify(program, user_data);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram
+(
+ cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * device_list,
+ const char * options,
+ cl_uint num_input_programs,
+ const cl_program * input_programs,
+ void (CL_CALLBACK * pfn_notify)(cl_program, void*),
+ void * user_data,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+ if (num_devices > 0 && !device_list)
+ {
+ SetErrorInfo(context, CL_INVALID_VALUE,
+ "num_devices >0 but device_list is NULL");
+ return NULL;
+ }
+ if (num_devices == 0 && device_list)
+ {
+ SetErrorInfo(context, CL_INVALID_VALUE,
+ "num_devices == 0 but device_list non-NULL");
+ return NULL;
+ }
+ if (!pfn_notify && user_data)
+ {
+ SetErrorInfo(context, CL_INVALID_VALUE,
+ "pfn_notify NULL but user_data non-NULL");
+ return NULL;
+ }
+ if (device_list && !device_list[0])
+ {
+ SetErrorArg(context, CL_INVALID_DEVICE, device_list);
+ return NULL;
+ }
+
+ // Prepare programs
+ list<const oclgrind::Program*> programs;
+ for (unsigned i = 0; i < num_input_programs; i++)
+ {
+ programs.push_back(input_programs[i]->program);
+ }
+
+ // Create program object
+ cl_program prog = new _cl_program;
+ prog->dispatch = m_dispatchTable;
+ prog->program = oclgrind::Program::createFromPrograms(context->context,
+ programs);
+ prog->context = context;
+ prog->refCount = 1;
+ if (!prog->program)
+ {
+ SetError(context, CL_INVALID_BINARY);
+ delete prog;
+ return NULL;
+ }
+
+ // Fire callback
+ if (pfn_notify)
+ {
+ pfn_notify(prog, user_data);
+ }
+
+ clRetainContext(context);
+
+ SetError(context, CL_SUCCESS);
+ return prog;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler
+(
+ cl_platform_id platform
+) CL_API_SUFFIX__VERSION_1_2
+{
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo
+(
+ cl_program program,
+ cl_program_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ size_t result_size = 0;
+ void *result_data = NULL;
+
+ // Check program is valid
+ if (!program)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+ }
+ if ((param_name == CL_PROGRAM_NUM_KERNELS ||
+ param_name == CL_PROGRAM_KERNEL_NAMES) &&
+ program->program->getBuildStatus() != CL_BUILD_SUCCESS)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_PROGRAM_EXECUTABLE,
+ "Program not successfully built");
+ }
+
+ switch (param_name)
+ {
+ case CL_PROGRAM_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data = malloc(result_size);
+ *(cl_uint*)result_data = program->refCount;
+ break;
+ case CL_PROGRAM_CONTEXT:
+ result_size = sizeof(cl_context);
+ result_data = malloc(result_size);
+ *(cl_context*)result_data = program->context;
+ break;
+ case CL_PROGRAM_NUM_DEVICES:
+ result_size = sizeof(cl_uint);
+ result_data = malloc(result_size);
+ *(cl_uint*)result_data = 1;
+ break;
+ case CL_PROGRAM_DEVICES:
+ result_size = sizeof(cl_device_id);
+ result_data = malloc(result_size);
+ *(cl_device_id*)result_data = m_device;
+ break;
+ case CL_PROGRAM_SOURCE:
+ result_size = strlen(program->program->getSource().c_str()) + 1;
+ result_data = malloc(result_size);
+ strcpy((char*)result_data, program->program->getSource().c_str());
+ break;
+ case CL_PROGRAM_BINARY_SIZES:
+ result_size = sizeof(size_t);
+ result_data = malloc(result_size);
+ *(size_t*)result_data = program->program->getBinarySize();
+ break;
+ case CL_PROGRAM_BINARIES:
+ result_size = sizeof(unsigned char*);
+ result_data = program->program->getBinary();
+ break;
+ case CL_PROGRAM_NUM_KERNELS:
+ result_size = sizeof(size_t);
+ result_data = malloc(result_size);
+ *(size_t*)result_data = program->program->getNumKernels();
+ break;
+ case CL_PROGRAM_KERNEL_NAMES:
+ {
+ list<string> names = program->program->getKernelNames();
+ string ret;
+ for (list<string>::iterator itr = names.begin(); itr != names.end(); itr++)
+ {
+ ret += *itr;
+ ret += ";";
+ }
+ if (!ret.empty())
+ {
+ ret.erase(ret.length()-1);
+ }
+ result_size = strlen(ret.c_str()) + 1;
+ result_data = malloc(result_size);
+ strcpy((char*)result_data, ret.c_str());
+ break;
+ }
+ default:
+ ReturnErrorArg(program->context, CL_INVALID_VALUE, param_name);
+ }
+
+ cl_int return_value = CL_SUCCESS;
+ if (param_value)
+ {
+ if (param_name == CL_PROGRAM_BINARIES)
+ {
+ memcpy(((unsigned char**)param_value)[0],
+ result_data, program->program->getBinarySize());
+ }
+ else
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ // TODO: Use API error reporting mechanism
+ return_value = CL_INVALID_VALUE;
+ }
+ else
+ {
+ memcpy(param_value, result_data, result_size);
+ }
+ }
+ }
+
+ if (param_value_size_ret)
+ {
+ *param_value_size_ret = result_size;
+ }
+
+ free(result_data);
+
+ return return_value;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo
+(
+ cl_program program,
+ cl_device_id device,
+ cl_program_build_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check program is valid
+ if (!program)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+ }
+
+ size_t dummy;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ cl_build_status status;
+ cl_program_binary_type type;
+ } result_data;
+ const char* str = 0;
+
+ switch (param_name)
+ {
+ case CL_PROGRAM_BUILD_STATUS:
+ result_size = sizeof(cl_build_status);
+ result_data.status = program->program->getBuildStatus();
+ break;
+ case CL_PROGRAM_BUILD_OPTIONS:
+ str = program->program->getBuildOptions().c_str();
+ result_size = strlen(str) + 1;
+ break;
+ case CL_PROGRAM_BUILD_LOG:
+ str = program->program->getBuildLog().c_str();
+ result_size = strlen(str) + 1;
+ break;
+ case CL_PROGRAM_BINARY_TYPE:
+ result_size = sizeof(cl_program_binary_type);
+ result_data.type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+ break;
+ default:
+ ReturnErrorArg(program->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ ParamValueSizeTooSmall);
+ }
+ else
+ {
+ if (str)
+ memcpy(param_value, str, result_size);
+ else
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel
+(
+ cl_program program,
+ const char * kernel_name,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (program->dispatch != m_dispatchTable)
+ {
+ SetError(NULL, CL_INVALID_PROGRAM);
+ return NULL;
+ }
+ if (!kernel_name)
+ {
+ SetErrorArg(program->context, CL_INVALID_VALUE, kernel_name);
+ return NULL;
+ }
+
+ // Create kernel object
+ cl_kernel kernel = new _cl_kernel;
+ kernel->dispatch = m_dispatchTable;
+ kernel->kernel = program->program->createKernel(kernel_name);
+ kernel->program = program;
+ kernel->refCount = 1;
+ if (!kernel->kernel)
+ {
+ SetErrorInfo(program->context, CL_INVALID_KERNEL_NAME,
+ "Kernel '" << kernel_name << "' not found");
+ delete kernel;
+ return NULL;
+ }
+
+ clRetainProgram(program);
+
+ SetError(program->context, CL_SUCCESS);
+ return kernel;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram
+(
+ cl_program program,
+ cl_uint num_kernels,
+ cl_kernel * kernels,
+ cl_uint * num_kernels_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!program)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+ }
+ if (program->program->getBuildStatus() != CL_BUILD_SUCCESS)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_PROGRAM_EXECUTABLE,
+ "Program not built");
+ }
+
+ unsigned int num = program->program->getNumKernels();
+ if (kernels && num_kernels < num)
+ {
+ ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+ "num_kernels is " << num_kernels <<
+ ", but " << num << " kernels found");
+ }
+
+ if (kernels)
+ {
+ int i = 0;
+ list<string> names = program->program->getKernelNames();
+ for (list<string>::iterator itr = names.begin(); itr != names.end(); itr++)
+ {
+ cl_kernel kernel = new _cl_kernel;
+ kernel->dispatch = m_dispatchTable;
+ kernel->kernel = program->program->createKernel(*itr);
+ kernel->program = program;
+ kernel->refCount = 1;
+ kernels[i++] = kernel;
+
+ clRetainProgram(program);
+ }
+ }
+
+ if (num_kernels_ret)
+ {
+ *num_kernels_ret = num;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel
+(
+ cl_kernel kernel
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!kernel)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+ }
+
+ kernel->refCount++;
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel
+(
+ cl_kernel kernel
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!kernel)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+ }
+
+ if (--kernel->refCount == 0)
+ {
+ delete kernel->kernel;
+
+ clReleaseProgram(kernel->program);
+
+ delete kernel;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg
+(
+ cl_kernel kernel,
+ cl_uint arg_index,
+ size_t arg_size,
+ const void * arg_value
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (arg_index >= kernel->kernel->getNumArguments())
+ {
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_INDEX,
+ "arg_index is " << arg_index <<
+ ", but kernel has " << kernel->kernel->getNumArguments()
+ << " arguments");
+ }
+
+ unsigned int addr = kernel->kernel->getArgumentAddressQualifier(arg_index);
+ bool isSampler =
+ kernel->kernel->getArgumentTypeName(arg_index) == "sampler_t";
+
+ if (kernel->kernel->getArgumentSize(arg_index) != arg_size
+ && !isSampler
+ && addr != CL_KERNEL_ARG_ADDRESS_LOCAL)
+ {
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_SIZE,
+ "arg_size is " << arg_size << ", but argument should be "
+ << kernel->kernel->getArgumentSize(arg_index) << " bytes");
+ }
+
+ // Prepare argument value
+ oclgrind::TypedValue value;
+ value.data = new unsigned char[arg_size];
+ value.size = arg_size;
+ value.num = 1;
+ switch (addr)
+ {
+ case CL_KERNEL_ARG_ADDRESS_PRIVATE:
+ if (isSampler)
+ {
+ memcpy(value.data, &(*(cl_sampler*)arg_value)->sampler, 4);
+ }
+ else
+ {
+ memcpy(value.data, arg_value, arg_size);
+ }
+ break;
+ case CL_KERNEL_ARG_ADDRESS_LOCAL:
+ delete value.data;
+ value.data = NULL;
+ break;
+ case CL_KERNEL_ARG_ADDRESS_GLOBAL:
+ case CL_KERNEL_ARG_ADDRESS_CONSTANT:
+ if (arg_value && *(cl_mem*)arg_value)
+ {
+ cl_mem mem = *(cl_mem*)arg_value;
+
+ if (mem->isImage)
+ {
+ // Create Image struct
+ oclgrind::Image *image = new oclgrind::Image;
+ image->address = mem->address;
+ image->format = ((cl_image*)mem)->format;
+ image->desc = ((cl_image*)mem)->desc;
+ *(oclgrind::Image**)value.data = image;
+ }
+ else
+ {
+ memcpy(value.data, &mem->address, arg_size);
+ }
+
+ kernel->memArgs[arg_index] = mem;
+ }
+ else
+ {
+ value.setPointer(0);
+ kernel->memArgs.erase(arg_index);
+ }
+ break;
+ default:
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_VALUE,
+ "Unsupported address space");
+ }
+
+ // Set argument
+ kernel->kernel->setArgument(arg_index, value);
+ delete[] value.data;
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo
+(
+ cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check kernel is valid
+ if (!kernel)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+ }
+
+ size_t dummy;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ cl_uint cluint;
+ cl_context context;
+ cl_program program;
+ } result_data;
+ const char* str = 0;
+
+ switch (param_name)
+ {
+ case CL_KERNEL_FUNCTION_NAME:
+ result_size = kernel->kernel->getName().size() + 1;
+ str = kernel->kernel->getName().c_str();
+ break;
+ case CL_KERNEL_NUM_ARGS:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = kernel->kernel->getNumArguments();
+ break;
+ case CL_KERNEL_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = kernel->refCount;
+ break;
+ case CL_KERNEL_CONTEXT:
+ result_size = sizeof(cl_context);
+ result_data.context = kernel->program->context;
+ break;
+ case CL_KERNEL_PROGRAM:
+ result_size = sizeof(cl_program);
+ result_data.program = kernel->program;
+ break;
+ case CL_KERNEL_ATTRIBUTES:
+ result_size = kernel->kernel->getAttributes().size() + 1;
+ str = kernel->kernel->getAttributes().c_str();
+ break;
+ default:
+ ReturnErrorArg(kernel->program->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+ ParamValueSizeTooSmall);
+ }
+ else
+ {
+ if (str)
+ memcpy(param_value, str, result_size);
+ else
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo
+(
+ cl_kernel kernel,
+ cl_uint arg_indx,
+ cl_kernel_arg_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters are valid
+ if (!kernel)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+ }
+ if (arg_indx >= kernel->kernel->getNumArguments())
+ {
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_INDEX,
+ "arg_indx is " << arg_indx <<
+ ", but kernel has " << kernel->kernel->getNumArguments()
+ << " arguments");
+ }
+
+ size_t dummy = 0;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ cl_kernel_arg_address_qualifier addressQual;
+ cl_kernel_arg_access_qualifier accessQual;
+ cl_kernel_arg_type_qualifier typeQual;
+ } result_data;
+
+ std::string str_data;
+
+ switch (param_name)
+ {
+ case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
+ result_size = sizeof(cl_kernel_arg_address_qualifier);
+ result_data.addressQual =
+ kernel->kernel->getArgumentAddressQualifier(arg_indx);
+ break;
+ case CL_KERNEL_ARG_ACCESS_QUALIFIER:
+ result_size = sizeof(cl_kernel_arg_access_qualifier);
+ result_data.accessQual =
+ kernel->kernel->getArgumentAccessQualifier(arg_indx);
+ break;
+ case CL_KERNEL_ARG_TYPE_NAME:
+ str_data = kernel->kernel->getArgumentTypeName(arg_indx).str();
+ result_size = str_data.size() + 1;
+ break;
+ case CL_KERNEL_ARG_TYPE_QUALIFIER:
+ result_size = sizeof(cl_kernel_arg_type_qualifier);
+ result_data.typeQual = kernel->kernel->getArgumentTypeQualifier(arg_indx);
+ break;
+ case CL_KERNEL_ARG_NAME:
+ str_data = kernel->kernel->getArgumentName(arg_indx).str();
+ result_size = str_data.size() + 1;
+ break;
+ default:
+ ReturnErrorArg(kernel->program->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+ ParamValueSizeTooSmall);
+ }
+
+ if (str_data.size())
+ memcpy(param_value, str_data.c_str(), result_size);
+ else
+ memcpy(param_value, &result_data, result_size);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo
+(
+ cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters are valid
+ if (!kernel)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+ }
+ if (!device || device != m_device)
+ {
+ ReturnErrorArg(kernel->program->context, CL_INVALID_DEVICE, device);
+ }
+
+ size_t dummy;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ size_t sizet;
+ size_t sizet3[3];
+ cl_ulong clulong;
+ } result_data;
+
+ switch (param_name)
+ {
+ case CL_KERNEL_GLOBAL_WORK_SIZE:
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+ "CL_KERNEL_GLOBAL_SIZE only valid on custom devices");
+ case CL_KERNEL_WORK_GROUP_SIZE:
+ result_size = sizeof(size_t);
+ result_data.sizet = MAX_WI_SIZE;
+ break;
+ case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+ result_size = sizeof(size_t[3]);
+ kernel->kernel->getRequiredWorkGroupSize(result_data.sizet3);
+ break;
+ case CL_KERNEL_LOCAL_MEM_SIZE:
+ result_size = sizeof(cl_ulong);
+ result_data.clulong = kernel->kernel->getLocalMemorySize();
+ break;
+ case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+ result_size = sizeof(size_t);
+ result_data.sizet = 1;
+ break;
+ case CL_KERNEL_PRIVATE_MEM_SIZE:
+ result_size = sizeof(cl_ulong);
+ result_data.clulong = 0;
+ break;
+ default:
+ ReturnErrorArg(kernel->program->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+ ParamValueSizeTooSmall);
+ }
+ else
+ {
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+/* Event Object APIs */
+
+namespace
+{
+ // Utility to check if an event has completed (or terminated)
+ inline bool isComplete(cl_event event)
+ {
+ return (event->event->state == CL_COMPLETE || event->event->state < 0);
+ }
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents
+(
+ cl_uint num_events,
+ const cl_event * event_list
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!num_events)
+ {
+ ReturnErrorInfo(NULL, CL_INVALID_VALUE, "num_events cannot be 0");
+ }
+ if (!event_list)
+ {
+ ReturnErrorInfo(NULL, CL_INVALID_VALUE, "event_list cannot be NULL");
+ }
+
+ // Loop until all events complete
+ bool complete = false;
+ while (!complete)
+ {
+ complete = true;
+ for (unsigned i = 0; i < num_events; i++)
+ {
+ // Skip event if already complete
+ if (isComplete(event_list[i]))
+ {
+ continue;
+ }
+
+ // If it's not a user event, update the queue
+ if (event_list[i]->queue)
+ {
+ oclgrind::Queue::Command *cmd = event_list[i]->queue->queue->update();
+ if (cmd)
+ {
+ asyncQueueRelease(cmd);
+ delete cmd;
+ }
+
+ // If it's still not complete, update flag
+ if (!isComplete(event_list[i]))
+ {
+ complete = false;
+ }
+ }
+ else
+ {
+ complete = false;
+ }
+ }
+ }
+
+ // Check if any command terminated unsuccessfully
+ for (unsigned i = 0; i < num_events; i++)
+ {
+ if (event_list[i]->event->state < 0)
+ {
+ ReturnErrorInfo(event_list[i]->context,
+ CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST,
+ "Event " << i <<
+ " terminated with error " << event_list[i]->event->state);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo
+(
+ cl_event event,
+ cl_event_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check event is valid
+ if (!event)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+ }
+
+ size_t dummy;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ union
+ {
+ cl_command_queue queue;
+ cl_context context;
+ cl_command_type type;
+ cl_int clint;
+ cl_uint cluint;
+ size_t sizet;
+ size_t sizet3[3];
+ } result_data;
+
+ switch (param_name)
+ {
+ case CL_EVENT_COMMAND_QUEUE:
+ result_size = sizeof(cl_command_queue);
+ result_data.queue = event->queue;
+ break;
+ case CL_EVENT_CONTEXT:
+ result_size = sizeof(cl_context);
+ result_data.context = event->context;
+ break;
+ case CL_EVENT_COMMAND_TYPE:
+ result_size = sizeof(cl_command_type);
+ result_data.type = event->type;
+ break;
+ case CL_EVENT_COMMAND_EXECUTION_STATUS:
+ result_size = sizeof(cl_int);
+ result_data.clint = event->event->state;
+ break;
+ case CL_EVENT_REFERENCE_COUNT:
+ result_size = sizeof(cl_uint);
+ result_data.cluint = event->refCount;
+ break;
+ default:
+ ReturnErrorArg(event->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(event->context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+ }
+ else
+ {
+ memcpy(param_value, &result_data, result_size);
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent
+(
+ cl_context context,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!context)
+ {
+ SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+ return NULL;
+ }
+
+ /// Create event object
+ cl_event event = new _cl_event;
+ event->dispatch = m_dispatchTable;
+ event->context = context;
+ event->queue = 0;
+ event->type = CL_COMMAND_USER;
+ event->event = new oclgrind::Event();
+ event->event->state = CL_SUBMITTED;
+ event->refCount = 1;
+
+ SetError(context, CL_SUCCESS);
+ return event;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent
+(
+ cl_event event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!event)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+ }
+
+ event->refCount++;
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent
+(
+ cl_event event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!event)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+ }
+
+ if (--event->refCount == 0)
+ {
+ if (event->event)
+ {
+ delete event->event;
+ }
+ delete event;
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus
+(
+ cl_event event,
+ cl_int execution_status
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!event)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+ }
+ if (event->queue)
+ {
+ ReturnErrorInfo(event->context, CL_INVALID_EVENT, "Not a user event");
+ }
+ if (execution_status != CL_COMPLETE && execution_status >= 0)
+ {
+ ReturnErrorArg(event->context, CL_INVALID_VALUE, execution_status);
+ }
+ if (event->event->state == CL_COMPLETE || event->event->state < 0)
+ {
+ ReturnErrorInfo(event->context, CL_INVALID_OPERATION,
+ "Event status already set");
+ }
+
+ event->event->state = execution_status;
+
+ // Perform callbacks
+ list< pair<void (CL_CALLBACK *)(cl_event, cl_int, void *), void*> >::iterator itr;
+ for (itr = event->callbacks.begin(); itr != event->callbacks.end(); itr++)
+ {
+ itr->first(event, execution_status, itr->second);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback
+(
+ cl_event event,
+ cl_int command_exec_callback_type,
+ void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void*),
+ void * user_data
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!event)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+ }
+ if (!pfn_notify)
+ {
+ ReturnErrorArg(event->context, CL_INVALID_VALUE, pfn_notify);
+ }
+ if (command_exec_callback_type != CL_COMPLETE &&
+ command_exec_callback_type != CL_SUBMITTED &&
+ command_exec_callback_type != CL_RUNNING)
+ {
+ ReturnErrorArg(event->context, CL_INVALID_VALUE,
+ command_exec_callback_type);
+ }
+
+ event->callbacks.push_back(make_pair(pfn_notify, user_data));
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo
+(
+ cl_event event,
+ cl_profiling_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check event is valid
+ if (!event)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+ }
+ if (!event->queue)
+ {
+ ReturnError(event->context, CL_PROFILING_INFO_NOT_AVAILABLE);
+ }
+
+ size_t dummy = 0;
+ size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+ cl_ulong result;
+
+ switch (param_name)
+ {
+ case CL_PROFILING_COMMAND_QUEUED:
+ result_size = sizeof(cl_ulong);
+ result = event->event->queueTime;
+ break;
+ case CL_PROFILING_COMMAND_SUBMIT:
+ result_size = sizeof(cl_ulong);
+ result = event->event->startTime;
+ break;
+ case CL_PROFILING_COMMAND_START:
+ result_size = sizeof(cl_ulong);
+ result = event->event->startTime;
+ break;
+ case CL_PROFILING_COMMAND_END:
+ result_size = sizeof(cl_ulong);
+ result = event->event->endTime;
+ break;
+ default:
+ ReturnErrorArg(event->context, CL_INVALID_VALUE, param_name);
+ }
+
+ if (param_value)
+ {
+ // Check destination is large enough
+ if (param_value_size < result_size)
+ {
+ ReturnErrorInfo(event->context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+ }
+ else
+ {
+ *(cl_ulong*)param_value = result;
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clFlush
+(
+ cl_command_queue command_queue
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ // TODO: Implement properly?
+ clFinish(command_queue);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clFinish
+(
+ cl_command_queue command_queue
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ while (!command_queue->queue->isEmpty())
+ {
+ // TODO: Move this update to async thread?
+ oclgrind::Queue::Command *cmd = command_queue->queue->update();
+ if (cmd)
+ {
+ asyncQueueRelease(cmd);
+ delete cmd;
+ }
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ size_t offset,
+ size_t cb,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+ }
+ if (!ptr)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+ }
+ if (offset + cb > buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "offset + cb (" << offset << " + " << cb <<
+ ") exceeds buffer size (" << buffer->size << " bytes)");
+ }
+ if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Buffer flags specify host will not read data");
+ }
+
+ // Enqueue command
+ oclgrind::Queue::BufferCommand *cmd =
+ new oclgrind::Queue::BufferCommand(oclgrind::Queue::READ);
+ cmd->ptr = (unsigned char*)ptr;
+ cmd->address = buffer->address + offset;
+ cmd->size = cb;
+ asyncQueueRetain(cmd, buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_READ_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ if (blocking_read)
+ {
+ return clFinish(command_queue);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_read,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+ }
+ if (!ptr)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+ }
+ if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Buffer flags specify host will not read data");
+ }
+
+ // Compute pitches if neccessary
+ if (buffer_row_pitch == 0)
+ {
+ buffer_row_pitch = region[0];
+ }
+ if (buffer_slice_pitch == 0)
+ {
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+ }
+ if (host_row_pitch == 0)
+ {
+ host_row_pitch = region[0];
+ }
+ if (host_slice_pitch == 0)
+ {
+ host_slice_pitch = region[1] * host_row_pitch;
+ }
+
+ // Compute origin offsets
+ size_t buffer_offset =
+ buffer_origin[2] * buffer_slice_pitch +
+ buffer_origin[1] * buffer_row_pitch +
+ buffer_origin[0];
+ size_t host_offset =
+ host_origin[2] * host_slice_pitch +
+ host_origin[1] * host_row_pitch +
+ host_origin[0];
+
+ // Ensure buffer region valid
+ size_t end =
+ buffer_offset + region[0] +
+ (region[1]-1) * buffer_row_pitch +
+ (region[2]-1) * buffer_slice_pitch;
+ if (end > buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "Region exceeds buffer size (" <<
+ buffer->size << " bytes)");
+ }
+
+ // Enqueue command
+ oclgrind::Queue::BufferRectCommand *cmd =
+ new oclgrind::Queue::BufferRectCommand(oclgrind::Queue::READ_RECT);
+ cmd->ptr = (unsigned char*)ptr;
+ cmd->address = buffer->address;
+ cmd->buffer_offset[0] = buffer_offset;
+ cmd->buffer_offset[1] = buffer_row_pitch;
+ cmd->buffer_offset[2] = buffer_slice_pitch;
+ cmd->host_offset[0] = host_offset;
+ cmd->host_offset[1] = host_row_pitch;
+ cmd->host_offset[2] = host_slice_pitch;
+ memcpy(cmd->region, region, 3*sizeof(size_t));
+ asyncQueueRetain(cmd, buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_READ_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ if (blocking_read)
+ {
+ return clFinish(command_queue);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ size_t offset,
+ size_t cb,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+ }
+ if (!ptr)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+ }
+ if (offset + cb > buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "offset + cb (" << offset << " + " << cb <<
+ ") exceeds buffer size (" << buffer->size << " bytes)");
+ }
+ if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Buffer flags specify host will not write data");
+ }
+
+ // Enqueue command
+ oclgrind::Queue::BufferCommand *cmd =
+ new oclgrind::Queue::BufferCommand(oclgrind::Queue::WRITE);
+ cmd->ptr = (unsigned char*)ptr;
+ cmd->address = buffer->address + offset;
+ cmd->size = cb;
+ asyncQueueRetain(cmd, buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_WRITE_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ if (blocking_write)
+ {
+ return clFinish(command_queue);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_write,
+ const size_t * buffer_origin,
+ const size_t * host_origin,
+ const size_t * region,
+ size_t buffer_row_pitch,
+ size_t buffer_slice_pitch,
+ size_t host_row_pitch,
+ size_t host_slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+ }
+ if (!ptr)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+ }
+ if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Buffer flags specify host will not write data");
+ }
+
+ // Compute pitches if necessary
+ if (buffer_row_pitch == 0)
+ {
+ buffer_row_pitch = region[0];
+ }
+ if (buffer_slice_pitch == 0)
+ {
+ buffer_slice_pitch = region[1] * buffer_row_pitch;
+ }
+ if (host_row_pitch == 0)
+ {
+ host_row_pitch = region[0];
+ }
+ if (host_slice_pitch == 0)
+ {
+ host_slice_pitch = region[1] * host_row_pitch;
+ }
+
+ // Compute origin offsets
+ size_t buffer_offset =
+ buffer_origin[2] * buffer_slice_pitch +
+ buffer_origin[1] * buffer_row_pitch +
+ buffer_origin[0];
+ size_t host_offset =
+ host_origin[2] * host_slice_pitch +
+ host_origin[1] * host_row_pitch +
+ host_origin[0];
+
+ // Ensure buffer region valid
+ size_t end =
+ buffer_offset + region[0] +
+ (region[1]-1) * buffer_row_pitch +
+ (region[2]-1) * buffer_slice_pitch;
+ if (end > buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "Region exceeds buffer size (" <<
+ buffer->size << " bytes)");
+ }
+
+ // Enqueue command
+ oclgrind::Queue::BufferRectCommand *cmd =
+ new oclgrind::Queue::BufferRectCommand(oclgrind::Queue::WRITE_RECT);
+ cmd->ptr = (unsigned char*)ptr;
+ cmd->address = buffer->address;
+ cmd->buffer_offset[0] = buffer_offset;
+ cmd->buffer_offset[1] = buffer_row_pitch;
+ cmd->buffer_offset[2] = buffer_slice_pitch;
+ cmd->host_offset[0] = host_offset;
+ cmd->host_offset[1] = host_row_pitch;
+ cmd->host_offset[2] = host_slice_pitch;
+ memcpy(cmd->region, region, 3*sizeof(size_t));
+ asyncQueueRetain(cmd, buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_WRITE_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ if (blocking_write)
+ {
+ return clFinish(command_queue);
+ }
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ size_t src_offset,
+ size_t dst_offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!src_buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_buffer);
+ }
+ if (!dst_buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_buffer);
+ }
+ if (dst_offset + cb > dst_buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "dst_offset + cb (" << dst_offset << " + " << cb <<
+ ") exceeds buffer size (" << dst_buffer->size << " bytes)");
+ }
+ if (src_offset + cb > src_buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "src_offset + cb (" << src_offset << " + " << cb <<
+ ") exceeds buffer size (" << src_buffer->size << " bytes)");
+ }
+
+ // Enqueue command
+ oclgrind::Queue::CopyCommand *cmd = new oclgrind::Queue::CopyCommand();
+ cmd->dst = dst_buffer->address + dst_offset;
+ cmd->src = src_buffer->address + src_offset;
+ cmd->size = cb;
+ asyncQueueRetain(cmd, src_buffer);
+ asyncQueueRetain(cmd, dst_buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_COPY_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect
+(
+ cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_buffer,
+ const size_t * src_origin,
+ const size_t * dst_origin,
+ const size_t * region,
+ size_t src_row_pitch,
+ size_t src_slice_pitch,
+ size_t dst_row_pitch,
+ size_t dst_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_1
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!src_buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_buffer);
+ }
+ if (!dst_buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_buffer);
+ }
+
+ // Compute pitches if neccessary
+ if (src_row_pitch == 0)
+ {
+ src_row_pitch = region[0];
+ }
+ if (src_slice_pitch == 0)
+ {
+ src_slice_pitch = region[1] * src_row_pitch;
+ }
+ if (dst_row_pitch == 0)
+ {
+ dst_row_pitch = region[0];
+ }
+ if (dst_slice_pitch == 0)
+ {
+ dst_slice_pitch = region[1] * dst_row_pitch;
+ }
+
+ // Compute origin offsets
+ size_t src_offset =
+ src_origin[2] * src_slice_pitch +
+ src_origin[1] * src_row_pitch +
+ src_origin[0];
+ size_t dst_offset =
+ dst_origin[2] * dst_slice_pitch +
+ dst_origin[1] * dst_row_pitch +
+ dst_origin[0];
+
+ // Ensure buffer region valid
+ size_t src_end =
+ src_offset + region[0] +
+ (region[1]-1) * src_row_pitch +
+ (region[2]-1) * src_slice_pitch;
+ size_t dst_end =
+ dst_offset + region[0] +
+ (region[1]-1) * dst_row_pitch +
+ (region[2]-1) * dst_slice_pitch;
+ if (src_end > src_buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "Region exceeds source buffer size (" <<
+ src_buffer->size << " bytes)");
+ }
+ if (dst_end > dst_buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "Region exceeds destination buffer size (" <<
+ dst_buffer->size << " bytes)");
+ }
+
+ // Enqueue command
+ oclgrind::Queue::CopyRectCommand *cmd = new oclgrind::Queue::CopyRectCommand();
+ cmd->src = src_buffer->address;
+ cmd->dst = dst_buffer->address;
+ cmd->src_offset[0] = src_offset;
+ cmd->src_offset[1] = src_row_pitch;
+ cmd->src_offset[2] = src_slice_pitch;
+ cmd->dst_offset[0] = dst_offset;
+ cmd->dst_offset[1] = dst_row_pitch;
+ cmd->dst_offset[2] = dst_slice_pitch;
+ memcpy(cmd->region, region, 3*sizeof(size_t));
+ asyncQueueRetain(cmd, src_buffer);
+ asyncQueueRetain(cmd, dst_buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_COPY_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ const void * pattern,
+ size_t pattern_size,
+ size_t offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, buffer);
+ }
+ if (offset + cb > buffer->size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "offset + cb (" << offset << " + " << cb <<
+ ") exceeds buffer size (" << buffer->size << " bytes)");
+ }
+ if (!pattern)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, pattern);
+ }
+ if (pattern_size == 0)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, pattern_size);
+ }
+ if (offset%pattern_size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "offset (" << offset << ")" <<
+ " not a multiple of pattern_size (" << pattern_size << ")");
+ }
+ if (cb%pattern_size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "cb (" << cb << ")" <<
+ " not a multiple of pattern_size (" << pattern_size << ")");
+ }
+
+ // Enqueue command
+ oclgrind::Queue::FillBufferCommand *cmd =
+ new oclgrind::Queue::FillBufferCommand((const unsigned char*)pattern,
+ pattern_size);
+ cmd->address = buffer->address + offset;
+ cmd->size = cb;
+ asyncQueueRetain(cmd, buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_FILL_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage
+(
+ cl_command_queue command_queue,
+ cl_mem image,
+ const void * fill_color,
+ const size_t * origin,
+ const size_t * region,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!image)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+ }
+ if (!fill_color)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, fill_color);
+ }
+ if (!region[0] || !region[1] || !region[2])
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "Values in region cannot be 0");
+ }
+
+ // Get image dimensions
+ cl_image *img = (cl_image*)image;
+ size_t width = img->desc.image_width;
+ size_t height = img->desc.image_height;
+ size_t depth = img->desc.image_depth;
+ size_t arraySize = img->desc.image_array_size;
+ size_t pixelSize = getPixelSize(&img->format);
+ size_t row_pitch = width * pixelSize;
+ size_t slice_pitch = height * row_pitch;
+
+ if (img->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ height = arraySize;
+ if (img->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ depth = arraySize;
+
+ // Ensure region is within image bounds
+ if (origin[0] + region[0] > width)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "origin[0] + region[0] > width ("
+ << origin[0] << " + " << region[0] << " > " << width
+ << " )");
+ }
+ if (origin[1] + region[1] > height)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "origin[1] + region[1] > height ("
+ << origin[1] << " + " << region[1] << " > " << height
+ << " )");
+ }
+ if (origin[2] + region[2] > depth)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "origin[2] + region[2] > depth ("
+ << origin[2] << " + " << region[2] << " > " << depth
+ << " )");
+ }
+
+ // Generate color data with correct order and data type
+ unsigned char *color = new unsigned char[pixelSize];
+ for (unsigned output = 0; output < getNumChannels(&img->format); output++)
+ {
+ // Get input channel index
+ int input = output;
+ switch (img->format.image_channel_order)
+ {
+ case CL_R:
+ case CL_Rx:
+ case CL_RG:
+ case CL_RGx:
+ case CL_RGB:
+ case CL_RGBx:
+ case CL_RGBA:
+ break;
+ case CL_BGRA:
+ if (output == 0) input = 2;
+ if (output == 2) input = 0;
+ break;
+ case CL_ARGB:
+ if (output == 0) input = 3;
+ if (output == 1) input = 0;
+ if (output == 2) input = 1;
+ if (output == 3) input = 2;
+ break;
+ case CL_A:
+ if (output == 0) input = 3;
+ break;
+ case CL_RA:
+ if (output == 1) input = 3;
+ break;
+ case CL_INTENSITY:
+ case CL_LUMINANCE:
+ input = 0;
+ break;
+ default:
+ ReturnError(command_queue->context, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+ }
+
+ // Interpret data
+ switch (img->format.image_channel_data_type)
+ {
+ case CL_SNORM_INT8:
+ ((int8_t*)color)[output] =
+ rint(min(max(((float*)fill_color)[input]*127.f, -127.f), 128.f));
+ break;
+ case CL_UNORM_INT8:
+ ((uint8_t*)color)[output] =
+ rint(min(max(((float*)fill_color)[input]*255.f, 0.f), 255.f));
+ break;
+ case CL_SNORM_INT16:
+ ((int16_t*)color)[output] =
+ rint(min(max(((float*)fill_color)[input]*32767.f, -32768.f), 32767.f));
+ break;
+ case CL_UNORM_INT16:
+ ((uint16_t*)color)[output] =
+ rint(min(max(((float*)fill_color)[input]*65535.f, 0.f), 65535.f));
+ break;
+ case CL_FLOAT:
+ ((float*)color)[output] = ((float*)fill_color)[input];
+ break;
+ case CL_HALF_FLOAT:
+ ((uint16_t*)color)[output] = floatToHalf(((float*)fill_color)[input]);
+ break;
+ case CL_SIGNED_INT8:
+ ((int8_t*)color)[output] = ((int32_t*)fill_color)[input];
+ break;
+ case CL_SIGNED_INT16:
+ ((int16_t*)color)[output] = ((int32_t*)fill_color)[input];
+ break;
+ case CL_SIGNED_INT32:
+ ((int32_t*)color)[output] = ((int32_t*)fill_color)[input];
+ break;
+ case CL_UNSIGNED_INT8:
+ ((uint8_t*)color)[output] = ((uint32_t*)fill_color)[input];
+ break;
+ case CL_UNSIGNED_INT16:
+ ((uint16_t*)color)[output] = ((uint32_t*)fill_color)[input];
+ break;
+ case CL_UNSIGNED_INT32:
+ ((uint32_t*)color)[output] = ((uint32_t*)fill_color)[input];
+ break;
+ default:
+ ReturnError(command_queue->context, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+ }
+ }
+
+ // Enqueue command
+ oclgrind::Queue::FillImageCommand *cmd =
+ new oclgrind::Queue::FillImageCommand(image->address, origin, region,
+ row_pitch, slice_pitch,
+ pixelSize, color);
+ asyncQueueRetain(cmd, image);
+ asyncEnqueue(command_queue, CL_COMMAND_FILL_IMAGE, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+ delete[] color;
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage
+(
+ cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_read,
+ const size_t * origin,
+ const size_t * region,
+ size_t row_pitch,
+ size_t slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!image)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+ }
+
+ cl_image *img = (cl_image*)image;
+
+ size_t pixelSize = getPixelSize(&img->format);
+ size_t buffer_origin[3] = {origin[0]*pixelSize, origin[1], origin[2]};
+ size_t pixel_region[3] = {region[0]*pixelSize, region[1], region[2]};
+ size_t host_origin[3] = {0, 0, 0};
+
+ size_t img_row_pitch = img->desc.image_width * pixelSize;
+ size_t img_slice_pitch = img->desc.image_height * img_row_pitch;
+ if (row_pitch == 0)
+ {
+ row_pitch = pixel_region[0];
+ }
+ if (slice_pitch == 0)
+ {
+ slice_pitch = pixel_region[1] * row_pitch;
+ }
+
+ // Enqueue read
+ cl_int ret = clEnqueueReadBufferRect(
+ command_queue, image, blocking_read,
+ buffer_origin, host_origin, pixel_region,
+ img_row_pitch, img_slice_pitch, row_pitch, slice_pitch,
+ ptr, num_events_in_wait_list, event_wait_list, event);
+ if (event)
+ {
+ (*event)->type = CL_COMMAND_READ_IMAGE;
+ }
+ return ret;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage
+(
+ cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_write,
+ const size_t * origin,
+ const size_t * region,
+ size_t input_row_pitch,
+ size_t input_slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!image)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+ }
+
+ cl_image *img = (cl_image*)image;
+
+ size_t pixelSize = getPixelSize(&img->format);
+ size_t buffer_origin[3] = {origin[0]*pixelSize, origin[1], origin[2]};
+ size_t pixel_region[3] = {region[0]*pixelSize, region[1], region[2]};
+ size_t host_origin[3] = {0, 0, 0};
+
+ size_t img_row_pitch = img->desc.image_width * pixelSize;
+ size_t img_slice_pitch = img->desc.image_height * img_row_pitch;
+ if (input_row_pitch == 0)
+ {
+ input_row_pitch = pixel_region[0];
+ }
+ if (input_slice_pitch == 0)
+ {
+ input_slice_pitch = pixel_region[1] * input_row_pitch;
+ }
+
+ // Enqueue write
+ cl_int ret = clEnqueueWriteBufferRect(
+ command_queue, image, blocking_write,
+ buffer_origin, host_origin, pixel_region,
+ img_row_pitch, img_slice_pitch, input_row_pitch, input_slice_pitch,
+ ptr, num_events_in_wait_list, event_wait_list, event);
+ if (event)
+ {
+ (*event)->type = CL_COMMAND_WRITE_IMAGE;
+ }
+ return ret;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage
+(
+ cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_image,
+ const size_t * src_origin,
+ const size_t * dst_origin,
+ const size_t * region,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!src_image)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_image);
+ }
+ if (!dst_image)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_image);
+ }
+
+ cl_image *src = (cl_image*)src_image;
+ cl_image *dst = (cl_image*)dst_image;
+ if (src->format.image_channel_order != dst->format.image_channel_order)
+ {
+ ReturnErrorInfo(command_queue->context, CL_IMAGE_FORMAT_MISMATCH,
+ "Channel orders do not match");
+ }
+ if (src->format.image_channel_data_type != dst->format.image_channel_data_type)
+ {
+ ReturnErrorInfo(command_queue->context, CL_IMAGE_FORMAT_MISMATCH,
+ "Channel data types do no match");
+ }
+
+ size_t srcPixelSize = getPixelSize(&src->format);
+ size_t dstPixelSize = getPixelSize(&dst->format);
+
+ size_t src_pixel_origin[3] = {src_origin[0]*srcPixelSize,
+ src_origin[1], src_origin[2]};
+ size_t dst_pixel_origin[3] = {dst_origin[0]*dstPixelSize,
+ dst_origin[1], dst_origin[2]};
+ size_t pixel_region[3] = {region[0]*srcPixelSize, region[1], region[2]};
+
+ size_t src_row_pitch = src->desc.image_width * srcPixelSize;
+ size_t src_slice_pitch = src->desc.image_height * src_row_pitch;
+ size_t dst_row_pitch = dst->desc.image_width * dstPixelSize;
+ size_t dst_slice_pitch = dst->desc.image_height * dst_row_pitch;
+
+ // Enqueue copy
+ cl_int ret = clEnqueueCopyBufferRect(
+ command_queue, src_image, dst_image,
+ src_pixel_origin, dst_pixel_origin, pixel_region,
+ src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
+ num_events_in_wait_list, event_wait_list, event);
+ if (event)
+ {
+ (*event)->type = CL_COMMAND_COPY_IMAGE;
+ }
+ return ret;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_buffer,
+ const size_t * src_origin,
+ const size_t * region,
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!src_image)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_image);
+ }
+ if (!dst_buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_buffer);
+ }
+
+ cl_image *src = (cl_image*)src_image;
+ size_t pixel_size = getPixelSize(&src->format);
+ size_t src_pixel_origin[3] = {src_origin[0]*pixel_size,
+ src_origin[1], src_origin[2]};
+ size_t src_row_pitch = src->desc.image_width * pixel_size;
+ size_t src_slice_pitch = src->desc.image_height * src_row_pitch;
+
+ size_t pixel_region[3] = {region[0]*pixel_size, region[1], region[2]};
+ size_t dst_origin[3] = {dst_offset, 0, 0};
+
+ // Enqueue copy
+ cl_int ret = clEnqueueCopyBufferRect(
+ command_queue, src_image, dst_buffer,
+ src_pixel_origin, dst_origin, pixel_region,
+ src_row_pitch, src_slice_pitch, 0, 0,
+ num_events_in_wait_list, event_wait_list, event);
+ if (event)
+ {
+ (*event)->type = CL_COMMAND_COPY_IMAGE_TO_BUFFER;
+ }
+ return ret;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage
+(
+ cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_image,
+ size_t src_offset,
+ const size_t * dst_origin,
+ const size_t * region,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!src_buffer)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_buffer);
+ }
+ if (!dst_image)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_image);
+ }
+
+ cl_image *dst = (cl_image*)dst_image;
+ size_t pixel_size = getPixelSize(&dst->format);
+ size_t dst_pixel_origin[3] = {dst_origin[0]*pixel_size,
+ dst_origin[1], dst_origin[2]};
+ size_t dst_row_pitch = dst->desc.image_width * pixel_size;
+ size_t dst_slice_pitch = dst->desc.image_height * dst_row_pitch;
+
+ size_t pixel_region[3] = {region[0]*pixel_size, region[1], region[2]};
+ size_t src_origin[3] = {src_offset, 0, 0};
+
+ // Enqueue copy
+ cl_int ret = clEnqueueCopyBufferRect(
+ command_queue, src_buffer, dst_image,
+ src_origin, dst_pixel_origin, pixel_region,
+ 0, 0, dst_row_pitch, dst_slice_pitch,
+ num_events_in_wait_list, event_wait_list, event);
+ if (event)
+ {
+ (*event)->type = CL_COMMAND_COPY_BUFFER_TO_IMAGE;
+ }
+ return ret;
+}
+
+CL_API_ENTRY void* CL_API_CALL
+clEnqueueMapBuffer
+(
+ cl_command_queue command_queue,
+ cl_mem buffer,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ size_t offset,
+ size_t cb,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ SetErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ return NULL;
+ }
+ if (!buffer)
+ {
+ SetErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, buffer);
+ return NULL;
+ }
+ if (map_flags & CL_MAP_WRITE &&
+ buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Buffer flags specify host will not write data");
+ return NULL;
+ }
+ if (map_flags & CL_MAP_READ &&
+ buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Buffer flags specify host will not read data");
+ return NULL;
+ }
+
+ // Check map region
+ if (offset + cb > buffer->size)
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "offset + cb (" << offset << " + " << cb <<
+ ") exceeds buffer size (" << buffer->size << " bytes)");
+ return NULL;
+ }
+
+ // Map buffer
+ void *ptr = buffer->context->context->getGlobalMemory()->mapBuffer(
+ buffer->address, offset, cb);
+ if (ptr == NULL)
+ {
+ SetError(command_queue->context, CL_INVALID_VALUE);
+ return NULL;
+ }
+
+ // Enqueue command
+ oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+ asyncQueueRetain(cmd, buffer);
+ asyncEnqueue(command_queue, CL_COMMAND_MAP_BUFFER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ SetError(command_queue->context, CL_SUCCESS);
+ if (blocking_map)
+ {
+ SetError(command_queue->context, clFinish(command_queue));
+ }
+
+ return ptr;
+}
+
+CL_API_ENTRY void* CL_API_CALL
+clEnqueueMapImage
+(
+ cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ const size_t * origin,
+ const size_t * region,
+ size_t * image_row_pitch,
+ size_t * image_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ SetErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ return NULL;
+ }
+ if (!image)
+ {
+ SetErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+ return NULL;
+ }
+ if (!image_row_pitch)
+ {
+ SetErrorArg(command_queue->context, CL_INVALID_VALUE, image_row_pitch);
+ return NULL;
+ }
+ if (map_flags & CL_MAP_WRITE &&
+ image->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Image flags specify host will not write data");
+ return NULL;
+ }
+ if (map_flags & CL_MAP_READ &&
+ image->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+ "Image flags specify host will not read data");
+ return NULL;
+ }
+ if (!region[0] || !region[1] || !region[2])
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "Values in region cannot be 0");
+ }
+
+ // Get image dimensions
+ cl_image *img = (cl_image*)image;
+ size_t width = img->desc.image_width;
+ size_t height = img->desc.image_height;
+ size_t depth = img->desc.image_depth;
+ size_t arraySize = img->desc.image_array_size;
+ size_t pixelSize = getPixelSize(&img->format);
+ size_t row_pitch = width * pixelSize;
+ size_t slice_pitch = height * row_pitch;
+
+ if (img->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+ height = arraySize;
+ if (img->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+ depth = arraySize;
+
+ // Ensure region is within image bounds
+ if (origin[0] + region[0] > width)
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "origin[0] + region[0] > width ("
+ << origin[0] << " + " << region[0] << " > " << width
+ << " )");
+ }
+ if (origin[1] + region[1] > height)
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "origin[1] + region[1] > height ("
+ << origin[1] << " + " << region[1] << " > " << height
+ << " )");
+ }
+ if (origin[2] + region[2] > depth)
+ {
+ SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "origin[2] + region[2] > depth ("
+ << origin[2] << " + " << region[2] << " > " << depth
+ << " )");
+ }
+
+ // Compute byte offset and size
+ size_t offset = origin[0] * pixelSize
+ + origin[1] * row_pitch
+ + origin[2] * slice_pitch;
+ size_t size = region[0] * pixelSize
+ + (region[1]-1) * row_pitch
+ + (region[2]-1) * slice_pitch;
+
+ // Map image
+ void *ptr = image->context->context->getGlobalMemory()->mapBuffer(
+ image->address, offset, size);
+ if (ptr == NULL)
+ {
+ SetError(command_queue->context, CL_INVALID_VALUE);
+ return NULL;
+ }
+
+ *image_row_pitch = row_pitch;
+ if (image_slice_pitch)
+ {
+ *image_slice_pitch = slice_pitch;
+ }
+
+ // Enqueue command
+ oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+ asyncQueueRetain(cmd, image);
+ asyncEnqueue(command_queue, CL_COMMAND_MAP_IMAGE, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ SetError(command_queue->context, CL_SUCCESS);
+ if (blocking_map)
+ {
+ SetError(command_queue->context, clFinish(command_queue));
+ }
+
+ return ptr;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject
+(
+ cl_command_queue command_queue,
+ cl_mem memobj,
+ void * mapped_ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!memobj)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+ }
+
+ // Enqueue command
+ oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+ asyncQueueRetain(cmd, memobj);
+ asyncEnqueue(command_queue, CL_COMMAND_UNMAP_MEM_OBJECT, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects
+(
+ cl_command_queue command_queue,
+ cl_uint num_mem_objects,
+ const cl_mem * mem_objects,
+ cl_mem_migration_flags flags,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ // Enqueue command
+ oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+ asyncEnqueue(command_queue, CL_COMMAND_MIGRATE_MEM_OBJECTS, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel
+(
+ cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t * global_work_offset,
+ const size_t * global_work_size,
+ const size_t * local_work_size,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (work_dim < 1 || work_dim > 3)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_DIMENSION,
+ "Kernels must be 1, 2 or 3 dimensional (work_dim = "
+ << work_dim << ")");
+ }
+ if (!global_work_size)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_GLOBAL_WORK_SIZE,
+ "global_work_size cannot be NULL");
+ }
+
+ // Check global and local sizes are valid
+ for (unsigned i = 0; i < work_dim; i++)
+ {
+ if (!global_work_size[i])
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_GLOBAL_WORK_SIZE,
+ "global_work_size[" << i << "] = 0");
+ }
+ if (local_work_size && global_work_size[i] % local_work_size[i])
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_GROUP_SIZE,
+ "Dimension " << i <<
+ ": local_work_size (" << local_work_size[i] <<
+ ") does not divide global_work_size (" <<
+ global_work_size[i] << ")");
+ }
+ }
+
+ // Ensure all arguments have been set
+ if (!kernel->kernel->allArgumentsSet())
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_KERNEL_ARGS,
+ "Not all kernel arguments set");
+ }
+
+ // Set-up offsets and sizes
+ oclgrind::Queue::KernelCommand *cmd = new oclgrind::Queue::KernelCommand();
+ cmd->kernel = new oclgrind::Kernel(*kernel->kernel);
+ cmd->work_dim = work_dim;
+ cmd->globalSize = oclgrind::Size3(1, 1, 1);
+ cmd->globalOffset = oclgrind::Size3(0, 0, 0);
+ cmd->localSize = oclgrind::Size3(1, 1, 1);
+ memcpy(&cmd->globalSize, global_work_size, work_dim*sizeof(size_t));
+ if (global_work_offset)
+ {
+ memcpy(&cmd->globalOffset, global_work_offset, work_dim*sizeof(size_t));
+ }
+ if (local_work_size)
+ {
+ memcpy(&cmd->localSize, local_work_size, work_dim*sizeof(size_t));
+ }
+
+ // Enqueue command
+ asyncQueueRetain(cmd, kernel);
+ asyncEnqueue(command_queue, CL_COMMAND_NDRANGE_KERNEL, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask
+(
+ cl_command_queue command_queue,
+ cl_kernel kernel,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ size_t work = 1;
+ return clEnqueueNDRangeKernel(command_queue, kernel, 1,
+ NULL, &work, &work,
+ num_events_in_wait_list,
+ event_wait_list,
+ event);
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel
+(
+ cl_command_queue command_queue,
+ void (CL_CALLBACK *user_func)(void *),
+ void * args,
+ size_t cb_args,
+ cl_uint num_mem_objects,
+ const cl_mem * mem_list,
+ const void ** args_mem_loc,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+ if (!user_func)
+ {
+ ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, user_func);
+ }
+ if (!args && (cb_args > 0 || num_mem_objects > 0))
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "args is NULL but cb_args|num_mem_objects >0");
+ }
+ if (args && cb_args == 0)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "args is non-NULL but cb_args is 0");
+ }
+ if (num_mem_objects > 0 && (!mem_list || !args_mem_loc))
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "num_mem_objects >0 but mem_list|args_mem_loc is NULL");
+ }
+ if (num_mem_objects == 0 && (mem_list || args_mem_loc))
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+ "num_mem_objects is 0 but mem_list|args_mem_loc not NULL");
+ }
+
+ // Replace mem objects with real pointers
+ oclgrind::Memory *memory = command_queue->context->context->getGlobalMemory();
+ for (unsigned i = 0; i < num_mem_objects; i++)
+ {
+ if (!mem_list[i])
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_MEM_OBJECT,
+ "Memory object " << i << " is NULL");
+ }
+
+ void *addr = memory->getPointer(mem_list[i]->address);
+ if (addr == NULL)
+ {
+ ReturnErrorInfo(command_queue->context, CL_INVALID_MEM_OBJECT,
+ "Memory object " << i << " not valid");
+ }
+ memcpy((void*)args_mem_loc[i], &addr, sizeof(void*));
+ }
+
+ // Create command
+ oclgrind::Queue::NativeKernelCommand *cmd =
+ new oclgrind::Queue::NativeKernelCommand(user_func, args, cb_args);
+
+ // Retain memory objects
+ for (unsigned i = 0; i < num_mem_objects; i++)
+ {
+ asyncQueueRetain(cmd, mem_list[i]);
+ }
+
+ // Enqueue commands
+ asyncEnqueue(command_queue, CL_COMMAND_NATIVE_KERNEL, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY void* CL_API_CALL
+clGetExtensionFunctionAddressForPlatform
+(
+ cl_platform_id platform,
+ const char * func_name
+) CL_API_SUFFIX__VERSION_1_2
+{
+ return NULL;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList
+(
+ cl_command_queue command_queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ // Enqueue command
+ oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+ asyncEnqueue(command_queue, CL_COMMAND_MARKER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList
+(
+ cl_command_queue command_queue,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_2
+{
+ // Check parameters
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ // Enqueue command
+ oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+ asyncEnqueue(command_queue, CL_COMMAND_BARRIER, cmd,
+ num_events_in_wait_list, event_wait_list, event);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetPrintfCallback
+(
+ cl_context context,
+ void (CL_CALLBACK * pfn_notify)(cl_context, cl_uint, char*, void*),
+ void * user_data
+) CL_API_SUFFIX__VERSION_1_2
+{
+ ReturnError(NULL, CL_INVALID_OPERATION);
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarker
+(
+ cl_command_queue command_queue,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ return clEnqueueMarkerWithWaitList(command_queue, 0, NULL, event);
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWaitForEvents
+(
+ cl_command_queue command_queue,
+ cl_uint num_events,
+ const cl_event * event_list
+) CL_API_SUFFIX__VERSION_1_0
+{
+ if (!command_queue)
+ {
+ ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+ }
+
+ // Enqueue command
+ oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+ asyncEnqueue(command_queue, CL_COMMAND_BARRIER, cmd,
+ num_events, event_list, NULL);
+
+ return CL_SUCCESS;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrier
+(
+ cl_command_queue command_queue
+) CL_API_SUFFIX__VERSION_1_0
+{
+ return clEnqueueBarrierWithWaitList(command_queue, 0, NULL, NULL);
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer
+(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_GLuint bufret_mem,
+ int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture
+(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_GLenum target,
+ cl_GLint miplevel,
+ cl_GLuint texture,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_2
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture2D
+(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_GLenum target,
+ cl_GLint miplevel,
+ cl_GLuint texture,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture3D
+(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_GLenum target,
+ cl_GLint miplevel,
+ cl_GLuint texture,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer
+(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_GLuint renderbuffer,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo
+(
+ cl_mem memobj,
+ cl_gl_object_type * gl_object_type,
+ cl_GLuint * gl_object_name
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_MEM_OBJECT, "CL/GL interop not implements");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo
+(
+ cl_mem memobj,
+ cl_gl_texture_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_MEM_OBJECT, "CL/GL interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR
+(
+ const cl_context_properties * properties,
+ cl_gl_context_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/GL interop not implemented");
+}
+
+CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR
+(
+ cl_context context,
+ cl_GLsync cl_GLsync,
+ cl_int * errcode_ret
+) CL_EXT_SUFFIX__VERSION_1_1
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+ return NULL;
+}
+
+#if defined(_WIN32) && !defined(__MINGW32__) // DX extension functions
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromD3D10KHR
+(
+ cl_platform_id platform,
+ cl_d3d10_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d10_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10BufferKHR
+(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Buffer * resource,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/DX interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10Texture2DKHR
+(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture2D * resource,
+ UINT subresource,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10Texture3DKHR
+(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D10Texture3D * resource,
+ UINT subresource,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireD3D10ObjectsKHR
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+)CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseD3D10ObjectsKHR
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromD3D11KHR
+(
+ cl_platform_id platform,
+ cl_d3d11_device_source_khr d3d_device_source,
+ void * d3d_object,
+ cl_d3d11_device_set_khr d3d_device_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11BufferKHR
+(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Buffer * resource,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/DX interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11Texture2DKHR
+(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Texture2D * resource,
+ UINT subresource,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11Texture3DKHR
+(
+ cl_context context,
+ cl_mem_flags flags,
+ ID3D11Texture3D * resource,
+ UINT subresource,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_0
+{
+ SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireD3D11ObjectsKHR
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+)CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseD3D11ObjectsKHR
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_0
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9MediaAdapterKHR
+(
+ cl_platform_id platform,
+ cl_uint num_media_adapters,
+ cl_dx9_media_adapter_type_khr * media_adapter_type,
+ void * media_adapters,
+ cl_dx9_media_adapter_set_khr media_adapter_set,
+ cl_uint num_entries,
+ cl_device_id * devices,
+ cl_uint * num_devices
+) CL_API_SUFFIX__VERSION_1_2
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceKHR
+(
+ cl_context context,
+ cl_mem_flags flags,
+ cl_dx9_media_adapter_type_khr adapter_type,
+ void * surface_info,
+ cl_uint plane,
+ cl_int * errcode_ret
+) CL_API_SUFFIX__VERSION_1_2
+{
+ SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/DX interop not implemented");
+ return NULL;
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9MediaSurfacesKHR
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_2
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9MediaSurfacesKHR
+(
+ cl_command_queue command_queue,
+ cl_uint num_objects,
+ const cl_mem * mem_objects,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event
+) CL_API_SUFFIX__VERSION_1_2
+{
+ ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+}
+
+#endif // DX extension functions
+
+////////////////////
+// Dispatch Table //
+////////////////////
+
+#define _NULL_ NULL
+#define DISPATCH_TABLE_ENTRY(FUNCTION) (void*)(FUNCTION)
+void *m_dispatchTable[] =
+{
+ DISPATCH_TABLE_ENTRY(clGetPlatformIDs),
+ DISPATCH_TABLE_ENTRY(clGetPlatformInfo),
+ DISPATCH_TABLE_ENTRY(clGetDeviceIDs),
+ DISPATCH_TABLE_ENTRY(clGetDeviceInfo),
+ DISPATCH_TABLE_ENTRY(clCreateContext),
+ DISPATCH_TABLE_ENTRY(clCreateContextFromType),
+ DISPATCH_TABLE_ENTRY(clRetainContext),
+ DISPATCH_TABLE_ENTRY(clReleaseContext),
+ DISPATCH_TABLE_ENTRY(clGetContextInfo),
+ DISPATCH_TABLE_ENTRY(clCreateCommandQueue),
+ DISPATCH_TABLE_ENTRY(clRetainCommandQueue),
+ DISPATCH_TABLE_ENTRY(clReleaseCommandQueue),
+ DISPATCH_TABLE_ENTRY(clGetCommandQueueInfo),
+ DISPATCH_TABLE_ENTRY(clSetCommandQueueProperty),
+ DISPATCH_TABLE_ENTRY(clCreateBuffer),
+ DISPATCH_TABLE_ENTRY(clCreateImage2D),
+ DISPATCH_TABLE_ENTRY(clCreateImage3D),
+ DISPATCH_TABLE_ENTRY(clRetainMemObject),
+ DISPATCH_TABLE_ENTRY(clReleaseMemObject),
+ DISPATCH_TABLE_ENTRY(clGetSupportedImageFormats),
+ DISPATCH_TABLE_ENTRY(clGetMemObjectInfo),
+ DISPATCH_TABLE_ENTRY(clGetImageInfo),
+ DISPATCH_TABLE_ENTRY(clCreateSampler),
+ DISPATCH_TABLE_ENTRY(clRetainSampler),
+ DISPATCH_TABLE_ENTRY(clReleaseSampler),
+ DISPATCH_TABLE_ENTRY(clGetSamplerInfo),
+ DISPATCH_TABLE_ENTRY(clCreateProgramWithSource),
+ DISPATCH_TABLE_ENTRY(clCreateProgramWithBinary),
+ DISPATCH_TABLE_ENTRY(clRetainProgram),
+ DISPATCH_TABLE_ENTRY(clReleaseProgram),
+ DISPATCH_TABLE_ENTRY(clBuildProgram),
+ DISPATCH_TABLE_ENTRY(clUnloadCompiler),
+ DISPATCH_TABLE_ENTRY(clGetProgramInfo),
+ DISPATCH_TABLE_ENTRY(clGetProgramBuildInfo),
+ DISPATCH_TABLE_ENTRY(clCreateKernel),
+ DISPATCH_TABLE_ENTRY(clCreateKernelsInProgram),
+ DISPATCH_TABLE_ENTRY(clRetainKernel),
+ DISPATCH_TABLE_ENTRY(clReleaseKernel),
+ DISPATCH_TABLE_ENTRY(clSetKernelArg),
+ DISPATCH_TABLE_ENTRY(clGetKernelInfo),
+ DISPATCH_TABLE_ENTRY(clGetKernelWorkGroupInfo),
+ DISPATCH_TABLE_ENTRY(clWaitForEvents),
+ DISPATCH_TABLE_ENTRY(clGetEventInfo),
+ DISPATCH_TABLE_ENTRY(clRetainEvent),
+ DISPATCH_TABLE_ENTRY(clReleaseEvent),
+ DISPATCH_TABLE_ENTRY(clGetEventProfilingInfo),
+ DISPATCH_TABLE_ENTRY(clFlush),
+ DISPATCH_TABLE_ENTRY(clFinish),
+ DISPATCH_TABLE_ENTRY(clEnqueueReadBuffer),
+ DISPATCH_TABLE_ENTRY(clEnqueueWriteBuffer),
+ DISPATCH_TABLE_ENTRY(clEnqueueCopyBuffer),
+ DISPATCH_TABLE_ENTRY(clEnqueueReadImage),
+ DISPATCH_TABLE_ENTRY(clEnqueueWriteImage),
+ DISPATCH_TABLE_ENTRY(clEnqueueCopyImage),
+ DISPATCH_TABLE_ENTRY(clEnqueueCopyImageToBuffer),
+ DISPATCH_TABLE_ENTRY(clEnqueueCopyBufferToImage),
+ DISPATCH_TABLE_ENTRY(clEnqueueMapBuffer),
+ DISPATCH_TABLE_ENTRY(clEnqueueMapImage),
+ DISPATCH_TABLE_ENTRY(clEnqueueUnmapMemObject),
+ DISPATCH_TABLE_ENTRY(clEnqueueNDRangeKernel),
+ DISPATCH_TABLE_ENTRY(clEnqueueTask),
+ DISPATCH_TABLE_ENTRY(clEnqueueNativeKernel),
+ DISPATCH_TABLE_ENTRY(clEnqueueMarker),
+ DISPATCH_TABLE_ENTRY(clEnqueueWaitForEvents),
+ DISPATCH_TABLE_ENTRY(clEnqueueBarrier),
+ DISPATCH_TABLE_ENTRY(clGetExtensionFunctionAddress),
+ DISPATCH_TABLE_ENTRY(clCreateFromGLBuffer),
+ DISPATCH_TABLE_ENTRY(clCreateFromGLTexture2D),
+ DISPATCH_TABLE_ENTRY(clCreateFromGLTexture3D),
+ DISPATCH_TABLE_ENTRY(clCreateFromGLRenderbuffer),
+ DISPATCH_TABLE_ENTRY(clGetGLObjectInfo),
+ DISPATCH_TABLE_ENTRY(clGetGLTextureInfo),
+ DISPATCH_TABLE_ENTRY(clEnqueueAcquireGLObjects),
+ DISPATCH_TABLE_ENTRY(clEnqueueReleaseGLObjects),
+
+ DISPATCH_TABLE_ENTRY(clGetGLContextInfoKHR),
+
+#if defined(_WIN32)
+ DISPATCH_TABLE_ENTRY(clGetDeviceIDsFromD3D10KHR),
+ DISPATCH_TABLE_ENTRY(clCreateFromD3D10BufferKHR),
+ DISPATCH_TABLE_ENTRY(clCreateFromD3D10Texture2DKHR),
+ DISPATCH_TABLE_ENTRY(clCreateFromD3D10Texture3DKHR),
+ DISPATCH_TABLE_ENTRY(clEnqueueAcquireD3D10ObjectsKHR),
+ DISPATCH_TABLE_ENTRY(clEnqueueReleaseD3D10ObjectsKHR),
+#else
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+#endif
+
+ // OpenCL 1.1
+ DISPATCH_TABLE_ENTRY(clSetEventCallback),
+ DISPATCH_TABLE_ENTRY(clCreateSubBuffer),
+ DISPATCH_TABLE_ENTRY(clSetMemObjectDestructorCallback),
+ DISPATCH_TABLE_ENTRY(clCreateUserEvent),
+ DISPATCH_TABLE_ENTRY(clSetUserEventStatus),
+ DISPATCH_TABLE_ENTRY(clEnqueueReadBufferRect),
+ DISPATCH_TABLE_ENTRY(clEnqueueWriteBufferRect),
+ DISPATCH_TABLE_ENTRY(clEnqueueCopyBufferRect),
+
+ DISPATCH_TABLE_ENTRY(NULL), // clCreateSubDevicesEXT
+ DISPATCH_TABLE_ENTRY(NULL), // clRetainDeviceEXT
+ DISPATCH_TABLE_ENTRY(NULL), // clReleaseDeviceEXT
+
+ DISPATCH_TABLE_ENTRY(clCreateEventFromGLsyncKHR),
+
+ // OpenCL 1.2
+ DISPATCH_TABLE_ENTRY(clCreateSubDevices),
+ DISPATCH_TABLE_ENTRY(clRetainDevice),
+ DISPATCH_TABLE_ENTRY(clReleaseDevice),
+ DISPATCH_TABLE_ENTRY(clCreateImage),
+ DISPATCH_TABLE_ENTRY(clCreateProgramWithBuiltInKernels),
+ DISPATCH_TABLE_ENTRY(clCompileProgram),
+ DISPATCH_TABLE_ENTRY(clLinkProgram),
+ DISPATCH_TABLE_ENTRY(clUnloadPlatformCompiler),
+ DISPATCH_TABLE_ENTRY(clGetKernelArgInfo),
+ DISPATCH_TABLE_ENTRY(clEnqueueFillBuffer),
+ DISPATCH_TABLE_ENTRY(clEnqueueFillImage),
+ DISPATCH_TABLE_ENTRY(clEnqueueMigrateMemObjects),
+ DISPATCH_TABLE_ENTRY(clEnqueueMarkerWithWaitList),
+ DISPATCH_TABLE_ENTRY(clEnqueueBarrierWithWaitList),
+ DISPATCH_TABLE_ENTRY(clGetExtensionFunctionAddressForPlatform),
+ DISPATCH_TABLE_ENTRY(clCreateFromGLTexture),
+
+#if defined(_WIN32)
+ DISPATCH_TABLE_ENTRY(clGetDeviceIDsFromD3D11KHR),
+ DISPATCH_TABLE_ENTRY(clCreateFromD3D11BufferKHR),
+ DISPATCH_TABLE_ENTRY(clCreateFromD3D11Texture2DKHR),
+ DISPATCH_TABLE_ENTRY(clCreateFromD3D11Texture3DKHR),
+ DISPATCH_TABLE_ENTRY(clCreateFromDX9MediaSurfaceKHR),
+ DISPATCH_TABLE_ENTRY(clEnqueueAcquireD3D11ObjectsKHR),
+ DISPATCH_TABLE_ENTRY(clEnqueueReleaseD3D11ObjectsKHR),
+ DISPATCH_TABLE_ENTRY(clGetDeviceIDsFromDX9MediaAdapterKHR),
+ DISPATCH_TABLE_ENTRY(clEnqueueAcquireDX9MediaSurfacesKHR),
+ DISPATCH_TABLE_ENTRY(clEnqueueReleaseDX9MediaSurfacesKHR),
+#else
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+ DISPATCH_TABLE_ENTRY(NULL),
+#endif
+};
diff --git a/src/runtime/runtime.def b/src/runtime/runtime.def
new file mode 100644
index 0000000..77992cf
--- /dev/null
+++ b/src/runtime/runtime.def
@@ -0,0 +1,119 @@
+EXPORTS
+
+; Make runtime functions visible
+clGetPlatformIDs
+clGetPlatformInfo
+clGetDeviceIDs
+clGetDeviceInfo
+clCreateContext
+clCreateContextFromType
+clRetainContext
+clReleaseContext
+clGetContextInfo
+clCreateCommandQueue
+clRetainCommandQueue
+clReleaseCommandQueue
+clGetCommandQueueInfo
+clSetCommandQueueProperty
+clCreateBuffer
+clCreateImage2D
+clCreateImage3D
+clRetainMemObject
+clReleaseMemObject
+clGetSupportedImageFormats
+clGetMemObjectInfo
+clGetImageInfo
+clCreateSampler
+clRetainSampler
+clReleaseSampler
+clGetSamplerInfo
+clCreateProgramWithSource
+clCreateProgramWithBinary
+clRetainProgram
+clReleaseProgram
+clBuildProgram
+clUnloadCompiler
+clGetProgramInfo
+clGetProgramBuildInfo
+clCreateKernel
+clCreateKernelsInProgram
+clRetainKernel
+clReleaseKernel
+clSetKernelArg
+clGetKernelInfo
+clGetKernelWorkGroupInfo
+clWaitForEvents
+clGetEventInfo
+clRetainEvent
+clReleaseEvent
+clGetEventProfilingInfo
+clFlush
+clFinish
+clEnqueueReadBuffer
+clEnqueueWriteBuffer
+clEnqueueCopyBuffer
+clEnqueueReadImage
+clEnqueueWriteImage
+clEnqueueCopyImage
+clEnqueueCopyImageToBuffer
+clEnqueueCopyBufferToImage
+clEnqueueMapBuffer
+clEnqueueMapImage
+clEnqueueUnmapMemObject
+clEnqueueNDRangeKernel
+clEnqueueTask
+clEnqueueNativeKernel
+clEnqueueMarker
+clEnqueueWaitForEvents
+clEnqueueBarrier
+clGetExtensionFunctionAddress
+clCreateFromGLBuffer
+clCreateFromGLTexture2D
+clCreateFromGLTexture3D
+clCreateFromGLRenderbuffer
+clGetGLObjectInfo
+clGetGLTextureInfo
+clEnqueueAcquireGLObjects
+clEnqueueReleaseGLObjects
+clGetGLContextInfoKHR
+clGetDeviceIDsFromD3D10KHR
+clCreateFromD3D10BufferKHR
+clCreateFromD3D10Texture2DKHR
+clCreateFromD3D10Texture3DKHR
+clEnqueueAcquireD3D10ObjectsKHR
+clEnqueueReleaseD3D10ObjectsKHR
+clSetEventCallback
+clCreateSubBuffer
+clSetMemObjectDestructorCallback
+clCreateUserEvent
+clSetUserEventStatus
+clEnqueueReadBufferRect
+clEnqueueWriteBufferRect
+clEnqueueCopyBufferRect
+clCreateEventFromGLsyncKHR
+clCreateSubDevices
+clRetainDevice
+clReleaseDevice
+clCreateImage
+clCreateProgramWithBuiltInKernels
+clCompileProgram
+clLinkProgram
+clUnloadPlatformCompiler
+clGetKernelArgInfo
+clEnqueueFillBuffer
+clEnqueueFillImage
+clEnqueueMigrateMemObjects
+clEnqueueMarkerWithWaitList
+clEnqueueBarrierWithWaitList
+clGetExtensionFunctionAddressForPlatform
+clCreateFromGLTexture
+clGetDeviceIDsFromD3D11KHR
+clCreateFromD3D11BufferKHR
+clCreateFromD3D11Texture2DKHR
+clCreateFromD3D11Texture3DKHR
+clCreateFromDX9MediaSurfaceKHR
+clEnqueueAcquireD3D11ObjectsKHR
+clEnqueueReleaseD3D11ObjectsKHR
+clGetDeviceIDsFromDX9MediaAdapterKHR
+clEnqueueAcquireDX9MediaSurfacesKHR
+clEnqueueReleaseDX9MediaSurfacesKHR
diff --git a/tests/apps/CMakeLists.txt b/tests/apps/CMakeLists.txt
new file mode 100644
index 0000000..0dff241
--- /dev/null
+++ b/tests/apps/CMakeLists.txt
@@ -0,0 +1,33 @@
+# CMakeLists.txt (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+# Add app tests
+foreach(test
+ vecadd)
+
+ add_executable(${test} ${test}/${test}.c)
+ target_link_libraries(${test} oclgrind-rt)
+
+ # Generate test binaries in same dir as Oclgrind libraries on Windows
+ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ add_test(app_${test} "${CMAKE_BINARY_DIR}/${test}")
+ set_target_properties(${test} PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+ else()
+ add_test(app_${test} "${test}/${test}")
+ set_target_properties(${test} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${test}")
+ set_target_properties(${test} PROPERTIES LINKER_LANGUAGE CXX)
+ endif()
+
+ set_tests_properties(app_${test} PROPERTIES DEPENDS ${test})
+
+ # Set PCH directory
+ set_tests_properties(app_${test} PROPERTIES
+ ENVIRONMENT "OCLGRIND_PCH_DIR=${CMAKE_BINARY_DIR}/include/oclgrind")
+
+endforeach(${test})
diff --git a/tests/apps/vecadd/vecadd.c b/tests/apps/vecadd/vecadd.c
new file mode 100644
index 0000000..22d55ed
--- /dev/null
+++ b/tests/apps/vecadd/vecadd.c
@@ -0,0 +1,190 @@
+#include <CL/cl.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define TOL 1e-8
+#define MAX_ERRORS 8
+#define MAX_PLATFORMS 8
+
+const char *KERNEL_SOURCE =
+"kernel void vecadd(global float *a, \n"
+" global float *b, \n"
+" global float *c) \n"
+"{ \n"
+" int i = get_global_id(0); \n"
+" c[i] = a[i] + b[i]; \n"
+"} \n"
+;
+
+void checkError(cl_int err, const char *operation);
+
+int main(int argc, char *argv[])
+{
+ cl_int err;
+ cl_platform_id platform;
+ cl_device_id device;
+ cl_context context;
+ cl_command_queue queue;
+ cl_program program;
+ cl_kernel kernel;
+ cl_mem d_a, d_b, d_c;
+ float *h_a, *h_b, *h_c;
+
+ size_t N = 1024;
+ if (argc > 1)
+ {
+ N = atoi(argv[1]);
+ }
+
+ size_t global = N;
+ if (argc > 2)
+ {
+ global = atoi(argv[2]);
+ }
+
+ if (!N || !global)
+ {
+ printf("Usage: ./vecadd N [GLOBAL_SIZE]\n");
+ exit(1);
+ }
+
+ // Get list of platforms
+ cl_uint numPlatforms = 0;
+ cl_platform_id platforms[MAX_PLATFORMS];
+ err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms);
+ checkError(err, "getting platforms");
+
+ // Find Oclgrind
+ platform = NULL;
+ for (int i = 0; i < numPlatforms; i++)
+ {
+ char name[256];
+ err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 256, name, NULL);
+ checkError(err, "getting platform name");
+ if (!strcmp(name, "Oclgrind"))
+ {
+ platform = platforms[i];
+ break;
+ }
+ }
+ if (!platform)
+ {
+ fprintf(stderr, "Unable to find Oclgrind platform\n");
+ exit(1);
+ }
+
+ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
+ checkError(err, "getting device");
+
+ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+ checkError(err, "creating context");
+
+ queue = clCreateCommandQueue(context, device, 0, &err);
+ checkError(err, "creating command queue");
+
+ program = clCreateProgramWithSource(context, 1, &KERNEL_SOURCE, NULL, &err);
+ checkError(err, "creating program");
+
+ err = clBuildProgram(program, 1, &device, "", NULL, NULL);
+ if (err == CL_BUILD_PROGRAM_FAILURE)
+ {
+ size_t sz;
+ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+ sizeof(size_t), NULL, &sz);
+ char *buildLog = malloc(++sz);
+ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+ sz, buildLog, NULL);
+ fprintf(stderr, "%s\n", buildLog);
+ }
+ checkError(err, "building program");
+
+ kernel = clCreateKernel(program, "vecadd", &err);
+ checkError(err, "creating kernel");
+
+ size_t dataSize = N*sizeof(cl_float);
+
+ // Initialise host data
+ srand(0);
+ h_a = malloc(dataSize);
+ h_b = malloc(dataSize);
+ h_c = malloc(dataSize);
+ for (int i = 0; i < N; i++)
+ {
+ h_a[i] = rand()/(float)RAND_MAX;
+ h_b[i] = rand()/(float)RAND_MAX;
+ h_c[i] = 0;
+ }
+
+ d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+ checkError(err, "creating d_a buffer");
+ d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+ checkError(err, "creating d_b buffer");
+ d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
+ checkError(err, "creating d_c buffer");
+
+ err = clEnqueueWriteBuffer(queue, d_a, CL_FALSE,
+ 0, dataSize, h_a, 0, NULL, NULL);
+ checkError(err, "writing d_a data");
+ err = clEnqueueWriteBuffer(queue, d_b, CL_FALSE,
+ 0, dataSize, h_b, 0, NULL, NULL);
+ checkError(err, "writing d_b data");
+ err = clEnqueueWriteBuffer(queue, d_c, CL_FALSE,
+ 0, dataSize, h_c, 0, NULL, NULL);
+ checkError(err, "writing d_c data");
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
+ err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
+ err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
+ checkError(err, "setting kernel args");
+
+ err = clEnqueueNDRangeKernel(queue, kernel,
+ 1, NULL, &global, NULL, 0, NULL, NULL);
+ checkError(err, "enqueuing kernel");
+
+ err = clFinish(queue);
+ checkError(err, "running kernel");
+
+ err = clEnqueueReadBuffer(queue, d_c, CL_TRUE,
+ 0, dataSize, h_c, 0, NULL, NULL);
+ checkError(err, "reading d_c data");
+
+ // Check results
+ int errors = 0;
+ for (int i = 0; i < N; i++)
+ {
+ float ref = h_a[i] + h_b[i];
+ if (fabs(ref - h_c[i]) > TOL)
+ {
+ if (errors < MAX_ERRORS)
+ {
+ fprintf(stderr, "%4d: %.4f != %.4f\n", i, h_c[i], ref);
+ }
+ errors++;
+ }
+ }
+ printf("%d errors detected\n", errors);
+
+ free(h_a);
+ free(h_b);
+ free(h_c);
+ clReleaseMemObject(d_a);
+ clReleaseMemObject(d_b);
+ clReleaseMemObject(d_c);
+ clReleaseKernel(kernel);
+ clReleaseProgram(program);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(context);
+
+ return (errors != 0);
+}
+
+void checkError(cl_int err, const char *operation)
+{
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
+ exit(1);
+ }
+}
diff --git a/tests/kernels/TESTS b/tests/kernels/TESTS
new file mode 100644
index 0000000..2ac8723
--- /dev/null
+++ b/tests/kernels/TESTS
@@ -0,0 +1,56 @@
+alignment/packed
+alignment/unaligned
+async_copy/async_copy
+async_copy/async_copy_divergent
+async_copy/async_copy_global_race
+async_copy/async_copy_local_race
+async_copy/async_copy_loop
+async_copy/async_copy_loop_divergent
+async_copy/async_copy_single_wi
+async_copy/async_copy_unwaited
+atomics/atomic_cmpxchg_false_race
+atomics/atomic_cmpxchg_read_race
+atomics/atomic_cmpxchg_write_race
+atomics/atomic_global_fence
+atomics/atomic_global_fence_race
+atomics/atomic_increment
+atomics/atomic_intergroup_race
+atomics/atomic_local_fence
+atomics/atomic_race_after
+atomics/atomic_race_before
+atomics/atomic_same_workitem
+barrier/barrier_different_instructions
+barrier/barrier_divergence
+bugs/gvn_arbitrary_integers
+bugs/kernel_struct_argument
+bugs/many_alloca
+bugs/multidim_array_in_struct
+bugs/null_argument
+bugs/sroa_addrspace_cast
+data-race/broadcast
+data-race/global_fence
+data-race/global_only_fence
+data-race/global_read_write_race
+data-race/global_write_write_race
+data-race/increment
+data-race/intergroup_hidden_race
+data-race/intragroup_hidden_race
+data-race/intergroup_race
+data-race/local_only_fence
+data-race/local_read_write_race
+data-race/local_write_write_race
+data-race/uniform_write_race
+memcheck/async_copy_out_of_bounds
+memcheck/atomic_out_of_bounds
+memcheck/dereference_null
+memcheck/read_out_of_bounds
+memcheck/read_write_only_memory
+memcheck/write_out_of_bounds
+memcheck/write_read_only_memory
+misc/array
+misc/reduce
+misc/vecadd
+wait_event/wait_event_chained
+wait_event/wait_event_divergent
+wait_event/wait_event_duplicates
+wait_event/wait_event_invalid
\ No newline at end of file
diff --git a/tests/kernels/alignment/packed.cl b/tests/kernels/alignment/packed.cl
new file mode 100644
index 0000000..f6d5391
--- /dev/null
+++ b/tests/kernels/alignment/packed.cl
@@ -0,0 +1,10 @@
+struct __attribute__((packed)) Foo
+{
+ char a;
+ int b;
+};
+
+kernel void packed(struct Foo x, global int *out)
+{
+ *out = x.b;
+}
diff --git a/tests/kernels/alignment/packed.ref b/tests/kernels/alignment/packed.ref
new file mode 100644
index 0000000..df23fc9
--- /dev/null
+++ b/tests/kernels/alignment/packed.ref
@@ -0,0 +1,4 @@
+
+Argument 'out': 4 bytes
+ out[0] = 2
+
diff --git a/tests/kernels/alignment/packed.sim b/tests/kernels/alignment/packed.sim
new file mode 100644
index 0000000..46e9090
--- /dev/null
+++ b/tests/kernels/alignment/packed.sim
@@ -0,0 +1,10 @@
+packed.cl
+packed
+1 1 1
+1 1 1
+
+<size=5 char hex>
+0x01
+0x02 0x00 0x0 0x00
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/alignment/unaligned.cl b/tests/kernels/alignment/unaligned.cl
new file mode 100644
index 0000000..140607e
--- /dev/null
+++ b/tests/kernels/alignment/unaligned.cl
@@ -0,0 +1,6 @@
+kernel void unaligned(global int *in, global int *out)
+{
+ global char *char_ptr = (global char*)in + 2;
+ global int *address = (global int*)char_ptr;
+ *out = *address;
+}
diff --git a/tests/kernels/alignment/unaligned.ref b/tests/kernels/alignment/unaligned.ref
new file mode 100644
index 0000000..1114e03
--- /dev/null
+++ b/tests/kernels/alignment/unaligned.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'out': 4 bytes
+ out[0] = 2752512
+
diff --git a/tests/kernels/alignment/unaligned.sim b/tests/kernels/alignment/unaligned.sim
new file mode 100644
index 0000000..70211e3
--- /dev/null
+++ b/tests/kernels/alignment/unaligned.sim
@@ -0,0 +1,7 @@
+unaligned.cl
+unaligned
+1 1 1
+1 1 1
+
+<size=8 fill=42>
+<size=4 fill=0 dump>
diff --git a/tests/kernels/async_copy/async_copy.cl b/tests/kernels/async_copy/async_copy.cl
new file mode 100644
index 0000000..a5280ce
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy.cl
@@ -0,0 +1,8 @@
+kernel void async_copy(global int *data, local int *scratch)
+{
+ event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+ wait_group_events(1, &event);
+
+ int i = get_local_id(0);
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/async_copy/async_copy.ref b/tests/kernels/async_copy/async_copy.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy.ref
@@ -0,0 +1,7 @@
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/async_copy/async_copy.sim b/tests/kernels/async_copy/async_copy.sim
new file mode 100644
index 0000000..58ec323
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy.sim
@@ -0,0 +1,7 @@
+async_copy.cl
+async_copy
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/async_copy/async_copy_divergent.cl b/tests/kernels/async_copy/async_copy_divergent.cl
new file mode 100644
index 0000000..f428a10
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_divergent.cl
@@ -0,0 +1,14 @@
+kernel void async_copy_divergent(global int *data, local int *scratch)
+{
+ int i = get_local_id(0);
+ size_t size = get_local_size(0);
+ if (i == size-1)
+ {
+ size = 1;
+ }
+
+ event_t event = async_work_group_copy(scratch, data, size, 0);
+ wait_group_events(1, &event);
+
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/async_copy/async_copy_divergent.ref b/tests/kernels/async_copy/async_copy_divergent.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_divergent.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/async_copy/async_copy_divergent.sim b/tests/kernels/async_copy/async_copy_divergent.sim
new file mode 100644
index 0000000..ef59d1a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_divergent.sim
@@ -0,0 +1,7 @@
+async_copy_divergent.cl
+async_copy_divergent
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/async_copy/async_copy_global_race.cl b/tests/kernels/async_copy/async_copy_global_race.cl
new file mode 100644
index 0000000..bf2684a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_global_race.cl
@@ -0,0 +1,11 @@
+kernel void async_copy_global_race(global int *data, local int *scratch)
+{
+ int i = get_local_id(0);
+ scratch[i] = i;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ data[i] = 0;
+
+ event_t event = async_work_group_copy(data, scratch, get_local_size(0), 0);
+ wait_group_events(1, &event);
+}
diff --git a/tests/kernels/async_copy/async_copy_global_race.ref b/tests/kernels/async_copy/async_copy_global_race.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_global_race.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 0
+ data[1] = 1
+ data[2] = 2
+ data[3] = 3
+
diff --git a/tests/kernels/async_copy/async_copy_global_race.sim b/tests/kernels/async_copy/async_copy_global_race.sim
new file mode 100644
index 0000000..9ff8835
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_global_race.sim
@@ -0,0 +1,7 @@
+async_copy_global_race.cl
+async_copy_global_race
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/async_copy/async_copy_local_race.cl b/tests/kernels/async_copy/async_copy_local_race.cl
new file mode 100644
index 0000000..02fd84e
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_local_race.cl
@@ -0,0 +1,10 @@
+kernel void async_copy_local_race(global int *data, local int *scratch)
+{
+ int i = get_local_id(0);
+
+ scratch[i] = 0;
+ event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+ wait_group_events(1, &event);
+
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/async_copy/async_copy_local_race.ref b/tests/kernels/async_copy/async_copy_local_race.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_local_race.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/async_copy/async_copy_local_race.sim b/tests/kernels/async_copy/async_copy_local_race.sim
new file mode 100644
index 0000000..5506a1a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_local_race.sim
@@ -0,0 +1,7 @@
+async_copy_local_race.cl
+async_copy_local_race
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/async_copy/async_copy_loop.cl b/tests/kernels/async_copy/async_copy_loop.cl
new file mode 100644
index 0000000..caff0c3
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop.cl
@@ -0,0 +1,14 @@
+kernel void async_copy_loop(global int *data, local int *scratch)
+{
+ int i = get_local_id(0);
+
+ event_t event = 0;
+ for (int j = 0; j < get_local_size(0); j++)
+ {
+ int offset = j;
+ event = async_work_group_copy(scratch+offset, data+offset, 1, event);
+ }
+ wait_group_events(1, &event);
+
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/async_copy/async_copy_loop.ref b/tests/kernels/async_copy/async_copy_loop.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop.ref
@@ -0,0 +1,7 @@
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/async_copy/async_copy_loop.sim b/tests/kernels/async_copy/async_copy_loop.sim
new file mode 100644
index 0000000..7f4fbd2
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop.sim
@@ -0,0 +1,7 @@
+async_copy_loop.cl
+async_copy_loop
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/async_copy/async_copy_loop_divergent.cl b/tests/kernels/async_copy/async_copy_loop_divergent.cl
new file mode 100644
index 0000000..5d7f399
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop_divergent.cl
@@ -0,0 +1,19 @@
+kernel void async_copy_loop_divergent(global int *data, local int *scratch)
+{
+ int i = get_local_id(0);
+
+ event_t event = 0;
+ for (int j = 0; j < get_local_size(0); j++)
+ {
+ int offset = j;
+ if (i == 2 && j == 2)
+ {
+ offset = 0;
+ }
+
+ event = async_work_group_copy(scratch+offset, data+offset, 1, event);
+ }
+ wait_group_events(1, &event);
+
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/async_copy/async_copy_loop_divergent.ref b/tests/kernels/async_copy/async_copy_loop_divergent.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop_divergent.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/async_copy/async_copy_loop_divergent.sim b/tests/kernels/async_copy/async_copy_loop_divergent.sim
new file mode 100644
index 0000000..6c2da0e
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop_divergent.sim
@@ -0,0 +1,7 @@
+async_copy_loop_divergent.cl
+async_copy_loop_divergent
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/async_copy/async_copy_single_wi.cl b/tests/kernels/async_copy/async_copy_single_wi.cl
new file mode 100644
index 0000000..40cac34
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_single_wi.cl
@@ -0,0 +1,13 @@
+kernel void async_copy_single_wi(global int *data, local int *scratch)
+{
+ int i = get_local_id(0);
+ event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+ if (i == 0)
+ {
+ // An extra copy that will only be registered by one work-item
+ event = async_work_group_copy(scratch, data, 1, event);
+ }
+ wait_group_events(1, &event);
+
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/async_copy/async_copy_single_wi.ref b/tests/kernels/async_copy/async_copy_single_wi.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_single_wi.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/async_copy/async_copy_single_wi.sim b/tests/kernels/async_copy/async_copy_single_wi.sim
new file mode 100644
index 0000000..fc4c50a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_single_wi.sim
@@ -0,0 +1,7 @@
+async_copy_single_wi.cl
+async_copy_single_wi
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/async_copy/async_copy_unwaited.cl b/tests/kernels/async_copy/async_copy_unwaited.cl
new file mode 100644
index 0000000..5c64771
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_unwaited.cl
@@ -0,0 +1,7 @@
+kernel void async_copy_unwaited(global int *data, local int *scratch)
+{
+ event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+
+ int i = get_local_id(0);
+ data[get_local_size(0)-i-1] = i;
+}
diff --git a/tests/kernels/async_copy/async_copy_unwaited.ref b/tests/kernels/async_copy/async_copy_unwaited.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_unwaited.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/async_copy/async_copy_unwaited.sim b/tests/kernels/async_copy/async_copy_unwaited.sim
new file mode 100644
index 0000000..698f053
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_unwaited.sim
@@ -0,0 +1,7 @@
+async_copy_unwaited.cl
+async_copy_unwaited
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/atomics/atomic_cmpxchg_false_race.cl b/tests/kernels/atomics/atomic_cmpxchg_false_race.cl
new file mode 100644
index 0000000..dda8dde
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_false_race.cl
@@ -0,0 +1,36 @@
+kernel void atomic_cmpxchg_false_race(global int *data, local int *scratch)
+{
+ int l = get_local_id(0);
+ if (l == 0)
+ {
+ scratch[0] = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ bool done = false;
+ int before, old;
+ int result;
+ for (int i = 0; i < get_local_size(0); i++)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ before = scratch[0];
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (!done)
+ {
+ old = atomic_cmpxchg(scratch, before, before+1);
+ if (old == before)
+ {
+ done = true;
+ result = scratch[0];
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (l == 0)
+ {
+ *data = *scratch;
+ }
+ data[l+1] = result;
+}
diff --git a/tests/kernels/atomics/atomic_cmpxchg_false_race.ref b/tests/kernels/atomics/atomic_cmpxchg_false_race.ref
new file mode 100644
index 0000000..fe14281
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_false_race.ref
@@ -0,0 +1,8 @@
+
+Argument 'data': 20 bytes
+ data[0] = 4
+ data[1] = 1
+ data[2] = 2
+ data[3] = 3
+ data[4] = 4
+
diff --git a/tests/kernels/atomics/atomic_cmpxchg_false_race.sim b/tests/kernels/atomics/atomic_cmpxchg_false_race.sim
new file mode 100644
index 0000000..f926a6e
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_false_race.sim
@@ -0,0 +1,7 @@
+atomic_cmpxchg_false_race.cl
+atomic_cmpxchg_false_race
+4 1 1
+4 1 1
+
+<size=20 fill=0 dump>
+<size=4>
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.cl b/tests/kernels/atomics/atomic_cmpxchg_read_race.cl
new file mode 100644
index 0000000..9be3a88
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.cl
@@ -0,0 +1,12 @@
+kernel void atomic_cmpxchg_read_race(global int *data)
+{
+ int i = get_global_id(0);
+ if (i == 0)
+ {
+ *data = 0;
+ }
+ else
+ {
+ atomic_cmpxchg(data, 0, i);
+ }
+}
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.ref b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
new file mode 100644
index 0000000..b398c6c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'data': 4 bytes
+ data[0] = 1
+
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.sim b/tests/kernels/atomics/atomic_cmpxchg_read_race.sim
new file mode 100644
index 0000000..daa580c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.sim
@@ -0,0 +1,7 @@
+atomic_cmpxchg_read_race.cl
+atomic_cmpxchg_read_race
+2 1 1
+2 1 1
+
+<size=4 dump>
+-1
diff --git a/tests/kernels/atomics/atomic_cmpxchg_write_race.cl b/tests/kernels/atomics/atomic_cmpxchg_write_race.cl
new file mode 100644
index 0000000..b78cc6d
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_write_race.cl
@@ -0,0 +1,9 @@
+kernel void atomic_cmpxchg_write_race(global int *data)
+{
+ int i = get_global_id(0);
+ if (i == 0)
+ {
+ *data = 0;
+ }
+ atomic_cmpxchg(data, i, 42);
+}
diff --git a/tests/kernels/atomics/atomic_cmpxchg_write_race.ref b/tests/kernels/atomics/atomic_cmpxchg_write_race.ref
new file mode 100644
index 0000000..af96d9b
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_write_race.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'data': 4 bytes
+ data[0] = 42
+
diff --git a/tests/kernels/atomics/atomic_cmpxchg_write_race.sim b/tests/kernels/atomics/atomic_cmpxchg_write_race.sim
new file mode 100644
index 0000000..74591bf
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_write_race.sim
@@ -0,0 +1,7 @@
+atomic_cmpxchg_write_race.cl
+atomic_cmpxchg_write_race
+2 1 1
+2 1 1
+
+<size=4 dump>
+-1
diff --git a/tests/kernels/atomics/atomic_global_fence.cl b/tests/kernels/atomics/atomic_global_fence.cl
new file mode 100644
index 0000000..a4edf11
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence.cl
@@ -0,0 +1,17 @@
+kernel void atomic_global_fence(global int *data, global int *scratch)
+{
+ int i = get_global_id(0);
+ int l = get_local_id(0);
+ int g = get_group_id(0);
+ if (l == 0)
+ {
+ scratch[g] = 0;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ atomic_add(scratch+g, i);
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ if (l == 0)
+ {
+ data[g] = scratch[g];
+ }
+}
diff --git a/tests/kernels/atomics/atomic_global_fence.ref b/tests/kernels/atomics/atomic_global_fence.ref
new file mode 100644
index 0000000..a7bf48a
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence.ref
@@ -0,0 +1,5 @@
+
+Argument 'data': 8 bytes
+ data[0] = 6
+ data[1] = 22
+
diff --git a/tests/kernels/atomics/atomic_global_fence.sim b/tests/kernels/atomics/atomic_global_fence.sim
new file mode 100644
index 0000000..76f685c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence.sim
@@ -0,0 +1,7 @@
+atomic_global_fence.cl
+atomic_global_fence
+8 1 1
+4 1 1
+
+<size=8 fill=0 dump>
+<size=8 fill=-1>
diff --git a/tests/kernels/atomics/atomic_global_fence_race.cl b/tests/kernels/atomics/atomic_global_fence_race.cl
new file mode 100644
index 0000000..a84cbb7
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence_race.cl
@@ -0,0 +1,12 @@
+kernel void atomic_global_fence_race(global int *data, global int *scratch)
+{
+ int i = get_global_id(0);
+ int l = get_local_id(0);
+ int g = get_group_id(0);
+ atomic_add(scratch, i);
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ if (l == 0)
+ {
+ data[g] = *scratch;
+ }
+}
diff --git a/tests/kernels/atomics/atomic_global_fence_race.ref b/tests/kernels/atomics/atomic_global_fence_race.ref
new file mode 100644
index 0000000..4920bcf
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence_race.ref
@@ -0,0 +1,6 @@
+ERROR EXPECTED
+
+Argument 'data': 8 bytes
+ data[0] = 6
+ data[1] = 28
+
diff --git a/tests/kernels/atomics/atomic_global_fence_race.sim b/tests/kernels/atomics/atomic_global_fence_race.sim
new file mode 100644
index 0000000..af77d6c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence_race.sim
@@ -0,0 +1,7 @@
+atomic_global_fence_race.cl
+atomic_global_fence_race
+8 1 1
+4 1 1
+
+<size=8 fill=0 dump>
+<size=4 fill=0>
diff --git a/tests/kernels/atomics/atomic_increment.cl b/tests/kernels/atomics/atomic_increment.cl
new file mode 100644
index 0000000..e9a11fa
--- /dev/null
+++ b/tests/kernels/atomics/atomic_increment.cl
@@ -0,0 +1,4 @@
+kernel void atomic_increment(global int *data)
+{
+ atomic_inc(data);
+}
diff --git a/tests/kernels/atomics/atomic_increment.ref b/tests/kernels/atomics/atomic_increment.ref
new file mode 100644
index 0000000..f61189d
--- /dev/null
+++ b/tests/kernels/atomics/atomic_increment.ref
@@ -0,0 +1,4 @@
+
+Argument 'data': 4 bytes
+ data[0] = 4
+
diff --git a/tests/kernels/atomics/atomic_increment.sim b/tests/kernels/atomics/atomic_increment.sim
new file mode 100644
index 0000000..38e2866
--- /dev/null
+++ b/tests/kernels/atomics/atomic_increment.sim
@@ -0,0 +1,6 @@
+atomic_increment.cl
+atomic_increment
+4 1 1
+1 1 1
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_intergroup_race.cl b/tests/kernels/atomics/atomic_intergroup_race.cl
new file mode 100644
index 0000000..b8d70f6
--- /dev/null
+++ b/tests/kernels/atomics/atomic_intergroup_race.cl
@@ -0,0 +1,10 @@
+kernel void atomic_intergroup_race(global int *data)
+{
+ int i = get_global_id(0);
+ if (i == 0)
+ {
+ *data = 0;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ atomic_inc(data);
+}
diff --git a/tests/kernels/atomics/atomic_intergroup_race.ref b/tests/kernels/atomics/atomic_intergroup_race.ref
new file mode 100644
index 0000000..cab3430
--- /dev/null
+++ b/tests/kernels/atomics/atomic_intergroup_race.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'data': 4 bytes
+ data[0] = 8
+
diff --git a/tests/kernels/atomics/atomic_intergroup_race.sim b/tests/kernels/atomics/atomic_intergroup_race.sim
new file mode 100644
index 0000000..2516334
--- /dev/null
+++ b/tests/kernels/atomics/atomic_intergroup_race.sim
@@ -0,0 +1,6 @@
+atomic_intergroup_race.cl
+atomic_intergroup_race
+8 1 1
+4 1 1
+
+<size=4 fill=-1 dump>
diff --git a/tests/kernels/atomics/atomic_local_fence.cl b/tests/kernels/atomics/atomic_local_fence.cl
new file mode 100644
index 0000000..e9227a5
--- /dev/null
+++ b/tests/kernels/atomics/atomic_local_fence.cl
@@ -0,0 +1,17 @@
+kernel void atomic_local_fence(global int *data, local int *scratch)
+{
+ int i = get_global_id(0);
+ int l = get_local_id(0);
+ int g = get_group_id(0);
+ if (l == 0)
+ {
+ *scratch = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ atomic_add(scratch, i);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (l == 0)
+ {
+ data[g] = *scratch;
+ }
+}
diff --git a/tests/kernels/atomics/atomic_local_fence.ref b/tests/kernels/atomics/atomic_local_fence.ref
new file mode 100644
index 0000000..a7bf48a
--- /dev/null
+++ b/tests/kernels/atomics/atomic_local_fence.ref
@@ -0,0 +1,5 @@
+
+Argument 'data': 8 bytes
+ data[0] = 6
+ data[1] = 22
+
diff --git a/tests/kernels/atomics/atomic_local_fence.sim b/tests/kernels/atomics/atomic_local_fence.sim
new file mode 100644
index 0000000..6abffee
--- /dev/null
+++ b/tests/kernels/atomics/atomic_local_fence.sim
@@ -0,0 +1,7 @@
+atomic_local_fence.cl
+atomic_local_fence
+8 1 1
+4 1 1
+
+<size=8 fill=0 dump>
+<size=4>
diff --git a/tests/kernels/atomics/atomic_race_after.cl b/tests/kernels/atomics/atomic_race_after.cl
new file mode 100644
index 0000000..d168053
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_after.cl
@@ -0,0 +1,8 @@
+kernel void atomic_race_after(global int *data)
+{
+ atomic_inc(data);
+ if (get_global_id(0) == get_global_size(0)-1)
+ {
+ (*data)++;
+ }
+}
diff --git a/tests/kernels/atomics/atomic_race_after.ref b/tests/kernels/atomics/atomic_race_after.ref
new file mode 100644
index 0000000..bc902a8
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_after.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'data': 4 bytes
+ data[0] = 5
+
diff --git a/tests/kernels/atomics/atomic_race_after.sim b/tests/kernels/atomics/atomic_race_after.sim
new file mode 100644
index 0000000..d182089
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_after.sim
@@ -0,0 +1,6 @@
+atomic_race_after.cl
+atomic_race_after
+4 1 1
+4 1 1
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_race_before.cl b/tests/kernels/atomics/atomic_race_before.cl
new file mode 100644
index 0000000..53db050
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_before.cl
@@ -0,0 +1,8 @@
+kernel void atomic_race_before(global int *data)
+{
+ if (get_global_id(0) == 0)
+ {
+ *data = 0;
+ }
+ atomic_inc(data);
+}
diff --git a/tests/kernels/atomics/atomic_race_before.ref b/tests/kernels/atomics/atomic_race_before.ref
new file mode 100644
index 0000000..6ecedc3
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_before.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'data': 4 bytes
+ data[0] = 4
+
diff --git a/tests/kernels/atomics/atomic_race_before.sim b/tests/kernels/atomics/atomic_race_before.sim
new file mode 100644
index 0000000..109c204
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_before.sim
@@ -0,0 +1,6 @@
+atomic_race_before.cl
+atomic_race_before
+4 1 1
+4 1 1
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_same_workitem.cl b/tests/kernels/atomics/atomic_same_workitem.cl
new file mode 100644
index 0000000..735c5e9
--- /dev/null
+++ b/tests/kernels/atomics/atomic_same_workitem.cl
@@ -0,0 +1,14 @@
+kernel void atomic_same_workitem(global int *data)
+{
+ int i = get_global_id(0);
+ if ((i % 2) == 0)
+ {
+ data[i] = 0;
+ atomic_inc(data+i);
+ }
+ else
+ {
+ atomic_inc(data+i);
+ data[i] = data[i] + 1;
+ }
+}
diff --git a/tests/kernels/atomics/atomic_same_workitem.ref b/tests/kernels/atomics/atomic_same_workitem.ref
new file mode 100644
index 0000000..3ef3ca7
--- /dev/null
+++ b/tests/kernels/atomics/atomic_same_workitem.ref
@@ -0,0 +1,7 @@
+
+Argument 'data': 16 bytes
+ data[0] = 1
+ data[1] = 2
+ data[2] = 1
+ data[3] = 2
+
diff --git a/tests/kernels/atomics/atomic_same_workitem.sim b/tests/kernels/atomics/atomic_same_workitem.sim
new file mode 100644
index 0000000..2e3f210
--- /dev/null
+++ b/tests/kernels/atomics/atomic_same_workitem.sim
@@ -0,0 +1,6 @@
+atomic_same_workitem.cl
+atomic_same_workitem
+4 1 1
+4 1 1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/barrier/barrier_different_instructions.cl b/tests/kernels/barrier/barrier_different_instructions.cl
new file mode 100644
index 0000000..bd4d00a
--- /dev/null
+++ b/tests/kernels/barrier/barrier_different_instructions.cl
@@ -0,0 +1,14 @@
+kernel void barrier_different_instructions(global int *data)
+{
+ int i = get_global_id(0);
+ if (i == 0)
+ {
+ data[0] = 42;
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ }
+ else
+ {
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ data[i] = i + data[0];
+ }
+}
diff --git a/tests/kernels/barrier/barrier_different_instructions.ref b/tests/kernels/barrier/barrier_different_instructions.ref
new file mode 100644
index 0000000..3ffaa5a
--- /dev/null
+++ b/tests/kernels/barrier/barrier_different_instructions.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 42
+ data[1] = 43
+ data[2] = 44
+ data[3] = 45
+
diff --git a/tests/kernels/barrier/barrier_different_instructions.sim b/tests/kernels/barrier/barrier_different_instructions.sim
new file mode 100644
index 0000000..96afa08
--- /dev/null
+++ b/tests/kernels/barrier/barrier_different_instructions.sim
@@ -0,0 +1,6 @@
+barrier_different_instructions.cl
+barrier_different_instructions
+4 1 1
+4 1 1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/barrier/barrier_divergence.cl b/tests/kernels/barrier/barrier_divergence.cl
new file mode 100644
index 0000000..c64a7c4
--- /dev/null
+++ b/tests/kernels/barrier/barrier_divergence.cl
@@ -0,0 +1,9 @@
+kernel void barrier_divergence(global int *data)
+{
+ int i = get_global_id(0);
+ if (i != 0)
+ {
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ }
+ data[i] = i;
+}
diff --git a/tests/kernels/barrier/barrier_divergence.ref b/tests/kernels/barrier/barrier_divergence.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/barrier/barrier_divergence.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 0
+ data[1] = 1
+ data[2] = 2
+ data[3] = 3
+
diff --git a/tests/kernels/barrier/barrier_divergence.sim b/tests/kernels/barrier/barrier_divergence.sim
new file mode 100644
index 0000000..aa68728
--- /dev/null
+++ b/tests/kernels/barrier/barrier_divergence.sim
@@ -0,0 +1,6 @@
+barrier_divergence.cl
+barrier_divergence
+4 1 1
+4 1 1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/bugs/gvn_arbitrary_integers.cl b/tests/kernels/bugs/gvn_arbitrary_integers.cl
new file mode 100644
index 0000000..38c6f52
--- /dev/null
+++ b/tests/kernels/bugs/gvn_arbitrary_integers.cl
@@ -0,0 +1,8 @@
+__kernel void gvn_arbitrary_integers(__global int *source,
+ __global int *dest)
+{
+ size_t i = get_global_id(0);
+ int3 tmp = 0;
+ tmp.S2 = source[i];
+ vstore3(tmp, 0, dest);
+}
diff --git a/tests/kernels/bugs/gvn_arbitrary_integers.ref b/tests/kernels/bugs/gvn_arbitrary_integers.ref
new file mode 100644
index 0000000..fafe2ec
--- /dev/null
+++ b/tests/kernels/bugs/gvn_arbitrary_integers.ref
@@ -0,0 +1,6 @@
+
+Argument 'dest': 12 bytes
+ dest[0] = 0
+ dest[1] = 0
+ dest[2] = 42
+
diff --git a/tests/kernels/bugs/gvn_arbitrary_integers.sim b/tests/kernels/bugs/gvn_arbitrary_integers.sim
new file mode 100644
index 0000000..064c5b4
--- /dev/null
+++ b/tests/kernels/bugs/gvn_arbitrary_integers.sim
@@ -0,0 +1,7 @@
+gvn_arbitrary_integers.cl
+gvn_arbitrary_integers
+1 1 1
+1 1 1
+
+<size=4 fill=42>
+<size=12 fill=0 dump>
diff --git a/tests/kernels/bugs/kernel_struct_argument.cl b/tests/kernels/bugs/kernel_struct_argument.cl
new file mode 100644
index 0000000..8b8af8b
--- /dev/null
+++ b/tests/kernels/bugs/kernel_struct_argument.cl
@@ -0,0 +1,11 @@
+typedef struct
+{
+ float a;
+ float b;
+ float c;
+} Structure;
+
+kernel void kernel_struct_argument(Structure x, global float *out)
+{
+ *out = x.a * x.b + x.c;
+}
diff --git a/tests/kernels/bugs/kernel_struct_argument.ref b/tests/kernels/bugs/kernel_struct_argument.ref
new file mode 100644
index 0000000..b8c7e51
--- /dev/null
+++ b/tests/kernels/bugs/kernel_struct_argument.ref
@@ -0,0 +1,4 @@
+
+Argument 'out': 4 bytes
+ out[0] = 144
+
diff --git a/tests/kernels/bugs/kernel_struct_argument.sim b/tests/kernels/bugs/kernel_struct_argument.sim
new file mode 100644
index 0000000..4ff650d
--- /dev/null
+++ b/tests/kernels/bugs/kernel_struct_argument.sim
@@ -0,0 +1,11 @@
+kernel_struct_argument.cl
+kernel_struct_argument
+1 1 1
+1 1 1
+
+<size=12 float>
+42
+3
+18
+
+<size=4 dump fill=0>
diff --git a/tests/kernels/bugs/many_alloca.cl b/tests/kernels/bugs/many_alloca.cl
new file mode 100644
index 0000000..00d9fd1
--- /dev/null
+++ b/tests/kernels/bugs/many_alloca.cl
@@ -0,0 +1,21 @@
+void bar(int *x)
+{
+ *x += 1;
+}
+
+int foo()
+{
+ int x = 0;
+ bar(&x);
+ return x;
+}
+
+kernel void many_alloca(global int *data, int n)
+{
+ int x = 0;
+ for (int i = 0; i < n; i++)
+ {
+ x += foo();
+ }
+ data[get_global_id(0)] = x;
+}
diff --git a/tests/kernels/bugs/many_alloca.ref b/tests/kernels/bugs/many_alloca.ref
new file mode 100644
index 0000000..201d55d
--- /dev/null
+++ b/tests/kernels/bugs/many_alloca.ref
@@ -0,0 +1,4 @@
+
+Argument 'data': 4 bytes
+ data[0] = 100000
+
diff --git a/tests/kernels/bugs/many_alloca.sim b/tests/kernels/bugs/many_alloca.sim
new file mode 100644
index 0000000..3df81c9
--- /dev/null
+++ b/tests/kernels/bugs/many_alloca.sim
@@ -0,0 +1,9 @@
+many_alloca.cl
+many_alloca
+1 1 1
+1 1 1
+
+<size=4 fill=0 dump>
+
+<size=4>
+100000
diff --git a/tests/kernels/bugs/multidim_array_in_struct.cl b/tests/kernels/bugs/multidim_array_in_struct.cl
new file mode 100644
index 0000000..11ecf21
--- /dev/null
+++ b/tests/kernels/bugs/multidim_array_in_struct.cl
@@ -0,0 +1,40 @@
+//
+// Issue #64 on GitHub:
+// https://github.com/jrprice/Oclgrind/issues/64
+//
+// Required alignment for multi-dimensional arrays was incorrect.
+//
+
+struct S0
+{
+ uchar a;
+ ulong b[2][3][1];
+};
+
+kernel void multidim_array_in_struct(global ulong *output)
+{
+ struct S0 s =
+ {
+ 1UL,
+ {
+ {
+ {1L},
+ {1L},
+ {1L}
+ },
+ {
+ {1L},
+ {1L},
+ {1L}
+ }
+ },
+ };
+
+ ulong c = 0UL;
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 3; j++)
+ for (int k = 0; k < 1; k++)
+ c += s.b[i][j][k];
+
+ *output = c;
+}
diff --git a/tests/kernels/bugs/multidim_array_in_struct.ref b/tests/kernels/bugs/multidim_array_in_struct.ref
new file mode 100644
index 0000000..f9606f2
--- /dev/null
+++ b/tests/kernels/bugs/multidim_array_in_struct.ref
@@ -0,0 +1,4 @@
+
+Argument 'output': 8 bytes
+ output[0] = 6
+
diff --git a/tests/kernels/bugs/multidim_array_in_struct.sim b/tests/kernels/bugs/multidim_array_in_struct.sim
new file mode 100644
index 0000000..07443f7
--- /dev/null
+++ b/tests/kernels/bugs/multidim_array_in_struct.sim
@@ -0,0 +1,13 @@
+#
+# Issue #64 on GitHub:
+# https://github.com/jrprice/Oclgrind/issues/64
+#
+# Required alignment for multi-dimensional arrays was incorrect.
+#
+
+multidim_array_in_struct.cl
+multidim_array_in_struct
+1 1 1
+1 1 1
+
+<size=8 fill=0 dump>
diff --git a/tests/kernels/bugs/null_argument.cl b/tests/kernels/bugs/null_argument.cl
new file mode 100644
index 0000000..d987861
--- /dev/null
+++ b/tests/kernels/bugs/null_argument.cl
@@ -0,0 +1,9 @@
+ulong func_1(ulong * p_1)
+{
+ return 1;
+}
+
+kernel void null_argument(global ulong *output)
+{
+ *output = func_1((void*)0);
+}
diff --git a/tests/kernels/bugs/null_argument.ref b/tests/kernels/bugs/null_argument.ref
new file mode 100644
index 0000000..dcf81cb
--- /dev/null
+++ b/tests/kernels/bugs/null_argument.ref
@@ -0,0 +1,4 @@
+
+Argument 'output': 8 bytes
+ output[0] = 1
+
diff --git a/tests/kernels/bugs/null_argument.sim b/tests/kernels/bugs/null_argument.sim
new file mode 100644
index 0000000..eb55985
--- /dev/null
+++ b/tests/kernels/bugs/null_argument.sim
@@ -0,0 +1,6 @@
+null_argument.cl
+null_argument
+1 1 1
+1 1 1
+
+<size=8 fill=0 dump>
diff --git a/tests/kernels/bugs/sroa_addrspace_cast.cl b/tests/kernels/bugs/sroa_addrspace_cast.cl
new file mode 100644
index 0000000..1eac32c
--- /dev/null
+++ b/tests/kernels/bugs/sroa_addrspace_cast.cl
@@ -0,0 +1,12 @@
+typedef struct
+{
+ float x;
+} DataStruct;
+
+__kernel void sroa_addrspace_cast(__global DataStruct *input,
+ __global float *output)
+{
+ size_t i = get_global_id(0);
+ DataStruct s = input[i];
+ output[i] = s.x;
+}
diff --git a/tests/kernels/bugs/sroa_addrspace_cast.ref b/tests/kernels/bugs/sroa_addrspace_cast.ref
new file mode 100644
index 0000000..2fff44c
--- /dev/null
+++ b/tests/kernels/bugs/sroa_addrspace_cast.ref
@@ -0,0 +1,4 @@
+
+Argument 'output': 4 bytes
+ output[0] = 42.24
+
diff --git a/tests/kernels/bugs/sroa_addrspace_cast.sim b/tests/kernels/bugs/sroa_addrspace_cast.sim
new file mode 100644
index 0000000..5d26265
--- /dev/null
+++ b/tests/kernels/bugs/sroa_addrspace_cast.sim
@@ -0,0 +1,7 @@
+sroa_addrspace_cast.cl
+sroa_addrspace_cast
+1 1 1
+1 1 1
+
+<size=4 float fill=42.24>
+<size=4 fill=0 dump>
diff --git a/tests/kernels/data-race/broadcast.cl b/tests/kernels/data-race/broadcast.cl
new file mode 100644
index 0000000..674f9f3
--- /dev/null
+++ b/tests/kernels/data-race/broadcast.cl
@@ -0,0 +1,5 @@
+kernel void broadcast(global int *value, global int *output)
+{
+ int i = get_global_id(0);
+ output[i] = value[0];
+}
diff --git a/tests/kernels/data-race/broadcast.ref b/tests/kernels/data-race/broadcast.ref
new file mode 100644
index 0000000..69790f7
--- /dev/null
+++ b/tests/kernels/data-race/broadcast.ref
@@ -0,0 +1,7 @@
+
+Argument 'output': 16 bytes
+ output[0] = 42
+ output[1] = 42
+ output[2] = 42
+ output[3] = 42
+
diff --git a/tests/kernels/data-race/broadcast.sim b/tests/kernels/data-race/broadcast.sim
new file mode 100644
index 0000000..7663c4f
--- /dev/null
+++ b/tests/kernels/data-race/broadcast.sim
@@ -0,0 +1,9 @@
+broadcast.cl
+broadcast
+4 1 1
+1 1 1
+
+<size=4>
+42
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/global_fence.cl b/tests/kernels/data-race/global_fence.cl
new file mode 100644
index 0000000..ed175f7
--- /dev/null
+++ b/tests/kernels/data-race/global_fence.cl
@@ -0,0 +1,16 @@
+kernel void global_fence(global int *scratch, global int *output)
+{
+ int i = get_global_id(0);
+ int g = get_group_id(0);
+ scratch[i] = i;
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ if (get_local_id(0) == 0)
+ {
+ int x = 0;
+ for (int l = 0; l < get_local_size(0); l++)
+ {
+ x += scratch[get_local_size(0)*g + l];
+ }
+ output[g] = x;
+ }
+}
diff --git a/tests/kernels/data-race/global_fence.ref b/tests/kernels/data-race/global_fence.ref
new file mode 100644
index 0000000..342c29a
--- /dev/null
+++ b/tests/kernels/data-race/global_fence.ref
@@ -0,0 +1,7 @@
+
+Argument 'output': 16 bytes
+ output[0] = 6
+ output[1] = 22
+ output[2] = 38
+ output[3] = 54
+
diff --git a/tests/kernels/data-race/global_fence.sim b/tests/kernels/data-race/global_fence.sim
new file mode 100644
index 0000000..088170f
--- /dev/null
+++ b/tests/kernels/data-race/global_fence.sim
@@ -0,0 +1,7 @@
+global_fence.cl
+global_fence
+16 1 1
+4 1 1
+
+<size=64 fill=0>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/global_only_fence.cl b/tests/kernels/data-race/global_only_fence.cl
new file mode 100644
index 0000000..c1f83cb
--- /dev/null
+++ b/tests/kernels/data-race/global_only_fence.cl
@@ -0,0 +1,16 @@
+kernel void global_only_fence(local int *scratch, global int *output)
+{
+ int l = get_local_id(0);
+ int g = get_group_id(0);
+ scratch[l] = l;
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ if (get_local_id(0) == 0)
+ {
+ int x = 0;
+ for (int i = 0; i < get_local_size(0); i++)
+ {
+ x += scratch[i];
+ }
+ output[g] = x;
+ }
+}
diff --git a/tests/kernels/data-race/global_only_fence.ref b/tests/kernels/data-race/global_only_fence.ref
new file mode 100644
index 0000000..5b62861
--- /dev/null
+++ b/tests/kernels/data-race/global_only_fence.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'output': 16 bytes
+ output[0] = 6
+ output[1] = 0
+ output[2] = 0
+ output[3] = 0
+
diff --git a/tests/kernels/data-race/global_only_fence.sim b/tests/kernels/data-race/global_only_fence.sim
new file mode 100644
index 0000000..7bc05c6
--- /dev/null
+++ b/tests/kernels/data-race/global_only_fence.sim
@@ -0,0 +1,7 @@
+global_only_fence.cl
+global_only_fence
+4 1 1
+4 1 1
+
+<size=16>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/global_read_write_race.cl b/tests/kernels/data-race/global_read_write_race.cl
new file mode 100644
index 0000000..7463e22
--- /dev/null
+++ b/tests/kernels/data-race/global_read_write_race.cl
@@ -0,0 +1,8 @@
+kernel void global_read_write_race(global int *data)
+{
+ int i = get_global_id(0);
+ if (i > 0)
+ {
+ data[i] = data[i-1];
+ }
+}
diff --git a/tests/kernels/data-race/global_read_write_race.ref b/tests/kernels/data-race/global_read_write_race.ref
new file mode 100644
index 0000000..7e1c317
--- /dev/null
+++ b/tests/kernels/data-race/global_read_write_race.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 0
+ data[1] = 0
+ data[2] = 0
+ data[3] = 0
+
diff --git a/tests/kernels/data-race/global_read_write_race.sim b/tests/kernels/data-race/global_read_write_race.sim
new file mode 100644
index 0000000..11077ab
--- /dev/null
+++ b/tests/kernels/data-race/global_read_write_race.sim
@@ -0,0 +1,6 @@
+global_read_write_race.cl
+global_read_write_race
+4 1 1
+1 1 1
+
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/data-race/global_write_write_race.cl b/tests/kernels/data-race/global_write_write_race.cl
new file mode 100644
index 0000000..53b5d9c
--- /dev/null
+++ b/tests/kernels/data-race/global_write_write_race.cl
@@ -0,0 +1,4 @@
+kernel void global_write_write_race(global int *data)
+{
+ data[0] = get_global_id(0);
+}
diff --git a/tests/kernels/data-race/global_write_write_race.ref b/tests/kernels/data-race/global_write_write_race.ref
new file mode 100644
index 0000000..0b31b65
--- /dev/null
+++ b/tests/kernels/data-race/global_write_write_race.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'data': 4 bytes
+ data[0] = 3
+
diff --git a/tests/kernels/data-race/global_write_write_race.sim b/tests/kernels/data-race/global_write_write_race.sim
new file mode 100644
index 0000000..236990b
--- /dev/null
+++ b/tests/kernels/data-race/global_write_write_race.sim
@@ -0,0 +1,6 @@
+global_write_write_race.cl
+global_write_write_race
+4 1 1
+1 1 1
+
+<size=4 fill=0 dump>
diff --git a/tests/kernels/data-race/increment.cl b/tests/kernels/data-race/increment.cl
new file mode 100644
index 0000000..d00f274
--- /dev/null
+++ b/tests/kernels/data-race/increment.cl
@@ -0,0 +1,5 @@
+kernel void increment(global int *data)
+{
+ int i = get_global_id(0);
+ data[i] = data[i] + 1;
+}
diff --git a/tests/kernels/data-race/increment.ref b/tests/kernels/data-race/increment.ref
new file mode 100644
index 0000000..11a20e6
--- /dev/null
+++ b/tests/kernels/data-race/increment.ref
@@ -0,0 +1,7 @@
+
+Argument 'data': 16 bytes
+ data[0] = 1
+ data[1] = 2
+ data[2] = 3
+ data[3] = 4
+
diff --git a/tests/kernels/data-race/increment.sim b/tests/kernels/data-race/increment.sim
new file mode 100644
index 0000000..fc44402
--- /dev/null
+++ b/tests/kernels/data-race/increment.sim
@@ -0,0 +1,6 @@
+increment.cl
+increment
+4 1 1
+1 1 1
+
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/data-race/intergroup_hidden_race.cl b/tests/kernels/data-race/intergroup_hidden_race.cl
new file mode 100644
index 0000000..5ac0b99
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_hidden_race.cl
@@ -0,0 +1,9 @@
+kernel void intergroup_hidden_race(global int *data, global int *output)
+{
+ int group = get_group_id(0);
+ output[group] = data[0];
+ if (group == 1)
+ {
+ data[0] = group;
+ }
+}
diff --git a/tests/kernels/data-race/intergroup_hidden_race.ref b/tests/kernels/data-race/intergroup_hidden_race.ref
new file mode 100644
index 0000000..9390c4c
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_hidden_race.ref
@@ -0,0 +1,6 @@
+ERROR EXPECTED
+
+Argument 'output': 8 bytes
+ output[0] = 0
+ output[1] = 0
+
diff --git a/tests/kernels/data-race/intergroup_hidden_race.sim b/tests/kernels/data-race/intergroup_hidden_race.sim
new file mode 100644
index 0000000..b31145c
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_hidden_race.sim
@@ -0,0 +1,7 @@
+intergroup_hidden_race.cl
+intergroup_hidden_race
+2 1 1
+1 1 1
+
+<size=4 fill=0>
+<size=8 fill=0 dump>
diff --git a/tests/kernels/data-race/intergroup_race.cl b/tests/kernels/data-race/intergroup_race.cl
new file mode 100644
index 0000000..0a9784b
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_race.cl
@@ -0,0 +1,19 @@
+kernel void intergroup_race(global int *data)
+{
+ int g = get_group_id(0);
+ if (get_local_id(0) == 0)
+ {
+ data[g] = g;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ if (get_global_id(0) == 0)
+ {
+ int x = 0;
+ for (int i = 0; i < get_num_groups(0); i++)
+ {
+ x += data[i];
+ }
+ data[0] = x;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/tests/kernels/data-race/intergroup_race.ref b/tests/kernels/data-race/intergroup_race.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_race.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 0
+ data[1] = 1
+ data[2] = 2
+ data[3] = 3
+
diff --git a/tests/kernels/data-race/intergroup_race.sim b/tests/kernels/data-race/intergroup_race.sim
new file mode 100644
index 0000000..4e60c87
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_race.sim
@@ -0,0 +1,6 @@
+intergroup_race.cl
+intergroup_race
+16 1 1
+4 1 1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/intragroup_hidden_race.cl b/tests/kernels/data-race/intragroup_hidden_race.cl
new file mode 100644
index 0000000..b101a41
--- /dev/null
+++ b/tests/kernels/data-race/intragroup_hidden_race.cl
@@ -0,0 +1,10 @@
+kernel void intragroup_hidden_race(global int *data, global int *output)
+{
+ int id = get_local_id(0);
+ output[id] = data[0];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (id == 0)
+ {
+ data[0] = -1;
+ }
+}
diff --git a/tests/kernels/data-race/intragroup_hidden_race.ref b/tests/kernels/data-race/intragroup_hidden_race.ref
new file mode 100644
index 0000000..7ff022b
--- /dev/null
+++ b/tests/kernels/data-race/intragroup_hidden_race.ref
@@ -0,0 +1,6 @@
+ERROR EXPECTED
+
+Argument 'output': 8 bytes
+ output[0] = 42
+ output[1] = 42
+
diff --git a/tests/kernels/data-race/intragroup_hidden_race.sim b/tests/kernels/data-race/intragroup_hidden_race.sim
new file mode 100644
index 0000000..16479a5
--- /dev/null
+++ b/tests/kernels/data-race/intragroup_hidden_race.sim
@@ -0,0 +1,7 @@
+intragroup_hidden_race.cl
+intragroup_hidden_race
+2 1 1
+2 1 1
+
+<size=4 fill=42>
+<size=8 fill=0 dump>
diff --git a/tests/kernels/data-race/local_only_fence.cl b/tests/kernels/data-race/local_only_fence.cl
new file mode 100644
index 0000000..7b28012
--- /dev/null
+++ b/tests/kernels/data-race/local_only_fence.cl
@@ -0,0 +1,16 @@
+kernel void local_only_fence(global int *scratch, global int *output)
+{
+ int i = get_global_id(0);
+ int g = get_group_id(0);
+ scratch[i] = i;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0) == 0)
+ {
+ int x = 0;
+ for (int l = 0; l < get_local_size(0); l++)
+ {
+ x += scratch[get_local_size(0)*g + l];
+ }
+ output[g] = x;
+ }
+}
diff --git a/tests/kernels/data-race/local_only_fence.ref b/tests/kernels/data-race/local_only_fence.ref
new file mode 100644
index 0000000..b6b7f00
--- /dev/null
+++ b/tests/kernels/data-race/local_only_fence.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'output': 16 bytes
+ output[0] = 6
+ output[1] = 22
+ output[2] = 38
+ output[3] = 54
+
diff --git a/tests/kernels/data-race/local_only_fence.sim b/tests/kernels/data-race/local_only_fence.sim
new file mode 100644
index 0000000..1662f3e
--- /dev/null
+++ b/tests/kernels/data-race/local_only_fence.sim
@@ -0,0 +1,7 @@
+local_only_fence.cl
+local_only_fence
+16 1 1
+4 1 1
+
+<size=64 fill=0>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/local_read_write_race.cl b/tests/kernels/data-race/local_read_write_race.cl
new file mode 100644
index 0000000..bcc3ff8
--- /dev/null
+++ b/tests/kernels/data-race/local_read_write_race.cl
@@ -0,0 +1,14 @@
+kernel void local_read_write_race(global int *data, local int *scratch)
+{
+ int l = get_local_id(0);
+ scratch[l] = l;
+ if (l == 0)
+ {
+ int x = 0;
+ for (int i = 0; i < get_local_size(0); i++)
+ {
+ x += scratch[i];
+ }
+ *data = x;
+ }
+}
diff --git a/tests/kernels/data-race/local_read_write_race.ref b/tests/kernels/data-race/local_read_write_race.ref
new file mode 100644
index 0000000..0943b15
--- /dev/null
+++ b/tests/kernels/data-race/local_read_write_race.ref
@@ -0,0 +1,5 @@
+ERROR EXPECETD
+
+Argument 'data': 4 bytes
+ data[0] = 0
+
diff --git a/tests/kernels/data-race/local_read_write_race.sim b/tests/kernels/data-race/local_read_write_race.sim
new file mode 100644
index 0000000..b3c4fbb
--- /dev/null
+++ b/tests/kernels/data-race/local_read_write_race.sim
@@ -0,0 +1,7 @@
+local_read_write_race.cl
+local_read_write_race
+4 1 1
+4 1 1
+
+<size=4 fill=0 dump>
+<size=16>
diff --git a/tests/kernels/data-race/local_write_write_race.cl b/tests/kernels/data-race/local_write_write_race.cl
new file mode 100644
index 0000000..26a96c1
--- /dev/null
+++ b/tests/kernels/data-race/local_write_write_race.cl
@@ -0,0 +1,7 @@
+kernel void local_write_write_race(global int *data, local int *scratch)
+{
+ int i = get_global_id(0);
+ *scratch = i;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ data[i] = *scratch;
+}
diff --git a/tests/kernels/data-race/local_write_write_race.ref b/tests/kernels/data-race/local_write_write_race.ref
new file mode 100644
index 0000000..3fe4e95
--- /dev/null
+++ b/tests/kernels/data-race/local_write_write_race.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 3
+ data[2] = 3
+ data[3] = 3
+
diff --git a/tests/kernels/data-race/local_write_write_race.sim b/tests/kernels/data-race/local_write_write_race.sim
new file mode 100644
index 0000000..43106c7
--- /dev/null
+++ b/tests/kernels/data-race/local_write_write_race.sim
@@ -0,0 +1,7 @@
+local_write_write_race.cl
+local_write_write_race
+4 1 1
+4 1 1
+
+<size=16 fill=0 dump>
+<size=4>
diff --git a/tests/kernels/data-race/uniform_write_race.cl b/tests/kernels/data-race/uniform_write_race.cl
new file mode 100644
index 0000000..ec13b59
--- /dev/null
+++ b/tests/kernels/data-race/uniform_write_race.cl
@@ -0,0 +1,4 @@
+kernel void uniform_write_race(global int *data)
+{
+ *data = 0;
+}
diff --git a/tests/kernels/data-race/uniform_write_race.ref b/tests/kernels/data-race/uniform_write_race.ref
new file mode 100644
index 0000000..b688113
--- /dev/null
+++ b/tests/kernels/data-race/uniform_write_race.ref
@@ -0,0 +1,4 @@
+
+Argument 'data': 4 bytes
+ data[0] = 0
+
diff --git a/tests/kernels/data-race/uniform_write_race.sim b/tests/kernels/data-race/uniform_write_race.sim
new file mode 100644
index 0000000..d08df5f
--- /dev/null
+++ b/tests/kernels/data-race/uniform_write_race.sim
@@ -0,0 +1,6 @@
+uniform_write_race.cl
+uniform_write_race
+4 1 1
+4 1 1
+
+<size=4 fill=-1 dump>
diff --git a/tests/kernels/memcheck/async_copy_out_of_bounds.cl b/tests/kernels/memcheck/async_copy_out_of_bounds.cl
new file mode 100644
index 0000000..9c38a91
--- /dev/null
+++ b/tests/kernels/memcheck/async_copy_out_of_bounds.cl
@@ -0,0 +1,8 @@
+kernel void async_copy_out_of_bounds(local int *src, global int *dst)
+{
+ int l = get_local_id(0);
+ src[l] = l;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ event_t event = async_work_group_copy(dst+1, src, get_local_size(0), 0);
+ wait_group_events(1, &event);
+}
diff --git a/tests/kernels/memcheck/async_copy_out_of_bounds.ref b/tests/kernels/memcheck/async_copy_out_of_bounds.ref
new file mode 100644
index 0000000..9a8cb35
--- /dev/null
+++ b/tests/kernels/memcheck/async_copy_out_of_bounds.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'dst': 16 bytes
+ dst[0] = 0
+ dst[1] = 0
+ dst[2] = 1
+ dst[3] = 2
+
diff --git a/tests/kernels/memcheck/async_copy_out_of_bounds.sim b/tests/kernels/memcheck/async_copy_out_of_bounds.sim
new file mode 100644
index 0000000..fd6d8de
--- /dev/null
+++ b/tests/kernels/memcheck/async_copy_out_of_bounds.sim
@@ -0,0 +1,7 @@
+async_copy_out_of_bounds.cl
+async_copy_out_of_bounds
+4 1 1
+4 1 1
+
+<size=16>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/memcheck/atomic_out_of_bounds.cl b/tests/kernels/memcheck/atomic_out_of_bounds.cl
new file mode 100644
index 0000000..bbb58b9
--- /dev/null
+++ b/tests/kernels/memcheck/atomic_out_of_bounds.cl
@@ -0,0 +1,5 @@
+kernel void atomic_out_of_bounds(global int *counters)
+{
+ int i = get_global_id(0);
+ atomic_inc(counters+i);
+}
diff --git a/tests/kernels/memcheck/atomic_out_of_bounds.ref b/tests/kernels/memcheck/atomic_out_of_bounds.ref
new file mode 100644
index 0000000..cfcff7d
--- /dev/null
+++ b/tests/kernels/memcheck/atomic_out_of_bounds.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'counters': 16 bytes
+ counters[0] = 1
+ counters[1] = 1
+ counters[2] = 1
+ counters[3] = 1
+
diff --git a/tests/kernels/memcheck/atomic_out_of_bounds.sim b/tests/kernels/memcheck/atomic_out_of_bounds.sim
new file mode 100644
index 0000000..3c70419
--- /dev/null
+++ b/tests/kernels/memcheck/atomic_out_of_bounds.sim
@@ -0,0 +1,6 @@
+atomic_out_of_bounds.cl
+atomic_out_of_bounds
+5 1 1
+1 1 1
+
+<size=16 fill=0 dump>
diff --git a/tests/kernels/memcheck/dereference_null.cl b/tests/kernels/memcheck/dereference_null.cl
new file mode 100644
index 0000000..c5df927
--- /dev/null
+++ b/tests/kernels/memcheck/dereference_null.cl
@@ -0,0 +1,4 @@
+kernel void dereference_null(global int *input, global int *output)
+{
+ output[0] *= input[0];
+}
diff --git a/tests/kernels/memcheck/dereference_null.ref b/tests/kernels/memcheck/dereference_null.ref
new file mode 100644
index 0000000..5a01471
--- /dev/null
+++ b/tests/kernels/memcheck/dereference_null.ref
@@ -0,0 +1,5 @@
+ERROR EXPECTED
+
+Argument 'output': 4 bytes
+ output[0] = 0
+
diff --git a/tests/kernels/memcheck/dereference_null.sim b/tests/kernels/memcheck/dereference_null.sim
new file mode 100644
index 0000000..84da097
--- /dev/null
+++ b/tests/kernels/memcheck/dereference_null.sim
@@ -0,0 +1,7 @@
+dereference_null.cl
+dereference_null
+1 1 1
+1 1 1
+
+<null>
+<size=4 fill=0 dump>
diff --git a/tests/kernels/memcheck/read_out_of_bounds.cl b/tests/kernels/memcheck/read_out_of_bounds.cl
new file mode 100644
index 0000000..d76d7f1
--- /dev/null
+++ b/tests/kernels/memcheck/read_out_of_bounds.cl
@@ -0,0 +1,12 @@
+kernel void read_out_of_bounds(global int *a, global int *b, global int *c)
+{
+ int i = get_global_id(0);
+ if (i < 4)
+ {
+ c[i] = a[i] + b[i];
+ }
+ else
+ {
+ c[i] = a[0] * (a[i] + b[i]);
+ }
+}
diff --git a/tests/kernels/memcheck/read_out_of_bounds.ref b/tests/kernels/memcheck/read_out_of_bounds.ref
new file mode 100644
index 0000000..539c267
--- /dev/null
+++ b/tests/kernels/memcheck/read_out_of_bounds.ref
@@ -0,0 +1,9 @@
+ERROR EXPECTED
+
+Argument 'c': 20 bytes
+ c[0] = 0
+ c[1] = 2
+ c[2] = 4
+ c[3] = 6
+ c[4] = 0
+
diff --git a/tests/kernels/memcheck/read_out_of_bounds.sim b/tests/kernels/memcheck/read_out_of_bounds.sim
new file mode 100644
index 0000000..df72869
--- /dev/null
+++ b/tests/kernels/memcheck/read_out_of_bounds.sim
@@ -0,0 +1,8 @@
+read_out_of_bounds.cl
+read_out_of_bounds
+5 1 1
+5 1 1
+
+<size=16 range=0:1:3>
+<size=16 range=0:1:3>
+<size=20 fill=0 dump>
diff --git a/tests/kernels/memcheck/read_write_only_memory.cl b/tests/kernels/memcheck/read_write_only_memory.cl
new file mode 100644
index 0000000..3f65143
--- /dev/null
+++ b/tests/kernels/memcheck/read_write_only_memory.cl
@@ -0,0 +1,5 @@
+kernel void read_write_only_memory(global int *input, global int *output)
+{
+ int i = get_global_id(0);
+ output[i] += input[i];
+}
diff --git a/tests/kernels/memcheck/read_write_only_memory.ref b/tests/kernels/memcheck/read_write_only_memory.ref
new file mode 100644
index 0000000..cb933ab
--- /dev/null
+++ b/tests/kernels/memcheck/read_write_only_memory.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'output': 16 bytes
+ output[0] = 0
+ output[1] = 1
+ output[2] = 2
+ output[3] = 3
+
diff --git a/tests/kernels/memcheck/read_write_only_memory.sim b/tests/kernels/memcheck/read_write_only_memory.sim
new file mode 100644
index 0000000..70981e0
--- /dev/null
+++ b/tests/kernels/memcheck/read_write_only_memory.sim
@@ -0,0 +1,7 @@
+read_write_only_memory.cl
+read_write_only_memory
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 ro>
+<size=16 fill=0 wo dump>
diff --git a/tests/kernels/memcheck/write_out_of_bounds.cl b/tests/kernels/memcheck/write_out_of_bounds.cl
new file mode 100644
index 0000000..fc4c3c6
--- /dev/null
+++ b/tests/kernels/memcheck/write_out_of_bounds.cl
@@ -0,0 +1,5 @@
+kernel void write_out_of_bounds(global int *a, global int *b, global int *c)
+{
+ int i = get_global_id(0);
+ c[i] = a[i] + b[i];
+}
diff --git a/tests/kernels/memcheck/write_out_of_bounds.ref b/tests/kernels/memcheck/write_out_of_bounds.ref
new file mode 100644
index 0000000..6412f26
--- /dev/null
+++ b/tests/kernels/memcheck/write_out_of_bounds.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'c': 16 bytes
+ c[0] = 0
+ c[1] = 2
+ c[2] = 4
+ c[3] = 6
+
diff --git a/tests/kernels/memcheck/write_out_of_bounds.sim b/tests/kernels/memcheck/write_out_of_bounds.sim
new file mode 100644
index 0000000..6fe07f6
--- /dev/null
+++ b/tests/kernels/memcheck/write_out_of_bounds.sim
@@ -0,0 +1,8 @@
+write_out_of_bounds.cl
+write_out_of_bounds
+5 1 1
+5 1 1
+
+<size=20 range=0:1:4>
+<size=20 range=0:1:4>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/memcheck/write_read_only_memory.cl b/tests/kernels/memcheck/write_read_only_memory.cl
new file mode 100644
index 0000000..8666891
--- /dev/null
+++ b/tests/kernels/memcheck/write_read_only_memory.cl
@@ -0,0 +1,5 @@
+kernel void write_read_only_memory(global int *input, global int *output)
+{
+ int i = get_global_id(0);
+ output[i] = input[i]++;
+}
diff --git a/tests/kernels/memcheck/write_read_only_memory.ref b/tests/kernels/memcheck/write_read_only_memory.ref
new file mode 100644
index 0000000..cb933ab
--- /dev/null
+++ b/tests/kernels/memcheck/write_read_only_memory.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'output': 16 bytes
+ output[0] = 0
+ output[1] = 1
+ output[2] = 2
+ output[3] = 3
+
diff --git a/tests/kernels/memcheck/write_read_only_memory.sim b/tests/kernels/memcheck/write_read_only_memory.sim
new file mode 100644
index 0000000..89c4b8c
--- /dev/null
+++ b/tests/kernels/memcheck/write_read_only_memory.sim
@@ -0,0 +1,7 @@
+write_read_only_memory.cl
+write_read_only_memory
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 ro>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/misc/array.cl b/tests/kernels/misc/array.cl
new file mode 100644
index 0000000..cd4e43a
--- /dev/null
+++ b/tests/kernels/misc/array.cl
@@ -0,0 +1,10 @@
+kernel void array(global long16 *output)
+{
+ long16 data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+
+ int i = get_global_id(0);
+
+ long16 *foo = data;
+
+ output[i] = foo[i];
+}
diff --git a/tests/kernels/misc/array.ref b/tests/kernels/misc/array.ref
new file mode 100644
index 0000000..1a1d2d0
--- /dev/null
+++ b/tests/kernels/misc/array.ref
@@ -0,0 +1,131 @@
+
+Argument 'output': 1024 bytes
+ output[0] = 0
+ output[1] = 0
+ output[2] = 0
+ output[3] = 0
+ output[4] = 0
+ output[5] = 0
+ output[6] = 0
+ output[7] = 0
+ output[8] = 0
+ output[9] = 0
+ output[10] = 0
+ output[11] = 0
+ output[12] = 0
+ output[13] = 0
+ output[14] = 0
+ output[15] = 0
+ output[16] = 1
+ output[17] = 1
+ output[18] = 1
+ output[19] = 1
+ output[20] = 1
+ output[21] = 1
+ output[22] = 1
+ output[23] = 1
+ output[24] = 1
+ output[25] = 1
+ output[26] = 1
+ output[27] = 1
+ output[28] = 1
+ output[29] = 1
+ output[30] = 1
+ output[31] = 1
+ output[32] = 2
+ output[33] = 2
+ output[34] = 2
+ output[35] = 2
+ output[36] = 2
+ output[37] = 2
+ output[38] = 2
+ output[39] = 2
+ output[40] = 2
+ output[41] = 2
+ output[42] = 2
+ output[43] = 2
+ output[44] = 2
+ output[45] = 2
+ output[46] = 2
+ output[47] = 2
+ output[48] = 3
+ output[49] = 3
+ output[50] = 3
+ output[51] = 3
+ output[52] = 3
+ output[53] = 3
+ output[54] = 3
+ output[55] = 3
+ output[56] = 3
+ output[57] = 3
+ output[58] = 3
+ output[59] = 3
+ output[60] = 3
+ output[61] = 3
+ output[62] = 3
+ output[63] = 3
+ output[64] = 4
+ output[65] = 4
+ output[66] = 4
+ output[67] = 4
+ output[68] = 4
+ output[69] = 4
+ output[70] = 4
+ output[71] = 4
+ output[72] = 4
+ output[73] = 4
+ output[74] = 4
+ output[75] = 4
+ output[76] = 4
+ output[77] = 4
+ output[78] = 4
+ output[79] = 4
+ output[80] = 5
+ output[81] = 5
+ output[82] = 5
+ output[83] = 5
+ output[84] = 5
+ output[85] = 5
+ output[86] = 5
+ output[87] = 5
+ output[88] = 5
+ output[89] = 5
+ output[90] = 5
+ output[91] = 5
+ output[92] = 5
+ output[93] = 5
+ output[94] = 5
+ output[95] = 5
+ output[96] = 6
+ output[97] = 6
+ output[98] = 6
+ output[99] = 6
+ output[100] = 6
+ output[101] = 6
+ output[102] = 6
+ output[103] = 6
+ output[104] = 6
+ output[105] = 6
+ output[106] = 6
+ output[107] = 6
+ output[108] = 6
+ output[109] = 6
+ output[110] = 6
+ output[111] = 6
+ output[112] = 7
+ output[113] = 7
+ output[114] = 7
+ output[115] = 7
+ output[116] = 7
+ output[117] = 7
+ output[118] = 7
+ output[119] = 7
+ output[120] = 7
+ output[121] = 7
+ output[122] = 7
+ output[123] = 7
+ output[124] = 7
+ output[125] = 7
+ output[126] = 7
+ output[127] = 7
+
diff --git a/tests/kernels/misc/array.sim b/tests/kernels/misc/array.sim
new file mode 100644
index 0000000..e0b46e3
--- /dev/null
+++ b/tests/kernels/misc/array.sim
@@ -0,0 +1,6 @@
+array.cl
+array
+8 1 1
+1 1 1
+
+<size=1024 fill=0 dump>
diff --git a/tests/kernels/misc/reduce.cl b/tests/kernels/misc/reduce.cl
new file mode 100644
index 0000000..28f53ca
--- /dev/null
+++ b/tests/kernels/misc/reduce.cl
@@ -0,0 +1,28 @@
+kernel void reduce(unsigned int n,
+ global unsigned int *data,
+ global unsigned int *result,
+ local unsigned int *localData)
+{
+ unsigned int lid = get_local_id(0);
+ unsigned int lsz = get_local_size(0);
+ unsigned int sum = 0;
+ for (unsigned int i = lid; i < n; i+=lsz)
+ {
+ sum += data[i];
+ }
+
+ localData[lid] = sum;
+ for (unsigned int offset = lsz/2; offset > 0; offset/=2)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (lid < offset)
+ {
+ localData[lid] += localData[lid + offset];
+ }
+ }
+
+ if (lid == 0)
+ {
+ *result = localData[lid];
+ }
+}
diff --git a/tests/kernels/misc/reduce.ref b/tests/kernels/misc/reduce.ref
new file mode 100644
index 0000000..fa92b4e
--- /dev/null
+++ b/tests/kernels/misc/reduce.ref
@@ -0,0 +1,4 @@
+
+Argument 'result': 4 bytes
+ result[0] = 120
+
diff --git a/tests/kernels/misc/reduce.sim b/tests/kernels/misc/reduce.sim
new file mode 100644
index 0000000..927a2e0
--- /dev/null
+++ b/tests/kernels/misc/reduce.sim
@@ -0,0 +1,11 @@
+reduce.cl
+reduce
+4 1 1
+4 1 1
+
+<size=4>
+16
+
+<size=64 range=0:1:15>
+<size=4 fill=0 dump>
+<size=16>
diff --git a/tests/kernels/misc/vecadd.cl b/tests/kernels/misc/vecadd.cl
new file mode 100644
index 0000000..04e2835
--- /dev/null
+++ b/tests/kernels/misc/vecadd.cl
@@ -0,0 +1,5 @@
+kernel void vecadd(global float *a, global float *b, global float *c)
+{
+ size_t i = get_global_id(0);
+ c[i] = a[i] + b[i];
+}
diff --git a/tests/kernels/misc/vecadd.ref b/tests/kernels/misc/vecadd.ref
new file mode 100644
index 0000000..9fa7b4c
--- /dev/null
+++ b/tests/kernels/misc/vecadd.ref
@@ -0,0 +1,1027 @@
+
+Argument 'c': 4096 bytes
+ c[0] = 0
+ c[1] = 2
+ c[2] = 4
+ c[3] = 6
+ c[4] = 8
+ c[5] = 10
+ c[6] = 12
+ c[7] = 14
+ c[8] = 16
+ c[9] = 18
+ c[10] = 20
+ c[11] = 22
+ c[12] = 24
+ c[13] = 26
+ c[14] = 28
+ c[15] = 30
+ c[16] = 32
+ c[17] = 34
+ c[18] = 36
+ c[19] = 38
+ c[20] = 40
+ c[21] = 42
+ c[22] = 44
+ c[23] = 46
+ c[24] = 48
+ c[25] = 50
+ c[26] = 52
+ c[27] = 54
+ c[28] = 56
+ c[29] = 58
+ c[30] = 60
+ c[31] = 62
+ c[32] = 64
+ c[33] = 66
+ c[34] = 68
+ c[35] = 70
+ c[36] = 72
+ c[37] = 74
+ c[38] = 76
+ c[39] = 78
+ c[40] = 80
+ c[41] = 82
+ c[42] = 84
+ c[43] = 86
+ c[44] = 88
+ c[45] = 90
+ c[46] = 92
+ c[47] = 94
+ c[48] = 96
+ c[49] = 98
+ c[50] = 100
+ c[51] = 102
+ c[52] = 104
+ c[53] = 106
+ c[54] = 108
+ c[55] = 110
+ c[56] = 112
+ c[57] = 114
+ c[58] = 116
+ c[59] = 118
+ c[60] = 120
+ c[61] = 122
+ c[62] = 124
+ c[63] = 126
+ c[64] = 128
+ c[65] = 130
+ c[66] = 132
+ c[67] = 134
+ c[68] = 136
+ c[69] = 138
+ c[70] = 140
+ c[71] = 142
+ c[72] = 144
+ c[73] = 146
+ c[74] = 148
+ c[75] = 150
+ c[76] = 152
+ c[77] = 154
+ c[78] = 156
+ c[79] = 158
+ c[80] = 160
+ c[81] = 162
+ c[82] = 164
+ c[83] = 166
+ c[84] = 168
+ c[85] = 170
+ c[86] = 172
+ c[87] = 174
+ c[88] = 176
+ c[89] = 178
+ c[90] = 180
+ c[91] = 182
+ c[92] = 184
+ c[93] = 186
+ c[94] = 188
+ c[95] = 190
+ c[96] = 192
+ c[97] = 194
+ c[98] = 196
+ c[99] = 198
+ c[100] = 200
+ c[101] = 202
+ c[102] = 204
+ c[103] = 206
+ c[104] = 208
+ c[105] = 210
+ c[106] = 212
+ c[107] = 214
+ c[108] = 216
+ c[109] = 218
+ c[110] = 220
+ c[111] = 222
+ c[112] = 224
+ c[113] = 226
+ c[114] = 228
+ c[115] = 230
+ c[116] = 232
+ c[117] = 234
+ c[118] = 236
+ c[119] = 238
+ c[120] = 240
+ c[121] = 242
+ c[122] = 244
+ c[123] = 246
+ c[124] = 248
+ c[125] = 250
+ c[126] = 252
+ c[127] = 254
+ c[128] = 256
+ c[129] = 258
+ c[130] = 260
+ c[131] = 262
+ c[132] = 264
+ c[133] = 266
+ c[134] = 268
+ c[135] = 270
+ c[136] = 272
+ c[137] = 274
+ c[138] = 276
+ c[139] = 278
+ c[140] = 280
+ c[141] = 282
+ c[142] = 284
+ c[143] = 286
+ c[144] = 288
+ c[145] = 290
+ c[146] = 292
+ c[147] = 294
+ c[148] = 296
+ c[149] = 298
+ c[150] = 300
+ c[151] = 302
+ c[152] = 304
+ c[153] = 306
+ c[154] = 308
+ c[155] = 310
+ c[156] = 312
+ c[157] = 314
+ c[158] = 316
+ c[159] = 318
+ c[160] = 320
+ c[161] = 322
+ c[162] = 324
+ c[163] = 326
+ c[164] = 328
+ c[165] = 330
+ c[166] = 332
+ c[167] = 334
+ c[168] = 336
+ c[169] = 338
+ c[170] = 340
+ c[171] = 342
+ c[172] = 344
+ c[173] = 346
+ c[174] = 348
+ c[175] = 350
+ c[176] = 352
+ c[177] = 354
+ c[178] = 356
+ c[179] = 358
+ c[180] = 360
+ c[181] = 362
+ c[182] = 364
+ c[183] = 366
+ c[184] = 368
+ c[185] = 370
+ c[186] = 372
+ c[187] = 374
+ c[188] = 376
+ c[189] = 378
+ c[190] = 380
+ c[191] = 382
+ c[192] = 384
+ c[193] = 386
+ c[194] = 388
+ c[195] = 390
+ c[196] = 392
+ c[197] = 394
+ c[198] = 396
+ c[199] = 398
+ c[200] = 400
+ c[201] = 402
+ c[202] = 404
+ c[203] = 406
+ c[204] = 408
+ c[205] = 410
+ c[206] = 412
+ c[207] = 414
+ c[208] = 416
+ c[209] = 418
+ c[210] = 420
+ c[211] = 422
+ c[212] = 424
+ c[213] = 426
+ c[214] = 428
+ c[215] = 430
+ c[216] = 432
+ c[217] = 434
+ c[218] = 436
+ c[219] = 438
+ c[220] = 440
+ c[221] = 442
+ c[222] = 444
+ c[223] = 446
+ c[224] = 448
+ c[225] = 450
+ c[226] = 452
+ c[227] = 454
+ c[228] = 456
+ c[229] = 458
+ c[230] = 460
+ c[231] = 462
+ c[232] = 464
+ c[233] = 466
+ c[234] = 468
+ c[235] = 470
+ c[236] = 472
+ c[237] = 474
+ c[238] = 476
+ c[239] = 478
+ c[240] = 480
+ c[241] = 482
+ c[242] = 484
+ c[243] = 486
+ c[244] = 488
+ c[245] = 490
+ c[246] = 492
+ c[247] = 494
+ c[248] = 496
+ c[249] = 498
+ c[250] = 500
+ c[251] = 502
+ c[252] = 504
+ c[253] = 506
+ c[254] = 508
+ c[255] = 510
+ c[256] = 512
+ c[257] = 514
+ c[258] = 516
+ c[259] = 518
+ c[260] = 520
+ c[261] = 522
+ c[262] = 524
+ c[263] = 526
+ c[264] = 528
+ c[265] = 530
+ c[266] = 532
+ c[267] = 534
+ c[268] = 536
+ c[269] = 538
+ c[270] = 540
+ c[271] = 542
+ c[272] = 544
+ c[273] = 546
+ c[274] = 548
+ c[275] = 550
+ c[276] = 552
+ c[277] = 554
+ c[278] = 556
+ c[279] = 558
+ c[280] = 560
+ c[281] = 562
+ c[282] = 564
+ c[283] = 566
+ c[284] = 568
+ c[285] = 570
+ c[286] = 572
+ c[287] = 574
+ c[288] = 576
+ c[289] = 578
+ c[290] = 580
+ c[291] = 582
+ c[292] = 584
+ c[293] = 586
+ c[294] = 588
+ c[295] = 590
+ c[296] = 592
+ c[297] = 594
+ c[298] = 596
+ c[299] = 598
+ c[300] = 600
+ c[301] = 602
+ c[302] = 604
+ c[303] = 606
+ c[304] = 608
+ c[305] = 610
+ c[306] = 612
+ c[307] = 614
+ c[308] = 616
+ c[309] = 618
+ c[310] = 620
+ c[311] = 622
+ c[312] = 624
+ c[313] = 626
+ c[314] = 628
+ c[315] = 630
+ c[316] = 632
+ c[317] = 634
+ c[318] = 636
+ c[319] = 638
+ c[320] = 640
+ c[321] = 642
+ c[322] = 644
+ c[323] = 646
+ c[324] = 648
+ c[325] = 650
+ c[326] = 652
+ c[327] = 654
+ c[328] = 656
+ c[329] = 658
+ c[330] = 660
+ c[331] = 662
+ c[332] = 664
+ c[333] = 666
+ c[334] = 668
+ c[335] = 670
+ c[336] = 672
+ c[337] = 674
+ c[338] = 676
+ c[339] = 678
+ c[340] = 680
+ c[341] = 682
+ c[342] = 684
+ c[343] = 686
+ c[344] = 688
+ c[345] = 690
+ c[346] = 692
+ c[347] = 694
+ c[348] = 696
+ c[349] = 698
+ c[350] = 700
+ c[351] = 702
+ c[352] = 704
+ c[353] = 706
+ c[354] = 708
+ c[355] = 710
+ c[356] = 712
+ c[357] = 714
+ c[358] = 716
+ c[359] = 718
+ c[360] = 720
+ c[361] = 722
+ c[362] = 724
+ c[363] = 726
+ c[364] = 728
+ c[365] = 730
+ c[366] = 732
+ c[367] = 734
+ c[368] = 736
+ c[369] = 738
+ c[370] = 740
+ c[371] = 742
+ c[372] = 744
+ c[373] = 746
+ c[374] = 748
+ c[375] = 750
+ c[376] = 752
+ c[377] = 754
+ c[378] = 756
+ c[379] = 758
+ c[380] = 760
+ c[381] = 762
+ c[382] = 764
+ c[383] = 766
+ c[384] = 768
+ c[385] = 770
+ c[386] = 772
+ c[387] = 774
+ c[388] = 776
+ c[389] = 778
+ c[390] = 780
+ c[391] = 782
+ c[392] = 784
+ c[393] = 786
+ c[394] = 788
+ c[395] = 790
+ c[396] = 792
+ c[397] = 794
+ c[398] = 796
+ c[399] = 798
+ c[400] = 800
+ c[401] = 802
+ c[402] = 804
+ c[403] = 806
+ c[404] = 808
+ c[405] = 810
+ c[406] = 812
+ c[407] = 814
+ c[408] = 816
+ c[409] = 818
+ c[410] = 820
+ c[411] = 822
+ c[412] = 824
+ c[413] = 826
+ c[414] = 828
+ c[415] = 830
+ c[416] = 832
+ c[417] = 834
+ c[418] = 836
+ c[419] = 838
+ c[420] = 840
+ c[421] = 842
+ c[422] = 844
+ c[423] = 846
+ c[424] = 848
+ c[425] = 850
+ c[426] = 852
+ c[427] = 854
+ c[428] = 856
+ c[429] = 858
+ c[430] = 860
+ c[431] = 862
+ c[432] = 864
+ c[433] = 866
+ c[434] = 868
+ c[435] = 870
+ c[436] = 872
+ c[437] = 874
+ c[438] = 876
+ c[439] = 878
+ c[440] = 880
+ c[441] = 882
+ c[442] = 884
+ c[443] = 886
+ c[444] = 888
+ c[445] = 890
+ c[446] = 892
+ c[447] = 894
+ c[448] = 896
+ c[449] = 898
+ c[450] = 900
+ c[451] = 902
+ c[452] = 904
+ c[453] = 906
+ c[454] = 908
+ c[455] = 910
+ c[456] = 912
+ c[457] = 914
+ c[458] = 916
+ c[459] = 918
+ c[460] = 920
+ c[461] = 922
+ c[462] = 924
+ c[463] = 926
+ c[464] = 928
+ c[465] = 930
+ c[466] = 932
+ c[467] = 934
+ c[468] = 936
+ c[469] = 938
+ c[470] = 940
+ c[471] = 942
+ c[472] = 944
+ c[473] = 946
+ c[474] = 948
+ c[475] = 950
+ c[476] = 952
+ c[477] = 954
+ c[478] = 956
+ c[479] = 958
+ c[480] = 960
+ c[481] = 962
+ c[482] = 964
+ c[483] = 966
+ c[484] = 968
+ c[485] = 970
+ c[486] = 972
+ c[487] = 974
+ c[488] = 976
+ c[489] = 978
+ c[490] = 980
+ c[491] = 982
+ c[492] = 984
+ c[493] = 986
+ c[494] = 988
+ c[495] = 990
+ c[496] = 992
+ c[497] = 994
+ c[498] = 996
+ c[499] = 998
+ c[500] = 1000
+ c[501] = 1002
+ c[502] = 1004
+ c[503] = 1006
+ c[504] = 1008
+ c[505] = 1010
+ c[506] = 1012
+ c[507] = 1014
+ c[508] = 1016
+ c[509] = 1018
+ c[510] = 1020
+ c[511] = 1022
+ c[512] = 1024
+ c[513] = 1026
+ c[514] = 1028
+ c[515] = 1030
+ c[516] = 1032
+ c[517] = 1034
+ c[518] = 1036
+ c[519] = 1038
+ c[520] = 1040
+ c[521] = 1042
+ c[522] = 1044
+ c[523] = 1046
+ c[524] = 1048
+ c[525] = 1050
+ c[526] = 1052
+ c[527] = 1054
+ c[528] = 1056
+ c[529] = 1058
+ c[530] = 1060
+ c[531] = 1062
+ c[532] = 1064
+ c[533] = 1066
+ c[534] = 1068
+ c[535] = 1070
+ c[536] = 1072
+ c[537] = 1074
+ c[538] = 1076
+ c[539] = 1078
+ c[540] = 1080
+ c[541] = 1082
+ c[542] = 1084
+ c[543] = 1086
+ c[544] = 1088
+ c[545] = 1090
+ c[546] = 1092
+ c[547] = 1094
+ c[548] = 1096
+ c[549] = 1098
+ c[550] = 1100
+ c[551] = 1102
+ c[552] = 1104
+ c[553] = 1106
+ c[554] = 1108
+ c[555] = 1110
+ c[556] = 1112
+ c[557] = 1114
+ c[558] = 1116
+ c[559] = 1118
+ c[560] = 1120
+ c[561] = 1122
+ c[562] = 1124
+ c[563] = 1126
+ c[564] = 1128
+ c[565] = 1130
+ c[566] = 1132
+ c[567] = 1134
+ c[568] = 1136
+ c[569] = 1138
+ c[570] = 1140
+ c[571] = 1142
+ c[572] = 1144
+ c[573] = 1146
+ c[574] = 1148
+ c[575] = 1150
+ c[576] = 1152
+ c[577] = 1154
+ c[578] = 1156
+ c[579] = 1158
+ c[580] = 1160
+ c[581] = 1162
+ c[582] = 1164
+ c[583] = 1166
+ c[584] = 1168
+ c[585] = 1170
+ c[586] = 1172
+ c[587] = 1174
+ c[588] = 1176
+ c[589] = 1178
+ c[590] = 1180
+ c[591] = 1182
+ c[592] = 1184
+ c[593] = 1186
+ c[594] = 1188
+ c[595] = 1190
+ c[596] = 1192
+ c[597] = 1194
+ c[598] = 1196
+ c[599] = 1198
+ c[600] = 1200
+ c[601] = 1202
+ c[602] = 1204
+ c[603] = 1206
+ c[604] = 1208
+ c[605] = 1210
+ c[606] = 1212
+ c[607] = 1214
+ c[608] = 1216
+ c[609] = 1218
+ c[610] = 1220
+ c[611] = 1222
+ c[612] = 1224
+ c[613] = 1226
+ c[614] = 1228
+ c[615] = 1230
+ c[616] = 1232
+ c[617] = 1234
+ c[618] = 1236
+ c[619] = 1238
+ c[620] = 1240
+ c[621] = 1242
+ c[622] = 1244
+ c[623] = 1246
+ c[624] = 1248
+ c[625] = 1250
+ c[626] = 1252
+ c[627] = 1254
+ c[628] = 1256
+ c[629] = 1258
+ c[630] = 1260
+ c[631] = 1262
+ c[632] = 1264
+ c[633] = 1266
+ c[634] = 1268
+ c[635] = 1270
+ c[636] = 1272
+ c[637] = 1274
+ c[638] = 1276
+ c[639] = 1278
+ c[640] = 1280
+ c[641] = 1282
+ c[642] = 1284
+ c[643] = 1286
+ c[644] = 1288
+ c[645] = 1290
+ c[646] = 1292
+ c[647] = 1294
+ c[648] = 1296
+ c[649] = 1298
+ c[650] = 1300
+ c[651] = 1302
+ c[652] = 1304
+ c[653] = 1306
+ c[654] = 1308
+ c[655] = 1310
+ c[656] = 1312
+ c[657] = 1314
+ c[658] = 1316
+ c[659] = 1318
+ c[660] = 1320
+ c[661] = 1322
+ c[662] = 1324
+ c[663] = 1326
+ c[664] = 1328
+ c[665] = 1330
+ c[666] = 1332
+ c[667] = 1334
+ c[668] = 1336
+ c[669] = 1338
+ c[670] = 1340
+ c[671] = 1342
+ c[672] = 1344
+ c[673] = 1346
+ c[674] = 1348
+ c[675] = 1350
+ c[676] = 1352
+ c[677] = 1354
+ c[678] = 1356
+ c[679] = 1358
+ c[680] = 1360
+ c[681] = 1362
+ c[682] = 1364
+ c[683] = 1366
+ c[684] = 1368
+ c[685] = 1370
+ c[686] = 1372
+ c[687] = 1374
+ c[688] = 1376
+ c[689] = 1378
+ c[690] = 1380
+ c[691] = 1382
+ c[692] = 1384
+ c[693] = 1386
+ c[694] = 1388
+ c[695] = 1390
+ c[696] = 1392
+ c[697] = 1394
+ c[698] = 1396
+ c[699] = 1398
+ c[700] = 1400
+ c[701] = 1402
+ c[702] = 1404
+ c[703] = 1406
+ c[704] = 1408
+ c[705] = 1410
+ c[706] = 1412
+ c[707] = 1414
+ c[708] = 1416
+ c[709] = 1418
+ c[710] = 1420
+ c[711] = 1422
+ c[712] = 1424
+ c[713] = 1426
+ c[714] = 1428
+ c[715] = 1430
+ c[716] = 1432
+ c[717] = 1434
+ c[718] = 1436
+ c[719] = 1438
+ c[720] = 1440
+ c[721] = 1442
+ c[722] = 1444
+ c[723] = 1446
+ c[724] = 1448
+ c[725] = 1450
+ c[726] = 1452
+ c[727] = 1454
+ c[728] = 1456
+ c[729] = 1458
+ c[730] = 1460
+ c[731] = 1462
+ c[732] = 1464
+ c[733] = 1466
+ c[734] = 1468
+ c[735] = 1470
+ c[736] = 1472
+ c[737] = 1474
+ c[738] = 1476
+ c[739] = 1478
+ c[740] = 1480
+ c[741] = 1482
+ c[742] = 1484
+ c[743] = 1486
+ c[744] = 1488
+ c[745] = 1490
+ c[746] = 1492
+ c[747] = 1494
+ c[748] = 1496
+ c[749] = 1498
+ c[750] = 1500
+ c[751] = 1502
+ c[752] = 1504
+ c[753] = 1506
+ c[754] = 1508
+ c[755] = 1510
+ c[756] = 1512
+ c[757] = 1514
+ c[758] = 1516
+ c[759] = 1518
+ c[760] = 1520
+ c[761] = 1522
+ c[762] = 1524
+ c[763] = 1526
+ c[764] = 1528
+ c[765] = 1530
+ c[766] = 1532
+ c[767] = 1534
+ c[768] = 1536
+ c[769] = 1538
+ c[770] = 1540
+ c[771] = 1542
+ c[772] = 1544
+ c[773] = 1546
+ c[774] = 1548
+ c[775] = 1550
+ c[776] = 1552
+ c[777] = 1554
+ c[778] = 1556
+ c[779] = 1558
+ c[780] = 1560
+ c[781] = 1562
+ c[782] = 1564
+ c[783] = 1566
+ c[784] = 1568
+ c[785] = 1570
+ c[786] = 1572
+ c[787] = 1574
+ c[788] = 1576
+ c[789] = 1578
+ c[790] = 1580
+ c[791] = 1582
+ c[792] = 1584
+ c[793] = 1586
+ c[794] = 1588
+ c[795] = 1590
+ c[796] = 1592
+ c[797] = 1594
+ c[798] = 1596
+ c[799] = 1598
+ c[800] = 1600
+ c[801] = 1602
+ c[802] = 1604
+ c[803] = 1606
+ c[804] = 1608
+ c[805] = 1610
+ c[806] = 1612
+ c[807] = 1614
+ c[808] = 1616
+ c[809] = 1618
+ c[810] = 1620
+ c[811] = 1622
+ c[812] = 1624
+ c[813] = 1626
+ c[814] = 1628
+ c[815] = 1630
+ c[816] = 1632
+ c[817] = 1634
+ c[818] = 1636
+ c[819] = 1638
+ c[820] = 1640
+ c[821] = 1642
+ c[822] = 1644
+ c[823] = 1646
+ c[824] = 1648
+ c[825] = 1650
+ c[826] = 1652
+ c[827] = 1654
+ c[828] = 1656
+ c[829] = 1658
+ c[830] = 1660
+ c[831] = 1662
+ c[832] = 1664
+ c[833] = 1666
+ c[834] = 1668
+ c[835] = 1670
+ c[836] = 1672
+ c[837] = 1674
+ c[838] = 1676
+ c[839] = 1678
+ c[840] = 1680
+ c[841] = 1682
+ c[842] = 1684
+ c[843] = 1686
+ c[844] = 1688
+ c[845] = 1690
+ c[846] = 1692
+ c[847] = 1694
+ c[848] = 1696
+ c[849] = 1698
+ c[850] = 1700
+ c[851] = 1702
+ c[852] = 1704
+ c[853] = 1706
+ c[854] = 1708
+ c[855] = 1710
+ c[856] = 1712
+ c[857] = 1714
+ c[858] = 1716
+ c[859] = 1718
+ c[860] = 1720
+ c[861] = 1722
+ c[862] = 1724
+ c[863] = 1726
+ c[864] = 1728
+ c[865] = 1730
+ c[866] = 1732
+ c[867] = 1734
+ c[868] = 1736
+ c[869] = 1738
+ c[870] = 1740
+ c[871] = 1742
+ c[872] = 1744
+ c[873] = 1746
+ c[874] = 1748
+ c[875] = 1750
+ c[876] = 1752
+ c[877] = 1754
+ c[878] = 1756
+ c[879] = 1758
+ c[880] = 1760
+ c[881] = 1762
+ c[882] = 1764
+ c[883] = 1766
+ c[884] = 1768
+ c[885] = 1770
+ c[886] = 1772
+ c[887] = 1774
+ c[888] = 1776
+ c[889] = 1778
+ c[890] = 1780
+ c[891] = 1782
+ c[892] = 1784
+ c[893] = 1786
+ c[894] = 1788
+ c[895] = 1790
+ c[896] = 1792
+ c[897] = 1794
+ c[898] = 1796
+ c[899] = 1798
+ c[900] = 1800
+ c[901] = 1802
+ c[902] = 1804
+ c[903] = 1806
+ c[904] = 1808
+ c[905] = 1810
+ c[906] = 1812
+ c[907] = 1814
+ c[908] = 1816
+ c[909] = 1818
+ c[910] = 1820
+ c[911] = 1822
+ c[912] = 1824
+ c[913] = 1826
+ c[914] = 1828
+ c[915] = 1830
+ c[916] = 1832
+ c[917] = 1834
+ c[918] = 1836
+ c[919] = 1838
+ c[920] = 1840
+ c[921] = 1842
+ c[922] = 1844
+ c[923] = 1846
+ c[924] = 1848
+ c[925] = 1850
+ c[926] = 1852
+ c[927] = 1854
+ c[928] = 1856
+ c[929] = 1858
+ c[930] = 1860
+ c[931] = 1862
+ c[932] = 1864
+ c[933] = 1866
+ c[934] = 1868
+ c[935] = 1870
+ c[936] = 1872
+ c[937] = 1874
+ c[938] = 1876
+ c[939] = 1878
+ c[940] = 1880
+ c[941] = 1882
+ c[942] = 1884
+ c[943] = 1886
+ c[944] = 1888
+ c[945] = 1890
+ c[946] = 1892
+ c[947] = 1894
+ c[948] = 1896
+ c[949] = 1898
+ c[950] = 1900
+ c[951] = 1902
+ c[952] = 1904
+ c[953] = 1906
+ c[954] = 1908
+ c[955] = 1910
+ c[956] = 1912
+ c[957] = 1914
+ c[958] = 1916
+ c[959] = 1918
+ c[960] = 1920
+ c[961] = 1922
+ c[962] = 1924
+ c[963] = 1926
+ c[964] = 1928
+ c[965] = 1930
+ c[966] = 1932
+ c[967] = 1934
+ c[968] = 1936
+ c[969] = 1938
+ c[970] = 1940
+ c[971] = 1942
+ c[972] = 1944
+ c[973] = 1946
+ c[974] = 1948
+ c[975] = 1950
+ c[976] = 1952
+ c[977] = 1954
+ c[978] = 1956
+ c[979] = 1958
+ c[980] = 1960
+ c[981] = 1962
+ c[982] = 1964
+ c[983] = 1966
+ c[984] = 1968
+ c[985] = 1970
+ c[986] = 1972
+ c[987] = 1974
+ c[988] = 1976
+ c[989] = 1978
+ c[990] = 1980
+ c[991] = 1982
+ c[992] = 1984
+ c[993] = 1986
+ c[994] = 1988
+ c[995] = 1990
+ c[996] = 1992
+ c[997] = 1994
+ c[998] = 1996
+ c[999] = 1998
+ c[1000] = 2000
+ c[1001] = 2002
+ c[1002] = 2004
+ c[1003] = 2006
+ c[1004] = 2008
+ c[1005] = 2010
+ c[1006] = 2012
+ c[1007] = 2014
+ c[1008] = 2016
+ c[1009] = 2018
+ c[1010] = 2020
+ c[1011] = 2022
+ c[1012] = 2024
+ c[1013] = 2026
+ c[1014] = 2028
+ c[1015] = 2030
+ c[1016] = 2032
+ c[1017] = 2034
+ c[1018] = 2036
+ c[1019] = 2038
+ c[1020] = 2040
+ c[1021] = 2042
+ c[1022] = 2044
+ c[1023] = 2046
+
diff --git a/tests/kernels/misc/vecadd.sim b/tests/kernels/misc/vecadd.sim
new file mode 100644
index 0000000..23e1a9a
--- /dev/null
+++ b/tests/kernels/misc/vecadd.sim
@@ -0,0 +1,8 @@
+vecadd.cl
+vecadd
+1024 1 1
+16 1 1
+
+<size=4096 range=0:1:1023>
+<size=4096 range=0:1:1023>
+<size=4096 fill=0 dump>
diff --git a/tests/kernels/run_kernel_test.py b/tests/kernels/run_kernel_test.py
new file mode 100644
index 0000000..9387b9c
--- /dev/null
+++ b/tests/kernels/run_kernel_test.py
@@ -0,0 +1,93 @@
+# run_kernel_test.py (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+#
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+
+import os
+import re
+import subprocess
+import sys
+
+# Check arguments
+if len(sys.argv) != 3:
+ print 'Usage: python run_kernel_test.py EXE SIMFILE'
+ sys.exit(1)
+if not os.path.isfile(sys.argv[2]):
+ print 'Test file not found'
+ sys.exit(1)
+
+# Construct paths to test inputs/outputs
+test_exe = sys.argv[1]
+test_file = sys.argv[2]
+test_dir = os.path.dirname(os.path.realpath(test_file))
+test_file = os.path.basename(test_file)
+test_name = os.path.splitext(test_file)[0]
+test_out = test_name + '.out'
+test_ref = test_dir + os.path.sep + test_name + '.ref'
+current_dir = os.getcwd()
+
+if os.environ.get('AM_TESTS') == '1':
+ # If running via automake, use build directory for output file
+ test_out = 'tests' + os.path.sep + 'kernels' + os.path.sep + \
+ test_dir.split(os.path.sep)[-1] + os.path.sep + test_out
+else:
+ # Otherwise, use test directory for output file
+ test_out = test_dir + os.path.sep + test_out
+
+# Run oclgrind-kernel
+out = open(test_out, 'w')
+os.chdir(test_dir)
+retval = subprocess.call([test_exe, '--data-races', test_file],
+ stdout=out, stderr=out)
+out.close()
+if retval != 0:
+ print 'oclgrind-kernel returned non-zero value (' + str(retval) + ')'
+ sys.exit(retval)
+
+# Open output and reference files
+os.chdir(current_dir)
+out = open(test_out).read().splitlines()
+ref = open(test_ref).read().splitlines()
+
+# Scan through file to reach argument data
+oi = 0
+ri = 0
+try:
+ while re.match('Argument \'.*\': [0-9]+ *bytes', out[oi]) == None:
+ oi += 1
+ while re.match('Argument \'.*\': [0-9]+ *bytes', ref[ri]) == None:
+ ri += 1
+except:
+ print 'Error searching for argument data'
+ sys.exit(1)
+
+# Check that an error was produced iff an error was expected
+# An error occured if global memory dump isn't at start of file
+# TODO: Improve this so that more details about the error are checked
+should_error = ri > 1
+if should_error and oi < 2:
+ print 'Error expected, but no error reported'
+ sys.exit(1)
+if not should_error and oi > 1:
+ print 'Error reported, but no error expected'
+ sys.exit(1)
+
+# Check that the global memory dump matches the reference
+# TODO: 32-bit machines will fail this due to memory address comparisons
+match = 1
+while oi < len(out):
+ if out[oi] != ref[ri]:
+ print '[%d:%d] "%s" vs "%s"' % (oi, ri, out[oi], ref[ri])
+ match = 0
+ oi += 1
+ ri += 1
+if not match:
+ print
+ print 'Output didn\'t match reference'
+ sys.exit(1)
+
+# Test passed
+sys.exit(0)
diff --git a/tests/kernels/wait_event/wait_event_chained.cl b/tests/kernels/wait_event/wait_event_chained.cl
new file mode 100644
index 0000000..1b86f8f
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_chained.cl
@@ -0,0 +1,13 @@
+kernel void wait_event_chained(global int *data, local int *scratch)
+{
+ event_t event;
+ event = async_work_group_copy(scratch, data, 1, 0);
+ for (int i = 1; i < 4; i++)
+ {
+ async_work_group_copy(scratch+i, data+i, 1, event);
+ }
+ wait_group_events(1, &event);
+
+ int i = get_local_id(0);
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/wait_event/wait_event_chained.ref b/tests/kernels/wait_event/wait_event_chained.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_chained.ref
@@ -0,0 +1,7 @@
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/wait_event/wait_event_chained.sim b/tests/kernels/wait_event/wait_event_chained.sim
new file mode 100644
index 0000000..c865d3c
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_chained.sim
@@ -0,0 +1,7 @@
+wait_event_chained.cl
+wait_event_chained
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/wait_event/wait_event_divergent.cl b/tests/kernels/wait_event/wait_event_divergent.cl
new file mode 100644
index 0000000..d88f3f3
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_divergent.cl
@@ -0,0 +1,11 @@
+kernel void wait_event_divergent(global int *data, local int *scratch)
+{
+ int i = get_local_id(0);
+ event_t events[2];
+ events[0] = async_work_group_copy(scratch, data, 1, 0);
+ events[1] = async_work_group_copy(scratch+1, data+1, 1, 0);
+
+ wait_group_events(1, events+i);
+
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/wait_event/wait_event_divergent.ref b/tests/kernels/wait_event/wait_event_divergent.ref
new file mode 100644
index 0000000..56f64ac
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_divergent.ref
@@ -0,0 +1,6 @@
+ERROR EXPECTED
+
+Argument 'data': 8 bytes
+ data[0] = 0
+ data[1] = 0
+
diff --git a/tests/kernels/wait_event/wait_event_divergent.sim b/tests/kernels/wait_event/wait_event_divergent.sim
new file mode 100644
index 0000000..da1eb99
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_divergent.sim
@@ -0,0 +1,7 @@
+wait_event_divergent.cl
+wait_event_divergent
+2 1 1
+2 1 1
+
+<size=8 range=0:1:1 dump>
+<size=8>
diff --git a/tests/kernels/wait_event/wait_event_duplicates.cl b/tests/kernels/wait_event/wait_event_duplicates.cl
new file mode 100644
index 0000000..a625cc5
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_duplicates.cl
@@ -0,0 +1,13 @@
+kernel void wait_event_duplicates(global int *data, local int *scratch)
+{
+ event_t events[4];
+ events[0] = async_work_group_copy(scratch, data, 1, 0);
+ events[1] = events[0];
+ events[2] = async_work_group_copy(scratch+1, data+1, 3, 0);
+ events[3] = events[0];
+
+ wait_group_events(4, events);
+
+ int i = get_local_id(0);
+ data[get_local_size(0)-i-1] = scratch[i];
+}
diff --git a/tests/kernels/wait_event/wait_event_duplicates.ref b/tests/kernels/wait_event/wait_event_duplicates.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_duplicates.ref
@@ -0,0 +1,7 @@
+
+Argument 'data': 16 bytes
+ data[0] = 3
+ data[1] = 2
+ data[2] = 1
+ data[3] = 0
+
diff --git a/tests/kernels/wait_event/wait_event_duplicates.sim b/tests/kernels/wait_event/wait_event_duplicates.sim
new file mode 100644
index 0000000..39ea9b9
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_duplicates.sim
@@ -0,0 +1,7 @@
+wait_event_duplicates.cl
+wait_event_duplicates
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
+<size=16>
diff --git a/tests/kernels/wait_event/wait_event_invalid.cl b/tests/kernels/wait_event/wait_event_invalid.cl
new file mode 100644
index 0000000..239530e
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_invalid.cl
@@ -0,0 +1,5 @@
+kernel void wait_event_invalid(global int *data)
+{
+ event_t event = 42;
+ wait_group_events(1, &event);
+}
diff --git a/tests/kernels/wait_event/wait_event_invalid.ref b/tests/kernels/wait_event/wait_event_invalid.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_invalid.ref
@@ -0,0 +1,8 @@
+ERROR EXPECTED
+
+Argument 'data': 16 bytes
+ data[0] = 0
+ data[1] = 1
+ data[2] = 2
+ data[3] = 3
+
diff --git a/tests/kernels/wait_event/wait_event_invalid.sim b/tests/kernels/wait_event/wait_event_invalid.sim
new file mode 100644
index 0000000..fdfff51
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_invalid.sim
@@ -0,0 +1,6 @@
+wait_event_invalid.cl
+wait_event_invalid
+4 1 1
+4 1 1
+
+<size=16 range=0:1:3 dump>
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/oclgrind.git
More information about the Pkg-opencl-devel
mailing list