[Pkg-opencl-devel] [oclgrind] 01/01: Imported Upstream version 15.5

James Price jprice-guest at moszumanska.debian.org
Tue Aug 11 12:52:21 UTC 2015

This is an automated email from the git hooks/post-receive script.

jprice-guest pushed a commit to branch upstream
in repository oclgrind.

commit 0ae7b0acd988dc66f11a1e6e8528575369ad5090
Author: James Price <j.price at bristol.ac.uk>
Date:   Tue Aug 11 13:31:35 2015 +0100

    Imported Upstream version 15.5
 .gitignore                                         |   54 +
 CMakeLists.txt                                     |  299 ++
 LICENSE                                            |   29 +
 Makefile.am                                        |  147 +
 NEWS                                               |   58 +
 README                                             |  138 +
 cmake_config.h.in                                  |    5 +
 configure.ac                                       |  134 +
 m4/m4_ax_check_compile_flag.m4                     |   74 +
 src/CL/cl.h                                        | 1214 +++++
 src/CL/cl_d3d10.h                                  |  126 +
 src/CL/cl_d3d11.h                                  |  126 +
 src/CL/cl_dx9_media_sharing.h                      |  127 +
 src/CL/cl_egl.h                                    |  131 +
 src/CL/cl_ext.h                                    |  310 ++
 src/CL/cl_gl.h                                     |  162 +
 src/CL/cl_gl_ext.h                                 |   69 +
 src/CL/cl_platform.h                               | 1278 +++++
 src/CL/opencl.h                                    |   54 +
 src/core/Context.cpp                               |  547 ++
 src/core/Context.h                                 |  115 +
 src/core/Kernel.cpp                                |  534 ++
 src/core/Kernel.h                                  |   72 +
 src/core/KernelInvocation.cpp                      |  355 ++
 src/core/KernelInvocation.h                        |   64 +
 src/core/Memory.cpp                                |  464 ++
 src/core/Memory.h                                  |   68 +
 src/core/Plugin.cpp                                |   25 +
 src/core/Plugin.h                                  |   69 +
 src/core/Program.cpp                               |  728 +++
 src/core/Program.h                                 |   79 +
 src/core/Queue.cpp                                 |  260 +
 src/core/Queue.h                                   |  183 +
 src/core/WorkGroup.cpp                             |  428 ++
 src/core/WorkGroup.h                               |  100 +
 src/core/WorkItem.cpp                              | 1660 ++++++
 src/core/WorkItem.h                                |  213 +
 src/core/WorkItemBuiltins.cpp                      | 3561 +++++++++++++
 src/core/clc.h                                     | 1035 ++++
 src/core/common.cpp                                |  712 +++
 src/core/common.h                                  |  203 +
 src/core/gen_clc_h.cmake                           |   11 +
 src/core/gen_clc_h.sh                              |   18 +
 src/core/half.h                                    |  160 +
 src/install/INSTALL.darwin                         |   17 +
 src/install/INSTALL.linux                          |   20 +
 src/install/INSTALL.windows                        |    8 +
 src/install/install.bat                            |   23 +
 src/install/oclgrind-icd.reg                       |  Bin 0 -> 1042 bytes
 src/install/uninstall.bat                          |    1 +
 src/kernel/Simulation.cpp                          |  764 +++
 src/kernel/Simulation.h                            |   82 +
 src/kernel/oclgrind-kernel.cpp                     |  233 +
 src/plugins/InstructionCounter.cpp                 |  184 +
 src/plugins/InstructionCounter.h                   |   38 +
 src/plugins/InteractiveDebugger.cpp                | 1024 ++++
 src/plugins/InteractiveDebugger.h                  |   72 +
 src/plugins/Logger.cpp                             |   81 +
 src/plugins/Logger.h                               |   27 +
 src/plugins/MemCheck.cpp                           |  107 +
 src/plugins/MemCheck.h                             |   43 +
 src/plugins/RaceDetector.cpp                       |  336 ++
 src/plugins/RaceDetector.h                         |   94 +
 src/runtime/async_queue.cpp                        |  136 +
 src/runtime/async_queue.h                          |   21 +
 src/runtime/icd.def                                |    5 +
 src/runtime/icd.h                                  |  235 +
 src/runtime/oclgrind                               |  145 +
 src/runtime/runtime.cpp                            | 5594 ++++++++++++++++++++
 src/runtime/runtime.def                            |  119 +
 tests/apps/CMakeLists.txt                          |   33 +
 tests/apps/vecadd/vecadd.c                         |  190 +
 tests/kernels/TESTS                                |   56 +
 tests/kernels/alignment/packed.cl                  |   10 +
 tests/kernels/alignment/packed.ref                 |    4 +
 tests/kernels/alignment/packed.sim                 |   10 +
 tests/kernels/alignment/unaligned.cl               |    6 +
 tests/kernels/alignment/unaligned.ref              |    5 +
 tests/kernels/alignment/unaligned.sim              |    7 +
 tests/kernels/async_copy/async_copy.cl             |    8 +
 tests/kernels/async_copy/async_copy.ref            |    7 +
 tests/kernels/async_copy/async_copy.sim            |    7 +
 tests/kernels/async_copy/async_copy_divergent.cl   |   14 +
 tests/kernels/async_copy/async_copy_divergent.ref  |    8 +
 tests/kernels/async_copy/async_copy_divergent.sim  |    7 +
 tests/kernels/async_copy/async_copy_global_race.cl |   11 +
 .../kernels/async_copy/async_copy_global_race.ref  |    8 +
 .../kernels/async_copy/async_copy_global_race.sim  |    7 +
 tests/kernels/async_copy/async_copy_local_race.cl  |   10 +
 tests/kernels/async_copy/async_copy_local_race.ref |    8 +
 tests/kernels/async_copy/async_copy_local_race.sim |    7 +
 tests/kernels/async_copy/async_copy_loop.cl        |   14 +
 tests/kernels/async_copy/async_copy_loop.ref       |    7 +
 tests/kernels/async_copy/async_copy_loop.sim       |    7 +
 .../async_copy/async_copy_loop_divergent.cl        |   19 +
 .../async_copy/async_copy_loop_divergent.ref       |    8 +
 .../async_copy/async_copy_loop_divergent.sim       |    7 +
 tests/kernels/async_copy/async_copy_single_wi.cl   |   13 +
 tests/kernels/async_copy/async_copy_single_wi.ref  |    8 +
 tests/kernels/async_copy/async_copy_single_wi.sim  |    7 +
 tests/kernels/async_copy/async_copy_unwaited.cl    |    7 +
 tests/kernels/async_copy/async_copy_unwaited.ref   |    8 +
 tests/kernels/async_copy/async_copy_unwaited.sim   |    7 +
 tests/kernels/atomics/atomic_cmpxchg_false_race.cl |   36 +
 .../kernels/atomics/atomic_cmpxchg_false_race.ref  |    8 +
 .../kernels/atomics/atomic_cmpxchg_false_race.sim  |    7 +
 tests/kernels/atomics/atomic_cmpxchg_read_race.cl  |   12 +
 tests/kernels/atomics/atomic_cmpxchg_read_race.ref |    5 +
 tests/kernels/atomics/atomic_cmpxchg_read_race.sim |    7 +
 tests/kernels/atomics/atomic_cmpxchg_write_race.cl |    9 +
 .../kernels/atomics/atomic_cmpxchg_write_race.ref  |    5 +
 .../kernels/atomics/atomic_cmpxchg_write_race.sim  |    7 +
 tests/kernels/atomics/atomic_global_fence.cl       |   17 +
 tests/kernels/atomics/atomic_global_fence.ref      |    5 +
 tests/kernels/atomics/atomic_global_fence.sim      |    7 +
 tests/kernels/atomics/atomic_global_fence_race.cl  |   12 +
 tests/kernels/atomics/atomic_global_fence_race.ref |    6 +
 tests/kernels/atomics/atomic_global_fence_race.sim |    7 +
 tests/kernels/atomics/atomic_increment.cl          |    4 +
 tests/kernels/atomics/atomic_increment.ref         |    4 +
 tests/kernels/atomics/atomic_increment.sim         |    6 +
 tests/kernels/atomics/atomic_intergroup_race.cl    |   10 +
 tests/kernels/atomics/atomic_intergroup_race.ref   |    5 +
 tests/kernels/atomics/atomic_intergroup_race.sim   |    6 +
 tests/kernels/atomics/atomic_local_fence.cl        |   17 +
 tests/kernels/atomics/atomic_local_fence.ref       |    5 +
 tests/kernels/atomics/atomic_local_fence.sim       |    7 +
 tests/kernels/atomics/atomic_race_after.cl         |    8 +
 tests/kernels/atomics/atomic_race_after.ref        |    5 +
 tests/kernels/atomics/atomic_race_after.sim        |    6 +
 tests/kernels/atomics/atomic_race_before.cl        |    8 +
 tests/kernels/atomics/atomic_race_before.ref       |    5 +
 tests/kernels/atomics/atomic_race_before.sim       |    6 +
 tests/kernels/atomics/atomic_same_workitem.cl      |   14 +
 tests/kernels/atomics/atomic_same_workitem.ref     |    7 +
 tests/kernels/atomics/atomic_same_workitem.sim     |    6 +
 .../barrier/barrier_different_instructions.cl      |   14 +
 .../barrier/barrier_different_instructions.ref     |    8 +
 .../barrier/barrier_different_instructions.sim     |    6 +
 tests/kernels/barrier/barrier_divergence.cl        |    9 +
 tests/kernels/barrier/barrier_divergence.ref       |    8 +
 tests/kernels/barrier/barrier_divergence.sim       |    6 +
 tests/kernels/bugs/gvn_arbitrary_integers.cl       |    8 +
 tests/kernels/bugs/gvn_arbitrary_integers.ref      |    6 +
 tests/kernels/bugs/gvn_arbitrary_integers.sim      |    7 +
 tests/kernels/bugs/kernel_struct_argument.cl       |   11 +
 tests/kernels/bugs/kernel_struct_argument.ref      |    4 +
 tests/kernels/bugs/kernel_struct_argument.sim      |   11 +
 tests/kernels/bugs/many_alloca.cl                  |   21 +
 tests/kernels/bugs/many_alloca.ref                 |    4 +
 tests/kernels/bugs/many_alloca.sim                 |    9 +
 tests/kernels/bugs/multidim_array_in_struct.cl     |   40 +
 tests/kernels/bugs/multidim_array_in_struct.ref    |    4 +
 tests/kernels/bugs/multidim_array_in_struct.sim    |   13 +
 tests/kernels/bugs/null_argument.cl                |    9 +
 tests/kernels/bugs/null_argument.ref               |    4 +
 tests/kernels/bugs/null_argument.sim               |    6 +
 tests/kernels/bugs/sroa_addrspace_cast.cl          |   12 +
 tests/kernels/bugs/sroa_addrspace_cast.ref         |    4 +
 tests/kernels/bugs/sroa_addrspace_cast.sim         |    7 +
 tests/kernels/data-race/broadcast.cl               |    5 +
 tests/kernels/data-race/broadcast.ref              |    7 +
 tests/kernels/data-race/broadcast.sim              |    9 +
 tests/kernels/data-race/global_fence.cl            |   16 +
 tests/kernels/data-race/global_fence.ref           |    7 +
 tests/kernels/data-race/global_fence.sim           |    7 +
 tests/kernels/data-race/global_only_fence.cl       |   16 +
 tests/kernels/data-race/global_only_fence.ref      |    8 +
 tests/kernels/data-race/global_only_fence.sim      |    7 +
 tests/kernels/data-race/global_read_write_race.cl  |    8 +
 tests/kernels/data-race/global_read_write_race.ref |    8 +
 tests/kernels/data-race/global_read_write_race.sim |    6 +
 tests/kernels/data-race/global_write_write_race.cl |    4 +
 .../kernels/data-race/global_write_write_race.ref  |    5 +
 .../kernels/data-race/global_write_write_race.sim  |    6 +
 tests/kernels/data-race/increment.cl               |    5 +
 tests/kernels/data-race/increment.ref              |    7 +
 tests/kernels/data-race/increment.sim              |    6 +
 tests/kernels/data-race/intergroup_hidden_race.cl  |    9 +
 tests/kernels/data-race/intergroup_hidden_race.ref |    6 +
 tests/kernels/data-race/intergroup_hidden_race.sim |    7 +
 tests/kernels/data-race/intergroup_race.cl         |   19 +
 tests/kernels/data-race/intergroup_race.ref        |    8 +
 tests/kernels/data-race/intergroup_race.sim        |    6 +
 tests/kernels/data-race/intragroup_hidden_race.cl  |   10 +
 tests/kernels/data-race/intragroup_hidden_race.ref |    6 +
 tests/kernels/data-race/intragroup_hidden_race.sim |    7 +
 tests/kernels/data-race/local_only_fence.cl        |   16 +
 tests/kernels/data-race/local_only_fence.ref       |    8 +
 tests/kernels/data-race/local_only_fence.sim       |    7 +
 tests/kernels/data-race/local_read_write_race.cl   |   14 +
 tests/kernels/data-race/local_read_write_race.ref  |    5 +
 tests/kernels/data-race/local_read_write_race.sim  |    7 +
 tests/kernels/data-race/local_write_write_race.cl  |    7 +
 tests/kernels/data-race/local_write_write_race.ref |    8 +
 tests/kernels/data-race/local_write_write_race.sim |    7 +
 tests/kernels/data-race/uniform_write_race.cl      |    4 +
 tests/kernels/data-race/uniform_write_race.ref     |    4 +
 tests/kernels/data-race/uniform_write_race.sim     |    6 +
 tests/kernels/memcheck/async_copy_out_of_bounds.cl |    8 +
 .../kernels/memcheck/async_copy_out_of_bounds.ref  |    8 +
 .../kernels/memcheck/async_copy_out_of_bounds.sim  |    7 +
 tests/kernels/memcheck/atomic_out_of_bounds.cl     |    5 +
 tests/kernels/memcheck/atomic_out_of_bounds.ref    |    8 +
 tests/kernels/memcheck/atomic_out_of_bounds.sim    |    6 +
 tests/kernels/memcheck/dereference_null.cl         |    4 +
 tests/kernels/memcheck/dereference_null.ref        |    5 +
 tests/kernels/memcheck/dereference_null.sim        |    7 +
 tests/kernels/memcheck/read_out_of_bounds.cl       |   12 +
 tests/kernels/memcheck/read_out_of_bounds.ref      |    9 +
 tests/kernels/memcheck/read_out_of_bounds.sim      |    8 +
 tests/kernels/memcheck/read_write_only_memory.cl   |    5 +
 tests/kernels/memcheck/read_write_only_memory.ref  |    8 +
 tests/kernels/memcheck/read_write_only_memory.sim  |    7 +
 tests/kernels/memcheck/write_out_of_bounds.cl      |    5 +
 tests/kernels/memcheck/write_out_of_bounds.ref     |    8 +
 tests/kernels/memcheck/write_out_of_bounds.sim     |    8 +
 tests/kernels/memcheck/write_read_only_memory.cl   |    5 +
 tests/kernels/memcheck/write_read_only_memory.ref  |    8 +
 tests/kernels/memcheck/write_read_only_memory.sim  |    7 +
 tests/kernels/misc/array.cl                        |   10 +
 tests/kernels/misc/array.ref                       |  131 +
 tests/kernels/misc/array.sim                       |    6 +
 tests/kernels/misc/reduce.cl                       |   28 +
 tests/kernels/misc/reduce.ref                      |    4 +
 tests/kernels/misc/reduce.sim                      |   11 +
 tests/kernels/misc/vecadd.cl                       |    5 +
 tests/kernels/misc/vecadd.ref                      | 1027 ++++
 tests/kernels/misc/vecadd.sim                      |    8 +
 tests/kernels/run_kernel_test.py                   |   93 +
 tests/kernels/wait_event/wait_event_chained.cl     |   13 +
 tests/kernels/wait_event/wait_event_chained.ref    |    7 +
 tests/kernels/wait_event/wait_event_chained.sim    |    7 +
 tests/kernels/wait_event/wait_event_divergent.cl   |   11 +
 tests/kernels/wait_event/wait_event_divergent.ref  |    6 +
 tests/kernels/wait_event/wait_event_divergent.sim  |    7 +
 tests/kernels/wait_event/wait_event_duplicates.cl  |   13 +
 tests/kernels/wait_event/wait_event_duplicates.ref |    7 +
 tests/kernels/wait_event/wait_event_duplicates.sim |    7 +
 tests/kernels/wait_event/wait_event_invalid.cl     |    5 +
 tests/kernels/wait_event/wait_event_invalid.ref    |    8 +
 tests/kernels/wait_event/wait_event_invalid.sim    |    6 +
 242 files changed, 28529 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..14830ae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,54 @@
+# Autotools generated files
+# Compiler output
+# Test output
+# Misc
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a35af1e
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,299 @@
+# CMakeLists.txt (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+cmake_minimum_required(VERSION 2.8.12)
+set(Oclgrind_VERSION_MAJOR 15)
+set(Oclgrind_VERSION_MINOR 5)
+# Enable C99 for GCC (required for tests)
+  set(CMAKE_C_FLAGS "-std=c99")
+# Enable rpath on OS X
+# Enable C++11 for Clang/GCC
+  set(CMAKE_CXX_FLAGS "-std=c++11")
+# Disable min/max macros on Windows
+  add_definitions(-DNOMINMAX)
+# Suppress warnings from OpenCL runtime API headers
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-attributes -Wno-gcc-compat -Wno-availability")
+# Find LLVM
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+# Check LLVM version
+  message(FATAL_ERROR "LLVM version must be >= 3.6")
+# Add flags for LLVM
+# Get LLVM libraries for linking
+  bitreader bitwriter core instrumentation ipo irreader
+  linker mcparser objcarcopts option)
+# Check for GNU readline library
+  set(READLINE_DIR "" CACHE PATH "Location of GNU readline library")
+  include_directories(${READLINE_DIR}/include)
+  link_directories(${READLINE_DIR}/lib)
+  check_include_files("stdio.h;readline/readline.h" HAVE_READLINE_H)
+  check_include_files("stdio.h;readline/history.h" HAVE_HISTORY_H)
+  check_library_exists(readline readline "${READLINE_DIR}/lib" HAVE_READLINE_LIB)
+  check_library_exists(readline add_history "${READLINE_DIR}/lib" HAVE_HISTORY_LIB)
+    set(HAVE_READLINE 1)
+    list(APPEND CORE_EXTRA_LIBS readline)
+  else()
+    set(HAVE_READLINE 0)
+    message(WARNING "GNU readline library not found (set READLINE_DIR)\n"
+                    "The interactive debugger will not have a command history.")
+  endif()
+# Generate stringified clc.h
+  OUTPUT src/core/clc_h.cpp
+    -P ${CMAKE_SOURCE_DIR}/src/core/gen_clc_h.cmake
+  DEPENDS src/core/clc.h src/core/gen_clc_h.cmake
+include_directories("src/" "${PROJECT_BINARY_DIR}")
+  src/core/common.h
+  src/core/Context.h
+  src/core/half.h
+  src/core/Kernel.h
+  src/core/KernelInvocation.h
+  src/core/Memory.h
+  src/core/Plugin.h
+  src/core/Program.h
+  src/core/Queue.h
+  src/core/WorkItem.h
+  src/core/WorkGroup.h)
+add_library(oclgrind ${CORE_LIB_TYPE}
+  src/core/clc_h.cpp
+  src/core/common.cpp
+  src/core/Context.cpp
+  src/core/Kernel.cpp
+  src/core/KernelInvocation.cpp
+  src/core/Memory.cpp
+  src/core/Plugin.cpp
+  src/core/Program.cpp
+  src/core/Queue.cpp
+  src/core/WorkItem.cpp
+  src/core/WorkItemBuiltins.cpp
+  src/core/WorkGroup.cpp
+  src/plugins/InstructionCounter.h
+  src/plugins/InstructionCounter.cpp
+  src/plugins/InteractiveDebugger.h
+  src/plugins/InteractiveDebugger.cpp
+  src/plugins/Logger.h
+  src/plugins/Logger.cpp
+  src/plugins/MemCheck.h
+  src/plugins/MemCheck.cpp
+  src/plugins/RaceDetector.h
+  src/plugins/RaceDetector.cpp)
+target_link_libraries(oclgrind ${CORE_EXTRA_LIBS}
+  clangAnalysis clangAST clangBasic clangCodeGen clangDriver clangEdit
+  clangFrontend clangLex clangParse clangSema clangSerialization
+  ${LLVM_LIBS})
+# Sources for OpenCL runtime API frontend
+  src/runtime/async_queue.h
+  src/runtime/async_queue.cpp
+  src/runtime/icd.h
+  src/runtime/runtime.cpp)
+# Add ICD exports on Windows
+  list(APPEND RUNTIME_SOURCES src/runtime/icd.def)
+add_library(oclgrind-rt-icd SHARED ${RUNTIME_SOURCES})
+set_target_properties(oclgrind-rt-icd PROPERTIES COMPILE_FLAGS -DOCLGRIND_ICD)
+target_link_libraries(oclgrind-rt-icd ${CMAKE_DL_LIBS} oclgrind)
+# Add runtime exports on Windows
+  list(APPEND RUNTIME_SOURCES src/runtime/runtime.def)
+add_library(oclgrind-rt SHARED ${RUNTIME_SOURCES})
+target_link_libraries(oclgrind-rt ${CMAKE_DL_LIBS} oclgrind)
+  src/kernel/oclgrind-kernel.cpp
+  src/kernel/Simulation.h
+  src/kernel/Simulation.cpp)
+target_link_libraries(oclgrind-kernel oclgrind)
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc32.pch
+ ${CMAKE_BINARY_DIR}/include/oclgrind/clc64.pch
+  OUTPUT include/oclgrind/clc.h
+    copy ${CMAKE_SOURCE_DIR}/src/core/clc.h include/oclgrind/clc.h
+  DEPENDS src/core/clc.h)
+# Generate precompiled headers for clc.h
+  OUTPUT include/oclgrind/clc32.pch
+    ${CLANG}
+    -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin
+    -emit-pch -triple spir-unknown-unknown
+    -relocatable-pch -isysroot ${CMAKE_BINARY_DIR}/include/oclgrind/
+    ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+    -o include/oclgrind/clc32.pch
+  DEPENDS include/oclgrind/clc.h
+  OUTPUT include/oclgrind/clc64.pch
+    ${CLANG}
+    -cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin
+    -emit-pch -triple spir64-unknown-unknown
+    -relocatable-pch -isysroot ${CMAKE_BINARY_DIR}/include/oclgrind/
+    ${CMAKE_BINARY_DIR}/include/oclgrind/clc.h
+    -o include/oclgrind/clc64.pch
+  DEPENDS include/oclgrind/clc.h
+# Generate config.h
+configure_file("cmake_config.h.in" "config.h")
+# Install oclgrind script if not on Windows
+  file(READ src/runtime/oclgrind OCLGRIND_SCRIPT)
+    "__VERSION__" "${Oclgrind_VERSION_MAJOR}.${Oclgrind_VERSION_MINOR}"
+  # Generate ICD loader
+  file(WRITE ${CMAKE_BINARY_DIR}/oclgrind.icd "${OCLGRIND_RT_FILENAME}\n")
+  install(PROGRAMS
+    ${CMAKE_BINARY_DIR}/oclgrind
+  oclgrind-kernel
+  oclgrind oclgrind-rt oclgrind-rt-icd
+  DESTINATION include/oclgrind)
+  install(FILES
+    src/CL/cl.h
+    src/CL/cl_d3d10.h
+    src/CL/cl_d3d11.h
+    src/CL/cl_dx9_media_sharing.h
+    src/CL/cl_egl.h
+    src/CL/cl_ext.h
+    src/CL/cl_gl.h
+    src/CL/cl_gl_ext.h
+    src/CL/cl_platform.h
+    src/CL/opencl.h
+    DESTINATION include/CL)
+# Tests
+# Check for Python
+  # Add kernel tests
+  file(READ tests/kernels/TESTS KERNEL_TESTS)
+  foreach(test ${KERNEL_TESTS})
+    add_test(
+      NAME ${test}
+      COMMAND
+      ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tests/kernels/run_kernel_test.py
+      $<TARGET_FILE:oclgrind-kernel>
+      ${CMAKE_SOURCE_DIR}/tests/kernels/${test}.sim)
+  endforeach(${test})
+  # Set PCH directory
+  set_tests_properties(${KERNEL_TESTS} PROPERTIES
+  # Expected failures
+  set_tests_properties(
+    atomics/atomic_intergroup_race
+    data-race/intragroup_hidden_race
+  message(WARNING "Kernel tests will not be run (Python required)")
+# Add app tests
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f91a2f2
--- /dev/null
@@ -0,0 +1,29 @@
+Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+University of Bristol. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..8fcd00f
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,147 @@
+# Makefile.am (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+AUTOMAKE_OPTIONS = subdir-objects
+AM_CFLAGS   = -std=c99
+AM_CPPFLAGS = -I$(top_srcdir)/src/ -Wall
+# Suppress warnings from OpenCL runtime API headers
+AM_CPPFLAGS += -Wno-ignored-attributes -Wno-gcc-compat -Wno-availability
+lib_LTLIBRARIES = liboclgrind.la liboclgrind-rt.la liboclgrind-rt-icd.la
+LLVM_LIBS = `$(llvm_config) --system-libs --libs bitreader bitwriter	\
+ core instrumentation ipo irreader linker mcparser objcarcopts option`
+liboclgrind_la_SOURCES = src/core/common.h src/core/common.cpp		\
+ src/core/Context.h src/core/Context.cpp src/core/half.h		\
+ src/core/Kernel.h src/core/Kernel.cpp src/core/KernelInvocation.h	\
+ src/core/KernelInvocation.cpp src/core/Memory.h src/core/Memory.cpp	\
+ src/core/Plugin.h src/core/Plugin.cpp src/core/Program.h		\
+ src/core/Program.cpp src/core/Queue.h src/core/Queue.cpp		\
+ src/core/WorkItem.h src/core/WorkItem.cpp				\
+ src/core/WorkItemBuiltins.cpp src/core/WorkGroup.h			\
+ src/core/WorkGroup.cpp src/plugins/InstructionCounter.h		\
+ src/plugins/InstructionCounter.cpp src/plugins/InteractiveDebugger.h	\
+ src/plugins/InteractiveDebugger.cpp src/plugins/Logger.h		\
+ src/plugins/Logger.cpp src/plugins/MemCheck.h				\
+ src/plugins/MemCheck.cpp src/plugins/RaceDetector.h			\
+ src/plugins/RaceDetector.cpp
+nodist_liboclgrind_la_SOURCES = src/core/clc_h.cpp config.h
+liboclgrind_la_LDFLAGS = -lclangFrontend -lclangDriver		\
+-lclangSerialization -lclangCodeGen -lclangParse -lclangSema	\
+-lclangAnalysis -lclangEdit -lclangAST -lclangLex -lclangBasic	\
+${LLVM_LIBS} $(oclgrind_extra_libs) -shared
+oclgrind_includedir = $(includedir)/oclgrind
+oclgrind_include_HEADERS = src/core/common.h src/core/Context.h	\
+ src/core/half.h src/core/Kernel.h src/core/KernelInvocation.h	\
+ src/core/Memory.h src/core/Plugin.h src/core/Program.h		\
+ src/core/Queue.h src/core/WorkItem.h src/core/WorkGroup.h config.h LICENSE
+src/core/clc_h.cpp: src/core/gen_clc_h.sh	src/core/clc.h
+	$(top_srcdir)/src/core/gen_clc_h.sh $(top_srcdir)/src/core/clc.h $@
+	cp -p src/include/oclgrind/clc.h      $(DESTDIR)$(includedir)/oclgrind/
+	cp -p src/include/oclgrind/clc32.pch  $(DESTDIR)$(includedir)/oclgrind/
+	cp -p src/include/oclgrind/clc64.pch  $(DESTDIR)$(includedir)/oclgrind/
+	rm -rf $(DESTDIR)$(includedir)/oclgrind/clc.h
+	rm -rf $(DESTDIR)$(includedir)/oclgrind/clc32.pch
+	rm -rf $(DESTDIR)$(includedir)/oclgrind/clc64.pch
+RUNTIME_SOURCES = src/runtime/async_queue.h				\
+ src/runtime/async_queue.cpp src/runtime/icd.h src/runtime/runtime.cpp
+liboclgrind_rt_la_SOURCES = $(RUNTIME_SOURCES)
+liboclgrind_rt_la_LIBADD = liboclgrind.la
+liboclgrind_rt_la_LDFLAGS = -shared
+liboclgrind_rt_icd_la_CPPFLAGS = -DOCLGRIND_ICD $(AM_CPPFLAGS)
+liboclgrind_rt_icd_la_SOURCES = $(RUNTIME_SOURCES)
+liboclgrind_rt_icd_la_LIBADD = liboclgrind.la
+liboclgrind_rt_icd_la_LDFLAGS = -shared
+bin_PROGRAMS = oclgrind-kernel
+oclgrind_kernel_SOURCES = src/kernel/oclgrind-kernel.cpp	\
+ src/kernel/Simulation.h src/kernel/Simulation.cpp
+oclgrind_kernel_LDADD = liboclgrind.la
+bin_SCRIPTS = oclgrind
+oclgrind: $(top_srcdir)/src/runtime/oclgrind
+	cat $(top_srcdir)/src/runtime/oclgrind \
+	| $(SED) 's|__VERSION__|'$(VERSION)'|g' \
+	>$@
+noinst_SCRIPTS = oclgrind.icd \
+ src/include/oclgrind/clc.h \
+ src/include/oclgrind/clc32.pch \
+ src/include/oclgrind/clc64.pch
+oclgrind.icd: liboclgrind-rt-icd.la
+	printf $(libdir)/ >$@
+	$(GREP) dlname $< | $(AWK) -F "'" '{print $$2}' >>$@
+src/include/oclgrind/clc.h: $(top_srcdir)/src/core/clc.h
+	mkdir -p src/include/oclgrind
+	cp $< $@
+src/include/oclgrind/clc32.pch: src/include/oclgrind/clc.h
+	$(clang) \
+		-cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin \
+		-emit-pch -triple spir-unknown-unknown \
+		-relocatable-pch \
+                -isysroot $(abs_builddir)/src/include/oclgrind \
+		$< -o $@
+src/include/oclgrind/clc64.pch: src/include/oclgrind/clc.h
+	$(clang) \
+		-cc1 -x cl -cl-std=CL1.2 -O0 -g -fno-builtin \
+		-emit-pch -triple spir64-unknown-unknown \
+		-relocatable-pch \
+                -isysroot $(abs_builddir)/src/include/oclgrind \
+		$< -o $@
+check_PROGRAMS = tests/apps/vecadd/vecadd
+tests_apps_vecadd_vecadd_LDADD = liboclgrind-rt.la
+TESTS = $(check_PROGRAMS)
+  $(top_srcdir)/tests/kernels/run_kernel_test.py	\
+  ${abs_top_builddir}/oclgrind-kernel
+  export AM_TESTS=1; \
+  export OCLGRIND_PCH_DIR=$(abs_builddir)/src/include/oclgrind;
+XFAIL_TESTS =							\
+	tests/kernels/atomics/atomic_intergroup_race.sim 	\
+	tests/kernels/data-race/intragroup_hidden_race.sim
+	@echo
+	@echo "WARNING: Kernel tests skipped (Python required)."
+	@echo
+EXTRA_DIST = NEWS src/core/gen_clc_h.sh src/core/clc.h			\
+ src/runtime/oclgrind src/CL/cl.h src/CL/cl_gl.h src/CL/cl_platform.h	\
+ src/CL/cl_ext.h src/CL/cl_gl_ext.h src/CL/cl_egl.h src/CL/cl_d3d10.h	\
+ src/CL/cl_d3d11.h src/CL/cl_dx9_media_sharing.h src/CL/opencl.h	\
+ CMakeLists.txt tests/apps/CMakeLists.txt cmake_config.h.in		\
+ src/core/gen_clc_h.cmake src/runtime/icd.def src/runtime/runtime.def	\
+ src/install/INSTALL.darwin src/install/INSTALL.linux			\
+ src/install/INSTALL.windows src/install/install.bat			\
+ src/install/uninstall.bat src/install/oclgrind-icd.reg			\
+ tests/kernels/run_kernel_test.py tests/kernels/TESTS			\
+CLEANFILES = src/core/clc_h.cpp $(bin_SCRIPTS) $(noinst_SCRIPTS)	\
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..16766ab
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,58 @@
+For more information, please visit the Oclgrind Wiki:
+Oclgrind 15.5
+This release updates to LLVM 3.6, which improves the OpenCL C compiler
+and provides some additional performance enhancements. See README for
+revised instructions on how to build Oclgrind from source.
+- Fixed race conditions in atomic operations
+- Interactive debugger breaks on Ctrl+C
+- Various other minor bug fixes
+Oclgrind 15.2
+This release significantly improves simulation performance, and fixes
+several bugs impacting on usage and stability.
+- Added detection for violations of read-only/write-only attributes
+- Added --build-options argument to append additional compiler flags
+- Added hostMemoryLoad and hostMemoryStore callbacks
+- Added workGroupBegin and workItemBegin callbacks
+- Split atomic callbacks into separate load and store
+- Multi-threaded simulation to improve performance
+- Various other performance improvements
+- Several general bug fixes and stability improvements
+Oclgrind 14.12
+This release incorporates a new plugin system, to allow third-party
+developers to build tools that utilise Oclgrind. More information can
+be found on the Wiki:
+In addition, this release contains the following changes:
+- Interactive debugger now has a command history
+- Detection for unaligned memory accesses
+- Limit the number of error messages printed to avoid flooding output
+- Various other bug fixes and improvements
+Oclgrind 14.5
+Initial release (beta).
+Implements a SPIR 1.2 interpreter which can be targeted either via an
+OpenCL 1.2 runtime API implementation or using a standalone kernel
+Provides the following utilities:
+- Memory access error detection
+- Work-group divergence detection (barriers, async-copies)
+- Data-race detection (--data-races)
+- Simple interactive debugger (--interactive)
+- Instruction histograms (--inst-counts)
+- OpenCL runtime API error reporting (--check-api)
diff --git a/README b/README
new file mode 100644
index 0000000..6a65f57
--- /dev/null
+++ b/README
@@ -0,0 +1,138 @@
+This project implements a virtual OpenCL device simulator, including
+an OpenCL runtime with ICD support. The goal is to provide a platform
+for creating tools to aid OpenCL development. In particular, this
+project currently implements utilities for debugging memory access
+errors, detecting data-races and barrier divergence, collecting
+instruction histograms, and for interactive OpenCL kernel debugging.
+The simulator is built on an interpreter for LLVM IR. This project is
+being developed by James Price and Simon McIntosh-Smith at the
+University of Bristol.
+Binary releases can be found on the GitHub releases page:
+  https://github.com/jrprice/Oclgrind/releases
+To build this project, you will require the LLVM and Clang 3.6
+development libraries and headers. With some modifications, it may
+also be possible to use other (recent) versions of LLVM. If building
+LLVM from source, it is recommended to enable optimizations to improve
+the performance of Oclgrind (configure with --enable-optimized, or set
+CMAKE_BUILD_TYPE to RelWithDebInfo).
+You will also need to use a compiler that supports C++11.
+Building on Linux and OS X
+If you are building directly from the GitHub repository, you need to
+run 'autoreconf -i' to generate the necessary build files. This is not
+required if you are using a released source package.
+Run ./configure to generate the Makefile, optionally using
+--prefix=PATH to specify the target installation directory. If you
+don't have the LLVM/Clang includes and libraries on your search path,
+you can specify the location of your LLVM installation using the
+--with-llvm=PATH option. For example:
+./configure --prefix=$PWD/build/ --with-llvm=PATH/TO/LLVM/INSTALL
+This path should be the directory in which LLVM is installed (e.g. the
+path specified to --prefix or CMAKE_INSTALL_PATH when LLVM was built).
+Next, build and install with make:
+make check
+make install
+If installing to a non-default location, you should add the bin/
+directory to the PATH environment variable in order to make use of the
+oclgrind command. If you wish to use Oclgrind via the OpenCL ICD
+(optional), then you should create an ICD loading point by copying the
+oclgrind.icd file from the build directory to /etc/OpenCL/vendors/.
+Building on Windows
+A CMake build system is provided for building Oclgrind on Windows. At
+present, this only works with Visual Studio 2013 (or newer), and
+Windows 7.
+When configuring the CMake build, you may be prompted to supply a
+value for the LLVM_DIR parameter. This should be set to the directory
+containing your LLVM installations's LLVMConfig.cmake file, (for
+example C:\Program Files\LLVM\share\llvm\cmake\).
+If you wish to use Oclgrind via the OpenCL ICD (optional), then you
+should also create an ICD loading point. To do this, you should add a
+REG_DWORD value to the Windows Registry under one or both of the
+registry keys below, with the name set to the absolute path of the
+oclgrind-rt-icd.dll library and the value set to 0.
+Key for 32-bit machines or 64-bit apps on a 64-bit machine:
+Key for 32-bit apps on a 64-bit machine:
+The recommended method of running an application with Oclgrind is to
+use the oclgrind command, for example:
+oclgrind ./application
+This command will make it such the only OpenCL platform and device
+available to your application is Oclgrind. If you need more control
+over platform selection then installing an ICD loading point for
+Oclgrind will cause it to appear when an application calls
+clGetPlatformIDs(), alongside any other OpenCL platforms installed on
+your system.
+If it encounters any invalid memory accesses, Oclgrind will
+report the details to stderr, for example:
+> Invalid write of size 4 at global memory address 0x1000000000040
+>     Kernel:  vecadd
+>     Entity:  Global(16,0,0) Local(0,0,0) Group(16,0,0)
+>     store i32 %tmp9, i32 addrspace(1)* %tmp15, align 4
+>     At line 4 of input.cl
+>       c[i] = a[i] + b[i]
+Since it is interpreting an abstract intermediate representation and
+bounds-checking each memory access, Oclgrind will run quite slowly
+(typically a couple of orders of magnitude slower than a regular CPU
+implementation). Therefore, it is recommended to run your application
+with a small problem if possible.
+To enable an interactive, GDB-style debugging session, supply the -i
+flag to the oclgrind command, or export the environment variable
+OCLGRIND_INTERACTIVE=1. This will cause Oclgrind to automatically
+break at the beginning of each kernel invocation, and upon
+encountering an invalid memory access. Type 'help' for details of
+available commands.
+For more detailed information about using Oclgrind please visit the
+GitHub Wiki:
+    https://github.com/jrprice/Oclgrind/wiki/
+If you encounter any issues or have any questions, please use the
+GitHub issues page:
+    https://github.com/jrprice/Oclgrind/issues
+You can also contact the primary developer via email:
+James Price <j.price at bristol.ac.uk>
diff --git a/cmake_config.h.in b/cmake_config.h.in
new file mode 100644
index 0000000..3794dc8
--- /dev/null
+++ b/cmake_config.h.in
@@ -0,0 +1,5 @@
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..4b4c793
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,134 @@
+# configure.ac (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+AC_INIT([Oclgrind], [15.5], , [oclgrind], [https://github.com/jrprice/Oclgrind])
+AM_INIT_AUTOMAKE([foreign 1.12])
+# Check if we're compiling with Clang
+AS_CASE([`$CC --version`], [*clang*], [using_clang=yes])
+AM_CONDITIONAL([USING_CLANG], [test "$using_clang" == "yes"])
+# Check for C++11
+AX_CHECK_COMPILE_FLAG([-std=c++11], [],
+                      [AC_MSG_ERROR([C++11 support is required])])
+CXXFLAGS="$CXXFLAGS -std=c++11"
+CPPFLAGS="$CPPFLAGS -std=c++11"
+# --with-llvm option to specify root of LLVM/Clang installation
+        llvm,
+        [AS_HELP_STRING([--with-llvm],
+                       [directory containing LLVM/Clang installation])],
+        [AC_SUBST(clang, $withval/bin/clang)
+         AC_SUBST(llvm_config, $withval/bin/llvm-config)])
+# Find LLVM/Clang binaries (assume on PATH if --with-llvm not used)
+AC_CHECK_PROG(clang, [clang], `which clang`)
+AC_CHECK_PROG(llvm_config, [llvm-config], `which llvm-config`)
+if test -z $llvm_config; then
+  AC_MSG_ERROR([llvm-config not found (use --with-llvm=)])
+# Check version of LLVM
+AC_MSG_CHECKING([llvm version])
+llvm_full_version=`$llvm_config --version`
+llvm_version=`echo $llvm_full_version | cut -b 1,3`
+if test $llvm_version -lt 36; then
+  AC_MSG_ERROR([LLVM version must be >= 3.6])
+                   [$llvm_version],
+                   [Version of LLVM we are building against])
+# Add flags for LLVM
+CPPFLAGS="$CPPFLAGS `$llvm_config --cppflags`"
+LDFLAGS="$LDFLAGS `$llvm_config --ldflags`"
+# Check for LLVM/Clang headers/libraries
+        [llvm/IR/Instruction.h clang/CodeGen/CodeGenAction.h],
+        [:],
+        [AC_MSG_ERROR([LLVM/Clang includes not found (use --with-llvm=)])])
+        [clangFrontend],
+        [main],
+        [:],
+        [AC_MSG_ERROR([Clang library not found (use --with-llvm)])])
+# GNU readline library (for interactive debugger)
+        [readline],
+        AS_HELP_STRING([--with-readline],
+                       [location of GNU readline library]),
+        [CPPFLAGS="$CPPFLAGS -I$withval/include";
+         LDFLAGS="$LDFLAGS -L$withval/lib"])
+        [readline/readline.h],
+        [:],
+        [have_readline=false])
+        [readline/history.h],
+        [:],
+        [have_readline=false])
+        [readline],
+        [readline],
+        [:],
+        [have_readline=false])
+        [readline],
+        [add_history],
+        [:],
+        [have_readline=false])
+if test $have_readline = true; then
+    AC_DEFINE([HAVE_READLINE], [1], [Define to 1 if GNU readline found])
+    oclgrind_extra_libs="$oclgrind_extra_libs -lreadline"
+    AC_MSG_WARN([GNU readline library not found (use --with-readline)])
+AC_SUBST([oclgrind_extra_libs], [$oclgrind_extra_libs])
+# Check if Python is available (required to run tests)
+# Kernel tests
+m4_foreach([name], m4_split(m4_include(tests/kernels/TESTS), m4_newline),
+    KERNEL_TESTS="$KERNEL_TESTS tests/kernels/"name".sim"
+    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".sim"
+    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".cl"
+    KERNEL_TEST_INPUTS="$KERNEL_TEST_INPUTS tests/kernels/"name".ref"
+    KERNEL_TEST_OUTPUTS="$KERNEL_TEST_OUTPUTS tests/kernels/"name".out"
diff --git a/m4/m4_ax_check_compile_flag.m4 b/m4/m4_ax_check_compile_flag.m4
new file mode 100644
index 0000000..ca36397
--- /dev/null
+++ b/m4/m4_ax_check_compile_flag.m4
@@ -0,0 +1,74 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#   Check whether the given FLAG works with the current language's compiler
+#   or gives an error.  (Warnings, however, are ignored)
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#   If EXTRA-FLAGS is defined, it is added to the current language's default
+#   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
+#   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
+#   force the compiler to issue an error when a bad flag is given.
+#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#   Copyright (c) 2008 Guido U. Draheim <guidod at gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans at gmail.com>
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   Public License for more details.
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+#serial 4
+[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+  AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
diff --git a/src/CL/cl.h b/src/CL/cl.h
new file mode 100644
index 0000000..203c659
--- /dev/null
+++ b/src/CL/cl.h
@@ -0,0 +1,1214 @@
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ ******************************************************************************/
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#include <CL/cl_platform.h>
+#ifdef __cplusplus
+extern "C" {
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
+#define CL_DEVICE_PARENT_DEVICE                     0x1042
+#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
+#define CL_DEVICE_PARTITION_TYPE                    0x1046
+#define CL_DEVICE_REFERENCE_COUNT                   0x1047
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+// reserved                                         (1 << 6)    
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_CONTEXT                            0x11D4
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */, 
+                     cl_device_id                   /* device */, 
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */, 
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */, 
+                cl_addressing_mode  /* addressing_mode */, 
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */, 
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */, 
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* size */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */, 
+                    const void *       /* pattern */, 
+                    size_t             /* pattern_size */, 
+                    size_t             /* offset */, 
+                    size_t             /* size */, 
+                    cl_uint            /* num_events_in_wait_list */, 
+                    const cl_event *   /* event_wait_list */, 
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */, 
+                   const void *       /* fill_color */, 
+                   const size_t *     /* origin[3] */, 
+                   const size_t *     /* region[3] */, 
+                   cl_uint            /* num_events_in_wait_list */, 
+                   const cl_event *   /* event_wait_list */, 
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (CL_CALLBACK * /*user_func*/)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or 
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL 
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+// Deprecated OpenCL 1.1 APIs
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+#ifdef __cplusplus
+#endif  /* __OPENCL_CL_H */
diff --git a/src/CL/cl_d3d10.h b/src/CL/cl_d3d10.h
new file mode 100644
index 0000000..81b0d37
--- /dev/null
+++ b/src/CL/cl_d3d10.h
@@ -0,0 +1,126 @@
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#ifdef __cplusplus
+extern "C" {
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+// Error Codes
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+// cl_d3d10_device_source_nv
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+// cl_d3d10_device_set_nv
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+// cl_context_info
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+// cl_mem_info
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+// cl_image_info
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+#ifdef __cplusplus
+#endif  // __OPENCL_CL_D3D10_H
diff --git a/src/CL/cl_d3d11.h b/src/CL/cl_d3d11.h
new file mode 100644
index 0000000..d3c8bdc
--- /dev/null
+++ b/src/CL/cl_d3d11.h
@@ -0,0 +1,126 @@
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#ifdef __cplusplus
+extern "C" {
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+// Error Codes
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+// cl_d3d11_device_source
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+// cl_d3d11_device_set
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+// cl_context_info
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+// cl_mem_info
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+// cl_image_info
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+#ifdef __cplusplus
+#endif  // __OPENCL_CL_D3D11_H
diff --git a/src/CL/cl_dx9_media_sharing.h b/src/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000..1ef543a
--- /dev/null
+++ b/src/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,127 @@
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+#ifdef __cplusplus
+extern "C" {
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+// Error Codes
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+// cl_media_adapter_type_khr
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+// cl_media_adapter_set_khr
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+// cl_context_info
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+// cl_mem_info
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+// cl_image_info
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+// cl_command_type
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+#ifdef __cplusplus
diff --git a/src/CL/cl_egl.h b/src/CL/cl_egl.h
new file mode 100644
index 0000000..c1bd4f3
--- /dev/null
+++ b/src/CL/cl_egl.h
@@ -0,0 +1,131 @@
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ ******************************************************************************/
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+#ifdef __APPLE__
+#include <CL/cl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#ifdef __cplusplus
+extern "C" {
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+#define cl_khr_egl_image 1
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+#define cl_khr_egl_event 1
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context /* context */,
+                            EGLSyncKHR /* sync */,
+                            EGLDisplay /* display */,
+                            cl_int *   /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context context,
+	EGLSyncKHR sync,
+	EGLDisplay display,
+	cl_int *   errcode_ret);
+#ifdef __cplusplus
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/src/CL/cl_ext.h b/src/CL/cl_ext.h
new file mode 100644
index 0000000..5ab2c13
--- /dev/null
+++ b/src/CL/cl_ext.h
@@ -0,0 +1,310 @@
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ ******************************************************************************/
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+#ifdef __cplusplus
+extern "C" {
+#ifdef __APPLE__
+        #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+        #include <CL/cl.h>
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+* cl_khr_icd extension *                                                  
+#define cl_khr_icd 1
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+ * cl_khr_initalize_memory extension *
+ *************************************/
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+ * cl_khr_terminate_context extension *
+ **************************************/
+#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+* cl_nv_device_attribute_query extension *
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+* cl_amd_device_attribute_query *
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+* cl_qcom_ext_host_ptr extension
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+typedef struct _cl_mem_ext_host_ptr
+    // Type of external memory allocation.
+    // Legal values will be defined in layered extensions.
+    cl_uint  allocation_type;
+    // Host cache policy for this external memory allocation.
+    cl_uint  host_cache_policy;
+} cl_mem_ext_host_ptr;
+* cl_qcom_ion_host_ptr extension
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+typedef struct _cl_mem_ion_host_ptr
+    // Type of external memory allocation.
+    // Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations.
+    cl_mem_ext_host_ptr  ext_host_ptr;
+    // ION file descriptor
+    int                  ion_filedesc;
+    // Host pointer to the ION allocated memory
+    void*                ion_hostptr;
+} cl_mem_ion_host_ptr;
+#endif /* CL_VERSION_1_1 */
+#ifdef __cplusplus
+#endif /* __CL_EXT_H */
diff --git a/src/CL/cl_gl.h b/src/CL/cl_gl.h
new file mode 100644
index 0000000..af2036c
--- /dev/null
+++ b/src/CL/cl_gl.h
@@ -0,0 +1,162 @@
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ **********************************************************************************/
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <CL/cl.h>
+#ifdef __cplusplus
+extern "C" {
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+// Deprecated OpenCL 1.1 APIs
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+/* cl_khr_gl_sharing extension  */
+#define cl_khr_gl_sharing 1
+typedef cl_uint     cl_gl_context_info;
+/* Additional Error Codes  */
+/* cl_gl_context_info  */
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+#ifdef __cplusplus
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/src/CL/cl_gl_ext.h b/src/CL/cl_gl_ext.h
new file mode 100644
index 0000000..77d5353
--- /dev/null
+++ b/src/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ **********************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+#ifdef __cplusplus
+extern "C" {
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+    #include <CL/cl_gl.h>
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+#ifdef __cplusplus
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/src/CL/cl_platform.h b/src/CL/cl_platform.h
new file mode 100644
index 0000000..7f6f5e8
--- /dev/null
+++ b/src/CL/cl_platform.h
@@ -0,0 +1,1278 @@
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ **********************************************************************************/
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#ifdef __cplusplus
+extern "C" {
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    #ifdef __GNUC__
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #endif
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #endif
+    #elif _WIN32
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+        #endif
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+        #endif
+    #else
+    #endif
+#if (defined (_WIN32) && defined(_MSC_VER))
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#include <stdint.h>
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+#include <stddef.h>
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+/* Define capabilities for anonymous struct members. */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && (_MSC_VER >= 1500)
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+    /* .xyzw and .s0123...{f|F} are supported */
+    /* .hi and .lo are supported */
+/* Define cl_vector types */
+/* ---- cl_charn ---- */
+typedef union
+    cl_char  CL_ALIGNED(2) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+typedef union
+    cl_char  CL_ALIGNED(4) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+typedef union
+    cl_char   CL_ALIGNED(8) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+typedef union
+    cl_char  CL_ALIGNED(16) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+/* ---- cl_ucharn ---- */
+typedef union
+    cl_uchar  CL_ALIGNED(2) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+typedef union
+    cl_uchar  CL_ALIGNED(4) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+typedef union
+    cl_uchar   CL_ALIGNED(8) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+typedef union
+    cl_uchar  CL_ALIGNED(16) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+/* ---- cl_shortn ---- */
+typedef union
+    cl_short  CL_ALIGNED(4) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+typedef union
+    cl_short  CL_ALIGNED(8) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+typedef union
+    cl_short   CL_ALIGNED(16) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+typedef union
+    cl_short  CL_ALIGNED(32) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+/* ---- cl_ushortn ---- */
+typedef union
+    cl_ushort  CL_ALIGNED(4) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+typedef union
+    cl_ushort  CL_ALIGNED(8) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+typedef union
+    cl_ushort   CL_ALIGNED(16) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+typedef union
+    cl_ushort  CL_ALIGNED(32) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+/* ---- cl_intn ---- */
+typedef union
+    cl_int  CL_ALIGNED(8) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+typedef union
+    cl_int  CL_ALIGNED(16) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+typedef union
+    cl_int   CL_ALIGNED(32) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+typedef union
+    cl_int  CL_ALIGNED(64) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+/* ---- cl_uintn ---- */
+typedef union
+    cl_uint  CL_ALIGNED(8) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+typedef union
+    cl_uint  CL_ALIGNED(16) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+typedef union
+    cl_uint   CL_ALIGNED(32) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+typedef union
+    cl_uint  CL_ALIGNED(64) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+/* ---- cl_longn ---- */
+typedef union
+    cl_long  CL_ALIGNED(16) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+typedef union
+    cl_long  CL_ALIGNED(32) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+typedef union
+    cl_long   CL_ALIGNED(64) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+typedef union
+    cl_long  CL_ALIGNED(128) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+/* ---- cl_ulongn ---- */
+typedef union
+    cl_ulong  CL_ALIGNED(16) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+typedef union
+    cl_ulong  CL_ALIGNED(32) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+typedef union
+    cl_ulong   CL_ALIGNED(64) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+typedef union
+    cl_ulong  CL_ALIGNED(128) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+/* --- cl_floatn ---- */
+typedef union
+    cl_float  CL_ALIGNED(8) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+typedef union
+    cl_float  CL_ALIGNED(16) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+typedef union
+    cl_float   CL_ALIGNED(32) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+typedef union
+    cl_float  CL_ALIGNED(64) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+/* --- cl_doublen ---- */
+typedef union
+    cl_double  CL_ALIGNED(16) s[2];
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+typedef union
+    cl_double  CL_ALIGNED(32) s[4];
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+typedef union
+    cl_double   CL_ALIGNED(64) s[8];
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+typedef union
+    cl_double  CL_ALIGNED(128) s[16];
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+#ifdef __cplusplus
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && (_MSC_VER >= 1500)
+#pragma warning( pop )
+#endif  /* __CL_PLATFORM_H  */
diff --git a/src/CL/opencl.h b/src/CL/opencl.h
new file mode 100644
index 0000000..3f00524
--- /dev/null
+++ b/src/CL/opencl.h
@@ -0,0 +1,54 @@
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ ******************************************************************************/
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+#ifndef __OPENCL_H
+#define __OPENCL_H
+#ifdef __cplusplus
+extern "C" {
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+#ifdef __cplusplus
+#endif  /* __OPENCL_H   */
diff --git a/src/core/Context.cpp b/src/core/Context.cpp
new file mode 100644
index 0000000..6a8b4ff
--- /dev/null
+++ b/src/core/Context.cpp
@@ -0,0 +1,547 @@
+// Context.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <windows.h>
+#undef ERROR
+#include <dlfcn.h>
+#include <mutex>
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Instruction.h"
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "Program.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+#include "plugins/InstructionCounter.h"
+#include "plugins/InteractiveDebugger.h"
+#include "plugins/Logger.h"
+#include "plugins/MemCheck.h"
+#include "plugins/RaceDetector.h"
+using namespace oclgrind;
+using namespace std;
+  m_globalMemory = new Memory(AddrSpaceGlobal, this);
+  m_kernelInvocation = NULL;
+  loadPlugins();
+  delete m_globalMemory;
+  unloadPlugins();
+bool Context::isThreadSafe() const
+  for (const PluginEntry &p : m_plugins)
+  {
+    if (!p.first->isThreadSafe())
+      return false;
+  }
+  return true;
+Memory* Context::getGlobalMemory() const
+  return m_globalMemory;
+void Context::loadPlugins()
+  // Create core plugins
+  m_plugins.push_back(make_pair(new Logger(this), true));
+  m_plugins.push_back(make_pair(new MemCheck(this), true));
+  if (checkEnv("OCLGRIND_INST_COUNTS"))
+    m_plugins.push_back(make_pair(new InstructionCounter(this), true));
+  if (checkEnv("OCLGRIND_DATA_RACES"))
+    m_plugins.push_back(make_pair(new RaceDetector(this), true));
+  if (checkEnv("OCLGRIND_INTERACTIVE"))
+    m_plugins.push_back(make_pair(new InteractiveDebugger(this), true));
+  // Load dynamic plugins
+  const char *dynamicPlugins = getenv("OCLGRIND_PLUGINS");
+  if (dynamicPlugins)
+  {
+    std::istringstream ss(dynamicPlugins);
+    std::string libpath;
+    while(std::getline(ss, libpath, ':'))
+    {
+#if defined(_WIN32) && !defined(__MINGW32__)
+      HMODULE library = LoadLibrary(libpath.c_str());
+      if (!library)
+      {
+        cerr << "Loading Oclgrind plugin failed (LoadLibrary): "
+             << GetLastError() << endl;
+        continue;
+      }
+      void *initialize = GetProcAddress(library, "initializePlugins");
+      if (!initialize)
+      {
+        cerr << "Loading Oclgrind plugin failed (GetProcAddress): "
+             << GetLastError() << endl;
+        continue;
+      }
+      void *library = dlopen(libpath.c_str(), RTLD_NOW);
+      if (!library)
+      {
+        cerr << "Loading Oclgrind plugin failed (dlopen): "
+             << dlerror() << endl;
+        continue;
+      }
+      void *initialize = dlsym(library, "initializePlugins");
+      if (!initialize)
+      {
+        cerr << "Loading Oclgrind plugin failed (dlsym): "
+             << dlerror() << endl;
+        continue;
+      }
+      ((void(*)(Context*))initialize)(this);
+      m_pluginLibraries.push_back(library);
+    }
+  }
+void Context::unloadPlugins()
+  // Release dynamic plugin libraries
+  list<void*>::iterator plibItr;
+  for (plibItr = m_pluginLibraries.begin();
+       plibItr != m_pluginLibraries.end(); plibItr++)
+  {
+#if defined(_WIN32) && !defined(__MINGW32__)
+      void *release = GetProcAddress((HMODULE)*plibItr, "releasePlugins");
+      if (release)
+      {
+        ((void(*)(Context*))release)(this);
+      }
+      FreeLibrary((HMODULE)*plibItr);
+      void *release = dlsym(*plibItr, "releasePlugins");
+      if (release)
+      {
+        ((void(*)(Context*))release)(this);
+      }
+      dlclose(*plibItr);
+  }
+  // Destroy internal plugins
+  PluginList::iterator pItr;
+  for (pItr = m_plugins.begin(); pItr != m_plugins.end(); pItr++)
+  {
+    if (pItr->second)
+      delete pItr->first;
+  }
+  m_plugins.clear();
+void Context::registerPlugin(Plugin *plugin)
+  m_plugins.push_back(make_pair(plugin, false));
+void Context::unregisterPlugin(Plugin *plugin)
+  m_plugins.remove(make_pair(plugin, false));
+void Context::logError(const char* error) const
+  Message msg(ERROR, this);
+  msg << error << endl
+      << msg.INDENT
+      << "Kernel: " << msg.CURRENT_KERNEL << endl
+      << "Entity: " << msg.CURRENT_ENTITY << endl
+      << msg.CURRENT_LOCATION << endl;
+  msg.send();
+#define NOTIFY(function, ...)                     \
+{                                                 \
+  PluginList::const_iterator pluginItr;           \
+  for (pluginItr = m_plugins.begin();             \
+       pluginItr != m_plugins.end(); pluginItr++) \
+  {                                               \
+    pluginItr->first->function(__VA_ARGS__);      \
+  }                                               \
+void Context::notifyInstructionExecuted(const WorkItem *workItem,
+                                        const llvm::Instruction *instruction,
+                                        const TypedValue& result) const
+  NOTIFY(instructionExecuted, workItem, instruction, result);
+void Context::notifyKernelBegin(const KernelInvocation *kernelInvocation) const
+  assert(m_kernelInvocation == NULL);
+  m_kernelInvocation = kernelInvocation;
+  NOTIFY(kernelBegin, kernelInvocation);
+void Context::notifyKernelEnd(const KernelInvocation *kernelInvocation) const
+  NOTIFY(kernelEnd, kernelInvocation);
+  assert(m_kernelInvocation == kernelInvocation);
+  m_kernelInvocation = NULL;
+void Context::notifyMemoryAllocated(const Memory *memory, size_t address,
+                                    size_t size, cl_mem_flags flags) const
+  NOTIFY(memoryAllocated, memory, address, size, flags);
+void Context::notifyMemoryAtomicLoad(const Memory *memory, AtomicOp op,
+                                     size_t address, size_t size) const
+  if (m_kernelInvocation && m_kernelInvocation->getCurrentWorkItem())
+  {
+    NOTIFY(memoryAtomicLoad, memory, m_kernelInvocation->getCurrentWorkItem(),
+           op, address, size);
+  }
+void Context::notifyMemoryAtomicStore(const Memory *memory, AtomicOp op,
+                                      size_t address, size_t size) const
+  if (m_kernelInvocation && m_kernelInvocation->getCurrentWorkItem())
+  {
+    NOTIFY(memoryAtomicStore, memory, m_kernelInvocation->getCurrentWorkItem(),
+           op, address, size);
+  }
+void Context::notifyMemoryDeallocated(const Memory *memory,
+                                      size_t address) const
+  NOTIFY(memoryDeallocated, memory, address);
+void Context::notifyMemoryLoad(const Memory *memory, size_t address,
+                               size_t size) const
+  if (m_kernelInvocation)
+  {
+    if (m_kernelInvocation->getCurrentWorkItem())
+    {
+      NOTIFY(memoryLoad, memory, m_kernelInvocation->getCurrentWorkItem(),
+             address, size);
+    }
+    else if (m_kernelInvocation->getCurrentWorkGroup())
+    {
+      NOTIFY(memoryLoad, memory, m_kernelInvocation->getCurrentWorkGroup(),
+             address, size);
+    }
+  }
+  else
+  {
+    NOTIFY(hostMemoryLoad, memory, address, size);
+  }
+void Context::notifyMemoryStore(const Memory *memory, size_t address,
+                                size_t size, const uint8_t *storeData) const
+  if (m_kernelInvocation)
+  {
+    if (m_kernelInvocation->getCurrentWorkItem())
+    {
+      NOTIFY(memoryStore, memory, m_kernelInvocation->getCurrentWorkItem(),
+             address, size, storeData);
+    }
+    else if (m_kernelInvocation->getCurrentWorkGroup())
+    {
+      NOTIFY(memoryStore, memory, m_kernelInvocation->getCurrentWorkGroup(),
+             address, size, storeData);
+    }
+  }
+  else
+  {
+    NOTIFY(hostMemoryStore, memory, address, size, storeData);
+  }
+void Context::notifyMessage(MessageType type, const char *message) const
+  NOTIFY(log, type, message);
+void Context::notifyWorkGroupBarrier(const WorkGroup *workGroup,
+                                     uint32_t flags) const
+  NOTIFY(workGroupBarrier, workGroup, flags);
+void Context::notifyWorkGroupBegin(const WorkGroup *workGroup) const
+  NOTIFY(workGroupBegin, workGroup);
+void Context::notifyWorkGroupComplete(const WorkGroup *workGroup) const
+  NOTIFY(workGroupComplete, workGroup);
+void Context::notifyWorkItemBegin(const WorkItem *workItem) const
+  NOTIFY(workItemBegin, workItem);
+void Context::notifyWorkItemComplete(const WorkItem *workItem) const
+  NOTIFY(workItemComplete, workItem);
+#undef NOTIFY
+Context::Message::Message(MessageType type, const Context *context)
+  m_type             = type;
+  m_context          = context;
+  m_kernelInvocation = context->m_kernelInvocation;
+Context::Message& Context::Message::operator<<(const special& id)
+  switch (id)
+  {
+  case INDENT:
+    m_indentModifiers.push_back( m_stream.tellp());
+    break;
+  case UNINDENT:
+    m_indentModifiers.push_back(-m_stream.tellp());
+    break;
+    *this << m_kernelInvocation->getKernel()->getName();
+    break;
+  {
+    const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+    if (workItem)
+    {
+      *this << workItem->getGlobalID();
+    }
+    else
+    {
+      *this << "(none)";
+    }
+    break;
+  }
+  {
+    const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+    if (workItem)
+    {
+      *this << workItem->getLocalID();
+    }
+    else
+    {
+      *this << "(none)";
+    }
+    break;
+  }
+  {
+    const WorkGroup *workGroup = m_kernelInvocation->getCurrentWorkGroup();
+    if (workGroup)
+    {
+      *this << workGroup->getGroupID();
+    }
+    else
+    {
+      *this << "(none)";
+    }
+    break;
+  }
+  {
+    const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+    const WorkGroup *workGroup = m_kernelInvocation->getCurrentWorkGroup();
+    if (workItem)
+    {
+      *this << "Global" << workItem->getGlobalID()
+            << " Local" << workItem->getLocalID() << " ";
+    }
+    if (workGroup)
+    {
+      *this << "Group" << workGroup->getGroupID();
+    }
+    if (!workItem && ! workGroup)
+    {
+      *this << "(unknown)";
+    }
+    break;
+  }
+  {
+    const llvm::Instruction *instruction = NULL;
+    const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+    const WorkGroup *workGroup = m_kernelInvocation->getCurrentWorkGroup();
+    if (workItem)
+    {
+      instruction = workItem->getCurrentInstruction();
+    }
+    else if (workGroup)
+    {
+      instruction = workGroup->getCurrentBarrier();
+    }
+    *this << instruction;
+    break;
+  }
+  }
+  return *this;
+Context::Message& Context::Message::operator<<(
+  const llvm::Instruction *instruction)
+  // Use mutex as some part of LLVM used by dumpInstruction() is not thread-safe
+  static std::mutex mtx;
+  std::lock_guard<std::mutex> lock(mtx);
+  if (instruction)
+  {
+    // Output instruction
+    dumpInstruction(m_stream, instruction);
+    *this << endl;
+    // Output debug information
+    llvm::MDNode *md = instruction->getMetadata("dbg");
+    if (!md)
+    {
+      *this << "Debugging information not available." << endl;
+    }
+    else
+    {
+#if LLVM_VERSION > 36
+      llvm::DILocation *loc = (llvm::DILocation*)md;
+      unsigned lineNumber = loc->getLine();
+      llvm::StringRef filename = loc->getFilename();
+      llvm::DILocation loc((llvm::MDLocation*)md);
+      unsigned lineNumber = loc.getLineNumber();
+      llvm::StringRef filename = loc.getFilename();
+      *this << "At line " << dec << lineNumber
+           << " of " << filename.str() << ":" << endl;
+      // Get source line
+      const Program *program = m_kernelInvocation->getKernel()->getProgram();
+      const char *line = program->getSourceLine(lineNumber);
+      if (line)
+      {
+        while (isspace(line[0]))
+          line++;
+        *this << "  " << line;
+      }
+      else
+        *this << "  (source not available)";
+    }
+  }
+  else
+  {
+    *this << "(location unknown)";
+  }
+  return *this;
+Context::Message& Context::Message::operator<<(
+  std::ostream& (*t)(std::ostream&))
+  m_stream << t;
+  return *this;
+Context::Message& Context::Message::operator<<(
+  std::ios& (*t)(std::ios&))
+  m_stream << t;
+  return *this;
+Context::Message& Context::Message::operator<<(
+  std::ios_base& (*t)(std::ios_base&))
+  m_stream << t;
+  return *this;
+void Context::Message::send() const
+  string msg;
+  string line;
+  int currentIndent = 0;
+  list<int>::const_iterator itr = m_indentModifiers.begin();
+  m_stream.clear();
+  m_stream.seekg(0);
+  while (m_stream.good())
+  {
+    getline(m_stream, line);
+    // TODO: Wrap long lines
+    msg += line;
+    // Check for indentation modifiers
+    long pos = m_stream.tellg();
+    if (itr != m_indentModifiers.end() && pos >= abs(*itr))
+    {
+      if (*itr >= 0)
+        currentIndent++;
+      else
+        currentIndent--;
+      itr++;
+    }
+    if (!m_stream.eof())
+    {
+      // Add newline and indentation
+      msg += '\n';
+      for (int i = 0; i < currentIndent; i++)
+        msg += '\t';
+    }
+  }
+  m_context->notifyMessage(m_type, msg.c_str());
diff --git a/src/core/Context.h b/src/core/Context.h
new file mode 100644
index 0000000..41be6c7
--- /dev/null
+++ b/src/core/Context.h
@@ -0,0 +1,115 @@
+// Context.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+namespace oclgrind
+  class KernelInvocation;
+  class Memory;
+  class Plugin;
+  class WorkGroup;
+  class WorkItem;
+  typedef std::pair<Plugin*, bool> PluginEntry;
+  typedef std::list<PluginEntry> PluginList;
+  class Context
+  {
+  public:
+    Context();
+    virtual ~Context();
+    Memory* getGlobalMemory() const;
+    bool isThreadSafe() const;
+    void logError(const char* error) const;
+    // Simulation callbacks
+    void notifyInstructionExecuted(const WorkItem *workItem,
+                                   const llvm::Instruction *instruction,
+                                   const TypedValue& result) const;
+    void notifyKernelBegin(const KernelInvocation *kernelInvocation) const;
+    void notifyKernelEnd(const KernelInvocation *kernelInvocation) const;
+    void notifyMemoryAllocated(const Memory *memory, size_t address,
+                               size_t size, cl_mem_flags flags) const;
+    void notifyMemoryAtomicLoad(const Memory *memory, AtomicOp op,
+                                size_t address, size_t size) const;
+    void notifyMemoryAtomicStore(const Memory *memory, AtomicOp op,
+                                 size_t address, size_t size) const;
+    void notifyMemoryDeallocated(const Memory *memory, size_t address) const;
+    void notifyMemoryLoad(const Memory *memory, size_t address,
+                          size_t size) const;
+    void notifyMemoryStore(const Memory *memory, size_t address, size_t size,
+                           const uint8_t *storeData) const;
+    void notifyMessage(MessageType type, const char *message) const;
+    void notifyWorkGroupBarrier(const WorkGroup *workGroup,
+                                uint32_t flags) const;
+    void notifyWorkGroupBegin(const WorkGroup *workGroup) const;
+    void notifyWorkGroupComplete(const WorkGroup *workGroup) const;
+    void notifyWorkItemBegin(const WorkItem *workItem) const;
+    void notifyWorkItemComplete(const WorkItem *workItem) const;
+    // Plugins
+    void registerPlugin(Plugin *plugin);
+    void unregisterPlugin(Plugin *plugin);
+  private:
+    mutable const KernelInvocation *m_kernelInvocation;
+    Memory *m_globalMemory;
+    PluginList m_plugins;
+    std::list<void*> m_pluginLibraries;
+    void loadPlugins();
+    void unloadPlugins();
+  public:
+    class Message
+    {
+    public:
+      enum special
+      {
+        INDENT,
+        UNINDENT,
+      };
+      Message(MessageType type, const Context *context);
+      Message& operator<<(const special& id);
+      Message& operator<<(const llvm::Instruction *instruction);
+      template<typename T>
+      Message& operator<<(const T& t);
+      Message& operator<<(std::ostream& (*t)(std::ostream&));
+      Message& operator<<(std::ios& (*t)(std::ios&));
+      Message& operator<<(std::ios_base& (*t)(std::ios_base&));
+      void send() const;
+    private:
+      MessageType                m_type;
+      const Context             *m_context;
+      const KernelInvocation    *m_kernelInvocation;
+      mutable std::stringstream  m_stream;
+      std::list<int>             m_indentModifiers;
+    };
+  };
+  template<typename T>
+  Context::Message& Context::Message::operator<<(const T& t)
+  {
+    m_stream << t;
+    return *this;
+  }
diff --git a/src/core/Kernel.cpp b/src/core/Kernel.cpp
new file mode 100644
index 0000000..ab2741e
--- /dev/null
+++ b/src/core/Kernel.cpp
@@ -0,0 +1,534 @@
+// Kernel.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include <sstream>
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "Kernel.h"
+#include "Program.h"
+#include "Memory.h"
+using namespace oclgrind;
+using namespace std;
+Kernel::Kernel(const Program *program,
+               const llvm::Function *function, const llvm::Module *module)
+ : m_program(program), m_function(function), m_name(function->getName())
+  m_localMemory = new Memory(AddrSpaceLocal, program->getContext());
+  m_privateMemory = new Memory(AddrSpacePrivate, program->getContext());
+  // Set-up global variables
+  llvm::Module::const_global_iterator itr;
+  for (itr = module->global_begin(); itr != module->global_end(); itr++)
+  {
+    llvm::PointerType *type = itr->getType();
+    switch (type->getPointerAddressSpace())
+    {
+    case AddrSpacePrivate:
+    {
+      const llvm::Constant *init = itr->getInitializer();
+      // Allocate private memory for variable
+      unsigned size = getTypeSize(init->getType());
+      size_t address = m_privateMemory->allocateBuffer(size);
+      // Initialize variable
+      void *ptr = m_privateMemory->getPointer(address);
+      getConstantData((unsigned char*)ptr, init);
+      TypedValue value =
+      {
+        sizeof(size_t),
+        1,
+        new unsigned char[sizeof(size_t)]
+      };
+      value.setPointer(address);
+      m_arguments[itr] = value;
+      break;
+    }
+    case AddrSpaceConstant:
+      m_constants.push_back(itr);
+      break;
+    case AddrSpaceLocal:
+    {
+      // Allocate buffer
+      unsigned size = getTypeSize(itr->getInitializer()->getType());
+      TypedValue v = {
+        sizeof(size_t),
+        1,
+        new unsigned char[sizeof(size_t)]
+      };
+      v.setPointer(m_localMemory->allocateBuffer(size));
+      m_arguments[itr] = v;
+      break;
+    }
+    default:
+      FATAL_ERROR("Unsupported GlobalVariable address space: %d",
+                  type->getPointerAddressSpace());
+    }
+  }
+  // Get metadata node containing kernel arg info
+  m_metadata = NULL;
+  llvm::NamedMDNode *md = module->getNamedMetadata("opencl.kernels");
+  if (md)
+  {
+    for (unsigned i = 0; i < md->getNumOperands(); i++)
+    {
+      llvm::MDNode *node = md->getOperand(i);
+      llvm::ConstantAsMetadata *cam =
+        llvm::dyn_cast<llvm::ConstantAsMetadata>(node->getOperand(0).get());
+      if (!cam)
+        continue;
+      llvm::Function *function = ((llvm::Function*)cam->getValue());
+      if (function->getName() == m_name)
+      {
+        m_metadata = node;
+        break;
+      }
+    }
+  }
+Kernel::Kernel(const Kernel& kernel)
+ : m_program(kernel.m_program)
+  m_function = kernel.m_function;
+  m_constants = kernel.m_constants;
+  m_constantBuffers = kernel.m_constantBuffers;
+  m_localMemory = kernel.m_localMemory->clone();
+  m_privateMemory = kernel.m_privateMemory->clone();
+  m_name = kernel.m_name;
+  m_metadata = kernel.m_metadata;
+  TypedValueMap::const_iterator itr;
+  for (itr = kernel.m_arguments.begin();
+       itr != kernel.m_arguments.end(); itr++)
+  {
+    m_arguments[itr->first] = itr->second.clone();
+  }
+  delete m_localMemory;
+  delete m_privateMemory;
+  TypedValueMap::iterator itr;
+  for (itr = m_arguments.begin(); itr != m_arguments.end(); itr++)
+  {
+    delete[] itr->second.data;
+  }
+bool Kernel::allArgumentsSet() const
+  llvm::Function::const_arg_iterator itr;
+  for (itr = m_function->arg_begin(); itr != m_function->arg_end(); itr++)
+  {
+    if (!m_arguments.count(itr))
+    {
+      return false;
+    }
+  }
+  return true;
+void Kernel::allocateConstants(Memory *memory)
+  list<const llvm::GlobalVariable*>::const_iterator itr;
+  for (itr = m_constants.begin(); itr != m_constants.end(); itr++)
+  {
+    const llvm::Constant *initializer = (*itr)->getInitializer();
+    const llvm::Type *type = initializer->getType();
+    // Allocate buffer
+    unsigned size = getTypeSize(type);
+    TypedValue v = {
+      sizeof(size_t),
+      1,
+      new unsigned char[sizeof(size_t)]
+    };
+    size_t address = memory->allocateBuffer(size);
+    v.setPointer(address);
+    m_constantBuffers.push_back(address);
+    m_arguments[*itr] = v;
+    // Initialise buffer contents
+    unsigned char *data = new unsigned char[size];
+    getConstantData(data, (const llvm::Constant*)initializer);
+    memory->store(data, address, size);
+    delete[] data;
+  }
+void Kernel::deallocateConstants(Memory *memory)
+  list<size_t>::const_iterator itr;
+  for (itr = m_constantBuffers.begin(); itr != m_constantBuffers.end(); itr++)
+  {
+    memory->deallocateBuffer(*itr);
+  }
+  m_constantBuffers.clear();
+const llvm::Argument* Kernel::getArgument(unsigned int index) const
+  assert(index < getNumArguments());
+  llvm::Function::const_arg_iterator argItr = m_function->arg_begin();
+  for (unsigned i = 0; i < index; i++)
+  {
+    argItr++;
+  }
+  return argItr;
+unsigned int Kernel::getArgumentAccessQualifier(unsigned int index) const
+  assert(index < getNumArguments());
+  // Get metadata node
+  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_access_qual");
+  if (!node)
+  {
+    return -1;
+  }
+  // Get qualifier string
+  llvm::MDString *str
+    = llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1));
+  string access = str->getString();
+  if (access == "read_only")
+  {
+  }
+  else if (access == "write_only")
+  {
+  }
+  else if (access == "read_write")
+  {
+  }
+unsigned int Kernel::getArgumentAddressQualifier(unsigned int index) const
+  assert(index < getNumArguments());
+  // Get metadata node
+  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_addr_space");
+  if (!node)
+  {
+    return -1;
+  }
+  // Get address space
+  switch(getMDOpAsConstInt(node->getOperand(index+1))->getZExtValue())
+  {
+    case AddrSpacePrivate:
+    case AddrSpaceGlobal:
+    case AddrSpaceConstant:
+    case AddrSpaceLocal:
+    default:
+      return -1;
+  }
+const llvm::MDNode* Kernel::getArgumentMetadata(string name) const
+  if (!m_metadata)
+  {
+    return NULL;
+  }
+  // Loop over all metadata nodes for this kernel
+  for (unsigned i = 0; i < m_metadata->getNumOperands(); i++)
+  {
+    const llvm::MDOperand& op = m_metadata->getOperand(i);
+    if (llvm::MDNode *node = llvm::dyn_cast<llvm::MDNode>(op.get()))
+    {
+      // Check if node matches target name
+      if (node->getNumOperands() > 0 &&
+          ((llvm::MDString*)(node->getOperand(0).get()))->getString() == name)
+      {
+        return node;
+      }
+    }
+  }
+  return NULL;
+const llvm::StringRef Kernel::getArgumentName(unsigned int index) const
+  return getArgument(index)->getName();
+const llvm::StringRef Kernel::getArgumentTypeName(unsigned int index) const
+  assert(index < getNumArguments());
+  // Get metadata node
+  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_type");
+  if (!node)
+  {
+    return "";
+  }
+  return llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1))->getString();
+unsigned int Kernel::getArgumentTypeQualifier(unsigned int index) const
+  assert(index < getNumArguments());
+  // Get metadata node
+  const llvm::MDNode *node = getArgumentMetadata("kernel_arg_type_qual");
+  if (!node)
+  {
+    return -1;
+  }
+  // Get qualifiers
+  llvm::MDString *str =
+    llvm::dyn_cast<llvm::MDString>(node->getOperand(index+1));
+  istringstream iss(str->getString().str());
+  unsigned int result = CL_KERNEL_ARG_TYPE_NONE;
+  while (!iss.eof())
+  {
+    string tok;
+    iss >> tok;
+    if (tok == "const")
+    {
+      result |= CL_KERNEL_ARG_TYPE_CONST;
+    }
+    else if (tok == "restrict")
+    {
+    }
+    else if (tok == "volatile")
+    {
+    }
+  }
+  return result;
+size_t Kernel::getArgumentSize(unsigned int index) const
+  const llvm::Argument *argument = getArgument(index);
+  const llvm::Type *type = argument->getType();
+  // Check if pointer argument
+  if (type->isPointerTy() && argument->hasByValAttr())
+  {
+    return getTypeSize(type->getPointerElementType());
+  }
+  return getTypeSize(type);
+string Kernel::getAttributes() const
+  ostringstream attributes("");
+  for (unsigned i = 0; i < m_metadata->getNumOperands(); i++)
+  {
+    llvm::MDNode *op = llvm::dyn_cast<llvm::MDNode>(m_metadata->getOperand(i));
+    if (op)
+    {
+      llvm::MDNode *val = ((llvm::MDNode*)op);
+      llvm::MDString *str =
+        llvm::dyn_cast<llvm::MDString>(val->getOperand(0).get());
+      string name = str->getString().str();
+      if (name == "reqd_work_group_size" ||
+          name == "work_group_size_hint")
+      {
+        attributes << name << "("
+                   <<
+          getMDOpAsConstInt(val->getOperand(1))->getZExtValue()
+                   << "," <<
+          getMDOpAsConstInt(val->getOperand(2))->getZExtValue()
+                   << "," <<
+          getMDOpAsConstInt(val->getOperand(3))->getZExtValue()
+                   << ") ";
+      }
+      else if (name == "vec_type_hint")
+      {
+        // Get type hint
+        size_t n = 1;
+        llvm::Metadata *md = val->getOperand(1).get();
+        llvm::ValueAsMetadata *vam = llvm::dyn_cast<llvm::ValueAsMetadata>(md);
+        const llvm::Type *type = vam->getType();
+        if (type->isVectorTy())
+        {
+          n = type->getVectorNumElements();
+          type = type->getVectorElementType();
+        }
+        // Generate attribute string
+        attributes << name << "(" << flush;
+        llvm::raw_os_ostream out(attributes);
+        type->print(out);
+        out.flush();
+        attributes << n << ") ";
+      }
+    }
+  }
+  return attributes.str();
+const llvm::Function* Kernel::getFunction() const
+  return m_function;
+const Memory* Kernel::getLocalMemory() const
+  return m_localMemory;
+size_t Kernel::getLocalMemorySize() const
+  return m_localMemory->getTotalAllocated();
+const std::string& Kernel::getName() const
+  return m_name;
+unsigned int Kernel::getNumArguments() const
+  return m_function->arg_size();
+const Memory* Kernel::getPrivateMemory() const
+  return m_privateMemory;
+const Program* Kernel::getProgram() const
+  return m_program;
+void Kernel::getRequiredWorkGroupSize(size_t reqdWorkGroupSize[3]) const
+  memset(reqdWorkGroupSize, 0, 3*sizeof(size_t));
+  for (unsigned i = 0; i < m_metadata->getNumOperands(); i++)
+  {
+    const llvm::MDOperand& op = m_metadata->getOperand(i);
+    if (llvm::MDNode *val = llvm::dyn_cast<llvm::MDNode>(op.get()))
+    {
+      llvm::MDString *str =
+        llvm::dyn_cast<llvm::MDString>(val->getOperand(0).get());
+      if (str->getString() == "reqd_work_group_size")
+      {
+        for (int j = 0; j < 3; j++)
+        {
+          reqdWorkGroupSize[j] =
+            getMDOpAsConstInt(val->getOperand(j+1))->getZExtValue();
+        }
+      }
+    }
+  }
+void Kernel::setArgument(unsigned int index, TypedValue value)
+  assert(index < m_function->arg_size());
+  const llvm::Value *argument = getArgument(index);
+  unsigned int type = getArgumentAddressQualifier(index);
+  {
+    // Deallocate existing argument
+    if (m_arguments.count(argument))
+    {
+      m_localMemory->deallocateBuffer(m_arguments[argument].getPointer());
+      delete[] m_arguments[argument].data;
+    }
+    // Allocate local memory buffer
+    TypedValue v = {
+      sizeof(size_t),
+      1,
+      new unsigned char[sizeof(size_t)]
+    };
+    v.setPointer(m_localMemory->allocateBuffer(value.size));
+    m_arguments[argument] = v;
+  }
+  else
+  {
+    if (((const llvm::Argument*)argument)->hasByValAttr())
+    {
+      // Deallocate existing argument
+      if (m_arguments.count(argument))
+      {
+        m_privateMemory->deallocateBuffer(m_arguments[argument].getPointer());
+        delete[] m_arguments[argument].data;
+      }
+      TypedValue address =
+      {
+        sizeof(size_t),
+        1,
+        new unsigned char[sizeof(size_t)]
+      };
+      size_t size = value.size*value.num;
+      address.setPointer(m_privateMemory->allocateBuffer(size));
+      m_privateMemory->store(value.data, address.getPointer(), size);
+      m_arguments[argument] = address;
+    }
+    else
+    {
+      // Deallocate existing argument
+      if (m_arguments.count(argument))
+      {
+        delete[] m_arguments[argument].data;
+      }
+      const llvm::Type *type = argument->getType();
+      if (type->isVectorTy())
+      {
+        value.num = type->getVectorNumElements();
+        value.size = getTypeSize(type->getVectorElementType());
+      }
+      m_arguments[argument] = value.clone();
+    }
+  }
+TypedValueMap::const_iterator Kernel::args_begin() const
+  return m_arguments.begin();
+TypedValueMap::const_iterator Kernel::args_end() const
+  return m_arguments.end();
diff --git a/src/core/Kernel.h b/src/core/Kernel.h
new file mode 100644
index 0000000..cf94e90
--- /dev/null
+++ b/src/core/Kernel.h
@@ -0,0 +1,72 @@
+// Kernel.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include "llvm/ADT/StringRef.h"
+namespace llvm
+  class Argument;
+  class Constant;
+  class Function;
+  class GlobalVariable;
+  class MDNode;
+  class Module;
+namespace oclgrind
+  class Memory;
+  class Program;
+  class Kernel
+  {
+  public:
+    Kernel(const Program *program,
+           const llvm::Function *function, const llvm::Module *module);
+    Kernel(const Kernel& kernel);
+    virtual ~Kernel();
+    TypedValueMap::const_iterator args_begin() const;
+    TypedValueMap::const_iterator args_end() const;
+    bool allArgumentsSet() const;
+    void allocateConstants(Memory *memory);
+    void deallocateConstants(Memory *memory);
+    size_t getArgumentSize(unsigned int index) const;
+    unsigned int getArgumentAccessQualifier(unsigned int index) const;
+    unsigned int getArgumentAddressQualifier(unsigned int index) const;
+    const llvm::StringRef getArgumentName(unsigned int index) const;
+    const llvm::StringRef getArgumentTypeName(unsigned int index) const;
+    unsigned int getArgumentTypeQualifier(unsigned int index) const;
+    std::string getAttributes() const;
+    const llvm::Function* getFunction() const;
+    const Memory* getLocalMemory() const;
+    size_t getLocalMemorySize() const;
+    const std::string& getName() const;
+    unsigned int getNumArguments() const;
+    const Memory* getPrivateMemory() const;
+    const Program* getProgram() const;
+    void getRequiredWorkGroupSize(size_t reqdWorkGroupSize[3]) const;
+    void setArgument(unsigned int index, TypedValue value);
+  private:
+    const Program *m_program;
+    const llvm::Function *m_function;
+    TypedValueMap m_arguments;
+    std::list<const llvm::GlobalVariable*> m_constants;
+    std::list<size_t> m_constantBuffers;
+    Memory *m_localMemory;
+    const llvm::MDNode *m_metadata;
+    std::string m_name;
+    Memory *m_privateMemory;
+    const llvm::Argument* getArgument(unsigned int index) const;
+    const llvm::MDNode* getArgumentMetadata(std::string name) const;
+  };
diff --git a/src/core/KernelInvocation.cpp b/src/core/KernelInvocation.cpp
new file mode 100644
index 0000000..3d50031
--- /dev/null
+++ b/src/core/KernelInvocation.cpp
@@ -0,0 +1,355 @@
+// KernelInvocation.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include <atomic>
+#include <sstream>
+#include <thread>
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+using namespace oclgrind;
+using namespace std;
+// TODO: Remove this when thread_local fixed on OS X
+#ifdef __APPLE__
+#define THREAD_LOCAL __thread
+#elif defined(_WIN32) && !defined(__MINGW32__)
+#define THREAD_LOCAL __declspec(thread)
+#define THREAD_LOCAL thread_local
+  WorkGroup *workGroup;
+  WorkItem  *workItem;
+} static THREAD_LOCAL workerState;
+static atomic<unsigned> nextGroupIndex;
+KernelInvocation::KernelInvocation(const Context *context, const Kernel *kernel,
+                                   unsigned int workDim,
+                                   Size3 globalOffset,
+                                   Size3 globalSize,
+                                   Size3 localSize)
+  : m_context(context), m_kernel(kernel)
+  m_workDim      = workDim;
+  m_globalOffset = globalOffset;
+  m_globalSize   = globalSize;
+  m_localSize    = localSize;
+  m_numGroups.x = m_globalSize.x/m_localSize.x;
+  m_numGroups.y = m_globalSize.y/m_localSize.y;
+  m_numGroups.z = m_globalSize.z/m_localSize.z;
+  // Check for user overriding number of threads
+  m_numWorkers = 0;
+  const char *numThreads = getenv("OCLGRIND_NUM_THREADS");
+  if (numThreads)
+  {
+    char *next;
+    m_numWorkers = strtoul(numThreads, &next, 10);
+    if (strlen(next))
+    {
+      cerr << "Oclgrind: Invalid value for OCLGRIND_NUM_THREADS" << endl;
+    }
+  }
+  else
+  {
+    m_numWorkers = thread::hardware_concurrency();
+  }
+  if (!m_numWorkers || !m_context->isThreadSafe())
+    m_numWorkers = 1;
+  // Check for quick-mode environment variable
+  if (checkEnv("OCLGRIND_QUICK"))
+  {
+    // Only run first and last work-groups in quick-mode
+    Size3 firstGroup(0, 0, 0);
+    Size3 lastGroup(m_numGroups.x-1, m_numGroups.y-1, m_numGroups.z-1);
+    m_workGroups.push_back(firstGroup);
+    m_workGroups.push_back(lastGroup);
+  }
+  else
+  {
+    for (size_t k = 0; k < m_numGroups.z; k++)
+    {
+      for (size_t j = 0; j < m_numGroups.y; j++)
+      {
+        for (size_t i = 0; i < m_numGroups.x; i++)
+        {
+          m_workGroups.push_back(Size3(i, j, k));
+        }
+      }
+    }
+  }
+  // Destroy any remaining work-groups
+  while (!m_runningGroups.empty())
+  {
+    delete m_runningGroups.front();
+    m_runningGroups.pop_front();
+  }
+const Context* KernelInvocation::getContext() const
+  return m_context;
+const WorkGroup* KernelInvocation::getCurrentWorkGroup() const
+  return workerState.workGroup;
+const WorkItem* KernelInvocation::getCurrentWorkItem() const
+  return workerState.workItem;
+Size3 KernelInvocation::getGlobalOffset() const
+  return m_globalOffset;
+Size3 KernelInvocation::getGlobalSize() const
+  return m_globalSize;
+const Kernel* KernelInvocation::getKernel() const
+  return m_kernel;
+Size3 KernelInvocation::getLocalSize() const
+  return m_localSize;
+Size3 KernelInvocation::getNumGroups() const
+  return m_numGroups;
+size_t KernelInvocation::getWorkDim() const
+  return m_workDim;
+void KernelInvocation::run(const Context *context, Kernel *kernel,
+                           unsigned int workDim,
+                           Size3 globalOffset,
+                           Size3 globalSize,
+                           Size3 localSize)
+  try
+  {
+    // Allocate and initialise constant memory
+    kernel->allocateConstants(context->getGlobalMemory());
+  }
+  catch (FatalError& err)
+  {
+    ostringstream info;
+    info << endl << "OCLGRIND FATAL ERROR "
+         << "(" << err.getFile() << ":" << err.getLine() << ")"
+         << endl << err.what()
+         << endl << "When allocating kernel constants for '"
+         << kernel->getName() << "'";
+    context->logError(info.str().c_str());
+    return;
+  }
+  // Create kernel invocation
+  KernelInvocation *ki = new KernelInvocation(context, kernel, workDim,
+                                              globalOffset,
+                                              globalSize,
+                                              localSize);
+  // Run kernel
+  context->notifyKernelBegin(ki);
+  ki->run();
+  context->notifyKernelEnd(ki);
+  delete ki;
+  // Deallocate constant memory
+  kernel->deallocateConstants(context->getGlobalMemory());
+void KernelInvocation::run()
+  nextGroupIndex = 0;
+  // Create worker threads
+  // TODO: Run in main thread if only 1 worker
+  vector<thread> threads;
+  for (unsigned i = 0; i < m_numWorkers; i++)
+  {
+    threads.push_back(thread(&KernelInvocation::runWorker, this));
+  }
+  // Wait for workers to complete
+  for (unsigned i = 0; i < m_numWorkers; i++)
+  {
+    threads[i].join();
+  }
+void KernelInvocation::runWorker()
+  workerState.workGroup = NULL;
+  workerState.workItem = NULL;
+  try
+  {
+    while (true)
+    {
+      // Move to next work-group
+      if (!m_runningGroups.empty())
+      {
+        // Take next work-group from running pool
+        workerState.workGroup = m_runningGroups.front();
+        m_runningGroups.pop_front();
+      }
+      else
+      {
+        // Take next work-group from pending pool
+        unsigned index = nextGroupIndex++;
+        if (index >= m_workGroups.size())
+          // No more work to do
+          break;
+        workerState.workGroup = new WorkGroup(this, m_workGroups[index]);
+        m_context->notifyWorkGroupBegin(workerState.workGroup);
+      }
+      // Execute work-group
+      workerState.workItem = workerState.workGroup->getNextWorkItem();
+      while (workerState.workItem)
+      {
+        // Run work-item until complete or at barrier
+        while (workerState.workItem->getState() == WorkItem::READY)
+        {
+          workerState.workItem->step();
+        }
+        // Move to next work-item
+        workerState.workItem = workerState.workGroup->getNextWorkItem();
+        if (workerState.workItem)
+          continue;
+        // No more work-items in READY state
+        // Check if there are work-items at a barrier
+        if (workerState.workGroup->hasBarrier())
+        {
+          // Resume execution
+          workerState.workGroup->clearBarrier();
+          workerState.workItem = workerState.workGroup->getNextWorkItem();
+        }
+      }
+      // Work-group has finished
+      m_context->notifyWorkGroupComplete(workerState.workGroup);
+      delete workerState.workGroup;
+      workerState.workGroup = NULL;
+    }
+  }
+  catch (FatalError& err)
+  {
+    ostringstream info;
+    info << endl << "OCLGRIND FATAL ERROR "
+         << "(" << err.getFile() << ":" << err.getLine() << ")"
+         << endl << err.what();
+    m_context->logError(info.str().c_str());
+    if (workerState.workGroup)
+      delete workerState.workGroup;
+  }
+bool KernelInvocation::switchWorkItem(const Size3 gid)
+  assert(m_numWorkers == 1);
+  // Compute work-group ID
+  Size3 group(gid.x/m_localSize.x, gid.y/m_localSize.y, gid.z/m_localSize.z);
+  bool found = false;
+  WorkGroup *previousWorkGroup = workerState.workGroup;
+  // Check if we're already running the work-group
+  if (group == previousWorkGroup->getGroupID())
+  {
+    found = true;
+  }
+  // Check if work-group is in running pool
+  if (!found)
+  {
+    std::list<WorkGroup*>::iterator rItr;
+    for (rItr = m_runningGroups.begin(); rItr != m_runningGroups.end(); rItr++)
+    {
+      if (group == (*rItr)->getGroupID())
+      {
+        workerState.workGroup = *rItr;
+        m_runningGroups.erase(rItr);
+        found = true;
+        break;
+      }
+    }
+  }
+  // Check if work-group is in pending pool
+  if (!found)
+  {
+    std::vector<Size3>::iterator pItr;
+    for (pItr = m_workGroups.begin()+nextGroupIndex;
+         pItr != m_workGroups.end(); pItr++)
+    {
+     if (group == *pItr)
+     {
+       workerState.workGroup = new WorkGroup(this, group);
+       found = true;
+       // Re-order list of groups accordingly
+       // Safe since this is not in a multi-threaded context
+       m_workGroups.erase(pItr);
+       m_workGroups.insert(m_workGroups.begin()+nextGroupIndex, group);
+       nextGroupIndex++;
+       break;
+     }
+    }
+  }
+  if (!found)
+  {
+    return false;
+  }
+  if (previousWorkGroup != workerState.workGroup)
+  {
+    m_runningGroups.push_back(previousWorkGroup);
+  }
+  // Get work-item
+  Size3 lid(gid.x%m_localSize.x, gid.y%m_localSize.y, gid.z%m_localSize.z);
+  workerState.workItem = workerState.workGroup->getWorkItem(lid);
+  return true;
diff --git a/src/core/KernelInvocation.h b/src/core/KernelInvocation.h
new file mode 100644
index 0000000..4f02447
--- /dev/null
+++ b/src/core/KernelInvocation.h
@@ -0,0 +1,64 @@
+// KernelInvocation.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+namespace oclgrind
+  class Context;
+  class Kernel;
+  class WorkGroup;
+  class WorkItem;
+  class KernelInvocation
+  {
+  public:
+    static void run(const Context *context, Kernel *kernel,
+                    unsigned int workDim,
+                    Size3 globalOffset,
+                    Size3 globalSize,
+                    Size3 localSize);
+    const Context* getContext() const;
+    const WorkGroup* getCurrentWorkGroup() const;
+    const WorkItem* getCurrentWorkItem() const;
+    Size3 getGlobalOffset() const;
+    Size3 getGlobalSize() const;
+    Size3 getLocalSize() const;
+    const Kernel* getKernel() const;
+    Size3 getNumGroups() const;
+    size_t getWorkDim() const;
+    bool switchWorkItem(const Size3 gid);
+  private:
+    KernelInvocation(const Context *context, const Kernel *kernel,
+                     unsigned int workDim,
+                     Size3 globalOffset,
+                     Size3 globalSize,
+                     Size3 localSize);
+    virtual ~KernelInvocation();
+    void run();
+    // Kernel launch parameters
+    const Context *m_context;
+    const Kernel  *m_kernel;
+    size_t m_workDim;
+    Size3  m_globalOffset;
+    Size3  m_globalSize;
+    Size3  m_localSize;
+    Size3  m_numGroups;
+    // Current execution state
+    std::vector<Size3>    m_workGroups;
+    std::list<WorkGroup*> m_runningGroups;
+    // Worker threads
+    void runWorker();
+    unsigned m_numWorkers;
+  };
diff --git a/src/core/Memory.cpp b/src/core/Memory.cpp
new file mode 100644
index 0000000..cd33bc4
--- /dev/null
+++ b/src/core/Memory.cpp
@@ -0,0 +1,464 @@
+// Memory.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <mutex>
+#include "Context.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+using namespace oclgrind;
+using namespace std;
+// Multiple mutexes to mitigate risk of unnecessary synchronisation in atomics
+#define NUM_ATOMIC_MUTEXES 64 // Must be power of two
+mutex atomicMutex[NUM_ATOMIC_MUTEXES];
+#define ATOMIC_MUTEX(offset) \
+  atomicMutex[(((offset)>>2) & (NUM_ATOMIC_MUTEXES-1))]
+Memory::Memory(unsigned int addrSpace, const Context *context)
+  m_context = context;
+  m_addressSpace = addrSpace;
+  clear();
+  clear();
+size_t Memory::allocateBuffer(size_t size, cl_mem_flags flags)
+  // Check requested size doesn't exceed maximum
+  if (size > MAX_BUFFER_SIZE)
+  {
+    return 0;
+  }
+  // Find first unallocated buffer slot
+  unsigned b = getNextBuffer();
+  if (b >= MAX_NUM_BUFFERS)
+  {
+    return 0;
+  }
+  // Create buffer
+  Buffer *buffer = new Buffer;
+  buffer->size   = size;
+  buffer->flags  = flags;
+  buffer->data   = new unsigned char[size];
+  // Initialize contents to 0
+  memset(buffer->data, 0, size);
+  if (b >= m_memory.size())
+  {
+    m_memory.push_back(buffer);
+  }
+  else
+  {
+    m_memory[b] = buffer;
+  }
+  m_totalAllocated += size;
+  size_t address = ((size_t)b) << NUM_ADDRESS_BITS;
+  m_context->notifyMemoryAllocated(this, address, size, flags);
+  return address;
+uint32_t Memory::atomic(AtomicOp op, size_t address, uint32_t value)
+  m_context->notifyMemoryAtomicLoad(this, op, address, 4);
+  m_context->notifyMemoryAtomicStore(this, op, address, 4);
+  // Bounds check
+  if (!isAddressValid(address, 4))
+  {
+    return 0;
+  }
+  // Get buffer
+  size_t offset = EXTRACT_OFFSET(address);
+  Buffer *buffer = m_memory[EXTRACT_BUFFER(address)];
+  uint32_t *ptr = (uint32_t*)(buffer->data + offset);
+  if (m_addressSpace == AddrSpaceGlobal)
+    ATOMIC_MUTEX(offset).lock();
+  uint32_t old = *ptr;
+  switch(op)
+  {
+  case AtomicAdd:
+    *ptr = old + value;
+    break;
+  case AtomicAnd:
+    *ptr = old & value;
+    break;
+  case AtomicCmpXchg:
+    FATAL_ERROR("AtomicCmpXchg in generic atomic handler");
+    break;
+  case AtomicDec:
+    *ptr = old - 1;
+    break;
+  case AtomicInc:
+    *ptr = old + 1;
+    break;
+  case AtomicMax:
+    *ptr = old > value ? old : value;
+    break;
+  case AtomicMin:
+    *ptr = old < value ? old : value;
+    break;
+  case AtomicOr:
+    *ptr = old | value;
+    break;
+  case AtomicSub:
+    *ptr = old - value;
+    break;
+  case AtomicXchg:
+    *ptr = value;
+    break;
+  case AtomicXor:
+    *ptr = old ^ value;
+    break;
+  }
+  if (m_addressSpace == AddrSpaceGlobal)
+    ATOMIC_MUTEX(offset).unlock();
+  return old;
+uint32_t Memory::atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value)
+  m_context->notifyMemoryAtomicLoad(this, AtomicCmpXchg, address, 4);
+  // Bounds check
+  if (!isAddressValid(address, 4))
+  {
+    return 0;
+  }
+  // Get buffer
+  size_t offset = EXTRACT_OFFSET(address);
+  Buffer *buffer = m_memory[EXTRACT_BUFFER(address)];
+  uint32_t *ptr = (uint32_t*)(buffer->data + offset);
+  if (m_addressSpace == AddrSpaceGlobal)
+    ATOMIC_MUTEX(offset).lock();
+  // Perform cmpxchg
+  uint32_t old = *ptr;
+  if (old == cmp)
+  {
+    *ptr = value;
+    m_context->notifyMemoryAtomicStore(this, AtomicCmpXchg, address, 4);
+  }
+  if (m_addressSpace == AddrSpaceGlobal)
+    ATOMIC_MUTEX(offset).unlock();
+  return old;
+void Memory::clear()
+  vector<Buffer*>::iterator itr;
+  for (itr = m_memory.begin(); itr != m_memory.end(); itr++)
+  {
+    if (*itr)
+    {
+      if (!((*itr)->flags & CL_MEM_USE_HOST_PTR))
+      {
+        delete[] (*itr)->data;
+      }
+      delete *itr;
+      size_t address = (itr-m_memory.begin())<<NUM_ADDRESS_BITS;
+      m_context->notifyMemoryDeallocated(this, address);
+    }
+  }
+  m_memory.resize(1);
+  m_memory[0] = NULL;
+  m_freeBuffers = queue<unsigned>();
+  m_totalAllocated = 0;
+Memory* Memory::clone() const
+  Memory *mem = new Memory(m_addressSpace, m_context);
+  // Clone buffers
+  mem->m_memory.resize(m_memory.size());
+  mem->m_memory[0] = NULL;
+  for (unsigned i = 1; i < m_memory.size(); i++)
+  {
+    Buffer *src = m_memory[i];
+    Buffer *dst = new Buffer;
+    dst->size   = src->size;
+    dst->flags  = src->flags,
+    dst->data   =
+      (src->flags&CL_MEM_USE_HOST_PTR) ?
+        src->data : new unsigned char[src->size],
+    memcpy(dst->data, src->data, src->size);
+    mem->m_memory[i] = dst;
+    m_context->notifyMemoryAllocated(mem, ((size_t)i<<NUM_ADDRESS_BITS),
+                                     src->size, src->flags);
+  }
+  // Clone state
+  mem->m_freeBuffers = m_freeBuffers;
+  mem->m_totalAllocated = m_totalAllocated;
+  return mem;
+size_t Memory::createHostBuffer(size_t size, void *ptr, cl_mem_flags flags)
+  // Check requested size doesn't exceed maximum
+  if (size > MAX_BUFFER_SIZE)
+  {
+    return 0;
+  }
+  // Find first unallocated buffer slot
+  unsigned b = getNextBuffer();
+  if (b >= MAX_NUM_BUFFERS)
+  {
+    return 0;
+  }
+  // Create buffer
+  Buffer *buffer = new Buffer;
+  buffer->size   = size;
+  buffer->flags  = flags;
+  buffer->data   = (unsigned char*)ptr;
+  if (b >= m_memory.size())
+  {
+    m_memory.push_back(buffer);
+  }
+  else
+  {
+    m_memory[b] = buffer;
+  }
+  m_totalAllocated += size;
+  size_t address = ((size_t)b) << NUM_ADDRESS_BITS;
+  m_context->notifyMemoryAllocated(this, address, size, flags);
+  return address;
+bool Memory::copy(size_t dst, size_t src, size_t size)
+  m_context->notifyMemoryLoad(this, src, size);
+  // Check source address
+  if (!isAddressValid(src, size))
+  {
+    return false;
+  }
+  size_t src_offset = EXTRACT_OFFSET(src);
+  Buffer *src_buffer = m_memory.at(EXTRACT_BUFFER(src));
+  m_context->notifyMemoryStore(this, dst, size, src_buffer->data + src_offset);
+  // Check destination address
+  if (!isAddressValid(dst, size))
+  {
+    return false;
+  }
+  size_t dst_offset = EXTRACT_OFFSET(dst);
+  Buffer *dst_buffer = m_memory.at(EXTRACT_BUFFER(dst));
+  // Copy data
+  memcpy(dst_buffer->data + dst_offset,
+         src_buffer->data + src_offset,
+         size);
+  return true;
+void Memory::deallocateBuffer(size_t address)
+  unsigned buffer = EXTRACT_BUFFER(address);
+  assert(buffer < m_memory.size() && m_memory[buffer]);
+  if (!(m_memory[buffer]->flags & CL_MEM_USE_HOST_PTR))
+  {
+    delete[] m_memory[buffer]->data;
+  }
+  m_totalAllocated -= m_memory[buffer]->size;
+  m_freeBuffers.push(buffer);
+  delete m_memory[buffer];
+  m_memory[buffer] = NULL;
+  m_context->notifyMemoryDeallocated(this, address);
+void Memory::dump() const
+  for (unsigned b = 1; b < m_memory.size(); b++)
+  {
+    if (!m_memory[b]->data)
+    {
+      continue;
+    }
+    for (unsigned i = 0; i < m_memory[b]->size; i++)
+    {
+      if (i%4 == 0)
+      {
+        cout << endl << hex << uppercase
+             << setw(16) << setfill(' ') << right
+             << ((((size_t)b)<<NUM_ADDRESS_BITS) | i) << ":";
+      }
+      cout << " " << hex << uppercase << setw(2) << setfill('0')
+           << (int)m_memory[b]->data[i];
+    }
+  }
+  cout << endl;
+unsigned int Memory::getAddressSpace() const
+  return m_addressSpace;
+const Memory::Buffer* Memory::getBuffer(size_t address) const
+  size_t buf = EXTRACT_BUFFER(address);
+  if (buf == 0 || buf >= m_memory.size() || !m_memory[buf]->data)
+  {
+    return NULL;
+  }
+  return m_memory[buf];
+size_t Memory::getMaxAllocSize()
+  return MAX_BUFFER_SIZE;
+unsigned Memory::getNextBuffer()
+  if (m_freeBuffers.empty())
+  {
+    return m_memory.size();
+  }
+  else
+  {
+    unsigned b = m_freeBuffers.front();
+    m_freeBuffers.pop();
+    return b;
+  }
+void* Memory::getPointer(size_t address) const
+  size_t buffer = EXTRACT_BUFFER(address);
+  // Bounds check
+  if (!isAddressValid(address))
+  {
+    return NULL;
+  }
+  return m_memory[buffer]->data + EXTRACT_OFFSET(address);
+size_t Memory::getTotalAllocated() const
+  return m_totalAllocated;
+bool Memory::isAddressValid(size_t address, size_t size) const
+  size_t buffer = EXTRACT_BUFFER(address);
+  size_t offset = EXTRACT_OFFSET(address);
+  if (buffer == 0 ||
+      buffer >= m_memory.size() ||
+      !m_memory[buffer] ||
+      offset+size > m_memory[buffer]->size)
+  {
+    return false;
+  }
+  return true;
+bool Memory::load(unsigned char *dest, size_t address, size_t size) const
+  m_context->notifyMemoryLoad(this, address, size);
+  // Bounds check
+  if (!isAddressValid(address, size))
+  {
+    return false;
+  }
+  // Get buffer
+  size_t offset = EXTRACT_OFFSET(address);
+  Buffer *src = m_memory[EXTRACT_BUFFER(address)];
+  // Load data
+  memcpy(dest, src->data + offset, size);
+  return true;
+unsigned char* Memory::mapBuffer(size_t address, size_t offset, size_t size)
+  size_t buffer = EXTRACT_BUFFER(address);
+  // Bounds check
+  if (!isAddressValid(address, size))
+  {
+    return NULL;
+  }
+  return m_memory[buffer]->data + offset + EXTRACT_OFFSET(address);
+bool Memory::store(const unsigned char *source, size_t address, size_t size)
+  m_context->notifyMemoryStore(this, address, size, source);
+  // Bounds check
+  if (!isAddressValid(address, size))
+  {
+    return false;
+  }
+  // Get buffer
+  size_t offset = EXTRACT_OFFSET(address);
+  Buffer *dst = m_memory[EXTRACT_BUFFER(address)];
+  // Store data
+  memcpy(dst->data + offset, source, size);
+  return true;
diff --git a/src/core/Memory.h b/src/core/Memory.h
new file mode 100644
index 0000000..42eb63a
--- /dev/null
+++ b/src/core/Memory.h
@@ -0,0 +1,68 @@
+// Memory.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#define NUM_BUFFER_BITS ( (sizeof(size_t)==4) ? 8 : 16)
+#define MAX_NUM_BUFFERS ((size_t)1 << NUM_BUFFER_BITS)
+#define NUM_ADDRESS_BITS ((sizeof(size_t)<<3) - NUM_BUFFER_BITS)
+#define MAX_BUFFER_SIZE ((size_t)1 << NUM_ADDRESS_BITS)
+#define EXTRACT_BUFFER(address) \
+  (address >> NUM_ADDRESS_BITS)
+#define EXTRACT_OFFSET(address) \
+  (address & (((size_t)-1) >> NUM_BUFFER_BITS))
+namespace oclgrind
+  class Context;
+  class Memory
+  {
+  public:
+    typedef struct
+    {
+      size_t size;
+      cl_mem_flags flags;
+      unsigned char *data;
+    } Buffer;
+  public:
+    Memory(unsigned int addrSpace, const Context *context);
+    virtual ~Memory();
+    size_t allocateBuffer(size_t size, cl_mem_flags flags=0);
+    uint32_t atomic(AtomicOp op, size_t address, uint32_t value = 0);
+    uint32_t atomicCmpxchg(size_t address, uint32_t cmp, uint32_t value);
+    void clear();
+    Memory *clone() const;
+    size_t createHostBuffer(size_t size, void *ptr, cl_mem_flags flags=0);
+    bool copy(size_t dest, size_t src, size_t size);
+    void deallocateBuffer(size_t address);
+    void dump() const;
+    unsigned int getAddressSpace() const;
+    const Buffer* getBuffer(size_t address) const;
+    void* getPointer(size_t address) const;
+    size_t getTotalAllocated() const;
+    bool isAddressValid(size_t address, size_t size=1) const;
+    bool load(unsigned char *dst, size_t address, size_t size=1) const;
+    unsigned char* mapBuffer(size_t address, size_t offset, size_t size);
+    bool store(const unsigned char *source, size_t address, size_t size=1);
+    static size_t getMaxAllocSize();
+  private:
+    const Context *m_context;
+    std::queue<unsigned> m_freeBuffers;
+    std::vector<Buffer*> m_memory;
+    unsigned int m_addressSpace;
+    size_t m_totalAllocated;
+    unsigned getNextBuffer();
+  };
diff --git a/src/core/Plugin.cpp b/src/core/Plugin.cpp
new file mode 100644
index 0000000..8880f2d
--- /dev/null
+++ b/src/core/Plugin.cpp
@@ -0,0 +1,25 @@
+// Plugin.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "Plugin.h"
+using namespace oclgrind;
+Plugin::Plugin(const Context *context)
+  : m_context(context)
+bool Plugin::isThreadSafe() const
+  return true;
diff --git a/src/core/Plugin.h b/src/core/Plugin.h
new file mode 100644
index 0000000..d4a8ea7
--- /dev/null
+++ b/src/core/Plugin.h
@@ -0,0 +1,69 @@
+// Plugin.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#pragma once
+#include "common.h"
+namespace oclgrind
+  class Context;
+  class Kernel;
+  class KernelInvocation;
+  class Memory;
+  class WorkGroup;
+  class WorkItem;
+  class Plugin
+  {
+  public:
+    Plugin(const Context *context);
+    virtual ~Plugin();
+    virtual void hostMemoryLoad(const Memory *memory,
+                                size_t address, size_t size){}
+    virtual void hostMemoryStore(const Memory *memory,
+                                 size_t address, size_t size,
+                                 const uint8_t *storeData){}
+    virtual void instructionExecuted(const WorkItem *workItem,
+                                     const llvm::Instruction *instruction,
+                                     const TypedValue& result){}
+    virtual void kernelBegin(const KernelInvocation *kernelInvocation){}
+    virtual void kernelEnd(const KernelInvocation *kernelInvocation){}
+    virtual void log(MessageType type, const char *message){}
+    virtual void memoryAllocated(const Memory *memory, size_t address,
+                                 size_t size, cl_mem_flags flags){}
+    virtual void memoryAtomicLoad(const Memory *memory,
+                                  const WorkItem *workItem,
+                                  AtomicOp op, size_t address, size_t size){}
+    virtual void memoryAtomicStore(const Memory *memory,
+                                   const WorkItem *workItem,
+                                   AtomicOp op, size_t address, size_t size){}
+    virtual void memoryDeallocated(const Memory *memory, size_t address){}
+    virtual void memoryLoad(const Memory *memory, const WorkItem *workItem,
+                            size_t address, size_t size){}
+    virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+                            size_t address, size_t size){}
+    virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
+                             size_t address, size_t size,
+                             const uint8_t *storeData){}
+    virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
+                             size_t address, size_t size,
+                             const uint8_t *storeData){}
+    virtual void workGroupBarrier(const WorkGroup *workGroup, uint32_t flags){}
+    virtual void workGroupBegin(const WorkGroup *workGroup){}
+    virtual void workGroupComplete(const WorkGroup *workGroup){}
+    virtual void workItemBegin(const WorkItem *workItem){}
+    virtual void workItemComplete(const WorkItem *workItem){}
+    virtual bool isThreadSafe() const;
+  protected:
+    const Context *m_context;
+  };
diff --git a/src/core/Program.cpp b/src/core/Program.cpp
new file mode 100644
index 0000000..31fdc5b
--- /dev/null
+++ b/src/core/Program.cpp
@@ -0,0 +1,728 @@
+// Program.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include <fstream>
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <windows.h>
+#include <dlfcn.h>
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "clang/CodeGen/CodeGenAction.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "Kernel.h"
+#include "Program.h"
+#include "WorkItem.h"
+#define CL_DUMP_NAME "/tmp/oclgrind_%lX.cl"
+#define IR_DUMP_NAME "/tmp/oclgrind_%lX.s"
+#define BC_DUMP_NAME "/tmp/oclgrind_%lX.bc"
+#if defined(_WIN32)
+#define REMAP_DIR "Z:/remapped/"
+#define REMAP_DIR "/remapped/"
+#define REMAP_INPUT "input.cl"
+#define CLC_H_PATH REMAP_DIR"clc.h"
+extern const char CLC_H_DATA[];
+const char *EXTENSIONS[] =
+  "cl_khr_fp64",
+  "cl_khr_3d_image_writes",
+  "cl_khr_global_int32_base_atomics",
+  "cl_khr_global_int32_extended_atomics",
+  "cl_khr_local_int32_base_atomics",
+  "cl_khr_local_int32_extended_atomics",
+  "cl_khr_byte_addressable_store",
+using namespace oclgrind;
+using namespace std;
+Program::Program(const Context *context, llvm::Module *module)
+  : m_module(module), m_context(context)
+  m_buildLog = "";
+  m_buildOptions = "";
+  m_buildStatus = CL_BUILD_SUCCESS;
+  m_uid = generateUID();
+Program::Program(const Context *context, const string& source)
+  : m_context(context)
+  m_source = source;
+  m_buildLog = "";
+  m_buildOptions = "";
+  m_buildStatus = CL_BUILD_NONE;
+  m_uid = 0;
+  // Split source into individual lines
+  m_sourceLines.clear();
+  if (!source.empty())
+  {
+    std::stringstream ss(source);
+    std::string line;
+    while(std::getline(ss, line, '\n'))
+    {
+      m_sourceLines.push_back(line);
+    }
+  }
+  clearInterpreterCache();
+bool Program::build(const char *options, list<Header> headers)
+  m_buildStatus = CL_BUILD_IN_PROGRESS;
+  m_buildOptions = options ? options : "";
+  // Create build log
+  m_buildLog = "";
+  llvm::raw_string_ostream buildLog(m_buildLog);
+  // Do nothing if program was created with binary
+  if (m_source.empty() && m_module)
+  {
+    m_buildStatus = CL_BUILD_SUCCESS;
+    return true;
+  }
+  if (m_module)
+  {
+    clearInterpreterCache();
+    m_module.reset();
+  }
+  // Assign a new UID to this program
+  m_uid = generateUID();
+  // Set compiler arguments
+  vector<const char*> args;
+  args.push_back("-cl-std=CL1.2");
+  args.push_back("-cl-kernel-arg-info");
+  args.push_back("-fno-builtin");
+  args.push_back("-g");
+  args.push_back("-triple");
+  if (sizeof(size_t) == 4)
+    args.push_back("spir-unknown-unknown");
+  else
+    args.push_back("spir64-unknown-unknown");
+  // Define extensions
+  for (unsigned i = 0; i < sizeof(EXTENSIONS)/sizeof(const char*); i++)
+  {
+    args.push_back("-D");
+    args.push_back(EXTENSIONS[i]);
+  }
+  // Disable Clang's optimizations.
+  // We will manually run optimization passes and legalize the IR later.
+  args.push_back("-O0");
+  bool optimize = true;
+  bool cl12     = true;
+  // Add OpenCL build options
+  const char *mainOptions = options;
+  const char *extraOptions = getenv("OCLGRIND_BUILD_OPTIONS");
+  if (!mainOptions)
+    mainOptions = "";
+  if (!extraOptions)
+    extraOptions = "";
+  char *tmpOptions = new char[strlen(mainOptions) + strlen(extraOptions) + 2];
+  sprintf(tmpOptions, "%s %s", mainOptions, extraOptions);
+  for (char *opt = strtok(tmpOptions, " "); opt; opt = strtok(NULL, " "))
+  {
+    // Ignore options that break PCH
+    if (strcmp(opt, "-cl-fast-relaxed-math") != 0 &&
+        strcmp(opt, "-cl-finite-math-only") != 0 &&
+        strcmp(opt, "-cl-single-precision-constant") != 0)
+    {
+      // Check for optimization flags
+      if (strcmp(opt, "-O0") == 0 || strcmp(opt, "-cl-opt-disable") == 0)
+      {
+        optimize = false;
+        continue;
+      }
+      else if (strncmp(opt, "-O", 2) == 0)
+      {
+        optimize = true;
+        continue;
+      }
+      // Check for -cl-std flag
+      if (strncmp(opt, "-cl-std=", 8) == 0)
+      {
+        if (strcmp(opt+8, "CL1.2") != 0)
+        {
+          cl12 = false;
+          args.push_back(opt);
+        }
+        continue;
+      }
+      args.push_back(opt);
+    }
+  }
+  if (cl12)
+  {
+    args.push_back("-cl-std=CL1.2");
+  }
+  // Pre-compiled header
+  char *pchdir = NULL;
+  char *pch    = NULL;
+  if (!checkEnv("OCLGRIND_DISABLE_PCH") && cl12)
+  {
+    const char *pchdirOverride = getenv("OCLGRIND_PCH_DIR");
+    if (pchdirOverride)
+    {
+      pchdir = strdup(pchdirOverride);
+    }
+    else
+    {
+      // Get directory containing library
+#if defined(_WIN32) && !defined(__MINGW32__)
+      char libpath[4096];
+      HMODULE dll;
+      if (GetModuleHandleEx(
+            (LPCSTR)&Program::createFromBitcode, &dll) &&
+          GetModuleFileName(dll, libpath, sizeof(libpath)))
+      {
+      Dl_info dlinfo;
+      if (dladdr((const void*)Program::createFromBitcode, &dlinfo))
+      {
+        const char *libpath = dlinfo.dli_fname;
+        // Construct path to PCH directory
+        const char *dirend;
+#if defined(_WIN32) && !defined(__MINGW32__)
+        if ((dirend = strrchr(libpath, '\\')))
+        if ((dirend = strrchr(libpath, '/')))
+        {
+          const char *includes_relative = "/../include/oclgrind/";
+          size_t length = dirend - libpath;
+          pchdir = new char[length + strlen(includes_relative) + 1];
+          strncpy(pchdir, libpath, length);
+          strcpy(pchdir + length, includes_relative);
+        }
+      }
+    }
+    if (pchdir)
+    {
+      // Select precompiled header
+      pch = new char[strlen(pchdir) + 20];
+      sprintf(pch, "%s/clc%d.pch", pchdir, (sizeof(size_t) == 4 ? 32 : 64));
+      // Check if precompiled header exists
+      ifstream pchfile(pch);
+      if (!pchfile.good())
+      {
+        buildLog << "WARNING: Unable to find precompiled header:\n"
+                 << pch << "\n";
+        delete[] pch;
+        pch = NULL;
+      }
+      pchfile.close();
+    }
+    else
+    {
+      buildLog << "WARNING: Unable to determine precompiled header path\n";
+    }
+  }
+  if (pch)
+  {
+    args.push_back("-isysroot");
+    args.push_back(pchdir);
+    args.push_back("-include-pch");
+    args.push_back(pch);
+  }
+  else
+  {
+    // Fall back to embedded clc.h
+    args.push_back("-include");
+    args.push_back(CLC_H_PATH);
+  }
+  // Append input file to arguments (remapped later)
+  args.push_back(REMAP_INPUT);
+  // Create diagnostics engine
+  clang::DiagnosticOptions *diagOpts = new clang::DiagnosticOptions();
+  llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagID(
+    new clang::DiagnosticIDs());
+  clang::TextDiagnosticPrinter *diagConsumer =
+    new clang::TextDiagnosticPrinter(buildLog, diagOpts);
+  clang::DiagnosticsEngine diags(diagID, diagOpts, diagConsumer);
+  // Create compiler instance
+  clang::CompilerInstance compiler;
+  compiler.createDiagnostics(diagConsumer, false);
+  // Create compiler invocation
+  clang::CompilerInvocation *invocation = new clang::CompilerInvocation;
+  clang::CompilerInvocation::CreateFromArgs(*invocation,
+                                            &args[0], &args[0] + args.size(),
+                                            compiler.getDiagnostics());
+  compiler.setInvocation(invocation);
+  // Remap include files
+  std::unique_ptr<llvm::MemoryBuffer> buffer;
+  compiler.getHeaderSearchOpts().AddPath(REMAP_DIR, clang::frontend::Quoted,
+                                         false, true);
+  list<Header>::iterator itr;
+  for (itr = headers.begin(); itr != headers.end(); itr++)
+  {
+    buffer = llvm::MemoryBuffer::getMemBuffer(itr->second->m_source, "", false);
+    compiler.getPreprocessorOpts().addRemappedFile(REMAP_DIR + itr->first,
+                                                   buffer.release());
+  }
+  // Remap clc.h
+  buffer = llvm::MemoryBuffer::getMemBuffer(CLC_H_DATA, "", false);
+  compiler.getPreprocessorOpts().addRemappedFile(CLC_H_PATH, buffer.release());
+  // Remap input file
+  buffer = llvm::MemoryBuffer::getMemBuffer(m_source, "", false);
+  compiler.getPreprocessorOpts().addRemappedFile(REMAP_INPUT, buffer.release());
+  // Compile
+  llvm::LLVMContext& context = llvm::getGlobalContext();
+  clang::EmitLLVMOnlyAction action(&context);
+  if (compiler.ExecuteAction(action))
+  {
+    // Retrieve module
+    m_module = action.takeModule();
+    // Strip debug intrinsics if not in interactive mode
+    if (!checkEnv("OCLGRIND_INTERACTIVE"))
+    {
+      stripDebugIntrinsics();
+    }
+    // Initialize pass managers
+    llvm::legacy::PassManager modulePasses;
+    llvm::legacy::FunctionPassManager functionPasses(m_module.get());
+#if LLVM_VERSION < 37
+    modulePasses.add(new llvm::DataLayoutPass());
+    functionPasses.add(new llvm::DataLayoutPass());
+    // Run optimizations on module
+    if (optimize)
+    {
+      // Populate pass managers with -Oz
+      llvm::PassManagerBuilder builder;
+      builder.OptLevel = 2;
+      builder.SizeLevel = 2;
+      builder.populateModulePassManager(modulePasses);
+      builder.populateFunctionPassManager(functionPasses);
+    }
+    // Run passes
+    functionPasses.doInitialization();
+    llvm::Module::iterator fItr;
+    for (fItr = m_module->begin(); fItr != m_module->end(); fItr++)
+      functionPasses.run(*fItr);
+    functionPasses.doFinalization();
+    modulePasses.run(*m_module);
+    m_buildStatus = CL_BUILD_SUCCESS;
+  }
+  else
+  {
+    m_buildStatus = CL_BUILD_ERROR;
+  }
+  // Dump temps if required
+  if (checkEnv(ENV_DUMP_SPIR))
+  {
+    // Temporary directory
+#if defined(_WIN32)
+    const char *tmpdir = getenv("TEMP");
+    const char *tmpdir = "/tmp";
+    // Construct unique output filenames
+    size_t sz = snprintf(NULL, 0, "%s/oclgrind_%lX.XX", tmpdir, m_uid) + 1;
+    char *tempCL = new char[sz];
+    char *tempIR = new char[sz];
+    char *tempBC = new char[sz];
+    sprintf(tempCL, "%s/oclgrind_%lX.cl", tmpdir, m_uid);
+    sprintf(tempIR, "%s/oclgrind_%lX.ll", tmpdir, m_uid);
+    sprintf(tempBC, "%s/oclgrind_%lX.bc", tmpdir, m_uid);
+    // Dump source
+    ofstream cl;
+    cl.open(tempCL);
+    cl << m_source;
+    cl.close();
+    if (m_buildStatus == CL_BUILD_SUCCESS)
+    {
+      // Dump IR
+      std::error_code err;
+      llvm::raw_fd_ostream ir(tempIR, err, llvm::sys::fs::F_None);
+      llvm::AssemblyAnnotationWriter asmWriter;
+      m_module->print(ir, &asmWriter);
+      ir.close();
+      // Dump bitcode
+      llvm::raw_fd_ostream bc(tempBC, err, llvm::sys::fs::F_None);
+      llvm::WriteBitcodeToFile(m_module.get(), bc);
+      bc.close();
+    }
+    delete[] tempCL;
+    delete[] tempIR;
+    delete[] tempBC;
+  }
+  delete[] tmpOptions;
+  delete[] pchdir;
+  delete[] pch;
+  return m_buildStatus == CL_BUILD_SUCCESS;
+void Program::clearInterpreterCache()
+  InterpreterCacheMap::iterator itr;
+  for (itr = m_interpreterCache.begin(); itr != m_interpreterCache.end(); itr++)
+  {
+    delete itr->second;
+  }
+  m_interpreterCache.clear();
+Program* Program::createFromBitcode(const Context *context,
+                                    const unsigned char *bitcode,
+                                    size_t length)
+  // Load bitcode from file
+  llvm::StringRef data((const char*)bitcode, length);
+  unique_ptr<llvm::MemoryBuffer> buffer =
+    llvm::MemoryBuffer::getMemBuffer(data, "", false);
+  if (!buffer)
+  {
+    return NULL;
+  }
+  // Parse bitcode into IR module
+  llvm::ErrorOr<llvm::Module*> module =
+    parseBitcodeFile(buffer->getMemBufferRef(), llvm::getGlobalContext());
+  if (!module)
+  {
+    return NULL;
+  }
+  return new Program(context, module.get());
+Program* Program::createFromBitcodeFile(const Context *context,
+                                        const string filename)
+  // Load bitcode from file
+  llvm::ErrorOr<unique_ptr<llvm::MemoryBuffer>> buffer =
+    llvm::MemoryBuffer::getFile(filename);
+  if (!buffer)
+  {
+    return NULL;
+  }
+  // Parse bitcode into IR module
+  llvm::ErrorOr<llvm::Module*> module =
+    parseBitcodeFile(buffer->get()->getMemBufferRef(),
+                     llvm::getGlobalContext());
+  if (!module)
+  {
+    return NULL;
+  }
+  return new Program(context, module.get());
+Program* Program::createFromPrograms(const Context *context,
+                                     list<const Program*> programs)
+  llvm::Module *module = new llvm::Module("oclgrind_linked",
+                                          llvm::getGlobalContext());
+  llvm::Linker linker(module);
+  // Link modules
+  list<const Program*>::iterator itr;
+  for (itr = programs.begin(); itr != programs.end(); itr++)
+  {
+    if (linker.linkInModule(CloneModule((*itr)->m_module.get())))
+    {
+      return NULL;
+    }
+  }
+  return new Program(context, linker.getModule());
+Kernel* Program::createKernel(const string name)
+  if (!m_module)
+    return NULL;
+  // Iterate over functions in module to find kernel
+  llvm::Function *function = NULL;
+  // Query the SPIR kernel list
+  llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
+  // No kernels in module
+  if (!tuple)
+    return NULL;
+  for (unsigned i = 0; i < tuple->getNumOperands(); ++i)
+  {
+    llvm::MDNode* kernel = tuple->getOperand(i);
+    llvm::ConstantAsMetadata *cam =
+      llvm::dyn_cast<llvm::ConstantAsMetadata>(kernel->getOperand(0).get());
+    if (!cam)
+      continue;
+    llvm::Function *kernelFunction =
+      llvm::dyn_cast<llvm::Function>(cam->getValue());
+    // Shouldn't really happen - this would mean an invalid Module as input
+    if (!kernelFunction)
+      continue;
+    // Is this the kernel we want?
+    if (kernelFunction->getName() == name)
+    {
+      function = kernelFunction;
+      break;
+    }
+  }
+  if (function == NULL)
+  {
+    return NULL;
+  }
+  try
+  {
+    // Create cache if none already
+    InterpreterCacheMap::iterator itr = m_interpreterCache.find(function);
+    if (itr == m_interpreterCache.end())
+    {
+      m_interpreterCache[function] = new InterpreterCache(function);
+    }
+    return new Kernel(this, function, m_module.get());
+  }
+  catch (FatalError& err)
+  {
+    cerr << endl << "OCLGRIND FATAL ERROR "
+         << "(" << err.getFile() << ":" << err.getLine() << ")"
+         << endl << err.what()
+         << endl << "When creating kernel '" << name << "'"
+         << endl;
+    return NULL;
+  }
+unsigned char* Program::getBinary() const
+  if (!m_module)
+  {
+    return NULL;
+  }
+  std::string str;
+  llvm::raw_string_ostream stream(str);
+  llvm::WriteBitcodeToFile(m_module.get(), stream);
+  stream.str();
+  unsigned char *bitcode = new unsigned char[str.length()];
+  memcpy(bitcode, str.c_str(), str.length());
+  return bitcode;
+size_t Program::getBinarySize() const
+  if (!m_module)
+  {
+    return 0;
+  }
+  std::string str;
+  llvm::raw_string_ostream stream(str);
+  llvm::WriteBitcodeToFile(m_module.get(), stream);
+  stream.str();
+  return str.length();
+const string& Program::getBuildLog() const
+  return m_buildLog;
+const string& Program::getBuildOptions() const
+  return m_buildOptions;
+unsigned int Program::getBuildStatus() const
+  return m_buildStatus;
+const Context* Program::getContext() const
+  return m_context;
+unsigned long Program::generateUID() const
+  srand(now());
+  return rand();
+const InterpreterCache* Program::getInterpreterCache(
+  const llvm::Function *kernel) const
+  return m_interpreterCache[kernel];
+list<string> Program::getKernelNames() const
+  list<string> names;
+  // Query the SPIR kernel list
+  llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
+  if (tuple)
+  {
+    for (unsigned i = 0; i < tuple->getNumOperands(); ++i)
+    {
+      llvm::MDNode* kernel = tuple->getOperand(i);
+      llvm::ConstantAsMetadata *cam =
+      llvm::dyn_cast<llvm::ConstantAsMetadata>(kernel->getOperand(0).get());
+      if (!cam)
+        continue;
+      llvm::Function *kernelFunction =
+        llvm::dyn_cast<llvm::Function>(cam->getValue());
+      // Shouldn't really happen - this would mean an invalid Module as input
+      if (!kernelFunction)
+        continue;
+      names.push_back(kernelFunction->getName());
+    }
+  }
+  return names;
+unsigned int Program::getNumKernels() const
+  assert(m_module);
+  // Extract kernels from metadata
+  llvm::NamedMDNode* tuple = m_module->getNamedMetadata("opencl.kernels");
+  // No kernels in module
+  if (!tuple)
+    return 0;
+  return tuple->getNumOperands();
+const string& Program::getSource() const
+  return m_source;
+const char* Program::getSourceLine(size_t lineNumber) const
+  if (!lineNumber || (lineNumber-1) >= m_sourceLines.size())
+    return NULL;
+  return m_sourceLines[lineNumber-1].c_str();
+size_t Program::getNumSourceLines() const
+  return m_sourceLines.size();
+unsigned long Program::getUID() const
+  return m_uid;
+void Program::stripDebugIntrinsics()
+  // Get list of llvm.dbg intrinsics
+  set<llvm::Instruction*> intrinsics;
+  for (llvm::Module::iterator F = m_module->begin(); F != m_module->end(); F++)
+  {
+    for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; I++)
+    {
+      if (I->getOpcode() == llvm::Instruction::Call)
+      {
+        llvm::CallInst *call = (llvm::CallInst*)&*I;
+        llvm::Function *function =
+          (llvm::Function*)call->getCalledValue()->stripPointerCasts();
+        if (function->getName().startswith("llvm.dbg"))
+        {
+          intrinsics.insert(&*I);
+        }
+      }
+    }
+  }
+  // Remove instructions
+  set<llvm::Instruction*>::iterator itr;
+  for (itr = intrinsics.begin(); itr != intrinsics.end(); itr++)
+  {
+    (*itr)->removeFromParent();
+    delete *itr;
+  }
diff --git a/src/core/Program.h b/src/core/Program.h
new file mode 100644
index 0000000..f888746
--- /dev/null
+++ b/src/core/Program.h
@@ -0,0 +1,79 @@
+// Program.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+namespace llvm
+  class Function;
+  class Module;
+namespace oclgrind
+  class Context;
+  class InterpreterCache;
+  class Kernel;
+  class Program
+  {
+  public:
+    typedef std::pair<std::string, const Program*> Header;
+  public:
+    Program(const Context *context, const std::string& source);
+    virtual ~Program();
+    static Program* createFromBitcode(const Context *context,
+                                      const unsigned char *bitcode,
+                                      size_t length);
+    static Program* createFromBitcodeFile(const Context *context,
+                                          const std::string filename);
+    static Program* createFromPrograms(const Context *context,
+                                       std::list<const Program*>);
+    bool build(const char *options,
+               std::list<Header> headers = std::list<Header>());
+    Kernel* createKernel(const std::string name);
+    const std::string& getBuildLog() const;
+    const std::string& getBuildOptions() const;
+    unsigned char* getBinary() const;
+    size_t getBinarySize() const;
+    unsigned int getBuildStatus() const;
+    const Context *getContext() const;
+    const InterpreterCache* getInterpreterCache(
+      const llvm::Function *kernel) const;
+    std::list<std::string> getKernelNames() const;
+    unsigned int getNumKernels() const;
+    const std::string& getSource() const;
+    const char* getSourceLine(size_t lineNumber) const;
+    size_t getNumSourceLines() const;
+    unsigned long getUID() const;
+  private:
+    Program(const Context *context, llvm::Module *module);
+    std::unique_ptr<llvm::Module> m_module;
+    std::string m_source;
+    std::string m_buildLog;
+    std::string m_buildOptions;
+    unsigned int m_buildStatus;
+    const Context *m_context;
+    std::vector<std::string> m_sourceLines;
+    unsigned long m_uid;
+    unsigned long generateUID() const;
+    void stripDebugIntrinsics();
+    typedef std::map<const llvm::Function*, InterpreterCache*>
+      InterpreterCacheMap;
+    mutable InterpreterCacheMap m_interpreterCache;
+    void clearInterpreterCache();
+  };
diff --git a/src/core/Queue.cpp b/src/core/Queue.cpp
new file mode 100644
index 0000000..e9e082f
--- /dev/null
+++ b/src/core/Queue.cpp
@@ -0,0 +1,260 @@
+// Queue.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include <cassert>
+#include "Context.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "Queue.h"
+using namespace oclgrind;
+using namespace std;
+Queue::Queue(const Context *context)
+  : m_context(context)
+  state = CL_QUEUED;
+  queueTime = now();
+  startTime = endTime = 0;
+Event* Queue::enqueue(Command *cmd)
+  Event *event = new Event();
+  cmd->event = event;
+  m_queue.push(cmd);
+  return event;
+void Queue::executeCopyBuffer(CopyCommand *cmd)
+  m_context->getGlobalMemory()->copy(cmd->dst, cmd->src, cmd->size);
+void Queue::executeCopyBufferRect(CopyRectCommand *cmd)
+  // Perform copy
+  Memory *memory = m_context->getGlobalMemory();
+  for (unsigned z = 0; z < cmd->region[2]; z++)
+  {
+    for (unsigned y = 0; y < cmd->region[1]; y++)
+    {
+      // Compute addresses
+      size_t src =
+        cmd->src +
+        cmd->src_offset[0] +
+        y * cmd->src_offset[1] +
+        z * cmd->src_offset[2];
+      size_t dst =
+        cmd->dst +
+        cmd->dst_offset[0] +
+        y * cmd->dst_offset[1] +
+        z * cmd->dst_offset[2];
+      // Copy data
+      memory->copy(dst, src, cmd->region[0]);
+    }
+  }
+void Queue::executeFillBuffer(FillBufferCommand *cmd)
+  Memory *memory = m_context->getGlobalMemory();
+  for (unsigned i = 0; i < cmd->size/cmd->pattern_size; i++)
+  {
+    memory->store(cmd->pattern,
+                  cmd->address + i*cmd->pattern_size,
+                  cmd->pattern_size);
+  }
+void Queue::executeFillImage(FillImageCommand *cmd)
+  Memory *memory = m_context->getGlobalMemory();
+  for (unsigned z = 0; z < cmd->region[2]; z++)
+  {
+    for (unsigned y = 0; y < cmd->region[1]; y++)
+    {
+      for (unsigned x = 0; x < cmd->region[0]; x++)
+      {
+        size_t address = cmd->base
+                       + (cmd->origin[0] + x) * cmd->pixelSize
+                       + (cmd->origin[1] + y) * cmd->rowPitch
+                       + (cmd->origin[2] + z) * cmd->slicePitch;
+        memory->store(cmd->color, address, cmd->pixelSize);
+      }
+    }
+  }
+void Queue::executeKernel(KernelCommand *cmd)
+  // Run kernel
+  KernelInvocation::run(m_context,
+                        cmd->kernel,
+                        cmd->work_dim,
+                        cmd->globalOffset,
+                        cmd->globalSize,
+                        cmd->localSize);
+void Queue::executeNativeKernel(NativeKernelCommand *cmd)
+  // Run kernel
+  cmd->func(cmd->args);
+void Queue::executeReadBuffer(BufferCommand *cmd)
+  m_context->getGlobalMemory()->load(cmd->ptr, cmd->address, cmd->size);
+void Queue::executeReadBufferRect(BufferRectCommand *cmd)
+  Memory *memory = m_context->getGlobalMemory();
+  for (unsigned z = 0; z < cmd->region[2]; z++)
+  {
+    for (unsigned y = 0; y < cmd->region[1]; y++)
+    {
+      unsigned char *host =
+        cmd->ptr +
+        cmd->host_offset[0] +
+        y * cmd->host_offset[1] +
+        z * cmd->host_offset[2];
+      size_t buff =
+        cmd->address +
+        cmd->buffer_offset[0] +
+        y * cmd->buffer_offset[1] +
+        z * cmd->buffer_offset[2];
+      memory->load(host, buff, cmd->region[0]);
+    }
+  }
+void Queue::executeWriteBuffer(BufferCommand *cmd)
+  m_context->getGlobalMemory()->store(cmd->ptr, cmd->address, cmd->size);
+void Queue::executeWriteBufferRect(BufferRectCommand *cmd)
+  // Perform write
+  Memory *memory = m_context->getGlobalMemory();
+  for (unsigned z = 0; z < cmd->region[2]; z++)
+  {
+    for (unsigned y = 0; y < cmd->region[1]; y++)
+    {
+      const unsigned char *host =
+        cmd->ptr +
+        cmd->host_offset[0] +
+        y * cmd->host_offset[1] +
+        z * cmd->host_offset[2];
+      size_t buff =
+        cmd->address +
+        cmd->buffer_offset[0] +
+        y * cmd->buffer_offset[1] +
+        z * cmd->buffer_offset[2];
+      memory->store(host, buff, cmd->region[0]);
+    }
+  }
+bool Queue::isEmpty() const
+  return m_queue.empty();
+Queue::Command* Queue::update()
+  if (m_queue.empty())
+  {
+    return NULL;
+  }
+  // Get next command
+  Command *cmd = m_queue.front();
+  // Check if all events in wait list have completed
+  while (!cmd->waitList.empty())
+  {
+    if (cmd->waitList.front()->state == CL_COMPLETE)
+    {
+      cmd->waitList.pop_front();
+    }
+    else if (cmd->waitList.front()->state < 0)
+    {
+      cmd->event->state = cmd->waitList.front()->state;
+      m_queue.pop();
+      return cmd;
+    }
+    else
+    {
+      return NULL;
+    }
+  }
+  cmd->event->startTime = now();
+  cmd->event->state = CL_RUNNING;
+  // Dispatch command
+  switch (cmd->type)
+  {
+  case COPY:
+    executeCopyBuffer((CopyCommand*)cmd);
+    break;
+  case COPY_RECT:
+    executeCopyBufferRect((CopyRectCommand*)cmd);
+    break;
+  case EMPTY:
+    break;
+  case FILL_BUFFER:
+    executeFillBuffer((FillBufferCommand*)cmd);
+    break;
+  case FILL_IMAGE:
+    executeFillImage((FillImageCommand*)cmd);
+    break;
+  case READ:
+    executeReadBuffer((BufferCommand*)cmd);
+    break;
+  case READ_RECT:
+    executeReadBufferRect((BufferRectCommand*)cmd);
+    break;
+  case KERNEL:
+    executeKernel((KernelCommand*)cmd);
+    break;
+    executeNativeKernel((NativeKernelCommand*)cmd);
+    break;
+  case WRITE:
+    executeWriteBuffer((BufferCommand*)cmd);
+    break;
+  case WRITE_RECT:
+    executeWriteBufferRect((BufferRectCommand*)cmd);
+    break;
+  default:
+    assert(false && "Unhandled command type in queue.");
+  }
+  cmd->event->endTime = now();
+  cmd->event->state = CL_COMPLETE;
+  // Remove command from queue and delete
+  m_queue.pop();
+  return cmd;
diff --git a/src/core/Queue.h b/src/core/Queue.h
new file mode 100644
index 0000000..7736d47
--- /dev/null
+++ b/src/core/Queue.h
@@ -0,0 +1,183 @@
+// Queue.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#pragma once
+#include "common.h"
+namespace oclgrind
+  class Context;
+  class Kernel;
+  struct Event
+  {
+    int state;
+    double queueTime, startTime, endTime;
+    Event();
+  };
+  class Queue
+  {
+  public:
+    struct Command
+    {
+      CommandType type;
+      std::list<Event*> waitList;
+      Command()
+      {
+        type = EMPTY;
+      }
+    private:
+      Event *event;
+      friend class Queue;
+    };
+    struct BufferCommand : Command
+    {
+      unsigned char *ptr;
+      size_t address, size;
+      BufferCommand(CommandType t)
+      {
+        type = t;
+      }
+    };
+    struct BufferRectCommand : Command
+    {
+      unsigned char *ptr;
+      size_t address;
+      size_t region[3];
+      size_t host_offset[3];
+      size_t buffer_offset[3];
+      BufferRectCommand(CommandType t)
+      {
+        type = t;
+      }
+    };
+    struct CopyCommand : Command
+    {
+      size_t src, dst, size;
+      CopyCommand()
+      {
+        type = COPY;
+      }
+    };
+    struct CopyRectCommand : Command
+    {
+      size_t src, dst;
+      size_t region[3];
+      size_t src_offset[3];
+      size_t dst_offset[3];
+      CopyRectCommand()
+      {
+        type = COPY_RECT;
+      }
+    };
+    struct FillBufferCommand : Command
+    {
+      size_t address, size;
+      size_t pattern_size;
+      unsigned char *pattern;
+      FillBufferCommand(const unsigned char *p, size_t sz)
+      {
+        type = FILL_BUFFER;
+        pattern = new unsigned char[sz];
+        pattern_size = sz;
+        memcpy(pattern, p, sz);
+      }
+      ~FillBufferCommand()
+      {
+        delete[] pattern;
+      }
+    };
+    struct FillImageCommand : Command
+    {
+      size_t base;
+      size_t origin[3], region[3];
+      size_t rowPitch, slicePitch;
+      size_t pixelSize;
+      unsigned char color[16];
+      FillImageCommand(size_t b, const size_t o[3], const size_t r[3],
+                       size_t rp, size_t sp,
+                       size_t ps, const unsigned char *col)
+      {
+        type = FILL_IMAGE;
+        base = b;
+        memcpy(origin, o, sizeof(size_t)*3);
+        memcpy(region, r, sizeof(size_t)*3);
+        rowPitch = rp;
+        slicePitch = sp;
+        pixelSize = ps;
+        memcpy(color, col, 16);
+      }
+    };
+    struct KernelCommand : Command
+    {
+      Kernel *kernel;
+      unsigned int work_dim;
+      Size3 globalOffset;
+      Size3 globalSize;
+      Size3 localSize;
+      KernelCommand()
+      {
+        type = KERNEL;
+      }
+    };
+    struct NativeKernelCommand : Command
+    {
+      void (CL_CALLBACK *func)(void *);
+      void *args;
+      NativeKernelCommand(void (CL_CALLBACK *f)(void *),
+                          void *a, size_t sz)
+      {
+        type = NATIVE_KERNEL;
+        func = f;
+        if (a)
+        {
+          args = malloc(sz);
+          memcpy(args, a, sz);
+        }
+        else
+        {
+          args = NULL;
+        }
+      }
+      ~NativeKernelCommand()
+      {
+        if (args)
+        {
+          free(args);
+        }
+      }
+    };
+  public:
+    Queue(const Context *context);
+    virtual ~Queue();
+    Event* enqueue(Command *command);
+    void executeCopyBuffer(CopyCommand *cmd);
+    void executeCopyBufferRect(CopyRectCommand *cmd);
+    void executeFillBuffer(FillBufferCommand *cmd);
+    void executeFillImage(FillImageCommand *cmd);
+    void executeKernel(KernelCommand *cmd);
+    void executeNativeKernel(NativeKernelCommand *cmd);
+    void executeReadBuffer(BufferCommand *cmd);
+    void executeReadBufferRect(BufferRectCommand *cmd);
+    void executeWriteBuffer(BufferCommand *cmd);
+    void executeWriteBufferRect(BufferRectCommand *cmd);
+    bool isEmpty() const;
+    Command* update();
+  private:
+    const Context *m_context;
+    std::queue<Command*> m_queue;
+  };
diff --git a/src/core/WorkGroup.cpp b/src/core/WorkGroup.cpp
new file mode 100644
index 0000000..23daf9d
--- /dev/null
+++ b/src/core/WorkGroup.cpp
@@ -0,0 +1,428 @@
+// WorkGroup.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include <sstream>
+#include "llvm/IR/Module.h"
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+using namespace oclgrind;
+using namespace std;
+WorkGroup::WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid)
+ : m_context(kernelInvocation->getContext())
+  m_groupID = wgid;
+  m_groupSize = kernelInvocation->getLocalSize();
+  m_groupIndex = (m_groupID.x +
+                 (m_groupID.y +
+                  m_groupID.z*(kernelInvocation->getNumGroups().y) *
+                  kernelInvocation->getNumGroups().x));
+  // Allocate local memory
+  m_localMemory = kernelInvocation->getKernel()->getLocalMemory()->clone();
+  // Initialise work-items
+  for (size_t k = 0; k < m_groupSize.z; k++)
+  {
+    for (size_t j = 0; j < m_groupSize.y; j++)
+    {
+      for (size_t i = 0; i < m_groupSize.x; i++)
+      {
+        WorkItem *workItem = new WorkItem(kernelInvocation, this,
+                                          Size3(i, j, k));
+        m_workItems.push_back(workItem);
+        m_running.insert(workItem);
+        m_context->notifyWorkItemBegin(workItem);
+      }
+    }
+  }
+  m_nextEvent = 1;
+  m_barrier = NULL;
+  // Delete work-items
+  for (unsigned i = 0; i < m_workItems.size(); i++)
+  {
+    delete m_workItems[i];
+  }
+  delete m_localMemory;
+size_t WorkGroup::async_copy(
+  const WorkItem *workItem,
+  const llvm::Instruction *instruction,
+  AsyncCopyType type,
+  size_t dest,
+  size_t src,
+  size_t size,
+  size_t num,
+  size_t srcStride,
+  size_t destStride,
+  size_t event)
+  AsyncCopy copy =
+  {
+    instruction,
+    type,
+    dest,
+    src,
+    size,
+    num,
+    srcStride,
+    destStride,
+    event
+  };
+  // Check if copy has already been registered by another work-item
+  list< pair<AsyncCopy,set<const WorkItem*> > >::iterator itr;
+  for (itr = m_asyncCopies.begin(); itr != m_asyncCopies.end(); itr++)
+  {
+    if (itr->second.count(workItem))
+    {
+      continue;
+    }
+    // Check for divergence
+    if ((itr->first.instruction->getDebugLoc()
+         != copy.instruction->getDebugLoc()) ||
+        (itr->first.type != copy.type) ||
+        (itr->first.dest != copy.dest) ||
+        (itr->first.src != copy.src) ||
+        (itr->first.size != copy.size) ||
+        (itr->first.num != copy.num) ||
+        (itr->first.srcStride != copy.srcStride) ||
+        (itr->first.destStride != copy.destStride))
+    {
+      Context::Message msg(ERROR, m_context);
+      msg << "Work-group divergence detected (async copy)" << endl
+          << msg.INDENT
+          << "Kernel:     " << msg.CURRENT_KERNEL << endl
+          << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+          << endl
+          << "Work-item:  " << msg.CURRENT_ENTITY << endl
+          << msg.CURRENT_LOCATION << endl
+          << "dest=0x" << hex << copy.dest << ", "
+          << "src=0x" << hex << copy.src << endl
+          << "elem_size=" << dec << copy.size << ", "
+          << "num_elems=" << dec << copy.num << ", "
+          << "src_stride=" << dec << copy.srcStride << ", "
+          << "dest_stride=" << dec << copy.destStride << endl
+          << endl
+          << "Previous work-items executed:" << endl
+          << itr->first.instruction << endl
+          << "dest=0x" << hex << itr->first.dest << ", "
+          << "src=0x" << hex << itr->first.src << endl
+          << "elem_size=" << dec << itr->first.size << ", "
+          << "num_elems=" << dec << itr->first.num << ", "
+          << "src_stride=" << dec << itr->first.srcStride << ", "
+          << "dest_stride=" << dec << itr->first.destStride << endl;
+      msg.send();
+    }
+    itr->second.insert(workItem);
+    return itr->first.event;
+  }
+  // Create new event if necessary
+  if (copy.event == 0)
+  {
+    copy.event = m_nextEvent++;
+  }
+  // Register new copy and event
+  m_asyncCopies.push_back(make_pair(copy, set<const WorkItem*>()));
+  m_asyncCopies.back().second.insert(workItem);
+  if (!m_events.count(event))
+  {
+    m_events[copy.event] = list<AsyncCopy>();
+  }
+  m_events[copy.event].push_back(copy);
+  return copy.event;
+void WorkGroup::clearBarrier()
+  assert(m_barrier);
+  // Check for divergence
+  if (m_barrier->workItems.size() != m_workItems.size())
+  {
+    Context::Message msg(ERROR, m_context);
+    msg << "Work-group divergence detected (barrier)" << endl
+        << msg.INDENT
+        << "Kernel:     " << msg.CURRENT_KERNEL << endl
+        << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+        << "Only " << dec << m_barrier->workItems.size() << " out of "
+        << m_workItems.size() << " work-items executed barrier" << endl
+        << m_barrier->instruction << endl;
+    msg.send();
+  }
+  // Move work-items to running state
+  set<WorkItem*>::iterator itr;
+  for (itr = m_barrier->workItems.begin();
+       itr != m_barrier->workItems.end();
+       itr++)
+  {
+    (*itr)->clearBarrier();
+    m_running.insert(*itr);
+  }
+  m_barrier->workItems.clear();
+  // Deal with events
+  while (!m_barrier->events.empty())
+  {
+    size_t event = m_barrier->events.front();
+    // Perform copy
+    list<AsyncCopy> copies = m_events[event];
+    list<AsyncCopy>::iterator itr;
+    for (itr = copies.begin(); itr != copies.end(); itr++)
+    {
+      Memory *destMem, *srcMem;
+      if (itr->type == GLOBAL_TO_LOCAL)
+      {
+        destMem = m_localMemory;
+        srcMem = m_context->getGlobalMemory();
+      }
+      else
+      {
+        destMem = m_context->getGlobalMemory();
+        srcMem = m_localMemory;
+      }
+      size_t src = itr->src;
+      size_t dest = itr->dest;
+      unsigned char *buffer = new unsigned char[itr->size];
+      for (unsigned i = 0; i < itr->num; i++)
+      {
+        srcMem->load(buffer, src, itr->size);
+        destMem->store(buffer, dest, itr->size);
+        src += itr->srcStride * itr->size;
+        dest += itr->destStride * itr->size;
+      }
+      delete[] buffer;
+    }
+    m_events.erase(event);
+    // Remove copies from list for this event
+    list< pair<AsyncCopy,set<const WorkItem*> > >::iterator cItr;
+    for (cItr = m_asyncCopies.begin(); cItr != m_asyncCopies.end();)
+    {
+      if (cItr->first.event == event)
+      {
+        // Check that all work-items registered the copy
+        if (cItr->second.size() != m_workItems.size())
+        {
+          Context::Message msg(ERROR, m_context);
+          msg << "Work-group divergence detected (async copy)" << endl
+              << msg.INDENT
+              << "Kernel:     " << msg.CURRENT_KERNEL << endl
+              << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+              << "Only " << dec << cItr->second.size() << " out of "
+              << m_workItems.size() << " work-items executed copy" << endl
+              << cItr->first.instruction << endl;
+          msg.send();
+        }
+        cItr = m_asyncCopies.erase(cItr);
+      }
+      else
+      {
+        cItr++;
+      }
+    }
+    m_barrier->events.remove(event);
+  }
+  m_context->notifyWorkGroupBarrier(this, m_barrier->fence);
+  delete m_barrier;
+  m_barrier = NULL;
+const llvm::Instruction* WorkGroup::getCurrentBarrier() const
+  return m_barrier ? m_barrier->instruction : NULL;
+Size3 WorkGroup::getGroupID() const
+  return m_groupID;
+size_t WorkGroup::getGroupIndex() const
+  return m_groupIndex;
+Size3 WorkGroup::getGroupSize() const
+  return m_groupSize;
+Memory* WorkGroup::getLocalMemory() const
+  return m_localMemory;
+WorkItem* WorkGroup::getNextWorkItem() const
+  if (m_running.empty())
+  {
+    return NULL;
+  }
+  return *m_running.begin();
+WorkItem* WorkGroup::getWorkItem(Size3 localID) const
+  return m_workItems[localID.x +
+                    (localID.y + localID.z*m_groupSize.y)*m_groupSize.x];
+bool WorkGroup::hasBarrier() const
+  return m_barrier;
+void WorkGroup::notifyBarrier(WorkItem *workItem,
+                              const llvm::Instruction *instruction,
+                              uint64_t fence, list<size_t> events)
+  if (!m_barrier)
+  {
+    // Create new barrier
+    m_barrier = new Barrier;
+    m_barrier->instruction = instruction;
+    m_barrier->fence = fence;
+    m_barrier->events = events;
+    // Check for invalid events
+    list<size_t>::iterator itr;
+    for (itr = events.begin(); itr != events.end(); itr++)
+    {
+      if (!m_events.count(*itr))
+      {
+        m_context->logError("Invalid wait event");
+      }
+    }
+  }
+  else
+  {
+    // Check for divergence
+    bool divergence = false;
+    if (instruction->getDebugLoc() != m_barrier->instruction->getDebugLoc() ||
+        fence != m_barrier->fence ||
+        events.size() != m_barrier->events.size())
+    {
+      divergence = true;
+    }
+    // Check events are all the same
+    int divergentEventIndex = -1;
+    size_t newEvent = -1;
+    size_t oldEvent = -1;
+    if (!divergence)
+    {
+      int i = 0;
+      list<size_t>::iterator cItr = events.begin();
+      list<size_t>::iterator pItr = m_barrier->events.begin();
+      for (; cItr != events.end(); cItr++, pItr++, i++)
+      {
+        if (*cItr != *pItr)
+        {
+          divergence = true;
+          divergentEventIndex = i;
+          newEvent = *cItr;
+          oldEvent = *pItr;
+          break;
+        }
+      }
+    }
+    if (divergence)
+    {
+      Context::Message msg(ERROR, m_context);
+      msg << "Work-group divergence detected (barrier)" << endl
+          << msg.INDENT
+          << "Kernel:     " << msg.CURRENT_KERNEL << endl
+          << "Work-group: " << msg.CURRENT_WORK_GROUP << endl
+          << endl
+          << "Work-item:  " << msg.CURRENT_ENTITY << endl
+          << msg.CURRENT_LOCATION << endl
+          << "fence=0x" << hex << fence << ", "
+          << "num_events=" << dec << events.size() << endl;
+      if (divergentEventIndex >= 0)
+      {
+        msg << "events[" << dec << divergentEventIndex << "]="
+            << newEvent << endl;
+      }
+      msg << endl
+          << "Previous work-items executed:" << endl
+          << m_barrier->instruction << endl
+          << "fence=0x" << hex << m_barrier->fence << ", "
+          << "num_events=" << dec << m_barrier->events.size() << endl;
+      if (divergentEventIndex >= 0)
+      {
+        msg << "events[" << dec << divergentEventIndex << "]="
+            << oldEvent << endl;
+      }
+      msg.send();
+    }
+  }
+  m_running.erase(workItem);
+  m_barrier->workItems.insert(workItem);
+void WorkGroup::notifyFinished(WorkItem *workItem)
+  m_running.erase(workItem);
+  // Check if work-group finished without waiting for all events
+  if (m_running.empty() && !m_barrier && !m_events.empty())
+  {
+    m_context->logError("Work-item finished without waiting for events");
+  }
+bool WorkGroup::WorkItemCmp::operator()(const WorkItem *lhs,
+                                        const WorkItem *rhs) const
+  Size3 lgid = lhs->getGlobalID();
+  Size3 rgid = rhs->getGlobalID();
+  if (lgid.z != rgid.z)
+  {
+    return lgid.z < rgid.z;
+  }
+  if (lgid.y != rgid.y)
+  {
+    return lgid.y < rgid.y;
+  }
+  return lgid.x < rgid.x;
diff --git a/src/core/WorkGroup.h b/src/core/WorkGroup.h
new file mode 100644
index 0000000..88319cf
--- /dev/null
+++ b/src/core/WorkGroup.h
@@ -0,0 +1,100 @@
+// WorkGroup.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#define CLK_LOCAL_MEM_FENCE  (1<<0)
+#define CLK_GLOBAL_MEM_FENCE (1<<1)
+namespace oclgrind
+  class Context;
+  class Memory;
+  class Kernel;
+  class KernelInvocation;
+  class WorkItem;
+  class WorkGroup
+  {
+  public:
+    enum AsyncCopyType{GLOBAL_TO_LOCAL, LOCAL_TO_GLOBAL};
+  private:
+    // Comparator for ordering work-items
+    struct WorkItemCmp
+    {
+      bool operator()(const WorkItem *lhs, const WorkItem *rhs) const;
+    };
+    std::set<WorkItem*, WorkItemCmp> m_running;
+    typedef struct
+    {
+      const llvm::Instruction *instruction;
+      AsyncCopyType type;
+      size_t dest;
+      size_t src;
+      size_t size;
+      size_t num;
+      size_t srcStride;
+      size_t destStride;
+      size_t event;
+    } AsyncCopy;
+    typedef struct
+    {
+      const llvm::Instruction *instruction;
+      std::set<WorkItem*, WorkItemCmp> workItems;
+      uint64_t fence;
+      std::list<size_t> events;
+    } Barrier;
+  public:
+    WorkGroup(const KernelInvocation *kernelInvocation, Size3 wgid);
+    virtual ~WorkGroup();
+    size_t async_copy(
+      const WorkItem *workItem,
+      const llvm::Instruction *instruction,
+      AsyncCopyType type,
+      size_t dest,
+      size_t src,
+      size_t size,
+      size_t num,
+      size_t srcStride,
+      size_t destStride,
+      size_t event);
+    void clearBarrier();
+    const llvm::Instruction* getCurrentBarrier() const;
+    Size3 getGroupID() const;
+    size_t getGroupIndex() const;
+    Size3 getGroupSize() const;
+    Memory* getLocalMemory() const;
+    WorkItem *getNextWorkItem() const;
+    WorkItem *getWorkItem(Size3 localID) const;
+    bool hasBarrier() const;
+    void notifyBarrier(WorkItem *workItem, const llvm::Instruction *instruction,
+                       uint64_t fence,
+                       std::list<size_t> events=std::list<size_t>());
+    void notifyFinished(WorkItem *workItem);
+  private:
+    size_t m_groupIndex;
+    Size3 m_groupID;
+    Size3 m_groupSize;
+    const Context *m_context;
+    Memory *m_localMemory;
+    std::vector<WorkItem*> m_workItems;
+    Barrier *m_barrier;
+    size_t m_nextEvent;
+    std::list< std::pair<AsyncCopy,std::set<const WorkItem*> > > m_asyncCopies;
+    std::map < size_t, std::list<AsyncCopy> > m_events;
+  };
diff --git a/src/core/WorkItem.cpp b/src/core/WorkItem.cpp
new file mode 100644
index 0000000..9d37ade
--- /dev/null
+++ b/src/core/WorkItem.cpp
@@ -0,0 +1,1660 @@
+// WorkItem.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.`
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/InstIterator.h"
+#include "Context.h"
+#include "Kernel.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "Program.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+using namespace oclgrind;
+using namespace std;
+struct WorkItem::Position
+  llvm::Function::const_iterator       prevBlock;
+  llvm::Function::const_iterator       currBlock;
+  llvm::Function::const_iterator       nextBlock;
+  llvm::BasicBlock::const_iterator     currInst;
+  std::stack<const llvm::Instruction*> callStack;
+  std::stack< std::list<size_t> >      allocations;
+WorkItem::WorkItem(const KernelInvocation *kernelInvocation,
+                   WorkGroup *workGroup, Size3 lid)
+  : m_context(kernelInvocation->getContext()),
+    m_kernelInvocation(kernelInvocation),
+    m_workGroup(workGroup)
+  m_localID = lid;
+  // Compute global ID
+  Size3 groupID = workGroup->getGroupID();
+  Size3 groupSize = workGroup->getGroupSize();
+  Size3 globalOffset = kernelInvocation->getGlobalOffset();
+  m_globalID.x = lid.x + groupID.x*groupSize.x + globalOffset.x;
+  m_globalID.y = lid.y + groupID.y*groupSize.y + globalOffset.y;
+  m_globalID.z = lid.z + groupID.z*groupSize.z + globalOffset.z;
+  Size3 globalSize = kernelInvocation->getGlobalSize();
+  m_globalIndex = (m_globalID.x +
+                  (m_globalID.y +
+                   m_globalID.z*globalSize.y) * globalSize.x);
+  const Kernel *kernel = kernelInvocation->getKernel();
+  // Load interpreter cache
+  m_cache = kernel->getProgram()->getInterpreterCache(kernel->getFunction());
+  // Set initial number of values to store based on cache
+  m_values.resize(m_cache->getNumValues());
+  m_privateMemory = kernel->getPrivateMemory()->clone();
+  // Initialise kernel arguments
+  TypedValueMap::const_iterator argItr;
+  for (argItr = kernel->args_begin(); argItr != kernel->args_end(); argItr++)
+  {
+    setValue(argItr->first, m_pool.clone(argItr->second));
+  }
+  // Initialize interpreter state
+  m_state    = READY;
+  m_position = new Position;
+  m_position->prevBlock = NULL;
+  m_position->nextBlock = NULL;
+  m_position->currBlock = kernel->getFunction()->begin();
+  m_position->currInst = m_position->currBlock->begin();
+  delete m_privateMemory;
+  delete m_position;
+void WorkItem::clearBarrier()
+  if (m_state == BARRIER)
+  {
+    m_state = READY;
+  }
+void WorkItem::dispatch(const llvm::Instruction *instruction,
+                        TypedValue& result)
+  switch (instruction->getOpcode())
+  {
+  case llvm::Instruction::Add:
+    add(instruction, result);
+    break;
+  case llvm::Instruction::Alloca:
+    alloc(instruction, result);
+    break;
+  case llvm::Instruction::And:
+    bwand(instruction, result);
+    break;
+  case llvm::Instruction::AShr:
+    ashr(instruction, result);
+    break;
+  case llvm::Instruction::BitCast:
+    bitcast(instruction, result);
+    break;
+  case llvm::Instruction::Br:
+    br(instruction, result);
+    break;
+  case llvm::Instruction::Call:
+    call(instruction, result);
+    break;
+  case llvm::Instruction::ExtractElement:
+    extractelem(instruction, result);
+    break;
+  case llvm::Instruction::ExtractValue:
+    extractval(instruction, result);
+    break;
+  case llvm::Instruction::FAdd:
+    fadd(instruction, result);
+    break;
+  case llvm::Instruction::FCmp:
+    fcmp(instruction, result);
+    break;
+  case llvm::Instruction::FDiv:
+    fdiv(instruction, result);
+    break;
+  case llvm::Instruction::FMul:
+    fmul(instruction, result);
+    break;
+  case llvm::Instruction::FPExt:
+    fpext(instruction, result);
+    break;
+  case llvm::Instruction::FPToSI:
+    fptosi(instruction, result);
+    break;
+  case llvm::Instruction::FPToUI:
+    fptoui(instruction, result);
+    break;
+  case llvm::Instruction::FPTrunc:
+    fptrunc(instruction, result);
+    break;
+  case llvm::Instruction::FRem:
+    frem(instruction, result);
+    break;
+  case llvm::Instruction::FSub:
+    fsub(instruction, result);
+    break;
+  case llvm::Instruction::GetElementPtr:
+    gep(instruction, result);
+    break;
+  case llvm::Instruction::ICmp:
+    icmp(instruction, result);
+    break;
+  case llvm::Instruction::InsertElement:
+    insertelem(instruction, result);
+    break;
+  case llvm::Instruction::InsertValue:
+    insertval(instruction, result);
+    break;
+  case llvm::Instruction::IntToPtr:
+    inttoptr(instruction, result);
+    break;
+  case llvm::Instruction::Load:
+    load(instruction, result);
+    break;
+  case llvm::Instruction::LShr:
+    lshr(instruction, result);
+    break;
+  case llvm::Instruction::Mul:
+    mul(instruction, result);
+    break;
+  case llvm::Instruction::Or:
+    bwor(instruction, result);
+    break;
+  case llvm::Instruction::PHI:
+    phi(instruction, result);
+    break;
+  case llvm::Instruction::PtrToInt:
+    ptrtoint(instruction, result);
+    break;
+  case llvm::Instruction::Ret:
+    ret(instruction, result);
+    break;
+  case llvm::Instruction::SDiv:
+    sdiv(instruction, result);
+    break;
+  case llvm::Instruction::Select:
+    select(instruction, result);
+    break;
+  case llvm::Instruction::SExt:
+    sext(instruction, result);
+    break;
+  case llvm::Instruction::Shl:
+    shl(instruction, result);
+    break;
+  case llvm::Instruction::ShuffleVector:
+    shuffle(instruction, result);
+    break;
+  case llvm::Instruction::SIToFP:
+    sitofp(instruction, result);
+    break;
+  case llvm::Instruction::SRem:
+    srem(instruction, result);
+    break;
+  case llvm::Instruction::Store:
+    store(instruction, result);
+    break;
+  case llvm::Instruction::Sub:
+    sub(instruction, result);
+    break;
+  case llvm::Instruction::Switch:
+    swtch(instruction, result);
+    break;
+  case llvm::Instruction::Trunc:
+    itrunc(instruction, result);
+    break;
+  case llvm::Instruction::UDiv:
+    udiv(instruction, result);
+    break;
+  case llvm::Instruction::UIToFP:
+    uitofp(instruction, result);
+    break;
+  case llvm::Instruction::URem:
+    urem(instruction, result);
+    break;
+  case llvm::Instruction::Unreachable:
+    FATAL_ERROR("Encountered unreachable instruction");
+  case llvm::Instruction::Xor:
+    bwxor(instruction, result);
+    break;
+  case llvm::Instruction::ZExt:
+    zext(instruction, result);
+    break;
+  default:
+    FATAL_ERROR("Unsupported instruction: %s", instruction->getOpcodeName());
+  }
+void WorkItem::execute(const llvm::Instruction *instruction)
+  // Prepare private variable for instruction result
+  pair<unsigned,unsigned> resultSize = getValueSize(instruction);
+  // Prepare result
+  TypedValue result = {
+    resultSize.first,
+    resultSize.second,
+    NULL
+  };
+  if (result.size)
+  {
+    result.data = m_pool.alloc(result.size*result.num);
+  }
+  if (instruction->getOpcode() != llvm::Instruction::PHI &&
+      m_phiTemps.size() > 0)
+  {
+    TypedValueMap::iterator itr;
+    for (itr = m_phiTemps.begin(); itr != m_phiTemps.end(); itr++)
+    {
+      setValue(itr->first, itr->second);
+    }
+    m_phiTemps.clear();
+  }
+  // Execute instruction
+  dispatch(instruction, result);
+  // Store result
+  if (result.size)
+  {
+    if (instruction->getOpcode() != llvm::Instruction::PHI)
+    {
+      setValue(instruction, result);
+    }
+    else
+    {
+      m_phiTemps[instruction] = result;
+    }
+  }
+  m_context->notifyInstructionExecuted(this, instruction, result);
+TypedValue WorkItem::getValue(const llvm::Value *key) const
+  return m_values[m_cache->getValueID(key)];
+const stack<const llvm::Instruction*>& WorkItem::getCallStack() const
+  return m_position->callStack;
+const llvm::Instruction* WorkItem::getCurrentInstruction() const
+  return m_position->currInst;
+Size3 WorkItem::getGlobalID() const
+  return m_globalID;
+size_t WorkItem::getGlobalIndex() const
+  return m_globalIndex;
+Size3 WorkItem::getLocalID() const
+  return m_localID;
+Memory* WorkItem::getMemory(unsigned int addrSpace) const
+  switch (addrSpace)
+  {
+    case AddrSpacePrivate:
+      return m_privateMemory;
+    case AddrSpaceGlobal:
+    case AddrSpaceConstant:
+      return m_context->getGlobalMemory();
+    case AddrSpaceLocal:
+      return m_workGroup->getLocalMemory();
+    default:
+      FATAL_ERROR("Unsupported address space: %d", addrSpace);
+  }
+TypedValue WorkItem::getOperand(const llvm::Value *operand) const
+  unsigned valID = operand->getValueID();
+  if (valID == llvm::Value::ArgumentVal ||
+      valID == llvm::Value::GlobalVariableVal ||
+      valID >= llvm::Value::InstructionVal)
+  {
+    return getValue(operand);
+  }
+  //else if (valID == llvm::Value::BasicBlockVal)
+  //{
+  //}
+  //else if (valID == llvm::Value::FunctionVal)
+  //{
+  //}
+  //else if (valID == llvm::Value::GlobalAliasVal)
+  //{
+  //}
+  //else if (valID == llvm::Value::BlockAddressVal)
+  //{
+  //}
+  else if (valID == llvm::Value::ConstantExprVal)
+  {
+    pair<unsigned,unsigned> size = getValueSize(operand);
+    TypedValue result;
+    result.size = size.first;
+    result.num  = size.second;
+    result.data = m_pool.alloc(getTypeSize(operand->getType()));
+    // Use of const_cast here is ugly, but ConstExpr instructions
+    // shouldn't actually modify WorkItem state anyway
+    const_cast<WorkItem*>(this)->dispatch(
+      m_cache->getConstantExpr(operand), result);
+    return result;
+  }
+  else if (valID == llvm::Value::UndefValueVal            ||
+           valID == llvm::Value::ConstantAggregateZeroVal ||
+           valID == llvm::Value::ConstantDataArrayVal     ||
+           valID == llvm::Value::ConstantDataVectorVal    ||
+           valID == llvm::Value::ConstantIntVal           ||
+           valID == llvm::Value::ConstantFPVal            ||
+           valID == llvm::Value::ConstantArrayVal         ||
+           valID == llvm::Value::ConstantStructVal        ||
+           valID == llvm::Value::ConstantVectorVal        ||
+           valID == llvm::Value::ConstantPointerNullVal)
+  {
+    return m_cache->getConstant(operand);
+  }
+  //else if (valID == llvm::Value::MDNodeVal)
+  //{
+  //}
+  //else if (valID == llvm::Value::MDStringVal)
+  //{
+  //}
+  //else if (valID == llvm::Value::InlineAsmVal)
+  //{
+  //}
+  //else if (valID == llvm::Value::PseudoSourceValueVal)
+  //{
+  //}
+  //else if (valID == llvm::Value::FixedStackPseudoSourceValueVal)
+  //{
+  //}
+  else
+  {
+    FATAL_ERROR("Unhandled operand type: %d", valID);
+  }
+  // Unreachable
+  assert(false);
+Memory* WorkItem::getPrivateMemory() const
+  return m_privateMemory;
+WorkItem::State WorkItem::getState() const
+  return m_state;
+const unsigned char* WorkItem::getValueData(const llvm::Value *value) const
+  if (!hasValue(value))
+  {
+    return NULL;
+  }
+  return getValue(value).data;
+const llvm::Value* WorkItem::getVariable(std::string name) const
+  VariableMap::const_iterator itr;
+  itr = m_variables.find(name);
+  if (itr == m_variables.end())
+  {
+    return NULL;
+  }
+  return itr->second;
+const WorkGroup* WorkItem::getWorkGroup() const
+  return m_workGroup;
+bool WorkItem::hasValue(const llvm::Value *key) const
+  return m_cache->hasValue(key);
+bool WorkItem::printValue(const llvm::Value *value) const
+  if (!hasValue(value))
+  {
+    return false;
+  }
+  printTypedData(value->getType(), getValue(value).data);
+  return true;
+bool WorkItem::printVariable(string name) const
+  // Find variable
+  const llvm::Value *value = getVariable(name);
+  if (!value)
+  {
+    return false;
+  }
+  // Get variable value
+  TypedValue result = getValue(value);
+  const llvm::Type *type = value->getType();
+  if (((const llvm::Instruction*)value)->getOpcode()
+       == llvm::Instruction::Alloca)
+  {
+    // If value is alloca result, look-up data at address
+    const llvm::Type *elemType = value->getType()->getPointerElementType();
+    size_t address = result.getPointer();
+    unsigned char *data = (unsigned char*)m_privateMemory->getPointer(address);
+    printTypedData(elemType, data);
+  }
+  else
+  {
+    printTypedData(type, result.data);
+  }
+  return true;
+void WorkItem::setValue(const llvm::Value *key, TypedValue value)
+  m_values[m_cache->getValueID(key)] = value;
+WorkItem::State WorkItem::step()
+  assert(m_state == READY);
+  // Execute the next instruction
+  execute(m_position->currInst);
+  // Check if we've reached the end of the block
+  if (++m_position->currInst == m_position->currBlock->end() ||
+      m_position->nextBlock)
+  {
+    if (m_position->nextBlock)
+    {
+      // Move to next basic block
+      m_position->prevBlock = m_position->currBlock;
+      m_position->currBlock = m_position->nextBlock;
+      m_position->nextBlock = NULL;
+      m_position->currInst  = m_position->currBlock->begin();
+    }
+  }
+  return m_state;
+//// Instruction execution ////
+#define INSTRUCTION(name) \
+  void WorkItem::name(const llvm::Instruction *instruction, TypedValue& result)
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) + opB.getUInt(i), i);
+  }
+  const llvm::AllocaInst *allocInst = ((const llvm::AllocaInst*)instruction);
+  const llvm::Type *type = allocInst->getAllocatedType();
+  // Perform allocation
+  unsigned size = getTypeSize(type);
+  size_t address = m_privateMemory->allocateBuffer(size);
+  if (!address)
+    FATAL_ERROR("Insufficient private memory (alloca)");
+  // Create pointer to alloc'd memory
+  result.setPointer(address);
+  // Track allocation in stack frame
+  if (!m_position->allocations.empty())
+    m_position->allocations.top().push_back(address);
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  uint64_t shiftMask =
+    (result.num > 1 ? result.size : max((size_t)result.size, sizeof(uint32_t)))
+    * 8 - 1;
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getSInt(i) >> (opB.getUInt(i) & shiftMask), i);
+  }
+  const llvm::Value *op = instruction->getOperand(0);
+  // Check for address space casts
+  if (instruction->getType()->isPointerTy())
+  {
+    unsigned srcAddrSpace = op->getType()->getPointerAddressSpace();
+    unsigned dstAddrSpace = instruction->getType()->getPointerAddressSpace();
+    if (srcAddrSpace != dstAddrSpace)
+    {
+      FATAL_ERROR("Invalid pointer cast from %s to %s address spaces",
+                  getAddressSpaceName(srcAddrSpace),
+                  getAddressSpaceName(dstAddrSpace));
+    }
+  }
+  TypedValue operand = getOperand(op);
+  memcpy(result.data, operand.data, result.size*result.num);
+  if (instruction->getNumOperands() == 1)
+  {
+    // Unconditional branch
+    m_position->nextBlock = (const llvm::BasicBlock*)instruction->getOperand(0);
+  }
+  else
+  {
+    // Conditional branch
+    bool pred = getOperand(instruction->getOperand(0)).getUInt();
+    const llvm::Value *iftrue = instruction->getOperand(2);
+    const llvm::Value *iffalse = instruction->getOperand(1);
+    m_position->nextBlock = (const llvm::BasicBlock*)(pred ? iftrue : iffalse);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) & opB.getUInt(i), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) | opB.getUInt(i), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) ^ opB.getUInt(i), i);
+  }
+  const llvm::CallInst *callInst = (const llvm::CallInst*)instruction;
+  const llvm::Function *function = callInst->getCalledFunction();
+  // Check for indirect function calls
+  if (!callInst->getCalledFunction())
+  {
+    // Resolve indirect function pointer
+    const llvm::Value *func = callInst->getCalledValue();
+    const llvm::Value *funcPtr = ((const llvm::User*)func)->getOperand(0);
+    function = (const llvm::Function*)funcPtr;
+  }
+  // Check if function has definition
+  if (!function->isDeclaration())
+  {
+    m_position->callStack.push(m_position->currInst);
+    m_position->allocations.push(list<size_t>());
+    m_position->nextBlock = function->begin();
+    // Set function arguments
+    llvm::Function::const_arg_iterator argItr;
+    for (argItr = function->arg_begin();
+         argItr != function->arg_end(); argItr++)
+    {
+      const llvm::Value *arg = callInst->getArgOperand(argItr->getArgNo());
+      setValue(argItr, m_pool.clone(getOperand(arg)));
+    }
+    return;
+  }
+  // Call builtin function
+  InterpreterCache::Builtin builtin = m_cache->getBuiltin(function);
+  builtin.function.func(this, callInst,
+                        builtin.name, builtin.overload,
+                        result, builtin.function.op);
+  const llvm::ExtractElementInst *extract =
+    (const llvm::ExtractElementInst*)instruction;
+  unsigned index     = getOperand(extract->getIndexOperand()).getUInt();
+  TypedValue operand = getOperand(extract->getVectorOperand());
+  memcpy(result.data, operand.data + result.size*index, result.size);
+  const llvm::ExtractValueInst *extract =
+    (const llvm::ExtractValueInst*)instruction;
+  const llvm::Value *agg = extract->getAggregateOperand();
+  llvm::ArrayRef<unsigned int> indices = extract->getIndices();
+  // Compute offset for target value
+  int offset = 0;
+  const llvm::Type *type = agg->getType();
+  for (unsigned i = 0; i < indices.size(); i++)
+  {
+    if (type->isArrayTy())
+    {
+      type = type->getArrayElementType();
+      offset += getTypeSize(type) * indices[i];
+    }
+    else if (type->isStructTy())
+    {
+      offset += getStructMemberOffset((const llvm::StructType*)type,
+                                      indices[i]);
+      type = type->getStructElementType(indices[i]);
+    }
+    else
+    {
+      FATAL_ERROR("Unsupported aggregate type: %d", type->getTypeID())
+    }
+  }
+  // Copy target value to result
+  memcpy(result.data, getOperand(agg).data + offset, getTypeSize(type));
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(opA.getFloat(i) + opB.getFloat(i), i);
+  }
+  const llvm::CmpInst *cmpInst = (const llvm::CmpInst*)instruction;
+  llvm::CmpInst::Predicate pred = cmpInst->getPredicate();
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  uint64_t t = result.num > 1 ? -1 : 1;
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    double a = opA.getFloat(i);
+    double b = opB.getFloat(i);
+    uint64_t r;
+    switch (pred)
+    {
+    case llvm::CmpInst::FCMP_OEQ:
+    case llvm::CmpInst::FCMP_UEQ:
+      r = a == b;
+      break;
+    case llvm::CmpInst::FCMP_ONE:
+    case llvm::CmpInst::FCMP_UNE:
+      r = a != b;
+      break;
+    case llvm::CmpInst::FCMP_OGT:
+    case llvm::CmpInst::FCMP_UGT:
+      r = a > b;
+      break;
+    case llvm::CmpInst::FCMP_OGE:
+    case llvm::CmpInst::FCMP_UGE:
+      r = a >= b;
+      break;
+    case llvm::CmpInst::FCMP_OLT:
+    case llvm::CmpInst::FCMP_ULT:
+      r = a < b;
+      break;
+    case llvm::CmpInst::FCMP_OLE:
+    case llvm::CmpInst::FCMP_ULE:
+      r = a <= b;
+      break;
+    case llvm::CmpInst::FCMP_FALSE:
+      r = false;
+      break;
+    case llvm::CmpInst::FCMP_TRUE:
+      r = true;
+      break;
+    case llvm::CmpInst::FCMP_ORD:
+    case llvm::CmpInst::FCMP_UNO:
+      break;
+    default:
+      FATAL_ERROR("Unsupported FCmp predicate: %d", pred);
+    }
+    // Deal with NaN operands
+    if (::isnan(a) || ::isnan(b))
+    {
+      r = !llvm::CmpInst::isOrdered(pred);
+    }
+    result.setUInt(r ? t : 0, i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(opA.getFloat(i) / opB.getFloat(i), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(opA.getFloat(i) * opB.getFloat(i), i);
+  }
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(op.getFloat(i), i);
+  }
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setSInt((int64_t)op.getFloat(i), i);
+  }
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt((uint64_t)op.getFloat(i), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(fmod(opA.getFloat(i), opB.getFloat(i)), i);
+  }
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(op.getFloat(i), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(opA.getFloat(i) - opB.getFloat(i), i);
+  }
+  const llvm::GetElementPtrInst *gepInst =
+    (const llvm::GetElementPtrInst*)instruction;
+  // Get base address
+  const llvm::Value *base = gepInst->getPointerOperand();
+  size_t address = getOperand(base).getPointer();
+  const llvm::Type *ptrType = gepInst->getPointerOperandType();
+  // Iterate over indices
+  llvm::User::const_op_iterator opItr;
+  for (opItr = gepInst->idx_begin(); opItr != gepInst->idx_end(); opItr++)
+  {
+    int64_t offset = getOperand(opItr->get()).getSInt();
+    if (ptrType->isPointerTy())
+    {
+      // Get pointer element size
+      const llvm::Type *elemType = ptrType->getPointerElementType();
+      address += offset*getTypeSize(elemType);
+      ptrType = elemType;
+    }
+    else if (ptrType->isArrayTy())
+    {
+      // Get array element size
+      const llvm::Type *elemType = ptrType->getArrayElementType();
+      address += offset*getTypeSize(elemType);
+      ptrType = elemType;
+    }
+    else if (ptrType->isVectorTy())
+    {
+      // Get vector element size
+      const llvm::Type *elemType = ptrType->getVectorElementType();
+      address += offset*getTypeSize(elemType);
+      ptrType = elemType;
+    }
+    else if (ptrType->isStructTy())
+    {
+      address +=
+        getStructMemberOffset((const llvm::StructType*)ptrType, offset);
+      ptrType = ptrType->getStructElementType(offset);
+    }
+    else
+    {
+      FATAL_ERROR("Unsupported GEP base type: %d", ptrType->getTypeID());
+    }
+  }
+  result.setPointer(address);
+  const llvm::CmpInst *cmpInst = (const llvm::CmpInst*)instruction;
+  llvm::CmpInst::Predicate pred = cmpInst->getPredicate();
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  uint64_t t = result.num > 1 ? -1 : 1;
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    // Load operands
+    uint64_t ua = opA.getUInt(i);
+    uint64_t ub = opB.getUInt(i);
+    int64_t  sa = opA.getSInt(i);
+    int64_t  sb = opB.getSInt(i);
+    uint64_t r;
+    switch (pred)
+    {
+    case llvm::CmpInst::ICMP_EQ:
+      r = ua == ub;
+      break;
+    case llvm::CmpInst::ICMP_NE:
+      r = ua != ub;
+      break;
+    case llvm::CmpInst::ICMP_UGT:
+      r = ua > ub;
+      break;
+    case llvm::CmpInst::ICMP_UGE:
+      r = ua >= ub;
+      break;
+    case llvm::CmpInst::ICMP_ULT:
+      r = ua < ub;
+      break;
+    case llvm::CmpInst::ICMP_ULE:
+      r = ua <= ub;
+      break;
+    case llvm::CmpInst::ICMP_SGT:
+      r = sa > sb;
+      break;
+    case llvm::CmpInst::ICMP_SGE:
+      r = sa >= sb;
+      break;
+    case llvm::CmpInst::ICMP_SLT:
+      r = sa < sb;
+      break;
+    case llvm::CmpInst::ICMP_SLE:
+      r = sa <= sb;
+      break;
+    default:
+      FATAL_ERROR("Unsupported ICmp predicate: %d", pred);
+    }
+    result.setUInt(r ? t : 0, i);
+  }
+  TypedValue vector  = getOperand(instruction->getOperand(0));
+  TypedValue element = getOperand(instruction->getOperand(1));
+  unsigned index     = getOperand(instruction->getOperand(2)).getUInt();
+  memcpy(result.data, vector.data, result.size*result.num);
+  memcpy(result.data + index*result.size, element.data, result.size);
+  const llvm::InsertValueInst *insert =
+    (const llvm::InsertValueInst*)instruction;
+  // Load original aggregate data
+  const llvm::Value *agg = insert->getAggregateOperand();
+  memcpy(result.data, getOperand(agg).data, result.size*result.num);
+  // Compute offset for inserted value
+  int offset = 0;
+  llvm::ArrayRef<unsigned int> indices = insert->getIndices();
+  const llvm::Type *type = agg->getType();
+  for (unsigned i = 0; i < indices.size(); i++)
+  {
+    if (type->isArrayTy())
+    {
+      type = type->getArrayElementType();
+      offset += getTypeSize(type) * indices[i];
+    }
+    else if (type->isStructTy())
+    {
+      offset += getStructMemberOffset((const llvm::StructType*)type,
+                                      indices[i]);
+      type = type->getStructElementType(indices[i]);
+    }
+    else
+    {
+      FATAL_ERROR("Unsupported aggregate type: %d", type->getTypeID())
+    }
+  }
+  // Copy inserted value into result
+  const llvm::Value *value = insert->getInsertedValueOperand();
+  memcpy(result.data + offset, getOperand(value).data,
+         getTypeSize(value->getType()));
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setPointer(op.getUInt(i), i);
+  }
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    memcpy(result.data+i*result.size, op.data+i*op.size, result.size);
+  }
+  const llvm::LoadInst *loadInst = (const llvm::LoadInst*)instruction;
+  unsigned addressSpace = loadInst->getPointerAddressSpace();
+  size_t address = getOperand(loadInst->getPointerOperand()).getPointer();
+  // Check address is correctly aligned
+  if (address & (loadInst->getAlignment()-1))
+  {
+    m_context->logError("Invalid memory load - source pointer is "
+                        "not aligned to the pointed type");
+  }
+  // Load data
+  getMemory(addressSpace)->load(result.data, address, result.size*result.num);
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  uint64_t shiftMask =
+    (result.num > 1 ? result.size : max((size_t)result.size, sizeof(uint32_t)))
+    * 8 - 1;
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) >> (opB.getUInt(i) & shiftMask), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) * opB.getUInt(i), i);
+  }
+  const llvm::PHINode *phiNode = (const llvm::PHINode*)instruction;
+  const llvm::Value *value = phiNode->getIncomingValueForBlock(
+    (const llvm::BasicBlock*)m_position->prevBlock);
+  memcpy(result.data, getOperand(value).data, result.size*result.num);
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(op.getPointer(i), i);
+  }
+  const llvm::ReturnInst *retInst = (const llvm::ReturnInst*)instruction;
+  if (!m_position->callStack.empty())
+  {
+    m_position->currInst = m_position->callStack.top();
+    m_position->currBlock = m_position->currInst->getParent();
+    m_position->callStack.pop();
+    // Set return value
+    const llvm::Value *returnVal = retInst->getReturnValue();
+    if (returnVal)
+    {
+      setValue(m_position->currInst, m_pool.clone(getOperand(returnVal)));
+    }
+    // Clear stack allocations
+    list<size_t>& allocs = m_position->allocations.top();
+    list<size_t>::iterator itr;
+    for (itr = allocs.begin(); itr != allocs.end(); itr++)
+    {
+      m_privateMemory->deallocateBuffer(*itr);
+    }
+    m_position->allocations.pop();
+  }
+  else
+  {
+    m_position->nextBlock = NULL;
+    m_state = FINISHED;
+    m_workGroup->notifyFinished(this);
+    m_context->notifyWorkItemComplete(this);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    int64_t a = opA.getSInt(i);
+    int64_t b = opB.getSInt(i);
+    int64_t r = 0;
+    if (b && !(a == INT64_MIN && b == -1))
+    {
+      r = a / b;
+    }
+    result.setSInt(r, i);
+  }
+  const llvm::SelectInst *selectInst = (const llvm::SelectInst*)instruction;
+  TypedValue opCondition = getOperand(selectInst->getCondition());
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    const bool cond =
+      selectInst->getCondition()->getType()->isVectorTy() ?
+      opCondition.getUInt(i) :
+      opCondition.getUInt();
+    const llvm::Value *op = cond ?
+      selectInst->getTrueValue() :
+      selectInst->getFalseValue();
+    memcpy(result.data + i*result.size,
+           getOperand(op).data + i*result.size,
+           result.size);
+  }
+  const llvm::Value *operand = instruction->getOperand(0);
+  TypedValue value = getOperand(operand);
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    int64_t val = value.getSInt(i);
+    if (operand->getType()->getPrimitiveSizeInBits() == 1)
+    {
+      val = val ? -1 : 0;
+    }
+    result.setSInt(val, i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  uint64_t shiftMask =
+    (result.num > 1 ? result.size : max((size_t)result.size, sizeof(uint32_t)))
+    * 8 - 1;
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) << (opB.getUInt(i) & shiftMask), i);
+  }
+  const llvm::ShuffleVectorInst *shuffle =
+    (const llvm::ShuffleVectorInst*)instruction;
+  const llvm::Value *v1 = shuffle->getOperand(0);
+  const llvm::Value *v2 = shuffle->getOperand(1);
+  TypedValue mask = getOperand(shuffle->getMask());
+  unsigned num = v1->getType()->getVectorNumElements();
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    if (shuffle->getMask()->getAggregateElement(i)->getValueID()
+          == llvm::Value::UndefValueVal)
+    {
+      // Don't care / undef
+      continue;
+    }
+    const llvm::Value *src = v1;
+    unsigned int index = mask.getUInt(i);
+    if (index >= num)
+    {
+      index -= num;
+      src = v2;
+    }
+    memcpy(result.data + i*result.size,
+           getOperand(src).data + index*result.size, result.size);
+  }
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(op.getSInt(i), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    int64_t a = opA.getSInt(i);
+    int64_t b = opB.getSInt(i);
+    int64_t r = 0;
+    if (b && !(a == INT64_MIN && b == -1))
+    {
+      r = a % b;
+    }
+    result.setSInt(r, i);
+  }
+  const llvm::StoreInst *storeInst = (const llvm::StoreInst*)instruction;
+  unsigned addressSpace = storeInst->getPointerAddressSpace();
+  size_t address = getOperand(storeInst->getPointerOperand()).getPointer();
+  // Check address is correctly aligned
+  if (address & (storeInst->getAlignment()-1))
+  {
+    m_context->logError("Invalid memory store - source pointer is "
+                        "not aligned to the pointed type");
+  }
+  // Store data
+  TypedValue operand = getOperand(storeInst->getValueOperand());
+  getMemory(addressSpace)->store(operand.data, address,
+                                 operand.size*operand.num);
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(opA.getUInt(i) - opB.getUInt(i), i);
+  }
+  const llvm::SwitchInst *swtch = (const llvm::SwitchInst*)instruction;
+  const llvm::Value *cond = swtch->getCondition();
+  uint64_t val = getOperand(cond).getUInt();
+  const llvm::ConstantInt *cval =
+    (const llvm::ConstantInt*)llvm::ConstantInt::get(cond->getType(), val);
+  m_position->nextBlock = swtch->findCaseValue(cval).getCaseSuccessor();
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    uint64_t a = opA.getUInt(i);
+    uint64_t b = opB.getUInt(i);
+    result.setUInt(b ? a / b : 0, i);
+  }
+  TypedValue op = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setFloat(op.getUInt(i), i);
+  }
+  TypedValue opA = getOperand(instruction->getOperand(0));
+  TypedValue opB = getOperand(instruction->getOperand(1));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    uint64_t a = opA.getUInt(i);
+    uint64_t b = opB.getUInt(i);
+    result.setUInt(b ? a % b : 0, i);
+  }
+  TypedValue operand = getOperand(instruction->getOperand(0));
+  for (unsigned i = 0; i < result.num; i++)
+  {
+    result.setUInt(operand.getUInt(i), i);
+  }
+// WorkItem::InterpreterCache //
+InterpreterCache::InterpreterCache(llvm::Function *kernel)
+  // TODO: Determine this number dynamically?
+  m_valueIDs.reserve(1024);
+  // Add global variables to cache
+  // TODO: Only add variables that are used?
+  const llvm::Module *module = kernel->getParent();
+  llvm::Module::const_global_iterator G;
+  for (G = module->global_begin(); G != module->global_end(); G++)
+  {
+    addValueID(G);
+  }
+  set<llvm::Function*> processed;
+  set<llvm::Function*> pending;
+  pending.insert(kernel);
+  while (!pending.empty())
+  {
+    // Get next function to process
+    llvm::Function *function = *pending.begin();
+    processed.insert(function);
+    pending.erase(function);
+    // Iterate through the function arguments
+    llvm::Function::arg_iterator A;
+    for (A = function->arg_begin(); A != function->arg_end(); A++)
+    {
+      addValueID(A);
+    }
+    // Iterate through instructions in function
+    llvm::inst_iterator I;
+    for (I = inst_begin(function); I != inst_end(function); I++)
+    {
+      addValueID(&*I);
+      // Check for function calls
+      if (I->getOpcode() == llvm::Instruction::Call)
+      {
+        const llvm::CallInst *call = ((const llvm::CallInst*)&*I);
+        llvm::Function *callee =
+          (llvm::Function*)call->getCalledValue()->stripPointerCasts();
+        if (callee->isDeclaration())
+        {
+          // Resolve builtin function calls
+          addBuiltin(callee);
+        }
+        else if (!processed.count(callee))
+        {
+          // Process called function
+          pending.insert(callee);
+        }
+      }
+      // Process operands
+      for (llvm::User::value_op_iterator O = I->value_op_begin();
+           O != I->value_op_end(); O++)
+      {
+        addOperand(*O);
+      }
+    }
+  }
+  ConstantMap::iterator constItr;
+  for (constItr  = m_constants.begin();
+       constItr != m_constants.end(); constItr++)
+  {
+    delete[] constItr->second.data;
+  }
+  ConstExprMap::iterator constExprItr;
+  for (constExprItr  = m_constExpressions.begin();
+       constExprItr != m_constExpressions.end(); constExprItr++)
+  {
+    delete constExprItr->second;
+  }
+void InterpreterCache::addBuiltin(
+  const llvm::Function *function)
+  // Check if already in cache
+  InterpreterCache::BuiltinMap::iterator fItr = m_builtins.find(function);
+  if (fItr != m_builtins.end())
+  {
+    return;
+  }
+  // Extract unmangled name and overload
+  string name, overload;
+  const string fullname = function->getName().str();
+  if (fullname.compare(0,2, "_Z") == 0)
+  {
+    int len = atoi(fullname.c_str()+2);
+    int start = fullname.find_first_not_of("0123456789", 2);
+    name = fullname.substr(start, len);
+    overload = fullname.substr(start + len);
+  }
+  else
+  {
+    name = fullname;
+    overload = "";
+  }
+  // Find builtin function in map
+  BuiltinFunctionMap::iterator bItr = workItemBuiltins.find(name);
+  if (bItr != workItemBuiltins.end())
+  {
+    // Add builtin to cache
+    const InterpreterCache::Builtin builtin = {bItr->second, name, overload};
+    m_builtins[function] = builtin;
+    return;
+  }
+  // Check for builtin with matching prefix
+  BuiltinFunctionPrefixList::iterator pItr;
+  for (pItr = workItemPrefixBuiltins.begin();
+       pItr != workItemPrefixBuiltins.end(); pItr++)
+  {
+    if (name.compare(0, pItr->first.length(), pItr->first) == 0)
+    {
+      // Add builtin to cache
+      const InterpreterCache::Builtin builtin = {pItr->second, name, overload};
+      m_builtins[function] = builtin;
+      return;
+    }
+  }
+  // Function didn't match any builtins
+  FATAL_ERROR("Undefined external function: %s", name.c_str());
+InterpreterCache::Builtin InterpreterCache::getBuiltin(
+  const llvm::Function *function) const
+  return m_builtins.at(function);
+void InterpreterCache::addConstant(const llvm::Value *value)
+  // Check if constant already in cache
+  if (m_constants.count(value))
+  {
+    return;
+  }
+  // Create constant and add to cache
+  pair<unsigned,unsigned> size = getValueSize(value);
+  TypedValue constant;
+  constant.size = size.first;
+  constant.num  = size.second;
+  constant.data = new unsigned char[getTypeSize(value->getType())];
+  getConstantData(constant.data, (const llvm::Constant*)value);
+  m_constants[value] = constant;
+TypedValue InterpreterCache::getConstant(const llvm::Value *operand) const
+  ConstantMap::const_iterator itr = m_constants.find(operand);
+  if (itr == m_constants.end())
+  {
+    FATAL_ERROR("Constant not found in cache (ID %d)", operand->getValueID());
+  }
+  return itr->second;
+const llvm::Instruction* InterpreterCache::getConstantExpr(
+  const llvm::Value *expr) const
+  ConstExprMap::const_iterator itr = m_constExpressions.find(expr);
+  if (itr == m_constExpressions.end())
+  {
+    FATAL_ERROR("Constant expression not found in cache");
+  }
+  return itr->second;
+unsigned InterpreterCache::addValueID(const llvm::Value *value)
+  ValueMap::iterator itr = m_valueIDs.find(value);
+  if (itr == m_valueIDs.end())
+  {
+    // Assign next index to value
+    unsigned pos = m_valueIDs.size();
+    itr = m_valueIDs.insert(make_pair(value, pos)).first;
+  }
+  return itr->second;
+unsigned InterpreterCache::getValueID(const llvm::Value *value) const
+  ValueMap::const_iterator itr = m_valueIDs.find(value);
+  if (itr == m_valueIDs.end())
+  {
+    FATAL_ERROR("Value not found in cache (ID %d)", value->getValueID());
+  }
+  return itr->second;
+unsigned InterpreterCache::getNumValues() const
+  return m_valueIDs.size();
+bool InterpreterCache::hasValue(const llvm::Value *value) const
+  return m_valueIDs.count(value);
+void InterpreterCache::addOperand(const llvm::Value *operand)
+  addValueID(operand);
+  // Resolve constants
+  if (operand->getValueID() == llvm::Value::UndefValueVal            ||
+      operand->getValueID() == llvm::Value::ConstantAggregateZeroVal ||
+      operand->getValueID() == llvm::Value::ConstantDataArrayVal     ||
+      operand->getValueID() == llvm::Value::ConstantDataVectorVal    ||
+      operand->getValueID() == llvm::Value::ConstantIntVal           ||
+      operand->getValueID() == llvm::Value::ConstantFPVal            ||
+      operand->getValueID() == llvm::Value::ConstantArrayVal         ||
+      operand->getValueID() == llvm::Value::ConstantStructVal        ||
+      operand->getValueID() == llvm::Value::ConstantVectorVal        ||
+      operand->getValueID() == llvm::Value::ConstantPointerNullVal)
+  {
+    addConstant(operand);
+  }
+  else if (operand->getValueID() == llvm::Value::ConstantExprVal)
+  {
+    // Resolve constant expressions
+    const llvm::ConstantExpr *expr = (const llvm::ConstantExpr*)operand;
+    if (!m_constExpressions.count(expr))
+    {
+      for (llvm::User::const_op_iterator O = expr->op_begin();
+           O != expr->op_end(); O++)
+      {
+        addOperand(*O);
+      }
+      m_constExpressions[expr] = getConstExprAsInstruction(expr);
+      // TODO: Resolve actual value?
+    }
+  }
+// WorkItem::MemoryPool //
+WorkItem::MemoryPool::MemoryPool(size_t blockSize) : m_blockSize(blockSize)
+  // Force first allocation to create new block
+  m_offset = m_blockSize;
+  list<unsigned char*>::iterator itr;
+  for (itr = m_blocks.begin(); itr != m_blocks.end(); itr++)
+  {
+    delete[] *itr;
+  }
+unsigned char* WorkItem::MemoryPool::alloc(size_t size)
+  // Check if requested size larger than block size
+  if (size > m_blockSize)
+  {
+    // Oversized buffers allocated separately from main pool
+    unsigned char *buffer = new unsigned char[size];
+    m_blocks.push_back(buffer);
+    return buffer;
+  }
+  // Check if enough space in current block
+  if (m_offset + size > m_blockSize)
+  {
+    // Allocate new block
+    m_blocks.push_front(new unsigned char[m_blockSize]);
+    m_offset = 0;
+  }
+  unsigned char *buffer = m_blocks.front() + m_offset;
+  m_offset += size;
+  return buffer;
+TypedValue WorkItem::MemoryPool::clone(const TypedValue& source)
+  TypedValue dest;
+  dest.size = source.size;
+  dest.num = source.num;
+  dest.data = alloc(dest.size*dest.num);
+  memcpy(dest.data, source.data, dest.size*dest.num);
+  return dest;
diff --git a/src/core/WorkItem.h b/src/core/WorkItem.h
new file mode 100644
index 0000000..ae8380c
--- /dev/null
+++ b/src/core/WorkItem.h
@@ -0,0 +1,213 @@
+// WorkItem.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+namespace llvm
+  class CallInst;
+  class ConstExpr;
+  class DbgValueInst;
+  class Function;
+  class Module;
+namespace oclgrind
+  class Context;
+  class Kernel;
+  class KernelInvocation;
+  class Memory;
+  class WorkGroup;
+  class WorkItem;
+  class WorkItemBuiltins;
+  // Data structures for builtin functions
+  typedef struct _BuiltinFunction
+  {
+    void (*func)(WorkItem*, const llvm::CallInst*,
+                 const std::string&, const std::string&, TypedValue&, void*);
+    void *op;
+    _BuiltinFunction(){};
+    _BuiltinFunction(void (*f)(WorkItem*, const llvm::CallInst*,
+                     const std::string&, const std::string&, TypedValue&,
+                     void*),
+                     void *o) : func(f), op(o) {};
+  } BuiltinFunction;
+  typedef std::unordered_map<std::string,BuiltinFunction> BuiltinFunctionMap;
+  typedef std::list< std::pair<std::string, BuiltinFunction> >
+    BuiltinFunctionPrefixList;
+  extern BuiltinFunctionMap workItemBuiltins;
+  extern BuiltinFunctionPrefixList workItemPrefixBuiltins;
+  // Per-kernel cache for various interpreter state information
+  class InterpreterCache
+  {
+  public:
+    typedef struct
+    {
+      BuiltinFunction function;
+      std::string name, overload;
+    } Builtin;
+    InterpreterCache(llvm::Function *kernel);
+    ~InterpreterCache();
+    void addBuiltin(const llvm::Function *function);
+    Builtin getBuiltin(const llvm::Function *function) const;
+    void addConstant(const llvm::Value *constant);
+    TypedValue getConstant(const llvm::Value *operand) const;
+    const llvm::Instruction* getConstantExpr(const llvm::Value *expr) const;
+    unsigned addValueID(const llvm::Value *value);
+    unsigned getValueID(const llvm::Value *value) const;
+    unsigned getNumValues() const;
+    bool hasValue(const llvm::Value *value) const;
+  private:
+    typedef std::unordered_map<const llvm::Value*, unsigned> ValueMap;
+    typedef std::unordered_map<const llvm::Function*, Builtin> BuiltinMap;
+    typedef std::unordered_map<const llvm::Value*, TypedValue> ConstantMap;
+    typedef std::unordered_map<const llvm::Value*, const llvm::Instruction*>
+      ConstExprMap;
+    BuiltinMap m_builtins;
+    ConstantMap m_constants;
+    ConstExprMap m_constExpressions;
+    ValueMap m_valueIDs;
+    void addOperand(const llvm::Value *value);
+  };
+  class WorkItem
+  {
+    friend class WorkItemBuiltins;
+  public:
+    enum State {READY, BARRIER, FINISHED};
+  private:
+    class MemoryPool
+    {
+    public:
+      MemoryPool(size_t blockSize = 1024);
+      ~MemoryPool();
+      unsigned char* alloc(size_t size);
+      TypedValue clone(const TypedValue& source);
+    private:
+      size_t m_blockSize;
+      size_t m_offset;
+      std::list<unsigned char *> m_blocks;
+    } mutable m_pool;
+  public:
+    WorkItem(const KernelInvocation *kernelInvocation,
+             WorkGroup *workGroup, Size3 lid);
+    virtual ~WorkItem();
+    void clearBarrier();
+    void dispatch(const llvm::Instruction *instruction, TypedValue& result);
+    void execute(const llvm::Instruction *instruction);
+    const std::stack<const llvm::Instruction*>& getCallStack() const;
+    const llvm::Instruction* getCurrentInstruction() const;
+    Size3 getGlobalID() const;
+    size_t getGlobalIndex() const;
+    Size3 getLocalID() const;
+    TypedValue getOperand(const llvm::Value *operand) const;
+    Memory* getPrivateMemory() const;
+    State getState() const;
+    const unsigned char* getValueData(const llvm::Value *value) const;
+    const llvm::Value* getVariable(std::string name) const;
+    const WorkGroup* getWorkGroup() const;
+    bool printValue(const llvm::Value *value) const;
+    bool printVariable(std::string name) const;
+    State step();
+    // SPIR instructions
+  private:
+#define INSTRUCTION(name) \
+  void name(const llvm::Instruction *instruction, TypedValue& result)
+    INSTRUCTION(add);
+    INSTRUCTION(alloc);
+    INSTRUCTION(ashr);
+    INSTRUCTION(bitcast);
+    INSTRUCTION(bwand);
+    INSTRUCTION(bwor);
+    INSTRUCTION(bwxor);
+    INSTRUCTION(call);
+    INSTRUCTION(extractelem);
+    INSTRUCTION(extractval);
+    INSTRUCTION(fadd);
+    INSTRUCTION(fcmp);
+    INSTRUCTION(fdiv);
+    INSTRUCTION(fmul);
+    INSTRUCTION(fpext);
+    INSTRUCTION(fptosi);
+    INSTRUCTION(fptoui);
+    INSTRUCTION(fptrunc);
+    INSTRUCTION(frem);
+    INSTRUCTION(fsub);
+    INSTRUCTION(gep);
+    INSTRUCTION(icmp);
+    INSTRUCTION(insertelem);
+    INSTRUCTION(insertval);
+    INSTRUCTION(inttoptr);
+    INSTRUCTION(itrunc);
+    INSTRUCTION(load);
+    INSTRUCTION(lshr);
+    INSTRUCTION(mul);
+    INSTRUCTION(phi);
+    INSTRUCTION(ptrtoint);
+    INSTRUCTION(ret);
+    INSTRUCTION(sdiv);
+    INSTRUCTION(select);
+    INSTRUCTION(sext);
+    INSTRUCTION(shl);
+    INSTRUCTION(shuffle);
+    INSTRUCTION(sitofp);
+    INSTRUCTION(srem);
+    INSTRUCTION(store);
+    INSTRUCTION(sub);
+    INSTRUCTION(swtch);
+    INSTRUCTION(udiv);
+    INSTRUCTION(uitofp);
+    INSTRUCTION(urem);
+    INSTRUCTION(zext);
+  private:
+    typedef std::map<std::string, const llvm::Value*> VariableMap;
+    size_t m_globalIndex;
+    Size3 m_globalID;
+    Size3 m_localID;
+    TypedValueMap m_phiTemps;
+    VariableMap m_variables;
+    const Context *m_context;
+    const KernelInvocation *m_kernelInvocation;
+    Memory *m_privateMemory;
+    WorkGroup *m_workGroup;
+    State m_state;
+    struct Position;
+    Position *m_position;
+    Memory* getMemory(unsigned int addrSpace) const;
+    // Store for instruction results and other operand values
+    std::vector<TypedValue> m_values;
+    TypedValue getValue(const llvm::Value *key) const;
+    bool hasValue(const llvm::Value *key) const;
+    void setValue(const llvm::Value *key, TypedValue value);
+    const InterpreterCache *m_cache;
+  };
diff --git a/src/core/WorkItemBuiltins.cpp b/src/core/WorkItemBuiltins.cpp
new file mode 100644
index 0000000..cce6da6
--- /dev/null
+++ b/src/core/WorkItemBuiltins.cpp
@@ -0,0 +1,3561 @@
+// WorkItemBuiltins.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#include <algorithm>
+#include <fenv.h>
+#include <mutex>
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#if LLVM_VERSION > 36
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "CL/cl.h"
+#include "Context.h"
+#include "half.h"
+#include "KernelInvocation.h"
+#include "Memory.h"
+#include "WorkGroup.h"
+#include "WorkItem.h"
+using namespace oclgrind;
+using namespace std;
+#define CLK_ADDRESS_NONE 0x0000
+#define CLK_ADDRESS_CLAMP 0x0004
+#define CLK_ADDRESS_REPEAT 0x0006
+#define CLK_ADDRESS_MASK 0x000E
+#define CLK_FILTER_NEAREST 0x0010
+#define CLK_FILTER_LINEAR 0x0020
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795
+namespace oclgrind
+  static mutex printfMutex;
+  class WorkItemBuiltins
+  {
+    // Utility macros for creating builtins
+#define DEFINE_BUILTIN(name)                                           \
+  static void name(WorkItem *workItem, const llvm::CallInst *callInst, \
+                   const string& fnName, const string& overload,       \
+                   TypedValue& result, void *)
+#define ARG(i) (callInst->getArgOperand(i))
+#define UARGV(i,v) workItem->getOperand(ARG(i)).getUInt(v)
+#define SARGV(i,v) workItem->getOperand(ARG(i)).getSInt(v)
+#define FARGV(i,v) workItem->getOperand(ARG(i)).getFloat(v)
+#define PARGV(i,v) workItem->getOperand(ARG(i)).getPointer(v)
+#define UARG(i) UARGV(i, 0)
+#define SARG(i) SARGV(i, 0)
+#define FARG(i) FARGV(i, 0)
+#define PARG(i) PARGV(i, 0)
+    // Functions that apply generic builtins to each component of a vector
+    static void f1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, double (*func)(double))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setFloat(func(FARGV(0, i)), i);
+      }
+    }
+    static void f2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, double (*func)(double, double))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setFloat(func(FARGV(0, i), FARGV(1, i)), i);
+      }
+    }
+    static void f3arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, double (*func)(double, double, double))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setFloat(func(FARGV(0, i), FARGV(1, i), FARGV(2, i)), i);
+      }
+    }
+    static void u1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, uint64_t (*func)(uint64_t))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setUInt(func(UARGV(0, i)), i);
+      }
+    }
+    static void u2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, uint64_t (*func)(uint64_t, uint64_t))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setUInt(func(UARGV(0, i), UARGV(1, i)), i);
+      }
+    }
+    static void u3arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, uint64_t (*func)(uint64_t, uint64_t, uint64_t))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setUInt(func(UARGV(0, i), UARGV(1, i), UARGV(2, i)), i);
+      }
+    }
+    static void s1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, int64_t (*func)(int64_t))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setSInt(func(SARGV(0, i)), i);
+      }
+    }
+    static void s2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, int64_t (*func)(int64_t, int64_t))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setSInt(func(SARGV(0, i), SARGV(1, i)), i);
+      }
+    }
+    static void s3arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                      const string& name, const string& overload,
+                      TypedValue& result, int64_t (*func)(int64_t, int64_t, int64_t))
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setSInt(func(SARGV(0, i), SARGV(1, i), SARGV(2, i)), i);
+      }
+    }
+    static void rel1arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                        const string& name, const string& overload,
+                        TypedValue& result, int64_t (*func)(double))
+    {
+      int64_t t = result.num > 1 ? -1 : 1;
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setSInt(func(FARGV(0, i))*t, i);
+      }
+    }
+    static void rel2arg(WorkItem *workItem, const llvm::CallInst *callInst,
+                        const string& name, const string& overload,
+                        TypedValue& result, int64_t (*func)(double, double))
+    {
+      int64_t t = result.num > 1 ? -1 : 1;
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setSInt(func(FARGV(0, i), FARGV(1, i))*t, i);
+      }
+    }
+    // Extract the (first) argument type from an overload string
+    static char getOverloadArgType(const string& overload)
+    {
+      char type = overload[0];
+      if (type == 'D')
+      {
+        char *typestr;
+        strtol(overload.c_str() + 2, &typestr, 10);
+        type = typestr[1];
+      }
+      return type;
+    }
+    ///////////////////////////////////////
+    // Async Copy and Prefetch Functions //
+    ///////////////////////////////////////
+    DEFINE_BUILTIN(async_work_group_copy)
+    {
+      int arg = 0;
+      // Get src/dest addresses
+      const llvm::Value *destOp = ARG(arg++);
+      const llvm::Value *srcOp = ARG(arg++);
+      size_t dest = workItem->getOperand(destOp).getPointer();
+      size_t src = workItem->getOperand(srcOp).getPointer();
+      // Get size of copy
+      unsigned elemSize =
+        getTypeSize(destOp->getType()->getPointerElementType());
+      uint64_t num = UARG(arg++);
+      // Get stride
+      uint64_t stride = 1;
+      size_t srcStride = 1;
+      size_t destStride = 1;
+      if (fnName == "async_work_group_strided_copy")
+      {
+        stride = UARG(arg++);
+      }
+      size_t event = UARG(arg++);
+      // Get type of copy
+      WorkGroup::AsyncCopyType type;
+      if (destOp->getType()->getPointerAddressSpace() == AddrSpaceLocal)
+      {
+        type = WorkGroup::GLOBAL_TO_LOCAL;
+        srcStride = stride;
+      }
+      else
+      {
+        type = WorkGroup::LOCAL_TO_GLOBAL;
+        destStride = stride;
+      }
+      // Register copy
+      event = workItem->m_workGroup->async_copy(
+        workItem,
+        callInst,
+        type,
+        dest,
+        src,
+        elemSize,
+        num,
+        srcStride,
+        destStride,
+        event);
+      result.setUInt(event);
+    }
+    DEFINE_BUILTIN(wait_group_events)
+    {
+      uint64_t num = UARG(0);
+      size_t address = PARG(1);
+      list<size_t> events;
+      for (unsigned i = 0; i < num; i++)
+      {
+        size_t event;
+        if (!workItem->m_privateMemory->load((unsigned char*)&event,
+            address, sizeof(size_t)))
+        {
+          return;
+        }
+        events.push_back(event);
+        address += sizeof(size_t);
+      }
+      workItem->m_state = WorkItem::BARRIER;
+      workItem->m_workGroup->notifyBarrier(workItem, callInst,
+                                           CLK_LOCAL_MEM_FENCE, events);
+    }
+    DEFINE_BUILTIN(prefetch)
+    {
+      // Do nothing.
+    }
+    //////////////////////
+    // Atomic Functions //
+    //////////////////////
+    DEFINE_BUILTIN(atomic_add)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_add");
+      }
+      uint32_t old = memory->atomic(AtomicAdd, address, UARG(1));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_and)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_and");
+      }
+      uint32_t old = memory->atomic(AtomicAnd, address, UARG(1));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_cmpxchg)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_cmpxchg");
+      }
+      uint32_t old = memory->atomicCmpxchg(address, UARG(1), UARG(2));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_dec)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_dec");
+      }
+      uint32_t old = memory->atomic(AtomicDec, address);
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_inc)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_dec");
+      }
+      uint32_t old = memory->atomic(AtomicInc, address);
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_max)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_max");
+      }
+      uint32_t old = memory->atomic(AtomicMax, address, UARG(1));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_min)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_min");
+      }
+      uint32_t old = memory->atomic(AtomicMin, address, UARG(1));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_or)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_or");
+      }
+      uint32_t old = memory->atomic(AtomicOr, address, UARG(1));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_sub)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_sub");
+      }
+      uint32_t old = memory->atomic(AtomicSub, address, UARG(1));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_xchg)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_xchg");
+      }
+      uint32_t old = memory->atomic(AtomicXchg, address, UARG(1));
+      result.setUInt(old);
+    }
+    DEFINE_BUILTIN(atomic_xor)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(0)->getType()->getPointerAddressSpace());
+      size_t address = PARG(0);
+      // Verify the address is 4-byte aligned
+      if ((address & 0x3) != 0) {
+        workItem->m_context->logError("Unaligned address on atomic_xor");
+      }
+      uint32_t old = memory->atomic(AtomicXor, address, UARG(1));
+      result.setUInt(old);
+    }
+    //////////////////////
+    // Common Functions //
+    //////////////////////
+    template<typename T> T static _max_(T a, T b){return a > b ? a : b;}
+    template<typename T> T static _min_(T a, T b){return a < b ? a : b;}
+    template<typename T> T static _clamp_(T x, T min, T max)
+    {
+      return _min_(_max_(x, min), max);
+    }
+    static double _degrees_(double x)
+    {
+      return x * (180 / M_PI);
+    }
+    static double _radians_(double x)
+    {
+      return x * (M_PI / 180);
+    }
+    static double _sign_(double x)
+    {
+      if (::isnan(x))  return  0.0;
+      if (x  >  0.0) return  1.0;
+      if (x == -0.0) return -0.0;
+      if (x ==  0.0) return  0.0;
+      if (x  <  0.0) return -1.0;
+      return 0.0;
+    }
+    DEFINE_BUILTIN(clamp)
+    {
+      switch (getOverloadArgType(overload))
+      {
+        case 'f':
+        case 'd':
+          if (ARG(1)->getType()->isVectorTy())
+          {
+            f3arg(workItem, callInst, fnName, overload, result, _clamp_);
+          }
+          else
+          {
+            for (unsigned i = 0; i < result.num; i++)
+            {
+              double x = FARGV(0, i);
+              double minval = FARG(1);
+              double maxval = FARG(2);
+              result.setFloat(_clamp_(x, minval, maxval), i);
+            }
+          }
+          break;
+        case 'h':
+        case 't':
+        case 'j':
+        case 'm':
+          u3arg(workItem, callInst, fnName, overload, result, _clamp_);
+          break;
+        case 'c':
+        case 's':
+        case 'i':
+        case 'l':
+          s3arg(workItem, callInst, fnName, overload, result, _clamp_);
+          break;
+        default:
+          FATAL_ERROR("Unsupported argument type: %c",
+                      getOverloadArgType(overload));
+      }
+    }
+    {
+      switch (getOverloadArgType(overload))
+      {
+        case 'f':
+        case 'd':
+          if (ARG(1)->getType()->isVectorTy())
+          {
+            f2arg(workItem, callInst, fnName, overload, result, fmax);
+          }
+          else
+          {
+            for (unsigned i = 0; i < result.num; i++)
+            {
+              double x = FARGV(0, i);
+              double y = FARG(1);
+              result.setFloat(_max_(x, y), i);
+            }
+          }
+          break;
+        case 'h':
+        case 't':
+        case 'j':
+        case 'm':
+          u2arg(workItem, callInst, fnName, overload, result, _max_);
+          break;
+        case 'c':
+        case 's':
+        case 'i':
+        case 'l':
+          s2arg(workItem, callInst, fnName, overload, result, _max_);
+          break;
+        default:
+          FATAL_ERROR("Unsupported argument type: %c",
+                      getOverloadArgType(overload));
+      }
+    }
+    {
+      switch (getOverloadArgType(overload))
+      {
+        case 'f':
+        case 'd':
+          if (ARG(1)->getType()->isVectorTy())
+          {
+            f2arg(workItem, callInst, fnName, overload, result, fmin);
+          }
+          else
+          {
+            for (unsigned i = 0; i < result.num; i++)
+            {
+              double x = FARGV(0, i);
+              double y = FARG(1);
+              result.setFloat(_min_(x, y), i);
+            }
+          }
+          break;
+        case 'h':
+        case 't':
+        case 'j':
+        case 'm':
+          u2arg(workItem, callInst, fnName, overload, result, _min_);
+          break;
+        case 'c':
+        case 's':
+        case 'i':
+        case 'l':
+          s2arg(workItem, callInst, fnName, overload, result, _min_);
+          break;
+        default:
+          FATAL_ERROR("Unsupported argument type: %c",
+                      getOverloadArgType(overload));
+      }
+    }
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        double y = FARGV(1, i);
+        double a = ARG(2)->getType()->isVectorTy() ? FARGV(2, i) : FARG(2);
+        double r = x + (y - x) * a;
+        result.setFloat(r, i);
+      }
+    }
+    DEFINE_BUILTIN(smoothstep)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double edge0 = ARG(0)->getType()->isVectorTy() ? FARGV(0, i) : FARG(0);
+        double edge1 = ARG(1)->getType()->isVectorTy() ? FARGV(1, i) : FARG(1);
+        double x = FARGV(2, i);
+        double t = _clamp_<double>((x - edge0) / (edge1 - edge0), 0, 1);
+        double r = t * t * (3 - 2*t);
+        result.setFloat(r, i);
+      }
+    }
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double edge = ARG(0)->getType()->isVectorTy() ? FARGV(0, i) : FARG(0);
+        double x = FARGV(1, i);
+        double r = (x < edge) ? 0.0 : 1.0;
+        result.setFloat(r, i);
+      }
+    }
+    /////////////////////////
+    // Geometric Functions //
+    /////////////////////////
+    DEFINE_BUILTIN(cross)
+    {
+      double u1 = FARGV(0, 0);
+      double u2 = FARGV(0, 1);
+      double u3 = FARGV(0, 2);
+      double v1 = FARGV(1, 0);
+      double v2 = FARGV(1, 1);
+      double v3 = FARGV(1, 2);
+      result.setFloat(u2*v3 - u3*v2, 0);
+      result.setFloat(u3*v1 - u1*v3, 1);
+      result.setFloat(u1*v2 - u2*v1, 2);
+      result.setFloat(0, 3);
+    }
+    {
+      unsigned num = 1;
+      if (ARG(0)->getType()->isVectorTy())
+      {
+        num = ARG(0)->getType()->getVectorNumElements();
+      }
+      double r = 0.f;
+      for (unsigned i = 0; i < num; i++)
+      {
+        double a = FARGV(0, i);
+        double b = FARGV(1, i);
+        r += a * b;
+      }
+      result.setFloat(r);
+    }
+    DEFINE_BUILTIN(distance)
+    {
+      unsigned num = 1;
+      if (ARG(0)->getType()->isVectorTy())
+      {
+        num = ARG(0)->getType()->getVectorNumElements();
+      }
+      double distSq = 0.0;
+      for (unsigned i = 0; i < num; i++)
+      {
+        double diff = FARGV(0,i) - FARGV(1,i);
+        distSq += diff*diff;
+      }
+      result.setFloat(sqrt(distSq));
+    }
+    DEFINE_BUILTIN(length)
+    {
+      unsigned num = 1;
+      if (ARG(0)->getType()->isVectorTy())
+      {
+        num = ARG(0)->getType()->getVectorNumElements();
+      }
+      double lengthSq = 0.0;
+      for (unsigned i = 0; i < num; i++)
+      {
+        lengthSq += FARGV(0, i) * FARGV(0, i);
+      }
+      result.setFloat(sqrt(lengthSq));
+    }
+    DEFINE_BUILTIN(normalize)
+    {
+      double lengthSq = 0.0;
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        lengthSq += FARGV(0, i) * FARGV(0, i);
+      }
+      double length = sqrt(lengthSq);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setFloat(FARGV(0, i)/length, i);
+      }
+    }
+    /////////////////////
+    // Image Functions //
+    /////////////////////
+    static size_t getChannelSize(const cl_image_format& format)
+    {
+      switch (format.image_channel_data_type)
+      {
+      case CL_SNORM_INT8:
+      case CL_UNORM_INT8:
+      case CL_SIGNED_INT8:
+      case CL_UNSIGNED_INT8:
+        return 1;
+      case CL_SNORM_INT16:
+      case CL_UNORM_INT16:
+      case CL_SIGNED_INT16:
+      case CL_UNSIGNED_INT16:
+      case CL_HALF_FLOAT:
+        return 2;
+      case CL_SIGNED_INT32:
+      case CL_UNSIGNED_INT32:
+      case CL_FLOAT:
+        return 4;
+      default:
+        return 0;
+      }
+    }
+    static size_t getNumChannels(const cl_image_format& format)
+    {
+      switch (format.image_channel_order)
+      {
+      case CL_R:
+      case CL_Rx:
+      case CL_A:
+      case CL_INTENSITY:
+      case CL_LUMINANCE:
+        return 1;
+      case CL_RG:
+      case CL_RGx:
+      case CL_RA:
+        return 2;
+      case CL_RGB:
+      case CL_RGBx:
+        return 3;
+      case CL_RGBA:
+      case CL_ARGB:
+      case CL_BGRA:
+        return 4;
+      default:
+        return 0;
+      }
+    }
+    static bool hasZeroAlphaBorder(const cl_image_format& format)
+    {
+      switch (format.image_channel_order)
+      {
+      case CL_A:
+      case CL_INTENSITY:
+      case CL_Rx:
+      case CL_RA:
+      case CL_RGx:
+      case CL_RGBx:
+      case CL_ARGB:
+      case CL_BGRA:
+      case CL_RGBA:
+        return true;
+      default:
+        return false;
+      }
+    }
+    DEFINE_BUILTIN(get_image_array_size)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      result.setUInt(image->desc.image_array_size);
+    }
+    DEFINE_BUILTIN(get_image_channel_data_type)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      result.setSInt(image->format.image_channel_data_type);
+    }
+    DEFINE_BUILTIN(get_image_channel_order)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      result.setSInt(image->format.image_channel_order);
+    }
+    DEFINE_BUILTIN(get_image_dim)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      result.setSInt(image->desc.image_width, 0);
+      result.setSInt(image->desc.image_height, 1);
+      if (result.num > 2)
+      {
+        result.setSInt(image->desc.image_depth, 2);
+        result.setSInt(0, 3);
+      }
+    }
+    DEFINE_BUILTIN(get_image_depth)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      result.setSInt(image->desc.image_depth);
+    }
+    DEFINE_BUILTIN(get_image_height)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      result.setSInt(image->desc.image_height);
+    }
+    DEFINE_BUILTIN(get_image_width)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      result.setSInt(image->desc.image_width);
+    }
+    static inline float getCoordinate(const llvm::Value *value, int index,
+                                      char type, WorkItem *workItem)
+    {
+      switch (type)
+      {
+        case 'i':
+          return workItem->getOperand(value).getSInt(index);
+        case 'f':
+          return workItem->getOperand(value).getFloat(index);
+        default:
+          FATAL_ERROR("Unsupported coordinate type: '%c'", type);
+      }
+    }
+    static inline int getNearestCoordinate(uint32_t sampler,
+                                           float n, // Normalized
+                                           float u, // Unormalized
+                                           size_t size)
+    {
+      switch (sampler & CLK_ADDRESS_MASK)
+      {
+        case CLK_ADDRESS_NONE:
+          return floor(u);
+          return _clamp_<int>(floor(u), 0, size - 1);
+        case CLK_ADDRESS_CLAMP:
+          return _clamp_<int>(floor(u), -1, size);
+        case CLK_ADDRESS_REPEAT:
+          return (int)floorf((n - floorf(n))*size) % size;
+          return _min_<int>((int)floorf(fabsf(n - 2.f * rintf(0.5f*n)) * size),
+                            size - 1);
+        default:
+          FATAL_ERROR("Unsupported sampler addressing mode: %X",
+                      sampler & CLK_ADDRESS_MASK);
+      }
+    }
+    static inline float getAdjacentCoordinates(uint32_t sampler,
+                                               float n, // Normalized
+                                               float u, // Unnormalized
+                                               size_t size,
+                                               int *c0, int *c1)
+    {
+      switch (sampler & CLK_ADDRESS_MASK)
+      {
+        case CLK_ADDRESS_NONE:
+          *c0 = floor(u);
+          *c1 = floor(u) + 1;
+          return u;
+          *c0 = _clamp_<int>(floorf(u - 0.5f), 0, size - 1);
+          *c1 = _clamp_<int>(floorf(u - 0.5f) + 1, 0, size - 1);
+          return u;
+        case CLK_ADDRESS_CLAMP:
+          *c0 = _clamp_<int>((floorf(u - 0.5f)), -1, size);
+          *c1 = _clamp_<int>((floorf(u - 0.5f)) + 1, -1, size);
+          return u;
+        case CLK_ADDRESS_REPEAT:
+        {
+          u = (n - floorf(n)) * size;
+          *c0 = (int)floorf(u - 0.5f);
+          *c1 = *c0 + 1;
+          if (*c0 < 0) *c0 += size;
+          if (*c1 >= size) *c1 -= size;
+          return u;
+        }
+        {
+          u = fabsf(n - 2.0f * rintf(0.5f * n)) * size;
+          *c0 = (int)floorf(u - 0.5f);
+          *c1 = *c0 + 1;
+          *c0 = _max_(*c0, 0);
+          *c1 = _min_(*c1, (int)size-1);
+          return u;
+        }
+        default:
+          FATAL_ERROR("Unsupported sampler addressing mode: %X",
+                      sampler & CLK_ADDRESS_MASK);
+      }
+    }
+    static inline int getInputChannel(const cl_image_format& format,
+                                      int output, float *ret)
+    {
+      int input = output;
+      switch (format.image_channel_order)
+      {
+        case CL_R:
+        case CL_Rx:
+          if (output == 1)
+          {
+            *ret = 0.f;
+            return -1;
+          }
+        case CL_RG:
+        case CL_RGx:
+          if (output == 2)
+          {
+            *ret = 0.f;
+            return -1;
+          }
+        case CL_RGB:
+        case CL_RGBx:
+          if (output == 3)
+          {
+            *ret = 1.f;
+            return -1;
+          }
+          break;
+        case CL_RGBA:
+          break;
+        case CL_BGRA:
+          if (output == 0) input = 2;
+          if (output == 2) input = 0;
+          break;
+        case CL_ARGB:
+          if (output == 0) input = 1;
+          if (output == 1) input = 2;
+          if (output == 2) input = 3;
+          if (output == 3) input = 0;
+          break;
+        case CL_A:
+          if (output == 3) input = 0;
+          else
+          {
+            *ret = 0.f;
+            return -1;
+          }
+          break;
+        case CL_RA:
+          if (output == 3) input = 1;
+          else if (output != 0)
+          {
+            *ret = 0.f;
+            return -1;
+          }
+          break;
+        case CL_INTENSITY:
+          input = 0;
+          break;
+        case CL_LUMINANCE:
+          if (output == 3)
+          {
+            *ret = 1.f;
+            return -1;
+          }
+          input = 0;
+          break;
+        default:
+          FATAL_ERROR("Unsupported image channel order: %X",
+                      format.image_channel_order);
+      }
+      return input;
+    }
+    static inline float readNormalizedColor(const Image *image,
+                                            WorkItem *workItem,
+                                            int i, int j, int k,
+                                            int layer, int c)
+    {
+      // Check for out-of-range coordinages
+      if (i < 0 || i >= image->desc.image_width ||
+          j < 0 || j >= image->desc.image_height ||
+          k < 0 || k >= image->desc.image_depth)
+      {
+        // Return border color
+        if (c == 3 && !hasZeroAlphaBorder(image->format))
+        {
+          return 1.f;
+        }
+        return 0.f;
+      }
+      // Remap channels
+      float ret;
+      int channel = getInputChannel(image->format, c, &ret);
+      if (channel < 0)
+      {
+        return ret;
+      }
+      // Calculate pixel address
+      size_t channelSize = getChannelSize(image->format);
+      size_t numChannels = getNumChannels(image->format);
+      size_t pixelSize = channelSize*numChannels;
+      size_t address = image->address
+                        + (i + (j + (k + layer*image->desc.image_depth)
+                        * image->desc.image_height)
+                        * image->desc.image_width) * pixelSize
+                        + channel*channelSize;
+      // Load channel data
+      unsigned char *data = workItem->m_pool.alloc(channelSize);
+      if (!workItem->getMemory(AddrSpaceGlobal)->load(data, address,
+                                                       channelSize))
+      {
+        return 0.f;
+      }
+      // Compute normalized color value
+      float color;
+      switch (image->format.image_channel_data_type)
+      {
+        case CL_SNORM_INT8:
+          color = _clamp_(*(int8_t*)data / 127.f, -1.f, 1.f);
+          break;
+        case CL_UNORM_INT8:
+          color = _clamp_(*(uint8_t*)data / 255.f, 0.f, 1.f);
+          break;
+        case CL_SNORM_INT16:
+          color = _clamp_(*(int16_t*)data / 32767.f, -1.f, 1.f);
+          break;
+        case CL_UNORM_INT16:
+          color = _clamp_(*(uint16_t*)data / 65535.f, 0.f, 1.f);
+          break;
+        case CL_FLOAT:
+          color = *(float*)data;
+          break;
+        case CL_HALF_FLOAT:
+          color = halfToFloat(*(uint16_t*)data);
+          break;
+        default:
+          FATAL_ERROR("Unsupported image channel data type: %X",
+                      image->format.image_channel_data_type);
+      }
+      return color;
+    }
+    static inline int32_t readSignedColor(const Image *image,
+                                          WorkItem *workItem,
+                                          int i, int j, int k,
+                                          int layer, int c)
+    {
+      // Check for out-of-range coordinages
+      if (i < 0 || i >= image->desc.image_width ||
+          j < 0 || j >= image->desc.image_height ||
+          k < 0 || k >= image->desc.image_depth)
+      {
+        // Return border color
+        if (c == 3 && !hasZeroAlphaBorder(image->format))
+        {
+          return 1.f;
+        }
+        return 0.f;
+      }
+      // Remap channels
+      float ret;
+      int channel = getInputChannel(image->format, c, &ret);
+      if (channel < 0)
+      {
+        return ret;
+      }
+      // Calculate pixel address
+      size_t channelSize = getChannelSize(image->format);
+      size_t numChannels = getNumChannels(image->format);
+      size_t pixelSize = channelSize*numChannels;
+      size_t address = image->address
+                        + (i + (j + (k + layer*image->desc.image_depth)
+                        * image->desc.image_height)
+                        * image->desc.image_width) * pixelSize
+                        + channel*channelSize;
+      // Load channel data
+      unsigned char *data = workItem->m_pool.alloc(channelSize);
+      if (!workItem->getMemory(AddrSpaceGlobal)->load(data, address,
+                                                       channelSize))
+      {
+        return 0;
+      }
+      // Compute unnormalized color value
+      int32_t color;
+      switch (image->format.image_channel_data_type)
+      {
+        case CL_SIGNED_INT8:
+          color = *(int8_t*)data;
+          break;
+        case CL_SIGNED_INT16:
+          color = *(int16_t*)data;
+          break;
+        case CL_SIGNED_INT32:
+          color = *(int32_t*)data;
+          break;
+        default:
+          FATAL_ERROR("Unsupported image channel data type: %X",
+                      image->format.image_channel_data_type);
+      }
+      return color;
+    }
+    static inline uint32_t readUnsignedColor(const Image *image,
+                                             WorkItem *workItem,
+                                             int i, int j, int k,
+                                             int layer, int c)
+    {
+      // Check for out-of-range coordinages
+      if (i < 0 || i >= image->desc.image_width ||
+          j < 0 || j >= image->desc.image_height ||
+          k < 0 || k >= image->desc.image_depth)
+      {
+        // Return border color
+        if (c == 3 && !hasZeroAlphaBorder(image->format))
+        {
+          return 1.f;
+        }
+        return 0.f;
+      }
+      // Remap channels
+      float ret;
+      int channel = getInputChannel(image->format, c, &ret);
+      if (channel < 0)
+      {
+        return ret;
+      }
+      // Calculate pixel address
+      size_t channelSize = getChannelSize(image->format);
+      size_t numChannels = getNumChannels(image->format);
+      size_t pixelSize = channelSize*numChannels;
+      size_t address = image->address
+                        + (i + (j + (k + layer*image->desc.image_depth)
+                        * image->desc.image_height)
+                        * image->desc.image_width) * pixelSize
+                        + channel*channelSize;
+      // Load channel data
+      unsigned char *data = workItem->m_pool.alloc(channelSize);
+      if (!workItem->getMemory(AddrSpaceGlobal)->load(data, address,
+                                                       channelSize))
+      {
+        return 0;
+      }
+      // Load color value
+      uint32_t color;
+      switch (image->format.image_channel_data_type)
+      {
+        case CL_UNSIGNED_INT8:
+          color = *(uint8_t*)data;
+          break;
+        case CL_UNSIGNED_INT16:
+          color = *(uint16_t*)data;
+          break;
+        case CL_UNSIGNED_INT32:
+          color = *(uint32_t*)data;
+          break;
+        default:
+          FATAL_ERROR("Unsupported image channel data type: %X",
+                      image->format.image_channel_data_type);
+      }
+      return color;
+    }
+    static inline float frac(float x)
+    {
+      return x - floorf(x);
+    }
+    static inline float interpolate(float v000, float v010,
+                                    float v100, float v110,
+                                    float v001, float v011,
+                                    float v101, float v111,
+                                    float a, float b, float c)
+    {
+      return  (1-a) * (1-b) * (1-c) * v000
+            +   a   * (1-b) * (1-c) * v100
+            + (1-a) *    b  * (1-c) * v010
+            +    a  *    b  * (1-c) * v110
+            + (1-a) * (1-b) *    c  * v001
+            +    a  * (1-b) *    c  * v101
+            + (1-a) *    b  *    c  * v011
+            +    a  *    b  *    c  * v111;
+    }
+    DEFINE_BUILTIN(read_imagef)
+    {
+      const Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      uint32_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+      int coordIndex = 1;
+      // Check for sampler version
+      if (callInst->getNumArgOperands() > 2)
+      {
+        sampler = UARG(1);
+        coordIndex = 2;
+      }
+      // Get coordinates
+      float s = 0.f, t = 0.f, r = 0.f;
+      char coordType = *overload.rbegin();
+      s = getCoordinate(ARG(coordIndex), 0, coordType, workItem);
+      if (ARG(coordIndex)->getType()->isVectorTy())
+      {
+        t = getCoordinate(ARG(coordIndex), 1, coordType, workItem);
+        if (ARG(coordIndex)->getType()->getVectorNumElements() > 2)
+        {
+          r = getCoordinate(ARG(coordIndex), 2, coordType, workItem);
+        }
+      }
+      // Get unnormalized coordinates
+      float u = 0.f, v = 0.f, w = 0.f;
+      bool noormCoords = sampler & CLK_NORMALIZED_COORDS_TRUE;
+      if (noormCoords)
+      {
+        u = s * image->desc.image_width;
+        v = t * image->desc.image_height;
+        w = r * image->desc.image_depth;
+      }
+      else
+      {
+        u = s;
+        v = t;
+        w = r;
+      }
+      // Get array layer index
+      int layer = 0;
+      if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      {
+        layer = _clamp_<int>(rintf(t), 0, image->desc.image_array_size - 1);
+        v = t = 0.f;
+      }
+      else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+      {
+        layer = _clamp_<int>(rintf(r), 0, image->desc.image_array_size - 1);
+        w = r = 0.f;
+      }
+      float values[4];
+      if (sampler & CLK_FILTER_LINEAR)
+      {
+        // Get coordinates of adjacent pixels
+        int i0 = 0, i1 = 0, j0 = 0, j1 = 0, k0 = 0, k1 = 0;
+        u = getAdjacentCoordinates(sampler, s, u, image->desc.image_width,
+                                   &i0, &i1);
+        v = getAdjacentCoordinates(sampler, t, v, image->desc.image_height,
+                                   &j0, &j1);
+        w = getAdjacentCoordinates(sampler, r, w, image->desc.image_depth,
+                                   &k0, &k1);
+        // Make sure y and z coordinates are equal for 1 and 2D images
+        if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D ||
+            image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+        {
+          j0 = j1;
+          k0 = k1;
+        }
+        else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D ||
+                 image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+        {
+          k0 = k1;
+        }
+        // Perform linear interpolation
+        float a = frac(u - 0.5f);
+        float b = frac(v - 0.5f);
+        float c = frac(w - 0.5f);
+        for (int i = 0; i < 4; i++)
+        {
+          values[i] = interpolate(
+            readNormalizedColor(image, workItem, i0, j0, k0, layer, i),
+            readNormalizedColor(image, workItem, i0, j1, k0, layer, i),
+            readNormalizedColor(image, workItem, i1, j0, k0, layer, i),
+            readNormalizedColor(image, workItem, i1, j1, k0, layer, i),
+            readNormalizedColor(image, workItem, i0, j0, k1, layer, i),
+            readNormalizedColor(image, workItem, i0, j1, k1, layer, i),
+            readNormalizedColor(image, workItem, i1, j0, k1, layer, i),
+            readNormalizedColor(image, workItem, i1, j1, k1, layer, i),
+            a, b, c);
+        }
+      }
+      else
+      {
+        // Read values from nearest pixel
+        int i = getNearestCoordinate(sampler, s, u, image->desc.image_width);
+        int j = getNearestCoordinate(sampler, t, v, image->desc.image_height);
+        int k = getNearestCoordinate(sampler, r, w, image->desc.image_depth);
+        values[0] = readNormalizedColor(image, workItem, i, j, k, layer, 0);
+        values[1] = readNormalizedColor(image, workItem, i, j, k, layer, 1);
+        values[2] = readNormalizedColor(image, workItem, i, j, k, layer, 2);
+        values[3] = readNormalizedColor(image, workItem, i, j, k, layer, 3);
+      }
+      // Store values in result
+      for (int i = 0; i < 4; i++)
+      {
+        result.setFloat(values[i], i);
+      }
+    }
+    DEFINE_BUILTIN(read_imagei)
+    {
+      const Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      uint32_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+      int coordIndex = 1;
+      // Check for sampler version
+      if (callInst->getNumArgOperands() > 2)
+      {
+        sampler = UARG(1);
+        coordIndex = 2;
+      }
+      // Get coordinates
+      float s = 0.f, t = 0.f, r = 0.f;
+      char coordType = *overload.rbegin();
+      s = getCoordinate(ARG(coordIndex), 0, coordType, workItem);
+      if (ARG(coordIndex)->getType()->isVectorTy())
+      {
+        t = getCoordinate(ARG(coordIndex), 1, coordType, workItem);
+        if (ARG(coordIndex)->getType()->getVectorNumElements() > 2)
+        {
+          r = getCoordinate(ARG(coordIndex), 2, coordType, workItem);
+        }
+      }
+      // Get unnormalized coordinates
+      float u = 0.f, v = 0.f, w = 0.f;
+      bool noormCoords = sampler & CLK_NORMALIZED_COORDS_TRUE;
+      if (noormCoords)
+      {
+        u = s * image->desc.image_width;
+        v = t * image->desc.image_height;
+        w = r * image->desc.image_depth;
+      }
+      else
+      {
+        u = s;
+        v = t;
+        w = r;
+      }
+      // Get array layer index
+      int layer = 0;
+      if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      {
+        layer = _clamp_<int>(rintf(t), 0, image->desc.image_array_size - 1);
+        v = t = 0.f;
+      }
+      else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+      {
+        layer = _clamp_<int>(rintf(r), 0, image->desc.image_array_size - 1);
+        w = r = 0.f;
+      }
+      // Read values from nearest pixel
+      int32_t values[4];
+      int i = getNearestCoordinate(sampler, s, u, image->desc.image_width);
+      int j = getNearestCoordinate(sampler, t, v, image->desc.image_height);
+      int k = getNearestCoordinate(sampler, r, w, image->desc.image_depth);
+      values[0] = readSignedColor(image, workItem, i, j, k, layer, 0);
+      values[1] = readSignedColor(image, workItem, i, j, k, layer, 1);
+      values[2] = readSignedColor(image, workItem, i, j, k, layer, 2);
+      values[3] = readSignedColor(image, workItem, i, j, k, layer, 3);
+      // Store values in result
+      for (int i = 0; i < 4; i++)
+      {
+        result.setSInt(values[i], i);
+      }
+    }
+    DEFINE_BUILTIN(read_imageui)
+    {
+      const Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      uint32_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+      int coordIndex = 1;
+      // Check for sampler version
+      if (callInst->getNumArgOperands() > 2)
+      {
+        sampler = UARG(1);
+        coordIndex = 2;
+      }
+      // Get coordinates
+      float s = 0.f, t = 0.f, r = 0.f;
+      char coordType = *overload.rbegin();
+      s = getCoordinate(ARG(coordIndex), 0, coordType, workItem);
+      if (ARG(coordIndex)->getType()->isVectorTy())
+      {
+        t = getCoordinate(ARG(coordIndex), 1, coordType, workItem);
+        if (ARG(coordIndex)->getType()->getVectorNumElements() > 2)
+        {
+          r = getCoordinate(ARG(coordIndex), 2, coordType, workItem);
+        }
+      }
+      // Get unnormalized coordinates
+      float u = 0.f, v = 0.f, w = 0.f;
+      bool noormCoords = sampler & CLK_NORMALIZED_COORDS_TRUE;
+      if (noormCoords)
+      {
+        u = s * image->desc.image_width;
+        v = t * image->desc.image_height;
+        w = r * image->desc.image_depth;
+      }
+      else
+      {
+        u = s;
+        v = t;
+        w = r;
+      }
+      // Get array layer index
+      int layer = 0;
+      if (image->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      {
+        layer = _clamp_<int>(rintf(t), 0, image->desc.image_array_size - 1);
+        v = t = 0.f;
+      }
+      else if (image->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+      {
+        layer = _clamp_<int>(rintf(r), 0, image->desc.image_array_size - 1);
+        w = r = 0.f;
+      }
+      // Read values from nearest pixel
+      uint32_t values[4];
+      int i = getNearestCoordinate(sampler, s, u, image->desc.image_width);
+      int j = getNearestCoordinate(sampler, t, v, image->desc.image_height);
+      int k = getNearestCoordinate(sampler, r, w, image->desc.image_depth);
+      values[0] = readUnsignedColor(image, workItem, i, j, k, layer, 0);
+      values[1] = readUnsignedColor(image, workItem, i, j, k, layer, 1);
+      values[2] = readUnsignedColor(image, workItem, i, j, k, layer, 2);
+      values[3] = readUnsignedColor(image, workItem, i, j, k, layer, 3);
+      // Store values in result
+      for (int i = 0; i < 4; i++)
+      {
+        result.setUInt(values[i], i);
+      }
+    }
+    DEFINE_BUILTIN(write_imagef)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      // Get pixel coordinates
+      int x, y = 0, z = 0 ;
+      x = SARGV(1, 0);
+      if (ARG(1)->getType()->isVectorTy())
+      {
+        y = SARGV(1, 1);
+        if (ARG(1)->getType()->getVectorNumElements() > 2)
+        {
+          z = SARGV(1, 2);
+        }
+      }
+      // Get color data
+      float values[4] =
+      {
+        (float)FARGV(2, 0),
+        (float)FARGV(2, 1),
+        (float)FARGV(2, 2),
+        (float)FARGV(2, 3),
+      };
+      // Re-order color values
+      switch (image->format.image_channel_order)
+      {
+      case CL_R:
+      case CL_Rx:
+      case CL_RG:
+      case CL_RGx:
+      case CL_RGB:
+      case CL_RGBx:
+      case CL_RGBA:
+      case CL_INTENSITY:
+      case CL_LUMINANCE:
+        break;
+      case CL_A:
+        values[0] = values[3];
+        break;
+      case CL_RA:
+        values[1] = values[3];
+        break;
+      case CL_ARGB:
+        swap(values[2], values[3]);
+        swap(values[1], values[2]);
+        swap(values[0], values[1]);
+        break;
+      case CL_BGRA:
+        swap(values[0], values[2]);
+        break;
+      default:
+        FATAL_ERROR("Unsupported image channel order: %X",
+                    image->format.image_channel_order);
+      }
+      size_t channelSize = getChannelSize(image->format);
+      size_t numChannels = getNumChannels(image->format);
+      size_t pixelSize = channelSize*numChannels;
+      size_t pixelAddress = image->address
+                            + (x + (y + z*image->desc.image_height)
+                            * image->desc.image_width) * pixelSize;
+      // Generate channel values
+      Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+      unsigned char *data = workItem->m_pool.alloc(channelSize*numChannels);
+      for (unsigned i = 0; i < numChannels; i++)
+      {
+        switch (image->format.image_channel_data_type)
+        {
+          case CL_SNORM_INT8:
+            ((int8_t*)data)[i] = rint(_clamp_(values[i] * 127.f,
+                                              -128.f, 127.f));
+            break;
+          case CL_UNORM_INT8:
+            data[i] = rint(_clamp_(values[i] * 255.f, 0.f, 255.f));
+            break;
+          case CL_SNORM_INT16:
+            ((int16_t*)data)[i] = rint(_clamp_(values[i] * 32767.f,
+                                               -32768.f, 32767.f));
+            break;
+          case CL_UNORM_INT16:
+            ((uint16_t*)data)[i] = rint(_clamp_(values[i] * 65535.f,
+                                                0.f, 65535.f));
+            break;
+          case CL_FLOAT:
+            ((float*)data)[i] = values[i];
+            break;
+          case CL_HALF_FLOAT:
+            ((uint16_t*)data)[i] = floatToHalf(values[i]);
+            break;
+          default:
+            FATAL_ERROR("Unsupported image channel data type: %X",
+                        image->format.image_channel_data_type);
+        }
+      }
+      // Write pixel data
+      memory->store(data, pixelAddress, channelSize*numChannels);
+    }
+    DEFINE_BUILTIN(write_imagei)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      // Get pixel coordinates
+      int x, y = 0, z = 0 ;
+      x = SARGV(1, 0);
+      if (ARG(1)->getType()->isVectorTy())
+      {
+        y = SARGV(1, 1);
+        if (ARG(1)->getType()->getVectorNumElements() > 2)
+        {
+          z = SARGV(1, 2);
+        }
+      }
+      // Get color data
+      int32_t values[4] =
+      {
+        (int32_t)SARGV(2, 0),
+        (int32_t)SARGV(2, 1),
+        (int32_t)SARGV(2, 2),
+        (int32_t)SARGV(2, 3),
+      };
+      // Re-order color values
+      switch (image->format.image_channel_order)
+      {
+      case CL_R:
+      case CL_Rx:
+      case CL_RG:
+      case CL_RGx:
+      case CL_RGB:
+      case CL_RGBx:
+      case CL_RGBA:
+      case CL_INTENSITY:
+      case CL_LUMINANCE:
+        break;
+      case CL_A:
+        values[0] = values[3];
+        break;
+      case CL_RA:
+        values[1] = values[3];
+        break;
+      case CL_ARGB:
+        swap(values[2], values[3]);
+        swap(values[1], values[2]);
+        swap(values[0], values[1]);
+        break;
+      case CL_BGRA:
+        swap(values[0], values[2]);
+        break;
+      default:
+        FATAL_ERROR("Unsupported image channel order: %X",
+                    image->format.image_channel_order);
+      }
+      size_t channelSize = getChannelSize(image->format);
+      size_t numChannels = getNumChannels(image->format);
+      size_t pixelSize = channelSize*numChannels;
+      size_t pixelAddress = image->address
+                            + (x + (y + z*image->desc.image_height)
+                            * image->desc.image_width) * pixelSize;
+      // Generate channel values
+      Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+      unsigned char *data = workItem->m_pool.alloc(channelSize*numChannels);
+      for (unsigned i = 0; i < numChannels; i++)
+      {
+        switch (image->format.image_channel_data_type)
+        {
+          case CL_SIGNED_INT8:
+            ((int8_t*)data)[i] = _clamp_(values[i], -128, 127);
+            break;
+          case CL_SIGNED_INT16:
+            ((int16_t*)data)[i] = _clamp_(values[i], -32768, 32767);
+            break;
+          case CL_SIGNED_INT32:
+            ((int32_t*)data)[i] = values[i];
+            break;
+          default:
+            FATAL_ERROR("Unsupported image channel data type: %X",
+                        image->format.image_channel_data_type);
+        }
+      }
+      // Write pixel data
+      memory->store(data, pixelAddress, channelSize*numChannels);
+    }
+    DEFINE_BUILTIN(write_imageui)
+    {
+      Image *image = *(Image**)(workItem->getValue(ARG(0)).data);
+      // Get pixel coordinates
+      int x, y = 0, z = 0 ;
+      x = SARGV(1, 0);
+      if (ARG(1)->getType()->isVectorTy())
+      {
+        y = SARGV(1, 1);
+        if (ARG(1)->getType()->getVectorNumElements() > 2)
+        {
+          z = SARGV(1, 2);
+        }
+      }
+      // Get color data
+      uint32_t values[4] =
+      {
+        (uint32_t)SARGV(2, 0),
+        (uint32_t)SARGV(2, 1),
+        (uint32_t)SARGV(2, 2),
+        (uint32_t)SARGV(2, 3),
+      };
+      // Re-order color values
+      switch (image->format.image_channel_order)
+      {
+      case CL_R:
+      case CL_Rx:
+      case CL_RG:
+      case CL_RGx:
+      case CL_RGB:
+      case CL_RGBx:
+      case CL_RGBA:
+      case CL_INTENSITY:
+      case CL_LUMINANCE:
+        break;
+      case CL_A:
+        values[0] = values[3];
+        break;
+      case CL_RA:
+        values[1] = values[3];
+        break;
+      case CL_ARGB:
+        swap(values[2], values[3]);
+        swap(values[1], values[2]);
+        swap(values[0], values[1]);
+        break;
+      case CL_BGRA:
+        swap(values[0], values[2]);
+        break;
+      default:
+        FATAL_ERROR("Unsupported image channel order: %X",
+                    image->format.image_channel_order);
+      }
+      size_t channelSize = getChannelSize(image->format);
+      size_t numChannels = getNumChannels(image->format);
+      size_t pixelSize = channelSize*numChannels;
+      size_t pixelAddress = image->address
+                            + (x + (y + z*image->desc.image_height)
+                            * image->desc.image_width) * pixelSize;
+      // Generate channel values
+      Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+      unsigned char *data = workItem->m_pool.alloc(channelSize*numChannels);
+      for (unsigned i = 0; i < numChannels; i++)
+      {
+        switch (image->format.image_channel_data_type)
+        {
+          case CL_UNSIGNED_INT8:
+            ((uint8_t*)data)[i] = _min_<uint32_t>(values[i], UINT8_MAX);
+            break;
+          case CL_UNSIGNED_INT16:
+            ((uint16_t*)data)[i] = _min_<uint32_t>(values[i], UINT16_MAX);
+            break;
+          case CL_UNSIGNED_INT32:
+            ((uint32_t*)data)[i] = values[i];
+            break;
+          default:
+            FATAL_ERROR("Unsupported image channel data type: %X",
+                        image->format.image_channel_data_type);
+        }
+      }
+      // Write pixel data
+      memory->store(data, pixelAddress, channelSize*numChannels);
+    }
+    ///////////////////////
+    // Integer Functions //
+    ///////////////////////
+    DEFINE_BUILTIN(abs_builtin)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+            result.setUInt(UARGV(0,i), i);
+            break;
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+            result.setSInt(abs(SARGV(0,i)), i);
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    DEFINE_BUILTIN(abs_diff)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+          {
+            uint64_t a = UARGV(0, i);
+            uint64_t b = UARGV(1, i);
+            result.setUInt(_max_(a,b) - _min_(a,b), i);
+            break;
+          }
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+          {
+            int64_t a = SARGV(0, i);
+            int64_t b = SARGV(1, i);
+            result.setSInt(_max_(a,b) - _min_(a,b), i);
+            break;
+          }
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    DEFINE_BUILTIN(add_sat)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t uresult = UARGV(0,i) + UARGV(1,i);
+        int64_t  sresult = SARGV(0,i) + SARGV(1,i);
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+            uresult = _min_<uint64_t>(uresult, UINT8_MAX);
+            result.setUInt(uresult, i);
+            break;
+          case 't':
+            uresult = _min_<uint64_t>(uresult, UINT16_MAX);
+            result.setUInt(uresult, i);
+            break;
+          case 'j':
+            uresult = _min_<uint64_t>(uresult, UINT32_MAX);
+            result.setUInt(uresult, i);
+            break;
+          case 'm':
+            uresult = (UARGV(1, i) > uresult) ? UINT64_MAX : uresult;
+            result.setUInt(uresult, i);
+            break;
+          case 'c':
+            sresult = _clamp_<int64_t>(sresult, INT8_MIN, INT8_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 's':
+            sresult = _clamp_<int64_t>(sresult, INT16_MIN, INT16_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 'i':
+            sresult = _clamp_<int64_t>(sresult, INT32_MIN, INT32_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 'l':
+            if ((SARGV(0,i)>0) == (SARGV(1,i)>0) &&
+                (SARGV(0,i)>0) != (sresult>0))
+            {
+              sresult = (SARGV(0,i)>0) ? INT64_MAX : INT64_MIN;
+            }
+            result.setSInt(sresult, i);
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t x = UARGV(0, i);
+        int nz = 0;
+        while (x)
+        {
+          x >>= 1;
+          nz++;
+        }
+        uint64_t r = ((result.size<<3) - nz);
+        result.setUInt(r, i);
+      }
+    }
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+          {
+            uint64_t a = UARGV(0, i);
+            uint64_t b = UARGV(1, i);
+            uint64_t c = (a > UINT64_MAX-b) ? (1L<<63) : 0;
+            result.setUInt(((a + b) >> 1) | c, i);
+            break;
+          }
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+          {
+            int64_t a = SARGV(0, i);
+            int64_t b = SARGV(1, i);
+            int64_t c = (a & b) & 1;
+            result.setSInt((a>>1) + (b>>1) + c, i);
+            break;
+          }
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    static uint64_t _mad_(uint64_t a, uint64_t b, uint64_t c)
+    {
+      return a*b + c;
+    }
+    static uint64_t _umul_hi_(uint64_t x, uint64_t y, uint64_t bits)
+    {
+      if (bits == 64)
+      {
+        uint64_t xl = x & UINT32_MAX;
+        uint64_t xh = x >> 32;
+        uint64_t yl = y & UINT32_MAX;
+        uint64_t yh = y >> 32;
+        uint64_t xlyl = xl*yl;
+        uint64_t xlyh = xl*yh;
+        uint64_t xhyl = xh*yl;
+        uint64_t xhyh = xh*yh;
+        uint64_t  a = xhyl + ((xlyl)>>32);
+        uint64_t al = a & UINT32_MAX;
+        uint64_t ah = a >> 32;
+        uint64_t  b = ((al + xlyh)>>32) + ah;
+        return xhyh + b;
+      }
+      else
+      {
+        return (x*y) >> bits;
+      }
+    }
+    static int64_t _smul_hi_(int64_t x, int64_t y, int64_t bits)
+    {
+      if (bits == 64)
+      {
+        int64_t xl = x & UINT32_MAX;
+        int64_t xh = x >> 32;
+        int64_t yl = y & UINT32_MAX;
+        int64_t yh = y >> 32;
+        int64_t xlyl = xl*yl;
+        int64_t xlyh = xl*yh;
+        int64_t xhyl = xh*yl;
+        int64_t xhyh = xh*yh;
+        int64_t  a = xhyl + ((xlyl>>32) & UINT32_MAX);
+        int64_t al = a & UINT32_MAX;
+        int64_t ah = a >> 32;
+        int64_t  b = ((al + xlyh)>>32) + ah;
+        return xhyh + b;
+      }
+      else
+      {
+        return (x*y) >> bits;
+      }
+    }
+    DEFINE_BUILTIN(mad_hi)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+          {
+            uint64_t r =
+              _umul_hi_(UARGV(0, i), UARGV(1, i), result.size<<3) + UARGV(2, i);
+            result.setUInt(r, i);
+            break;
+          }
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+          {
+            int64_t r =
+              _smul_hi_(SARGV(0, i), SARGV(1, i), result.size<<3) + SARGV(2, i);
+            result.setSInt(r, i);
+            break;
+          }
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    DEFINE_BUILTIN(mad_sat)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t uresult = UARGV(0,i)*UARGV(1,i) + UARGV(2,i);
+        int64_t  sresult = SARGV(0,i)*SARGV(1,i) + SARGV(2,i);
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+            uresult = _min_<uint64_t>(uresult, UINT8_MAX);
+            result.setUInt(uresult, i);
+            break;
+          case 't':
+            uresult = _min_<uint64_t>(uresult, UINT16_MAX);
+            result.setUInt(uresult, i);
+            break;
+          case 'j':
+            uresult = _min_<uint64_t>(uresult, UINT32_MAX);
+            result.setUInt(uresult, i);
+            break;
+          case 'm':
+          {
+            uint64_t hi = _umul_hi_(UARGV(0, i), UARGV(1, i), 64);
+            if (hi || UARGV(2, i) > uresult)
+            {
+              uresult = UINT64_MAX;
+            }
+            result.setUInt(uresult, i);
+            break;
+          }
+          case 'c':
+            sresult = _clamp_<int64_t>(sresult, INT8_MIN, INT8_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 's':
+            sresult = _clamp_<int64_t>(sresult, INT16_MIN, INT16_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 'i':
+            sresult = _clamp_<int64_t>(sresult, INT32_MIN, INT32_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 'l':
+            // Check for overflow in multiplication
+            if (_smul_hi_(SARGV(0, i), SARGV(1, i), 64))
+            {
+              sresult = (SARGV(0,i)>0) ^ (SARGV(1,i)>0) ? INT64_MIN : INT64_MAX;
+            }
+            else
+            {
+              // Check for overflow in addition
+              int64_t m = SARGV(0, i) * SARGV(1, i);
+              if ((m>0) == (SARGV(2,i)>0) && (m>0) != (sresult>0))
+              {
+                sresult = (m>0) ? INT64_MAX : INT64_MIN;
+              }
+            }
+            result.setSInt(sresult, i);
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    static uint64_t _mul_(uint64_t a, uint64_t b)
+    {
+      return a*b;
+    }
+    DEFINE_BUILTIN(mul_hi)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+          {
+            uint64_t r = _umul_hi_(UARGV(0, i), UARGV(1, i), result.size<<3);
+            result.setUInt(r, i);
+            break;
+          }
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+          {
+            int64_t r = _smul_hi_(SARGV(0, i), SARGV(1, i), result.size<<3);
+            result.setSInt(r, i);
+            break;
+          }
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    static uint64_t _popcount_(uint64_t x)
+    {
+      int i = 0;
+      while (x)
+      {
+        i += (x & 0x1);
+        x >>= 1;
+      }
+      return i;
+    }
+    DEFINE_BUILTIN(rhadd)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+          {
+            uint64_t a = UARGV(0, i);
+            uint64_t b = UARGV(1, i);
+            uint64_t c = (a > UINT64_MAX-(b+1)) ? (1L<<63) : 0;
+            result.setUInt(((a + b + 1) >> 1) | c, i);
+            break;
+          }
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+          {
+            int64_t a = SARGV(0, i);
+            int64_t b = SARGV(1, i);
+            int64_t c = (a | b) & 1;
+            result.setSInt((a>>1) + (b>>1) + c, i);
+            break;
+          }
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    DEFINE_BUILTIN(rotate)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t width = (result.size << 3);
+        uint64_t v  = UARGV(0, i);
+        uint64_t ls = UARGV(1, i) % width;
+        uint64_t rs = width - ls;
+        result.setUInt((v << ls) | (v >> rs), i);
+      }
+    }
+    DEFINE_BUILTIN(sub_sat)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t uresult = UARGV(0,i) - UARGV(1,i);
+        int64_t  sresult = SARGV(0,i) - SARGV(1,i);
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+            uresult = uresult > UINT8_MAX ? 0 : uresult;
+            result.setUInt(uresult, i);
+            break;
+          case 't':
+            uresult = uresult > UINT16_MAX ? 0 : uresult;
+            result.setUInt(uresult, i);
+            break;
+          case 'j':
+            uresult = uresult > UINT32_MAX ? 0 : uresult;
+            result.setUInt(uresult, i);
+            break;
+          case 'm':
+            uresult = (UARGV(1, i) > UARGV(0, i)) ? 0 : uresult;
+            result.setUInt(uresult, i);
+            break;
+          case 'c':
+            sresult = _clamp_<int64_t>(sresult, INT8_MIN, INT8_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 's':
+            sresult = _clamp_<int64_t>(sresult, INT16_MIN, INT16_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 'i':
+            sresult = _clamp_<int64_t>(sresult, INT32_MIN, INT32_MAX);
+            result.setSInt(sresult, i);
+            break;
+          case 'l':
+            if ((SARGV(0,i)>0) != (SARGV(1,i)>0) &&
+                (SARGV(0,i)>0) != (sresult>0))
+            {
+              sresult = (SARGV(0,i)>0) ? INT64_MAX : INT64_MIN;
+            }
+            result.setSInt(sresult, i);
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    DEFINE_BUILTIN(upsample)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t r = (UARGV(0,i)<<(result.size<<2)) | UARGV(1, i);
+        result.setUInt(r, i);
+      }
+    }
+    ////////////////////
+    // Math Functions //
+    ////////////////////
+    static double _acospi_(double x){ return (acos(x) / M_PI); }
+    static double _asinpi_(double x){ return (asin(x) / M_PI); }
+    static double _atanpi_(double x){ return (atan(x) / M_PI); }
+    static double _atan2pi_(double x, double y){ return (atan2(x, y) / M_PI); }
+    static double _cospi_(double x){ return (cos(x * M_PI)); }
+    static double _exp10_(double x){ return pow(10, x); }
+    static double _fdivide_(double x, double y){ return x/y; }
+    static double _frecip_(double x){ return 1.0/x; }
+    static double _rsqrt_(double x){ return 1.0 / sqrt(x); }
+    static double _sinpi_(double x){ return (sin(x * M_PI)); }
+    static double _tanpi_(double x){ return (tan(x * M_PI)); }
+    static double _fma_(double a, double b, double c)
+    {
+      return a*b + c;
+    }
+    static double _maxmag_(double x, double y)
+    {
+      double _x = fabs(x);
+      double _y = fabs(y);
+      if (_x > _y)
+      {
+        return x;
+      }
+      else if (_y > _x)
+      {
+        return y;
+      }
+      else
+      {
+        return fmax(x, y);
+      }
+    }
+    static double _minmag_(double x, double y)
+    {
+      double _x = fabs(x);
+      double _y = fabs(y);
+      if (_x < _y)
+      {
+        return x;
+      }
+      else if (_y < _x)
+      {
+        return y;
+      }
+      else
+      {
+        return fmin(x, y);
+      }
+    }
+    DEFINE_BUILTIN(fract)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+      size_t iptr = PARG(1);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        double fl = floor(x);
+#if defined(_WIN32) && !defined(__MINGW32__)
+        double r = fmin(x - fl, nextafter(1, 0));
+        double r = fmin(x - fl, 0x1.fffffep-1f);
+        size_t offset = i*result.size;
+        result.setFloat(fl, i);
+        memory->store(result.data + offset, iptr + offset, result.size);
+        result.setFloat(r, i);
+      }
+    }
+    DEFINE_BUILTIN(frexp_builtin)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+      size_t iptr = PARG(1);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        int32_t e;
+        double r = frexp(FARGV(0, i), &e);
+        memory->store((const unsigned char*)&e, iptr + i*4, 4);
+        result.setFloat(r, i);
+      }
+    }
+    DEFINE_BUILTIN(ilogb_builtin)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setSInt(ilogb(FARGV(0, i)), i);
+      }
+    }
+    DEFINE_BUILTIN(ldexp_builtin)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setFloat(ldexp(FARGV(0, i), SARGV(1, i)), i);
+      }
+    }
+    DEFINE_BUILTIN(lgamma_r)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+      size_t signp = PARG(1);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double r = lgamma(FARGV(0, i));
+        int32_t s = (tgamma(FARGV(0, i)) < 0 ? -1 : 1);
+        memory->store((const unsigned char*)&s, signp + i*4, 4);
+        result.setFloat(r, i);
+      }
+    }
+    DEFINE_BUILTIN(modf_builtin)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+      size_t iptr = PARG(1);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        double integral = trunc(x);
+        double fractional = copysign(::isinf(x) ? 0.0 : x - integral, x);
+        size_t offset = i*result.size;
+        result.setFloat(integral, i);
+        memory->store(result.data + offset, iptr + offset, result.size);
+        result.setFloat(fractional, i);
+      }
+    }
+    DEFINE_BUILTIN(nan_builtin)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setFloat(nan(""), i);
+      }
+    }
+    DEFINE_BUILTIN(nextafter_builtin)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        if (result.size == 4)
+          result.setFloat(nextafterf(FARGV(0, i), FARGV(1, i)), i);
+        else
+          result.setFloat(nextafter(FARGV(0, i), FARGV(1, i)), i);
+      }
+    }
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        int32_t y = SARGV(1, i);
+        result.setFloat(pow(x, y), i);
+      }
+    }
+    DEFINE_BUILTIN(remquo_builtin)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(2)->getType()->getPointerAddressSpace());
+      size_t quop = PARG(2);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        double y = FARGV(1, i);
+        int32_t quo;
+        double rem = remquo(x, y, &quo);
+        memory->store((const unsigned char*)&quo, quop + i*4, 4);
+        result.setFloat(rem, i);
+      }
+    }
+    DEFINE_BUILTIN(rootn)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        int y = SARGV(1, i);
+        result.setFloat(pow(x, (double)(1.0/y)), i);
+      }
+    }
+    DEFINE_BUILTIN(sincos)
+    {
+      Memory *memory =
+        workItem->getMemory(ARG(1)->getType()->getPointerAddressSpace());
+      size_t cv = PARG(1);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        double x = FARGV(0, i);
+        size_t offset = i*result.size;
+        result.setFloat(cos(x), i);
+        memory->store(result.data + offset, cv + offset, result.size);
+        result.setFloat(sin(x), i);
+      }
+    }
+    ////////////////////////////
+    // Misc. Vector Functions //
+    ////////////////////////////
+    DEFINE_BUILTIN(shuffle_builtin)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        result.setUInt(UARGV(0, UARGV(1, i)), i);
+      }
+    }
+    DEFINE_BUILTIN(shuffle2_builtin)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t m = 1;
+        if (ARG(0)->getType()->isVectorTy())
+        {
+          m = ARG(0)->getType()->getVectorNumElements();
+        }
+        uint64_t src = 0;
+        uint64_t index = UARGV(2, i);
+        if (index >= m)
+        {
+          index -= m;
+          src = 1;
+        }
+        result.setUInt(UARGV(src, index), i);
+      }
+    }
+    //////////////////////////
+    // Relational Functions //
+    //////////////////////////
+    static int64_t _iseq_(double x, double y){ return x == y; }
+    static int64_t _isneq_(double x, double y){ return x != y; }
+    static int64_t _isgt_(double x, double y){ return isgreater(x, y); }
+    static int64_t _isge_(double x, double y){ return isgreaterequal(x, y); }
+    static int64_t _islt_(double x, double y){ return isless(x, y); }
+    static int64_t _isle_(double x, double y){ return islessequal(x, y); }
+    static int64_t _islg_(double x, double y){ return islessgreater(x, y); }
+    static int64_t _isfin_(double x){ return isfinite(x); }
+    static int64_t _isinf_(double x){ return ::isinf(x); }
+    static int64_t _isnan_(double x){ return ::isnan(x); }
+    static int64_t _isnorm_(double x){ return isnormal(x); }
+    static int64_t _isord_(double x, double y){ return !isunordered(x, y); }
+    static int64_t _isuord_(double x, double y){ return isunordered(x, y); }
+    static int64_t _signbit_(double x){ return signbit(x); }
+    {
+      unsigned num = 1;
+      if (ARG(0)->getType()->isVectorTy())
+      {
+        num = ARG(0)->getType()->getVectorNumElements();
+      }
+      for (unsigned i = 0; i < num; i++)
+      {
+        if (!(SARGV(0, i) & INT64_MIN))
+        {
+          result.setSInt(0);
+          return;
+        }
+      }
+      result.setSInt(1);
+    }
+    {
+      unsigned num = 1;
+      if (ARG(0)->getType()->isVectorTy())
+      {
+        num = ARG(0)->getType()->getVectorNumElements();
+      }
+      for (unsigned i = 0; i < num; i++)
+      {
+        if (SARGV(0, i) & INT64_MIN)
+        {
+          result.setSInt(1);
+          return;
+        }
+      }
+      result.setSInt(0);
+    }
+    static uint64_t _ibitselect_(uint64_t a, uint64_t b, uint64_t c)
+    {
+      return ((a & ~c) | (b & c));
+    }
+    static double _fbitselect_(double a, double b, double c)
+    {
+      uint64_t _a = *(uint64_t*)&a;
+      uint64_t _b = *(uint64_t*)&b;
+      uint64_t _c = *(uint64_t*)&c;
+      uint64_t _r = _ibitselect_(_a, _b, _c);
+      return *(double*)&_r;
+    }
+    DEFINE_BUILTIN(bitselect)
+    {
+      switch (getOverloadArgType(overload))
+      {
+        case 'f':
+        case 'd':
+          f3arg(workItem, callInst, fnName, overload, result, _fbitselect_);
+          break;
+        case 'h':
+        case 't':
+        case 'j':
+        case 'm':
+        case 'c':
+        case 's':
+        case 'i':
+        case 'l':
+          u3arg(workItem, callInst, fnName, overload, result, _ibitselect_);
+          break;
+        default:
+          FATAL_ERROR("Unsupported argument type: %c",
+                      getOverloadArgType(overload));
+      }
+    }
+    DEFINE_BUILTIN(select_builtin)
+    {
+      char type = getOverloadArgType(overload);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        int64_t c = SARGV(2, i);
+        bool _c = (result.num > 1) ? c & INT64_MIN : c;
+        switch (type)
+        {
+          case 'f':
+          case 'd':
+            result.setFloat(_c ? FARGV(1, i) : FARGV(0, i), i);
+            break;
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+            result.setSInt(_c ? SARGV(1, i) : SARGV(0, i), i);
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    ///////////////////////////////
+    // Synchronization Functions //
+    ///////////////////////////////
+    DEFINE_BUILTIN(barrier)
+    {
+      workItem->m_state = WorkItem::BARRIER;
+      workItem->m_workGroup->notifyBarrier(workItem, callInst, UARG(0));
+    }
+    DEFINE_BUILTIN(mem_fence)
+    {
+      // TODO: Implement?
+    }
+    //////////////////////////////////////////
+    // Vector Data Load and Store Functions //
+    //////////////////////////////////////////
+    DEFINE_BUILTIN(vload)
+    {
+      size_t base = PARG(1);
+      unsigned int addressSpace = ARG(1)->getType()->getPointerAddressSpace();
+      uint64_t offset = UARG(0);
+      size_t address = base + offset*result.size*result.num;
+      size_t size = result.size*result.num;
+      workItem->getMemory(addressSpace)->load(result.data, address, size);
+    }
+    DEFINE_BUILTIN(vstore)
+    {
+      const llvm::Value *value = ARG(0);
+      unsigned size = getTypeSize(value->getType());
+      if (isVector3(value))
+      {
+        // 3-element vectors are same size as 4-element vectors,
+        // but vstore address offset shouldn't use this.
+        size = (size/4) * 3;
+      }
+      size_t base = PARG(2);
+      unsigned int addressSpace = ARG(2)->getType()->getPointerAddressSpace();
+      uint64_t offset = UARG(1);
+      size_t address = base + offset*size;
+      unsigned char *data = workItem->getOperand(value).data;
+      workItem->getMemory(addressSpace)->store(data, address, size);
+    }
+    DEFINE_BUILTIN(vload_half)
+    {
+      size_t base = PARG(1);
+      unsigned int addressSpace = ARG(1)->getType()->getPointerAddressSpace();
+      uint64_t offset = UARG(0);
+      size_t address;
+      if (fnName.compare(0, 6, "vloada") == 0 && result.num == 3)
+      {
+        address = base + offset*sizeof(cl_half)*4;
+      }
+      else
+      {
+        address = base + offset*sizeof(cl_half)*result.num;
+      }
+      size_t size = sizeof(cl_half)*result.num;
+      uint16_t *halfData = (uint16_t*)workItem->m_pool.alloc(2*result.num);
+      workItem->getMemory(addressSpace)->load((unsigned char*)halfData,
+                                              address, size);
+      // Convert to floats
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        ((float*)result.data)[i] = halfToFloat(halfData[i]);
+      }
+    }
+    DEFINE_BUILTIN(vstore_half)
+    {
+      const llvm::Value *value = ARG(0);
+      unsigned size = getTypeSize(value->getType());
+      if (isVector3(value))
+      {
+        // 3-element vectors are same size as 4-element vectors,
+        // but vstore address offset shouldn't use this.
+        size = (size/4) * 3;
+      }
+      size_t base = PARG(2);
+      unsigned int addressSpace = ARG(2)->getType()->getPointerAddressSpace();
+      uint64_t offset = UARG(1);
+      // Convert to halfs
+      unsigned char *data = workItem->getOperand(value).data;
+      size_t num = size / sizeof(float);
+      size = num*sizeof(cl_half);
+      uint16_t *halfData = (uint16_t*)workItem->m_pool.alloc(2*num);
+      HalfRoundMode rmode = Half_RTE; //  The Oclgrind device's round mode
+      if (fnName.find("_rtz") != std::string::npos)
+        rmode = Half_RTZ;
+      else if (fnName.find("_rtn") != std::string::npos)
+        rmode = Half_RTN;
+      else if (fnName.find("_rtp") != std::string::npos)
+        rmode = Half_RTP;
+      for (unsigned i = 0; i < num; i++)
+      {
+        halfData[i] = floatToHalf(((float*)data)[i], rmode);
+      }
+      size_t address;
+      if (fnName.compare(0, 7, "vstorea") == 0 && num == 3)
+      {
+        address = base + offset*sizeof(cl_half)*4;
+      }
+      else
+      {
+        address = base + offset*sizeof(cl_half)*num;
+      }
+      workItem->getMemory(addressSpace)->store((unsigned char*)halfData,
+                                               address, size);
+    }
+    /////////////////////////
+    // Work-Item Functions //
+    /////////////////////////
+    DEFINE_BUILTIN(get_global_id)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = dim < 3 ? workItem->m_globalID[dim] : 0;
+      result.setUInt(r);
+    }
+    DEFINE_BUILTIN(get_global_size)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = dim < 3 ?
+        workItem->m_kernelInvocation->getGlobalSize()[dim] : 0;
+      result.setUInt(r);
+    }
+    DEFINE_BUILTIN(get_global_offset)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = dim < 3 ?
+        workItem->m_kernelInvocation->getGlobalOffset()[dim] : 0;
+      result.setUInt(r);
+    }
+    DEFINE_BUILTIN(get_group_id)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = dim < 3 ? workItem->m_workGroup->getGroupID()[dim] : 0;
+      result.setUInt(r);
+    }
+    DEFINE_BUILTIN(get_local_id)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = dim < 3 ? workItem->m_localID[dim] : 0;
+      result.setUInt(r);
+    }
+    DEFINE_BUILTIN(get_local_size)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = dim < 3 ? workItem->m_workGroup->getGroupSize()[dim] : 0;
+      result.setUInt(r);
+    }
+    DEFINE_BUILTIN(get_num_groups)
+    {
+      uint64_t dim = UARG(0);
+      size_t r = 0;
+      if (dim < 3)
+      {
+        r = workItem->m_kernelInvocation->getNumGroups()[dim];
+      }
+      result.setUInt(r);
+    }
+    DEFINE_BUILTIN(get_work_dim)
+    {
+      result.setUInt(workItem->m_kernelInvocation->getWorkDim());
+    }
+    /////////////////////
+    // Other Functions //
+    /////////////////////
+    DEFINE_BUILTIN(convert_float)
+    {
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+            result.setFloat((float)UARGV(0, i), i);
+            break;
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+            result.setFloat((float)SARGV(0, i), i);
+            break;
+          case 'f':
+          case 'd':
+            result.setFloat(FARGV(0, i), i);
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+      }
+    }
+    DEFINE_BUILTIN(convert_half)
+    {
+      float f;
+      HalfRoundMode rmode = Half_RTE;
+      if (fnName.find("_rtz") != std::string::npos)
+        rmode = Half_RTZ;
+      else if (fnName.find("_rtn") != std::string::npos)
+        rmode = Half_RTN;
+      else if (fnName.find("_rtp") != std::string::npos)
+        rmode = Half_RTP;
+      const char srcType = getOverloadArgType(overload);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        switch (srcType)
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+            f = (float)UARGV(0, i);
+            break;
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+            f = (float)SARGV(0, i);
+            break;
+          case 'd':
+          case 'f':
+            f = FARGV(0, i);
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+        result.setUInt(floatToHalf(f, rmode), i);
+      }
+    }
+    static void setConvertRoundingMode(const string& name)
+    {
+      size_t rpos = name.find("_rt");
+      if (rpos != string::npos)
+      {
+        switch (name[rpos+3])
+        {
+        case 'e':
+          fesetround(FE_TONEAREST);
+          break;
+        case 'z':
+          fesetround(FE_TOWARDZERO);
+          break;
+        case 'p':
+          fesetround(FE_UPWARD);
+          break;
+        case 'n':
+          fesetround(FE_DOWNWARD);
+          break;
+        default:
+          FATAL_ERROR("Unsupported rounding mode: %c", name[rpos=3]);
+        }
+      }
+      else
+      {
+        fesetround(FE_TOWARDZERO);
+      }
+    }
+    DEFINE_BUILTIN(convert_uint)
+    {
+      // Check for saturation modifier
+      bool sat = fnName.find("_sat") != string::npos;
+      uint64_t max = (1UL<<(result.size*8)) - 1;
+      // Use rounding mode
+      const int origRnd = fegetround();
+      setConvertRoundingMode(fnName);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        uint64_t r;
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+            r = UARGV(0, i);
+            if (sat)
+            {
+              r = _min_(r, max);
+            }
+            break;
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+          {
+            int64_t si = SARGV(0, i);
+            r = si;
+            if (sat)
+            {
+              if (si < 0)
+              {
+                r = 0;
+              }
+              else if (si > max)
+              {
+                r = max;
+              }
+            }
+            break;
+          }
+          case 'f':
+          case 'd':
+            if (sat)
+            {
+              r = rint(_clamp_(FARGV(0, i), 0.0, (double)max));
+            }
+            else
+            {
+              r = rint(FARGV(0, i));
+            }
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+        result.setUInt(r, i);
+      }
+      fesetround(origRnd);
+    }
+    DEFINE_BUILTIN(convert_sint)
+    {
+      // Check for saturation modifier
+      bool sat = fnName.find("_sat") != string::npos;
+      int64_t min, max;
+      switch (result.size)
+      {
+      case 1:
+        min = INT8_MIN;
+        max = INT8_MAX;
+        break;
+      case 2:
+        min = INT16_MIN;
+        max = INT16_MAX;
+        break;
+      case 4:
+        min = INT32_MIN;
+        max = INT32_MAX;
+        break;
+      case 8:
+        min = INT64_MIN;
+        max = INT64_MAX;
+        break;
+      }
+      // Use rounding mode
+      const int origRnd = fegetround();
+      setConvertRoundingMode(fnName);
+      for (unsigned i = 0; i < result.num; i++)
+      {
+        int64_t r;
+        switch (getOverloadArgType(overload))
+        {
+          case 'h':
+          case 't':
+          case 'j':
+          case 'm':
+            r = UARGV(0, i);
+            if (sat)
+            {
+              r = _min_((uint64_t)r, (uint64_t)max);
+            }
+            break;
+          case 'c':
+          case 's':
+          case 'i':
+          case 'l':
+            r = SARGV(0, i);
+            if (sat)
+            {
+              r = _clamp_(r, min, max);
+            }
+            break;
+          case 'f':
+          case 'd':
+            if (sat)
+            {
+              r = rint(_clamp_(FARGV(0, i), (double)min, (double)max));
+            }
+            else
+            {
+              r = rint(FARGV(0, i));
+            }
+            break;
+          default:
+            FATAL_ERROR("Unsupported argument type: %c",
+                        getOverloadArgType(overload));
+        }
+        result.setSInt(r, i);
+      }
+      fesetround(origRnd);
+    }
+    DEFINE_BUILTIN(printf_builtin)
+    {
+      lock_guard<mutex> lck(printfMutex);
+      size_t formatPtr = workItem->getOperand(ARG(0)).getPointer();
+      Memory *memory = workItem->getMemory(AddrSpaceGlobal);
+      int arg = 1;
+      while (true)
+      {
+        char c;
+        memory->load((unsigned char*)&c, formatPtr++);
+        if (c == '\0')
+        {
+          break;
+        }
+        if (c == '%')
+        {
+          unsigned vectorWidth = 1;
+          string format = "%";
+          while (true)
+          {
+            memory->load((unsigned char*)&c, formatPtr++);
+            if (c == '\0')
+            {
+              cout << format;
+              break;
+            }
+            if (c == 'v')
+            {
+              // Load vector width specifier
+              memory->load((unsigned char*)&c, formatPtr++);
+              vectorWidth = c - '0';
+              if (vectorWidth == 1)
+              {
+                // Assume this is 16
+                vectorWidth = 16;
+                formatPtr++;
+              }
+              continue;
+            }
+            // Ignore all 'h' specifiers
+            if (c == 'h')
+              continue;
+            format += c;
+            bool done = false;
+            switch (c)
+            {
+              case 'c':
+              case 'd':
+              case 'i':
+                for (unsigned i = 0; i < vectorWidth; i++)
+                {
+                  if (i > 0)
+                    printf(",");
+                  printf(format.c_str(), SARGV(arg, i));
+                }
+                arg++;
+                done = true;
+                break;
+              case 'o':
+              case 'u':
+              case 'x':
+              case 'X':
+              case 'p':
+                for (unsigned i = 0; i < vectorWidth; i++)
+                {
+                  if (i > 0)
+                    printf(",");
+                  printf(format.c_str(), UARGV(arg, i));
+                }
+                arg++;
+                done = true;
+                break;
+              case 'f':
+              case 'F':
+              case 'e':
+              case 'E':
+              case 'g':
+              case 'G':
+              case 'a':
+              case 'A':
+                for (unsigned i = 0; i < vectorWidth; i++)
+                {
+                  if (i > 0)
+                    printf(",");
+                  printf(format.c_str(), FARGV(arg, i));
+                }
+                arg++;
+                done = true;
+                break;
+              case 's':
+              {
+                size_t ptr = UARG(arg++);
+                if (!ptr)
+                {
+                  // Special case for printing NULL pointer
+                  printf(format.c_str(), NULL);
+                }
+                else
+                {
+                  // Load string from memory
+                  char c;
+                  string str = "";
+                  while (true)
+                  {
+                    if (!memory->load((unsigned char*)&c, ptr++))
+                      break;
+                    if (c == '\0')
+                      break;
+                    str += c;
+                  }
+                  printf(format.c_str(), str.c_str());
+                }
+                done = true;
+                break;
+              }
+              case '%':
+                printf("%%");
+                done = true;
+                break;
+            }
+            if (done)
+            {
+              break;
+            }
+          }
+          if (c == '\0')
+          {
+            break;
+          }
+        }
+        else
+        {
+          cout << c;
+        }
+      }
+    }
+    /////////////////////
+    // LLVM Intrinsics //
+    /////////////////////
+    DEFINE_BUILTIN(llvm_dbg_declare)
+    {
+      const llvm::DbgDeclareInst *dbgInst =
+        (const llvm::DbgDeclareInst*)callInst;
+      const llvm::Value *addr = dbgInst->getAddress();
+#if LLVM_VERSION > 36
+     const llvm::DILocalVariable *var = dbgInst->getVariable();
+     workItem->m_variables[var->getName()] = addr;
+      const llvm::MDNode *var = dbgInst->getVariable();
+      llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(var->getOperand(0));
+      if (str)
+      {
+        // TODO: There must be a better way of getting the variable name...
+        unsigned length = str->getLength();
+        const char *name = str->getString().str().c_str();
+        if (length > strlen(name) + 1)
+        {
+          name += strlen(name) + 1;
+          workItem->m_variables[name] = addr;
+        }
+      }
+    }
+    DEFINE_BUILTIN(llvm_dbg_value)
+    {
+      const llvm::DbgValueInst *dbgInst = (const llvm::DbgValueInst*)callInst;
+      const llvm::Value *value = dbgInst->getValue();
+      // TODO: Use offset?
+      //uint64_t offset = dbgInst->getOffset();
+#if LLVM_VERSION > 36
+      const llvm::DILocalVariable *var = dbgInst->getVariable();
+      workItem->m_variables[var->getName()] = value;
+      const llvm::MDNode *var = dbgInst->getVariable();
+      llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(var->getOperand(0));
+      if (str)
+      {
+        // TODO: There must be a better way of getting the variable name...
+        unsigned length = str->getLength();
+        const char *name = str->getString().str().c_str();
+        if (length > strlen(name) + 1)
+        {
+          name += strlen(name) + 1;
+          workItem->m_variables[name] = value;
+        }
+      }
+    }
+    DEFINE_BUILTIN(llvm_lifetime_start)
+    {
+      // TODO: Implement?
+    }
+    DEFINE_BUILTIN(llvm_lifetime_end)
+    {
+      // TODO: Implement?
+    }
+    DEFINE_BUILTIN(llvm_memcpy)
+    {
+      const llvm::MemCpyInst *memcpyInst = (const llvm::MemCpyInst*)callInst;
+      size_t dest = workItem->getOperand(memcpyInst->getDest()).getPointer();
+      size_t src = workItem->getOperand(memcpyInst->getSource()).getPointer();
+      size_t size = workItem->getOperand(memcpyInst->getLength()).getUInt();
+      unsigned destAddrSpace = memcpyInst->getDestAddressSpace();
+      unsigned srcAddrSpace = memcpyInst->getSourceAddressSpace();
+      unsigned char *buffer = workItem->m_pool.alloc(size);
+      workItem->getMemory(srcAddrSpace)->load(buffer, src, size);
+      workItem->getMemory(destAddrSpace)->store(buffer, dest, size);
+    }
+    DEFINE_BUILTIN(llvm_memset)
+    {
+      const llvm::MemSetInst *memsetInst = (const llvm::MemSetInst*)callInst;
+      size_t dest = workItem->getOperand(memsetInst->getDest()).getPointer();
+      size_t size = workItem->getOperand(memsetInst->getLength()).getUInt();
+      unsigned addressSpace = memsetInst->getDestAddressSpace();
+      unsigned char *buffer = workItem->m_pool.alloc(size);
+      unsigned char value = UARG(1);
+      memset(buffer, value, size);
+      workItem->getMemory(addressSpace)->store(buffer, dest, size);
+    }
+    DEFINE_BUILTIN(llvm_trap)
+    {
+      FATAL_ERROR("Encountered trap instruction");
+    }
+  public:
+    static BuiltinFunctionMap initBuiltins();
+  };
+  // Utility macros for generating builtin function map
+#define CAST                                \
+  void(*)(WorkItem*, const llvm::CallInst*, \
+  const std::string&, const std::string&, TypedValue& result, void*)
+#define F1ARG(name) (double(*)(double))name
+#define F2ARG(name) (double(*)(double,double))name
+#define F3ARG(name) (double(*)(double,double,double))name
+#define ADD_BUILTIN(name, func, op)         \
+  builtins[name] = BuiltinFunction((CAST)func, (void*)op);
+#define ADD_PREFIX_BUILTIN(name, func, op)  \
+  workItemPrefixBuiltins.push_back(                 \
+    make_pair(name, BuiltinFunction((CAST)func, (void*)op)));
+  // Generate builtin function map
+  BuiltinFunctionPrefixList workItemPrefixBuiltins;
+  BuiltinFunctionMap workItemBuiltins = WorkItemBuiltins::initBuiltins();
+  BuiltinFunctionMap WorkItemBuiltins::initBuiltins()
+  {
+    BuiltinFunctionMap builtins;
+    // Async Copy and Prefetch Functions
+    ADD_BUILTIN("async_work_group_copy", async_work_group_copy, NULL);
+    ADD_BUILTIN("async_work_group_strided_copy", async_work_group_copy, NULL);
+    ADD_BUILTIN("wait_group_events", wait_group_events, NULL);
+    ADD_BUILTIN("prefetch", prefetch, NULL);
+    // Atomic Functions
+    ADD_BUILTIN("atom_add", atomic_add, NULL);
+    ADD_BUILTIN("atomic_add", atomic_add, NULL);
+    ADD_BUILTIN("atom_and", atomic_and, NULL);
+    ADD_BUILTIN("atomic_and", atomic_and, NULL);
+    ADD_BUILTIN("atom_cmpxchg", atomic_cmpxchg, NULL);
+    ADD_BUILTIN("atomic_cmpxchg", atomic_cmpxchg, NULL);
+    ADD_BUILTIN("atom_dec", atomic_dec, NULL);
+    ADD_BUILTIN("atomic_dec", atomic_dec, NULL);
+    ADD_BUILTIN("atom_inc", atomic_inc, NULL);
+    ADD_BUILTIN("atomic_inc", atomic_inc, NULL);
+    ADD_BUILTIN("atom_max", atomic_max, NULL);
+    ADD_BUILTIN("atomic_max", atomic_max, NULL);
+    ADD_BUILTIN("atom_min", atomic_min, NULL);
+    ADD_BUILTIN("atomic_min", atomic_min, NULL);
+    ADD_BUILTIN("atom_or", atomic_or, NULL);
+    ADD_BUILTIN("atomic_or", atomic_or, NULL);
+    ADD_BUILTIN("atom_sub", atomic_sub, NULL);
+    ADD_BUILTIN("atomic_sub", atomic_sub, NULL);
+    ADD_BUILTIN("atom_xchg", atomic_xchg, NULL);
+    ADD_BUILTIN("atomic_xchg", atomic_xchg, NULL);
+    ADD_BUILTIN("atom_xor", atomic_xor, NULL);
+    ADD_BUILTIN("atomic_xor", atomic_xor, NULL);
+    // Common Functions
+    ADD_BUILTIN("clamp", clamp, NULL);
+    ADD_BUILTIN("degrees", f1arg, _degrees_);
+    ADD_BUILTIN("max", max, NULL);
+    ADD_BUILTIN("min", min, NULL);
+    ADD_BUILTIN("mix", mix, NULL);
+    ADD_BUILTIN("radians", f1arg, _radians_);
+    ADD_BUILTIN("sign", f1arg, _sign_);
+    ADD_BUILTIN("smoothstep", smoothstep, NULL);
+    ADD_BUILTIN("step", step, NULL);
+    // Geometric Functions
+    ADD_BUILTIN("cross", cross, NULL);
+    ADD_BUILTIN("dot", dot, NULL);
+    ADD_BUILTIN("distance", distance, NULL);
+    ADD_BUILTIN("length", length, NULL);
+    ADD_BUILTIN("normalize", normalize, NULL);
+    ADD_BUILTIN("fast_distance", distance, NULL);
+    ADD_BUILTIN("fast_length", length, NULL);
+    ADD_BUILTIN("fast_normalize", normalize, NULL);
+    // Image Functions
+    ADD_BUILTIN("get_image_array_size", get_image_array_size, NULL);
+    ADD_BUILTIN("get_image_channel_data_type",
+                get_image_channel_data_type, NULL);
+    ADD_BUILTIN("get_image_channel_order", get_image_channel_order, NULL);
+    ADD_BUILTIN("get_image_dim", get_image_dim, NULL);
+    ADD_BUILTIN("get_image_depth", get_image_depth, NULL);
+    ADD_BUILTIN("get_image_height", get_image_height, NULL);
+    ADD_BUILTIN("get_image_width", get_image_width, NULL);
+    ADD_BUILTIN("read_imagef", read_imagef, NULL);
+    ADD_BUILTIN("read_imagei", read_imagei, NULL);
+    ADD_BUILTIN("read_imageui", read_imageui, NULL);
+    ADD_BUILTIN("write_imagef", write_imagef, NULL);
+    ADD_BUILTIN("write_imagei", write_imagei, NULL);
+    ADD_BUILTIN("write_imageui", write_imageui, NULL);
+    // Integer Functions
+    ADD_BUILTIN("abs", abs_builtin, NULL);
+    ADD_BUILTIN("abs_diff", abs_diff, NULL);
+    ADD_BUILTIN("add_sat", add_sat, NULL);
+    ADD_BUILTIN("clz", clz, NULL);
+    ADD_BUILTIN("hadd", hadd, NULL);
+    ADD_BUILTIN("mad24", u3arg, _mad_);
+    ADD_BUILTIN("mad_hi", mad_hi, NULL);
+    ADD_BUILTIN("mad_sat", mad_sat, NULL);
+    ADD_BUILTIN("mul24", u2arg, _mul_);
+    ADD_BUILTIN("mul_hi", mul_hi, NULL);
+    ADD_BUILTIN("popcount", u1arg, _popcount_);
+    ADD_BUILTIN("rhadd", rhadd, NULL);
+    ADD_BUILTIN("rotate", rotate, NULL);
+    ADD_BUILTIN("sub_sat", sub_sat, NULL);
+    ADD_BUILTIN("upsample", upsample, NULL);
+    // Math Functions
+    ADD_BUILTIN("acos", f1arg, F1ARG(acos));
+    ADD_BUILTIN("acosh", f1arg, F1ARG(acosh));
+    ADD_BUILTIN("acospi", f1arg, _acospi_);
+    ADD_BUILTIN("asin", f1arg, F1ARG(asin));
+    ADD_BUILTIN("asinh", f1arg, F1ARG(asinh));
+    ADD_BUILTIN("asinpi", f1arg, _asinpi_);
+    ADD_BUILTIN("atan", f1arg, F1ARG(atan));
+    ADD_BUILTIN("atan2", f2arg, F2ARG(atan2));
+    ADD_BUILTIN("atanh", f1arg, F1ARG(atanh));
+    ADD_BUILTIN("atanpi", f1arg, _atanpi_);
+    ADD_BUILTIN("atan2pi", f2arg, _atan2pi_);
+    ADD_BUILTIN("cbrt", f1arg, F1ARG(cbrt));
+    ADD_BUILTIN("ceil", f1arg, F1ARG(ceil));
+    ADD_BUILTIN("copysign", f2arg, F2ARG(copysign));
+    ADD_BUILTIN("cos", f1arg, F1ARG(cos));
+    ADD_BUILTIN("cosh", f1arg, F1ARG(cosh));
+    ADD_BUILTIN("cospi", f1arg, _cospi_);
+    ADD_BUILTIN("erfc", f1arg, F1ARG(erfc));
+    ADD_BUILTIN("erf", f1arg, F1ARG(erf));
+    ADD_BUILTIN("exp", f1arg, F1ARG(exp));
+    ADD_BUILTIN("exp2", f1arg, F1ARG(exp2));
+    ADD_BUILTIN("exp10", f1arg, _exp10_);
+    ADD_BUILTIN("expm1", f1arg, F1ARG(expm1));
+    ADD_BUILTIN("fabs", f1arg, F1ARG(fabs));
+    ADD_BUILTIN("fdim", f2arg, F2ARG(fdim));
+    ADD_BUILTIN("floor", f1arg, F1ARG(floor));
+    ADD_BUILTIN("fma", f3arg, F3ARG(_fma_));
+    ADD_BUILTIN("fmax", f2arg, F2ARG(fmax));
+    ADD_BUILTIN("fmin", f2arg, F2ARG(fmin));
+    ADD_BUILTIN("fmod", f2arg, F2ARG(fmod));
+    ADD_BUILTIN("fract", fract, NULL);
+    ADD_BUILTIN("frexp", frexp_builtin, NULL);
+    ADD_BUILTIN("hypot", f2arg, F2ARG(hypot));
+    ADD_BUILTIN("ilogb", ilogb_builtin, NULL);
+    ADD_BUILTIN("ldexp", ldexp_builtin, NULL);
+    ADD_BUILTIN("lgamma", f1arg, F1ARG(lgamma));
+    ADD_BUILTIN("lgamma_r", lgamma_r, NULL);
+    ADD_BUILTIN("log", f1arg, F1ARG(log));
+    ADD_BUILTIN("log2", f1arg, F1ARG(log2));
+    ADD_BUILTIN("log10", f1arg, F1ARG(log10));
+    ADD_BUILTIN("log1p", f1arg, F1ARG(log1p));
+    ADD_BUILTIN("logb", f1arg, F1ARG(logb));
+    ADD_BUILTIN("mad", f3arg, F3ARG(_fma_));
+    ADD_BUILTIN("maxmag", f2arg, _maxmag_);
+    ADD_BUILTIN("minmag", f2arg, _minmag_);
+    ADD_BUILTIN("modf", modf_builtin, NULL);
+    ADD_BUILTIN("nan", nan_builtin, NULL);
+    ADD_BUILTIN("nanf", nan_builtin, NULL);
+    ADD_BUILTIN("nextafter", nextafter_builtin, NULL);
+    ADD_BUILTIN("pow", f2arg, F2ARG(pow));
+    ADD_BUILTIN("pown", pown, NULL);
+    ADD_BUILTIN("powr", f2arg, F2ARG(pow));
+    ADD_BUILTIN("remainder", f2arg, F2ARG(remainder));
+    ADD_BUILTIN("remquo", remquo_builtin, NULL);
+    ADD_BUILTIN("rint", f1arg, F1ARG(rint));
+    ADD_BUILTIN("rootn", rootn, NULL);
+    ADD_BUILTIN("round", f1arg, F1ARG(round));
+    ADD_BUILTIN("rsqrt", f1arg, _rsqrt_);
+    ADD_BUILTIN("sin", f1arg, F1ARG(sin));
+    ADD_BUILTIN("sinh", f1arg, F1ARG(sinh));
+    ADD_BUILTIN("sinpi", f1arg, _sinpi_);
+    ADD_BUILTIN("sincos", sincos, NULL);
+    ADD_BUILTIN("sqrt", f1arg, F1ARG(sqrt));
+    ADD_BUILTIN("tan", f1arg, F1ARG(tan));
+    ADD_BUILTIN("tanh", f1arg, F1ARG(tanh));
+    ADD_BUILTIN("tanpi", f1arg, _tanpi_);
+    ADD_BUILTIN("tgamma", f1arg, F1ARG(tgamma));
+    ADD_BUILTIN("trunc", f1arg, F1ARG(trunc));
+    // Native Math Functions
+    ADD_BUILTIN("half_cos", f1arg, F1ARG(cos));
+    ADD_BUILTIN("native_cos", f1arg, F1ARG(cos));
+    ADD_BUILTIN("half_divide", f2arg, _fdivide_);
+    ADD_BUILTIN("native_divide", f2arg, _fdivide_);
+    ADD_BUILTIN("half_exp", f1arg, F1ARG(exp));
+    ADD_BUILTIN("native_exp", f1arg, F1ARG(exp));
+    ADD_BUILTIN("half_exp2", f1arg, F1ARG(exp2));
+    ADD_BUILTIN("native_exp2", f1arg, F1ARG(exp2));
+    ADD_BUILTIN("half_exp10", f1arg, _exp10_);
+    ADD_BUILTIN("native_exp10", f1arg, _exp10_);
+    ADD_BUILTIN("half_log", f1arg, F1ARG(log));
+    ADD_BUILTIN("native_log", f1arg, F1ARG(log));
+    ADD_BUILTIN("half_log2", f1arg, F1ARG(log2));
+    ADD_BUILTIN("native_log2", f1arg, F1ARG(log2));
+    ADD_BUILTIN("half_log10", f1arg, F1ARG(log10));
+    ADD_BUILTIN("native_log10", f1arg, F1ARG(log10));
+    ADD_BUILTIN("half_powr", f2arg, F2ARG(pow));
+    ADD_BUILTIN("native_powr", f2arg, F2ARG(pow));
+    ADD_BUILTIN("half_recip", f1arg, _frecip_);
+    ADD_BUILTIN("native_recip", f1arg, _frecip_);
+    ADD_BUILTIN("half_rsqrt", f1arg, _rsqrt_);
+    ADD_BUILTIN("native_rsqrt", f1arg, _rsqrt_);
+    ADD_BUILTIN("half_sin", f1arg, F1ARG(sin));
+    ADD_BUILTIN("native_sin", f1arg, F1ARG(sin));
+    ADD_BUILTIN("half_sqrt", f1arg, F1ARG(sqrt));
+    ADD_BUILTIN("native_sqrt", f1arg, F1ARG(sqrt));
+    ADD_BUILTIN("half_tan", f1arg, F1ARG(tan));
+    ADD_BUILTIN("native_tan", f1arg, F1ARG(tan));
+    // Misc. Vector Functions
+    ADD_BUILTIN("shuffle", shuffle_builtin, NULL);
+    ADD_BUILTIN("shuffle2", shuffle2_builtin, NULL);
+    // Relational Functional
+    ADD_BUILTIN("all", all, NULL);
+    ADD_BUILTIN("any", any, NULL);
+    ADD_BUILTIN("bitselect", bitselect, NULL);
+    ADD_BUILTIN("isequal", rel2arg, _iseq_);
+    ADD_BUILTIN("isnotequal", rel2arg, _isneq_);
+    ADD_BUILTIN("isgreater", rel2arg, _isgt_);
+    ADD_BUILTIN("isgreaterequal", rel2arg, _isge_);
+    ADD_BUILTIN("isless", rel2arg, _islt_);
+    ADD_BUILTIN("islessequal", rel2arg, _isle_);
+    ADD_BUILTIN("islessgreater", rel2arg, _islg_);
+    ADD_BUILTIN("isfinite", rel1arg, _isfin_);
+    ADD_BUILTIN("isinf", rel1arg, _isinf_);
+    ADD_BUILTIN("isnan", rel1arg, _isnan_);
+    ADD_BUILTIN("isnormal", rel1arg, _isnorm_);
+    ADD_BUILTIN("isordered", rel2arg, _isord_);
+    ADD_BUILTIN("isunordered", rel2arg, _isuord_);
+    ADD_BUILTIN("select", select_builtin, NULL);
+    ADD_BUILTIN("signbit", rel1arg, _signbit_);
+    // Synchronization Functions
+    ADD_BUILTIN("barrier", barrier, NULL);
+    ADD_BUILTIN("mem_fence", mem_fence, NULL);
+    ADD_BUILTIN("read_mem_fence", mem_fence, NULL);
+    ADD_BUILTIN("write_mem_fence", mem_fence, NULL);
+    // Vector Data Load and Store Functions
+    ADD_PREFIX_BUILTIN("vload_half", vload_half, NULL);
+    ADD_PREFIX_BUILTIN("vloada_half", vload_half, NULL);
+    ADD_PREFIX_BUILTIN("vstore_half", vstore_half, NULL);
+    ADD_PREFIX_BUILTIN("vstorea_half", vstore_half, NULL);
+    ADD_PREFIX_BUILTIN("vload", vload, NULL);
+    ADD_PREFIX_BUILTIN("vstore", vstore, NULL);
+    // Work-Item Functions
+    ADD_BUILTIN("get_global_id", get_global_id, NULL);
+    ADD_BUILTIN("get_global_size", get_global_size, NULL);
+    ADD_BUILTIN("get_global_offset", get_global_offset, NULL);
+    ADD_BUILTIN("get_group_id", get_group_id, NULL);
+    ADD_BUILTIN("get_local_id", get_local_id, NULL);
+    ADD_BUILTIN("get_local_size", get_local_size, NULL);
+    ADD_BUILTIN("get_num_groups", get_num_groups, NULL);
+    ADD_BUILTIN("get_work_dim", get_work_dim, NULL);
+    // Other Functions
+    ADD_PREFIX_BUILTIN("convert_half",   convert_half, NULL);
+    ADD_PREFIX_BUILTIN("convert_float",  convert_float, NULL);
+    ADD_PREFIX_BUILTIN("convert_double", convert_float, NULL);
+    ADD_PREFIX_BUILTIN("convert_u",      convert_uint, NULL);
+    ADD_PREFIX_BUILTIN("convert_",       convert_sint, NULL);
+    ADD_BUILTIN("printf", printf_builtin, NULL);
+    // LLVM Intrinsics
+    ADD_BUILTIN("llvm.dbg.declare", llvm_dbg_declare, NULL);
+    ADD_BUILTIN("llvm.dbg.value", llvm_dbg_value, NULL);
+    ADD_BUILTIN("llvm.lifetime.start", llvm_lifetime_start, NULL);
+    ADD_BUILTIN("llvm.lifetime.end", llvm_lifetime_end, NULL);
+    ADD_PREFIX_BUILTIN("llvm.memcpy", llvm_memcpy, NULL);
+    ADD_PREFIX_BUILTIN("llvm.memmove", llvm_memcpy, NULL);
+    ADD_PREFIX_BUILTIN("llvm.memset", llvm_memset, NULL);
+    ADD_PREFIX_BUILTIN("llvm.fmuladd", f3arg, F3ARG(_fma_));
+    ADD_BUILTIN("llvm.trap", llvm_trap, NULL);
+    return builtins;
+  }
diff --git a/src/core/clc.h b/src/core/clc.h
new file mode 100644
index 0000000..320ddce
--- /dev/null
+++ b/src/core/clc.h
@@ -0,0 +1,1035 @@
+// clc.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+#if defined(__SPIR32__)
+  typedef uint size_t;
+  typedef int ptrdiff_t;
+  typedef ulong size_t;
+  typedef long ptrdiff_t;
+typedef size_t uintptr_t;
+typedef ptrdiff_t intptr_t;
+#define event_t size_t
+#define TYPEDEF_VECTOR(type)                                \
+  typedef __attribute__((ext_vector_type(2))) type type##2; \
+  typedef __attribute__((ext_vector_type(3))) type type##3; \
+  typedef __attribute__((ext_vector_type(4))) type type##4; \
+  typedef __attribute__((ext_vector_type(8))) type type##8; \
+  typedef __attribute__((ext_vector_type(16))) type type##16;
+#define __ENDIAN_LITTLE__ 1
+#define __OPENCL_VERSION__ 120
+#define __OPENCL_C_VERSION__ 120
+#define __IMAGE_SUPPORT__ 1
+#define __kernel_exec(X, typen) __kernel                        \
+  __attribute__((work_group_size_hint(X, 1, 1)))                \
+  __attribute__((vec_type_hint(typen)))
+#define CHAR_BIT    8
+#define SCHAR_MAX 127
+#define SCHAR_MIN (-128)
+#define UCHAR_MAX 255
+#define USHRT_MAX 65535
+#define SHRT_MAX  32767
+#define SHRT_MIN  (-32768)
+#define UINT_MAX  0xffffffff
+#define INT_MAX   2147483647
+#define INT_MIN   (-2147483647-1)
+#define ULONG_MAX 0xffffffffffffffffUL
+#define LONG_MAX  ((long)0x7fffffffffffffffL)
+#define LONG_MIN  ((long)(-0x7fffffffffffffffL-1))
+#define FLT_DIG         6
+#define FLT_MANT_DIG    24
+#define FLT_MAX_10_EXP  +38
+#define FLT_MAX_EXP     +128
+#define FLT_MIN_10_EXP  -37
+#define FLT_MIN_EXP     -125
+#define FLT_RADIX       2
+#define FLT_MAX         0x1.fffffep127f
+#define FLT_MIN         0x1.0p-126f
+#define FLT_EPSILON     0x1.0p-23f
+#define DBL_DIG         15
+#define DBL_MANT_DIG    53
+#define DBL_MAX_10_EXP  +308
+#define DBL_MAX_EXP     +1024
+#define DBL_MIN_10_EXP  -307
+#define DBL_MIN_EXP     -1021
+#define DBL_RADIX       2
+#define DBL_MAX         0x1.fffffffffffffp1023
+#define DBL_MIN         0x1.0p-1022
+#define DBL_EPSILON     0x1.0p-52
+#define FP_ILOGB0       INT_MIN
+#define FP_ILOGBNAN     INT_MIN
+#define M_E_F         2.71828182845904523536028747135266250f
+#define M_LOG2E_F     1.44269504088896340735992468100189214f
+#define M_LOG10E_F    0.434294481903251827651128918916605082f
+#define M_LN2_F       0.693147180559945309417232121458176568f
+#define M_LN10_F      2.3025850929940456840179914546843642f
+#define M_PI_F        3.14159265358979323846264338327950288f
+#define M_PI_2_F      1.57079632679489661923132169163975144f
+#define M_PI_4_F      0.785398163397448309615660845819875721f
+#define M_1_PI_F      0.318309886183790671537767526745028724f
+#define M_2_PI_F      0.636619772367581343075535053490057448f
+#define M_2_SQRTPI_F  1.12837916709551257389615890312154517f
+#define M_SQRT2_F     1.41421356237309504880168872420969808f
+#define M_SQRT1_2_F   0.707106781186547524400844362104849039f
+#define M_E         2.71828182845904523536028747135266250
+#define M_LOG2E     1.44269504088896340735992468100189214
+#define M_LOG10E    0.434294481903251827651128918916605082
+#define M_LN2       0.693147180559945309417232121458176568
+#define M_LN10      2.30258509299404568401799145468436421
+#define M_PI        3.14159265358979323846264338327950288
+#define M_PI_2      1.57079632679489661923132169163975144
+#define M_PI_4      0.785398163397448309615660845819875721
+#define M_1_PI      0.318309886183790671537767526745028724
+#define M_2_PI      0.636619772367581343075535053490057448
+#define M_2_SQRTPI  1.12837916709551257389615890312154517
+#define M_SQRT2     1.41421356237309504880168872420969808
+#define M_SQRT1_2   0.707106781186547524400844362104849039
+#define MAXFLOAT ((float)3.40282346638528860e+38)
+#define HUGE_VALF __builtin_huge_valf()
+#define HUGE_VAL __builtin_huge_val()
+#define INFINITY __builtin_inff()
+#define NAN __builtin_nanf(0)
+#define CLK_SNORM_INT8 0x10D0
+#define CLK_SNORM_INT16 0x10D1
+#define CLK_UNORM_INT8 0x10D2
+#define CLK_UNORM_INT16 0x10D3
+#define CLK_UNORM_SHORT_565 0x10D4
+#define CLK_UNORM_SHORT_555 0x10D5
+#define CLK_UNORM_INT_101010 0x10D6
+#define CLK_SIGNED_INT8 0x10D7
+#define CLK_SIGNED_INT16 0x10D8
+#define CLK_SIGNED_INT32 0x10D9
+#define CLK_UNSIGNED_INT8 0x10DA
+#define CLK_UNSIGNED_INT16 0x10DB
+#define CLK_UNSIGNED_INT32 0x10DC
+#define CLK_HALF_FLOAT 0x10DD
+#define CLK_FLOAT 0x10DE
+#define CLK_UNORM_INT24 0x10DF
+#define CLK_R 0x10B0
+#define CLK_A 0x10B1
+#define CLK_RG 0x10B2
+#define CLK_RA 0x10B3
+#define CLK_RGB 0x10B4
+#define CLK_RGBA 0x10B5
+#define CLK_BGRA 0x10B6
+#define CLK_ARGB 0x10B7
+#define CLK_INTENSITY 0x10B8
+#define CLK_LUMINANCE 0x10B9
+#define CLK_Rx 0x10BA
+#define CLK_RGx 0x10BB
+#define CLK_RGBx 0x10BC
+#define CLK_DEPTH 0x10BD
+#define CLK_ADDRESS_NONE 0x0000
+#define CLK_ADDRESS_CLAMP 0x0004
+#define CLK_ADDRESS_REPEAT 0x0006
+#define CLK_FILTER_NEAREST 0x0010
+#define CLK_FILTER_LINEAR 0x0020
+#define __OVERLOAD__ __attribute__((__overloadable__))
+#define BUILTIN_1ARG(rtype, type0, name)  \
+  rtype __OVERLOAD__ name(type0 a);       \
+  rtype##2 __OVERLOAD__ name(type0##2 a); \
+  rtype##3 __OVERLOAD__ name(type0##3 a); \
+  rtype##4 __OVERLOAD__ name(type0##4 a); \
+  rtype##8 __OVERLOAD__ name(type0##8 a); \
+  rtype##16 __OVERLOAD__ name(type0##16 a);
+#define BUILTIN_2ARG(rtype, type0, type1, name)       \
+  rtype __OVERLOAD__ name(type0 a, type1 b);          \
+  rtype##2 __OVERLOAD__ name(type0##2 a, type1##2 b); \
+  rtype##3 __OVERLOAD__ name(type0##3 a, type1##3 b); \
+  rtype##4 __OVERLOAD__ name(type0##4 a, type1##4 b); \
+  rtype##8 __OVERLOAD__ name(type0##8 a, type1##8 b); \
+  rtype##16 __OVERLOAD__ name(type0##16 a, type1##16 b);
+#define BUILTIN_3ARG(rtype, type0, type1, type2, name)            \
+  rtype __OVERLOAD__ name(type0 a, type1 b, type2 c);             \
+  rtype##2 __OVERLOAD__ name(type0##2 a, type1##2 b, type2##2 c); \
+  rtype##3 __OVERLOAD__ name(type0##3 a, type1##3 b, type2##3 c); \
+  rtype##4 __OVERLOAD__ name(type0##4 a, type1##4 b, type2##4 c); \
+  rtype##8 __OVERLOAD__ name(type0##8 a, type1##8 b, type2##8 c); \
+  rtype##16 __OVERLOAD__ name(type0##16 a, type1##16 b, type2##16 c);
+#define BUILTIN_1ARG_INTEGERS(name)  \
+  BUILTIN_1ARG(char, char, name)     \
+  BUILTIN_1ARG(uchar, uchar, name)   \
+  BUILTIN_1ARG(short, short, name)   \
+  BUILTIN_1ARG(ushort, ushort, name) \
+  BUILTIN_1ARG(int, int, name)       \
+  BUILTIN_1ARG(uint, uint, name)     \
+  BUILTIN_1ARG(long, long, name)     \
+  BUILTIN_1ARG(ulong, ulong, name);
+#define BUILTIN_2ARG_INTEGERS(name)          \
+  BUILTIN_2ARG(char, char, char, name)       \
+  BUILTIN_2ARG(uchar, uchar, uchar, name)    \
+  BUILTIN_2ARG(short, short, short, name)    \
+  BUILTIN_2ARG(ushort, ushort, ushort, name) \
+  BUILTIN_2ARG(int, int, int, name)          \
+  BUILTIN_2ARG(uint, uint, uint, name)       \
+  BUILTIN_2ARG(long, long, long, name)       \
+  BUILTIN_2ARG(ulong, ulong, ulong, name);
+#define BUILTIN_3ARG_INTEGERS(name)                  \
+  BUILTIN_3ARG(char, char, char, char, name)         \
+  BUILTIN_3ARG(uchar, uchar, uchar, uchar, name)     \
+  BUILTIN_3ARG(short, short, short, short, name)     \
+  BUILTIN_3ARG(ushort, ushort, ushort, ushort, name) \
+  BUILTIN_3ARG(int, int, int, int, name)             \
+  BUILTIN_3ARG(uint, uint, uint, uint, name)         \
+  BUILTIN_3ARG(long, long, long, long, name)         \
+  BUILTIN_3ARG(ulong, ulong, ulong, ulong, name);
+#define BUILTIN_1ARG_FLOATS(name)  \
+  BUILTIN_1ARG(float, float, name) \
+  BUILTIN_1ARG(double, double, name);
+#define BUILTIN_2ARG_FLOATS(name)         \
+  BUILTIN_2ARG(float, float, float, name) \
+  BUILTIN_2ARG(double, double, double, name);
+#define BUILTIN_3ARG_FLOATS(name)                \
+  BUILTIN_3ARG(float, float, float, float, name) \
+  BUILTIN_3ARG(double, double, double, double, name);
+// Async Copy and Prefetch Functions //
+#define ASYNC_COPY_TYPE(type)                                                                                       \
+  event_t __OVERLOAD__ async_work_group_copy(__local type*, const __global type*, size_t, event_t);                 \
+  event_t __OVERLOAD__ async_work_group_copy(__global type*, const __local type*, size_t, event_t);                 \
+  event_t __OVERLOAD__ async_work_group_strided_copy(__local type*, const __global type*, size_t, size_t, event_t); \
+  event_t __OVERLOAD__ async_work_group_strided_copy(__global type*, const __local type*, size_t, size_t, event_t);
+#define ASYNC_COPY(type)   \
+  ASYNC_COPY_TYPE(type)    \
+  ASYNC_COPY_TYPE(type##2) \
+  ASYNC_COPY_TYPE(type##3) \
+  ASYNC_COPY_TYPE(type##4) \
+  ASYNC_COPY_TYPE(type##8) \
+  ASYNC_COPY_TYPE(type##16);
+void wait_group_events(int, event_t*);
+#define PREFETCH(type)                                         \
+  void __OVERLOAD__ prefetch(const __global type*, size_t);    \
+  void __OVERLOAD__ prefetch(const __global type##2*, size_t); \
+  void __OVERLOAD__ prefetch(const __global type##3*, size_t); \
+  void __OVERLOAD__ prefetch(const __global type##4*, size_t); \
+  void __OVERLOAD__ prefetch(const __global type##8*, size_t); \
+  void __OVERLOAD__ prefetch(const __global type##16*, size_t);
+// Atomic Functions //
+#define ATOMIC_0ARG_DEF(name, type)                  \
+  type __OVERLOAD__ name(volatile __global type *p); \
+  type __OVERLOAD__ name(volatile __local type *p);
+#define ATOMIC_0ARG(name)               \
+  ATOMIC_0ARG_DEF(atom_##name, int);    \
+  ATOMIC_0ARG_DEF(atom_##name, uint);   \
+  ATOMIC_0ARG_DEF(atomic_##name, int);  \
+  ATOMIC_0ARG_DEF(atomic_##name, uint);
+#define ATOMIC_1ARG_DEF(name, type)                            \
+  type __OVERLOAD__ name(volatile __global type *p, type val); \
+  type __OVERLOAD__ name(volatile __local type *p, type val);
+#define ATOMIC_1ARG(name)               \
+  ATOMIC_1ARG_DEF(atom_##name, int);    \
+  ATOMIC_1ARG_DEF(atom_##name, uint);   \
+  ATOMIC_1ARG_DEF(atomic_##name, int);  \
+  ATOMIC_1ARG_DEF(atomic_##name, uint);
+ATOMIC_1ARG_DEF(atom_xchg, float);
+ATOMIC_1ARG_DEF(atomic_xchg, float);
+int __OVERLOAD__ atom_cmpxchg(volatile __global int *p, int cmp, int val);
+int __OVERLOAD__ atom_cmpxchg(volatile __local int *p, int cmp, int val);
+uint __OVERLOAD__ atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
+uint __OVERLOAD__ atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+int __OVERLOAD__ atomic_cmpxchg(volatile __global int *p, int cmp, int val);
+int __OVERLOAD__ atomic_cmpxchg(volatile __local int *p, int cmp, int val);
+uint __OVERLOAD__ atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
+uint __OVERLOAD__ atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+// Common Functions //
+#define ABS(type)                 \
+  u##type __OVERLOAD__ abs(type); \
+  u##type __OVERLOAD__ abs(u##type);
+#define ABS_DIFF(type)                       \
+  u##type __OVERLOAD__ abs_diff(type, type); \
+  u##type __OVERLOAD__ abs_diff(u##type, u##type);
+#define ABS_BOTH(type) \
+  ABS(type);           \
+  ABS_DIFF(type);
+#define ABS_ALL(type) \
+  ABS_BOTH(type);     \
+  ABS_BOTH(type##2);  \
+  ABS_BOTH(type##3);  \
+  ABS_BOTH(type##4);  \
+  ABS_BOTH(type##8);  \
+  ABS_BOTH(type##16);
+#define COMMON_SCALAR(type, n)                          \
+  type##n __OVERLOAD__ clamp(type##n, type, type);      \
+  type##n __OVERLOAD__ max(type##n, type);              \
+  type##n __OVERLOAD__ min(type##n, type);              \
+  type##n __OVERLOAD__ mix(type##n, type##n, type);     \
+  type##n __OVERLOAD__ smoothstep(type, type, type##n); \
+  type##n __OVERLOAD__ step(type, type##n);
+COMMON_SCALAR(float, 2);
+COMMON_SCALAR(float, 3);
+COMMON_SCALAR(float, 4);
+COMMON_SCALAR(float, 8);
+COMMON_SCALAR(float, 16);
+COMMON_SCALAR(double, 2);
+COMMON_SCALAR(double, 3);
+COMMON_SCALAR(double, 4);
+COMMON_SCALAR(double, 8);
+COMMON_SCALAR(double, 16);
+// Geometric Functions //
+#define GEOM_1ARG(type, name)     \
+ type __OVERLOAD__ name(type);    \
+ type __OVERLOAD__ name(type##2); \
+ type __OVERLOAD__ name(type##3); \
+ type __OVERLOAD__ name(type##4); \
+ type __OVERLOAD__ name(type##8); \
+ type __OVERLOAD__ name(type##16);
+#define GEOM_2ARG(type, name)              \
+ type __OVERLOAD__ name(type, type);       \
+ type __OVERLOAD__ name(type##2, type##2); \
+ type __OVERLOAD__ name(type##3, type##3); \
+ type __OVERLOAD__ name(type##4, type##4); \
+ type __OVERLOAD__ name(type##8, type##8); \
+ type __OVERLOAD__ name(type##16, type##16);
+float4 __OVERLOAD__ cross(float4, float4);
+float3 __OVERLOAD__ cross(float3, float3);
+double4 __OVERLOAD__ cross(double4, double4);
+double3 __OVERLOAD__ cross(double3, double3);
+GEOM_2ARG(float, dot);
+GEOM_2ARG(double, dot);
+GEOM_2ARG(float, distance);
+GEOM_2ARG(double, distance);
+GEOM_1ARG(float, length);
+GEOM_1ARG(double, length);
+GEOM_2ARG(float, fast_distance);
+GEOM_2ARG(double, fast_distance);
+GEOM_1ARG(float, fast_length);
+GEOM_1ARG(double, fast_length);
+// Image Functions //
+size_t __OVERLOAD__ get_image_array_size(image1d_array_t image);
+size_t __OVERLOAD__ get_image_array_size(image2d_array_t image);
+int __OVERLOAD__ get_image_channel_data_type(image1d_t image);
+int __OVERLOAD__ get_image_channel_data_type(image1d_buffer_t image);
+int __OVERLOAD__ get_image_channel_data_type(image1d_array_t image);
+int __OVERLOAD__ get_image_channel_data_type(image2d_t image);
+int __OVERLOAD__ get_image_channel_data_type(image2d_array_t image);
+int __OVERLOAD__ get_image_channel_data_type(image3d_t image);
+int __OVERLOAD__ get_image_channel_order(image1d_t image);
+int __OVERLOAD__ get_image_channel_order(image1d_buffer_t image);
+int __OVERLOAD__ get_image_channel_order(image1d_array_t image);
+int __OVERLOAD__ get_image_channel_order(image2d_t image);
+int __OVERLOAD__ get_image_channel_order(image2d_array_t image);
+int __OVERLOAD__ get_image_channel_order(image3d_t image);
+int2 __OVERLOAD__ get_image_dim(image2d_t image);
+int2 __OVERLOAD__ get_image_dim(image2d_array_t image);
+int4 __OVERLOAD__ get_image_dim(image3d_t image);
+int __OVERLOAD__ get_image_depth(image3d_t image);
+int __OVERLOAD__ get_image_height(image2d_t image);
+int __OVERLOAD__ get_image_height(image2d_array_t image);
+int __OVERLOAD__ get_image_height(image3d_t image);
+int __OVERLOAD__ get_image_width(image1d_t image);
+int __OVERLOAD__ get_image_width(image1d_buffer_t image);
+int __OVERLOAD__ get_image_width(image1d_array_t image);
+int __OVERLOAD__ get_image_width(image2d_t image);
+int __OVERLOAD__ get_image_width(image2d_array_t image);
+int __OVERLOAD__ get_image_width(image3d_t image);
+float4 __OVERLOAD__ read_imagef(image1d_t, int);
+float4 __OVERLOAD__ read_imagef(image1d_buffer_t, int);
+float4 __OVERLOAD__ read_imagef(image1d_array_t, int2);
+float4 __OVERLOAD__ read_imagef(image2d_t, int2);
+float4 __OVERLOAD__ read_imagef(image2d_array_t, int4);
+float4 __OVERLOAD__ read_imagef(image3d_t, int4);
+float4 __OVERLOAD__ read_imagef(image1d_t, sampler_t, int);
+float4 __OVERLOAD__ read_imagef(image1d_t, sampler_t, float);
+float4 __OVERLOAD__ read_imagef(image1d_array_t, sampler_t, int2);
+float4 __OVERLOAD__ read_imagef(image1d_array_t, sampler_t, float2);
+float4 __OVERLOAD__ read_imagef(image2d_t, sampler_t, int2);
+float4 __OVERLOAD__ read_imagef(image2d_t, sampler_t, float2);
+float4 __OVERLOAD__ read_imagef(image2d_array_t, sampler_t, int4);
+float4 __OVERLOAD__ read_imagef(image2d_array_t, sampler_t, float4);
+float4 __OVERLOAD__ read_imagef(image3d_t, sampler_t, int4);
+float4 __OVERLOAD__ read_imagef(image3d_t, sampler_t, float4);
+int4 __OVERLOAD__ read_imagei(image1d_t, int);
+int4 __OVERLOAD__ read_imagei(image1d_buffer_t, int);
+int4 __OVERLOAD__ read_imagei(image1d_array_t, int2);
+int4 __OVERLOAD__ read_imagei(image2d_t, int2);
+int4 __OVERLOAD__ read_imagei(image2d_array_t, int4);
+int4 __OVERLOAD__ read_imagei(image3d_t, int4);
+int4 __OVERLOAD__ read_imagei(image1d_t, sampler_t, int);
+int4 __OVERLOAD__ read_imagei(image1d_t, sampler_t, float);
+int4 __OVERLOAD__ read_imagei(image1d_array_t, sampler_t, int2);
+int4 __OVERLOAD__ read_imagei(image1d_array_t, sampler_t, float2);
+int4 __OVERLOAD__ read_imagei(image2d_t, sampler_t, int2);
+int4 __OVERLOAD__ read_imagei(image2d_t, sampler_t, float2);
+int4 __OVERLOAD__ read_imagei(image2d_array_t, sampler_t, int4);
+int4 __OVERLOAD__ read_imagei(image2d_array_t, sampler_t, float4);
+int4 __OVERLOAD__ read_imagei(image3d_t, sampler_t, int4);
+int4 __OVERLOAD__ read_imagei(image3d_t, sampler_t, float4);
+uint4 __OVERLOAD__ read_imageui(image1d_t, int);
+uint4 __OVERLOAD__ read_imageui(image1d_buffer_t, int);
+uint4 __OVERLOAD__ read_imageui(image1d_array_t, int2);
+uint4 __OVERLOAD__ read_imageui(image2d_t, int2);
+uint4 __OVERLOAD__ read_imageui(image2d_array_t, int4);
+uint4 __OVERLOAD__ read_imageui(image3d_t, int4);
+uint4 __OVERLOAD__ read_imageui(image1d_t, sampler_t, int);
+uint4 __OVERLOAD__ read_imageui(image1d_t, sampler_t, float);
+uint4 __OVERLOAD__ read_imageui(image1d_array_t, sampler_t, int2);
+uint4 __OVERLOAD__ read_imageui(image1d_array_t, sampler_t, float2);
+uint4 __OVERLOAD__ read_imageui(image2d_t, sampler_t, int2);
+uint4 __OVERLOAD__ read_imageui(image2d_t, sampler_t, float2);
+uint4 __OVERLOAD__ read_imageui(image2d_array_t, sampler_t, int4);
+uint4 __OVERLOAD__ read_imageui(image2d_array_t, sampler_t, float4);
+uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, int4);
+uint4 __OVERLOAD__ read_imageui(image3d_t, sampler_t, float4);
+void __OVERLOAD__ write_imagef(image1d_t, int, float4);
+void __OVERLOAD__ write_imagef(image1d_array_t, int2, float4);
+void __OVERLOAD__ write_imagef(image2d_t, int2, float4);
+void __OVERLOAD__ write_imagef(image2d_array_t, int4, float4);
+void __OVERLOAD__ write_imagef(image3d_t, int4, float4);
+void __OVERLOAD__ write_imagei(image1d_t, int, int4);
+void __OVERLOAD__ write_imagei(image1d_array_t, int2, int4);
+void __OVERLOAD__ write_imagei(image2d_t, int2, int4);
+void __OVERLOAD__ write_imagei(image2d_array_t, int4, int4);
+void __OVERLOAD__ write_imagei(image3d_t, int4, int4);
+void __OVERLOAD__ write_imageui(image1d_t, int, uint4);
+void __OVERLOAD__ write_imageui(image1d_array_t, int2, uint4);
+void __OVERLOAD__ write_imageui(image2d_t, int2, uint4);
+void __OVERLOAD__ write_imageui(image2d_array_t, int4, uint4);
+void __OVERLOAD__ write_imageui(image3d_t, int4, uint4);
+// Integer Functions //
+BUILTIN_3ARG(int, int, int, int, mad24);
+BUILTIN_3ARG(uint, uint, uint, uint, mad24);
+BUILTIN_2ARG(int, int, int, mul24);
+BUILTIN_2ARG(uint, uint, uint, mul24);
+#define UPSAMPLE_SIZES(out, in1, in2)            \
+  out     __OVERLOAD__ upsample(in1, in2);       \
+  out##2  __OVERLOAD__ upsample(in1##2, in2##2); \
+  out##3  __OVERLOAD__ upsample(in1##3, in2##3); \
+  out##4  __OVERLOAD__ upsample(in1##4, in2##4); \
+  out##8  __OVERLOAD__ upsample(in1##8, in2##8); \
+  out##16 __OVERLOAD__ upsample(in1##16, in2##16);
+#define UPSAMPLE(out, in)      \
+  UPSAMPLE_SIZES(out, in, u##in); \
+  UPSAMPLE_SIZES(u##out, u##in, u##in);
+UPSAMPLE(short, char);
+UPSAMPLE(int, short);
+UPSAMPLE(long, int);
+// Math Functions //
+#define BUILTIN_2TYPE_PTR(type1, type2, name)     \
+ type1 __OVERLOAD__ name(type1, __global type2*); \
+ type1 __OVERLOAD__ name(type1, __local type2*);  \
+ type1 __OVERLOAD__ name(type1, __private type2*);
+#define BUILTIN_PTR_ARG(type1, type2, name)  \
+ BUILTIN_2TYPE_PTR(type1, type2, name)       \
+ BUILTIN_2TYPE_PTR(type1##2, type2##2, name) \
+ BUILTIN_2TYPE_PTR(type1##3, type2##3, name) \
+ BUILTIN_2TYPE_PTR(type1##4, type2##4, name) \
+ BUILTIN_2TYPE_PTR(type1##8, type2##8, name) \
+ BUILTIN_2TYPE_PTR(type1##16, type2##16, name);
+#define REMQUO(type, addrspace)                                   \
+  type __OVERLOAD__ remquo(type, type, addrspace int*);           \
+  type##2 __OVERLOAD__ remquo(type##2, type##2, addrspace int2*); \
+  type##3 __OVERLOAD__ remquo(type##3, type##3, addrspace int3*); \
+  type##4 __OVERLOAD__ remquo(type##4, type##4, addrspace int4*); \
+  type##8 __OVERLOAD__ remquo(type##8, type##8, addrspace int8*); \
+  type##16 __OVERLOAD__ remquo(type##16, type##16, addrspace int16*);
+BUILTIN_PTR_ARG(float, float, fract);
+BUILTIN_PTR_ARG(double, double, fract);
+BUILTIN_PTR_ARG(float, int, frexp);
+BUILTIN_PTR_ARG(double, int, frexp);
+BUILTIN_1ARG(int, float, ilogb);
+BUILTIN_1ARG(int, double, ilogb);
+BUILTIN_2ARG(float, float, int, ldexp);
+BUILTIN_2ARG(double, double, int, ldexp);
+BUILTIN_PTR_ARG(float, int, lgamma_r);
+BUILTIN_PTR_ARG(double, int, lgamma_r);
+BUILTIN_PTR_ARG(float, float, modf);
+BUILTIN_PTR_ARG(double, double, modf);
+BUILTIN_1ARG(float, uint, nan);
+BUILTIN_1ARG(double, ulong, nan);
+BUILTIN_2ARG(float, float, int, pown);
+BUILTIN_2ARG(double, double, int, pown);
+REMQUO(float, global);
+REMQUO(float, local);
+REMQUO(float, private);
+REMQUO(double, global);
+REMQUO(double, local);
+REMQUO(double, private);
+BUILTIN_2ARG(float, float, int, rootn);
+BUILTIN_2ARG(double, double, int, rootn);
+BUILTIN_PTR_ARG(float, float, sincos);
+BUILTIN_PTR_ARG(double, double, sincos);
+// Native math functions
+// Misc. Vector Functions //
+#define SHUFFLE_TYPE(ret, type, mask)         \
+  ret __OVERLOAD__ shuffle(type, mask);       \
+  ret##2 __OVERLOAD__ shuffle(type, mask##2); \
+  ret##3 __OVERLOAD__ shuffle(type, mask##3); \
+  ret##4 __OVERLOAD__ shuffle(type, mask##4); \
+  ret##8 __OVERLOAD__ shuffle(type, mask##8); \
+  ret##16 __OVERLOAD__ shuffle(type, mask##16);
+#define SHUFFLE(type, mask)          \
+  SHUFFLE_TYPE(type, type, mask);    \
+  SHUFFLE_TYPE(type, type##2, mask); \
+  SHUFFLE_TYPE(type, type##3, mask); \
+  SHUFFLE_TYPE(type, type##4, mask); \
+  SHUFFLE_TYPE(type, type##8, mask); \
+  SHUFFLE_TYPE(type, type##16, mask);
+SHUFFLE(char, uchar);
+SHUFFLE(uchar, uchar);
+SHUFFLE(short, ushort);
+SHUFFLE(ushort, ushort);
+SHUFFLE(int, uint);
+SHUFFLE(uint, uint);
+SHUFFLE(long, ulong);
+SHUFFLE(ulong, ulong);
+SHUFFLE(float, uint);
+SHUFFLE(double, ulong);
+#define SHUFFLE2_TYPE(ret, type, mask)               \
+  ret __OVERLOAD__ shuffle2(type, type, mask);       \
+  ret##2 __OVERLOAD__ shuffle2(type, type, mask##2); \
+  ret##3 __OVERLOAD__ shuffle2(type, type, mask##3); \
+  ret##4 __OVERLOAD__ shuffle2(type, type, mask##4); \
+  ret##8 __OVERLOAD__ shuffle2(type, type, mask##8); \
+  ret##16 __OVERLOAD__ shuffle2(type, type, mask##16);
+#define SHUFFLE2(type, mask)          \
+  SHUFFLE2_TYPE(type, type, mask);    \
+  SHUFFLE2_TYPE(type, type##2, mask); \
+  SHUFFLE2_TYPE(type, type##3, mask); \
+  SHUFFLE2_TYPE(type, type##4, mask); \
+  SHUFFLE2_TYPE(type, type##8, mask); \
+  SHUFFLE2_TYPE(type, type##16, mask);
+SHUFFLE2(char, uchar);
+SHUFFLE2(uchar, uchar);
+SHUFFLE2(short, ushort);
+SHUFFLE2(ushort, ushort);
+SHUFFLE2(int, uint);
+SHUFFLE2(uint, uint);
+SHUFFLE2(long, ulong);
+SHUFFLE2(ulong, ulong);
+SHUFFLE2(float, uint);
+SHUFFLE2(double, ulong);
+// Relational Functions //
+#define BUILTIN_ANYALL(name, type) \
+  int __OVERLOAD__ name(type);     \
+  int __OVERLOAD__ name(type##2);  \
+  int __OVERLOAD__ name(type##3);  \
+  int __OVERLOAD__ name(type##4);  \
+  int __OVERLOAD__ name(type##8);  \
+  int __OVERLOAD__ name(type##16);
+#define REL_1ARG(name)            \
+  BUILTIN_1ARG(int, float, name); \
+  BUILTIN_1ARG(long, double, name);
+#define REL_2ARG(name)                   \
+  BUILTIN_2ARG(int, float, float, name); \
+  BUILTIN_2ARG(long, double, double, name);
+BUILTIN_ANYALL(all, char);
+BUILTIN_ANYALL(all, short);
+BUILTIN_ANYALL(all, int);
+BUILTIN_ANYALL(all, long);
+BUILTIN_ANYALL(any, char);
+BUILTIN_ANYALL(any, short);
+BUILTIN_ANYALL(any, int);
+BUILTIN_ANYALL(any, long);
+#define SELECT_TYPE(type, ctype)               \
+  type __OVERLOAD__ select(type, type, ctype); \
+  type __OVERLOAD__ select(type, type, u##ctype);
+#define SELECT(type, ctype)      \
+  SELECT_TYPE(type, ctype)       \
+  SELECT_TYPE(type##2, ctype##2) \
+  SELECT_TYPE(type##3, ctype##3) \
+  SELECT_TYPE(type##4, ctype##4) \
+  SELECT_TYPE(type##8, ctype##8) \
+  SELECT_TYPE(type##16, ctype##16);
+SELECT(char, char);
+SELECT(uchar, char);
+SELECT(short, short);
+SELECT(ushort, short);
+SELECT(int, int);
+SELECT(uint, int);
+SELECT(long, long);
+SELECT(ulong, long);
+SELECT(float, int);
+SELECT(double, long);
+// Synchronization Functions //
+typedef uint cl_mem_fence_flags;
+#define CLK_LOCAL_MEM_FENCE  (1<<0)
+#define CLK_GLOBAL_MEM_FENCE (1<<1)
+void barrier(cl_mem_fence_flags);
+void mem_fence(cl_mem_fence_flags);
+void read_mem_fence(cl_mem_fence_flags);
+void write_mem_fence(cl_mem_fence_flags);
+// Vector Data Load and Store Functions //
+#define VLOAD_ADDRSPACE(type, width)                                    \
+  type##width __OVERLOAD__ vload##width(size_t, const __private type*); \
+  type##width __OVERLOAD__ vload##width(size_t, const __local type*);   \
+  type##width __OVERLOAD__ vload##width(size_t, const __global type*);  \
+  type##width __OVERLOAD__ vload##width(size_t, const __constant type*);
+#define VSTORE_ADDRSPACE(type, width)                                   \
+  void __OVERLOAD__ vstore##width(type##width, size_t, __local type*);  \
+  void __OVERLOAD__ vstore##width(type##width, size_t, __global type*); \
+  void __OVERLOAD__ vstore##width(type##width, size_t, __private type*);
+#define V_ADDRSPACE(macro, type) \
+  macro(type, 2)                 \
+  macro(type, 3)                 \
+  macro(type, 4)                 \
+  macro(type, 8)                 \
+  macro(type, 16);
+#define VLOADSTORE(type)              \
+#define VLOAD_HALF_WIDTH(n)                                            \
+  float##n __OVERLOAD__ vload_half##n(size_t, const __private half*);  \
+  float##n __OVERLOAD__ vloada_half##n(size_t, const __private half*); \
+  float##n __OVERLOAD__ vload_half##n(size_t, const __local half*);    \
+  float##n __OVERLOAD__ vloada_half##n(size_t, const __local half*);   \
+  float##n __OVERLOAD__ vload_half##n(size_t, const __global half*);   \
+  float##n __OVERLOAD__ vloada_half##n(size_t, const __global half*);  \
+  float##n __OVERLOAD__ vload_half##n(size_t, const __constant half*); \
+  float##n __OVERLOAD__ vloada_half##n(size_t, const __constant half*);
+#define VSTORE_HALF_ADDRSPACE(func, type)                      \
+  void __OVERLOAD__ func(type, size_t, const __private half*); \
+  void __OVERLOAD__ func(type, size_t, const __local half*);   \
+  void __OVERLOAD__ func(type, size_t, const __global half*);  \
+  void __OVERLOAD__ func(type, size_t, const __constant half*);
+#define VSTORE_HALF_ROUND(func, type)      \
+  VSTORE_HALF_ADDRSPACE(func, type);       \
+  VSTORE_HALF_ADDRSPACE(func##_rte, type); \
+  VSTORE_HALF_ADDRSPACE(func##_rtz, type); \
+  VSTORE_HALF_ADDRSPACE(func##_rtp, type); \
+  VSTORE_HALF_ADDRSPACE(func##_rtn, type);
+#define VSTORE_HALF_WIDTH(n)                    \
+  VSTORE_HALF_ROUND(vstore_half##n, float##n);  \
+  VSTORE_HALF_ROUND(vstorea_half##n, float##n);
+  VLOAD_HALF_WIDTH(n);           \
+// Work-Item Functions //
+size_t get_global_id(uint dim);
+size_t get_global_size(uint dim);
+size_t get_global_offset(uint dim);
+size_t get_group_id(uint dim);
+size_t get_local_id(uint dim);
+size_t get_local_size(uint dim);
+size_t get_num_groups(uint dim);
+uint get_work_dim(void);
+// Other Functions //
+int printf(__constant char * restrict, ...);
+// Conversions //
+#define as_char( _x )  __builtin_astype( _x, char )
+#define as_char2( _x )  __builtin_astype( _x, char2 )
+#define as_char3( _x )  __builtin_astype( _x, char3 )
+#define as_char4( _x )  __builtin_astype( _x, char4 )
+#define as_char8( _x )  __builtin_astype( _x, char8 )
+#define as_char16( _x )  __builtin_astype( _x, char16 )
+#define as_uchar( _x )  __builtin_astype( _x, uchar )
+#define as_uchar2( _x )  __builtin_astype( _x, uchar2 )
+#define as_uchar3( _x )  __builtin_astype( _x, uchar3 )
+#define as_uchar4( _x )  __builtin_astype( _x, uchar4 )
+#define as_uchar8( _x )  __builtin_astype( _x, uchar8 )
+#define as_uchar16( _x )  __builtin_astype( _x, uchar16 )
+#define as_short( _x )  __builtin_astype( _x, short )
+#define as_short2( _x )  __builtin_astype( _x, short2 )
+#define as_short3( _x )  __builtin_astype( _x, short3 )
+#define as_short4( _x )  __builtin_astype( _x, short4 )
+#define as_short8( _x )  __builtin_astype( _x, short8 )
+#define as_short16( _x )  __builtin_astype( _x, short16 )
+#define as_ushort( _x )  __builtin_astype( _x, ushort )
+#define as_ushort2( _x )  __builtin_astype( _x, ushort2 )
+#define as_ushort3( _x )  __builtin_astype( _x, ushort3 )
+#define as_ushort4( _x )  __builtin_astype( _x, ushort4 )
+#define as_ushort8( _x )  __builtin_astype( _x, ushort8 )
+#define as_ushort16( _x )  __builtin_astype( _x, ushort16 )
+#define as_int( _x )  __builtin_astype( _x, int )
+#define as_int2( _x )  __builtin_astype( _x, int2 )
+#define as_int3( _x )  __builtin_astype( _x, int3 )
+#define as_int4( _x )  __builtin_astype( _x, int4 )
+#define as_int8( _x )  __builtin_astype( _x, int8 )
+#define as_int16( _x )  __builtin_astype( _x, int16 )
+#define as_uint( _x )  __builtin_astype( _x, uint )
+#define as_uint2( _x )  __builtin_astype( _x, uint2 )
+#define as_uint3( _x )  __builtin_astype( _x, uint3 )
+#define as_uint4( _x )  __builtin_astype( _x, uint4 )
+#define as_uint8( _x )  __builtin_astype( _x, uint8 )
+#define as_uint16( _x )  __builtin_astype( _x, uint16 )
+#define as_long( _x )  __builtin_astype( _x, long )
+#define as_long2( _x )  __builtin_astype( _x, long2 )
+#define as_long3( _x )  __builtin_astype( _x, long3 )
+#define as_long4( _x )  __builtin_astype( _x, long4 )
+#define as_long8( _x )  __builtin_astype( _x, long8 )
+#define as_long16( _x )  __builtin_astype( _x, long16 )
+#define as_ulong( _x )  __builtin_astype( _x, ulong )
+#define as_ulong2( _x )  __builtin_astype( _x, ulong2 )
+#define as_ulong3( _x )  __builtin_astype( _x, ulong3 )
+#define as_ulong4( _x )  __builtin_astype( _x, ulong4 )
+#define as_ulong8( _x )  __builtin_astype( _x, ulong8 )
+#define as_ulong16( _x )  __builtin_astype( _x, ulong16 )
+#define as_float( _x )  __builtin_astype( _x, float )
+#define as_float2( _x )  __builtin_astype( _x, float2 )
+#define as_float3( _x )  __builtin_astype( _x, float3 )
+#define as_float4( _x )  __builtin_astype( _x, float4 )
+#define as_float8( _x )  __builtin_astype( _x, float8 )
+#define as_float16( _x )  __builtin_astype( _x, float16 )
+#define as_double( _x )  __builtin_astype( _x, double )
+#define as_double2( _x )  __builtin_astype( _x, double2 )
+#define as_double3( _x )  __builtin_astype( _x, double3 )
+#define as_double4( _x )  __builtin_astype( _x, double4 )
+#define as_double8( _x )  __builtin_astype( _x, double8 )
+#define as_double16( _x )  __builtin_astype( _x, double16 )
+#define as_size_t( _x ) __builtin_astype( _x, size_t )
+#define as_ptrdiff_t( _x ) __builtin_astype( _x, ptrdiff_t )
+#define as_uintptr_t( _x ) __builtin_astype( _x, uintptr_t )
+#define as_intptr_t( _x ) __builtin_astype( _x, intptr_t )
+#define CONVERT_TYPE_SIZE(out, in)              \
+  out __OVERLOAD__ convert_##out(in);           \
+  out __OVERLOAD__ convert_##out##_rte(in);     \
+  out __OVERLOAD__ convert_##out##_rtz(in);     \
+  out __OVERLOAD__ convert_##out##_rtp(in);     \
+  out __OVERLOAD__ convert_##out##_rtn(in);     \
+  out __OVERLOAD__ convert_##out##_sat(in);     \
+  out __OVERLOAD__ convert_##out##_sat_rte(in); \
+  out __OVERLOAD__ convert_##out##_sat_rtz(in); \
+  out __OVERLOAD__ convert_##out##_sat_rtp(in); \
+  out __OVERLOAD__ convert_##out##_sat_rtn(in);
+#define CONVERT_TYPE(out, in)             \
+  CONVERT_TYPE_SIZE(out, in);             \
+  CONVERT_TYPE_SIZE(out##2, in##2);       \
+  CONVERT_TYPE_SIZE(out##3, in##3);       \
+  CONVERT_TYPE_SIZE(out##4, in##4);       \
+  CONVERT_TYPE_SIZE(out##8, in##8);       \
+  CONVERT_TYPE_SIZE(out##16, in##16);
+#define CONVERT(out)         \
+  CONVERT_TYPE(out, char);   \
+  CONVERT_TYPE(out, uchar);  \
+  CONVERT_TYPE(out, short);  \
+  CONVERT_TYPE(out, ushort); \
+  CONVERT_TYPE(out, int);    \
+  CONVERT_TYPE(out, uint);   \
+  CONVERT_TYPE(out, long);   \
+  CONVERT_TYPE(out, ulong);  \
+  CONVERT_TYPE(out, float);  \
+  CONVERT_TYPE(out, double);
diff --git a/src/core/common.cpp b/src/core/common.cpp
new file mode 100644
index 0000000..3f849fa
--- /dev/null
+++ b/src/core/common.cpp
@@ -0,0 +1,712 @@
+// common.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include <time.h>
+#include <sys/time.h>
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/raw_os_ostream.h"
+using namespace oclgrind;
+using namespace std;
+namespace oclgrind
+  _Size3_::_Size3_()
+  {
+    x = y = z = 0;
+  }
+  _Size3_::_Size3_(size_t _x, size_t _y, size_t _z)
+  {
+    x = _x;
+    y = _y;
+    z = _z;
+  }
+  _Size3_::_Size3_(size_t linear, _Size3_ dimensions)
+  {
+    x = linear % dimensions.x;
+    y = (linear / dimensions.x) % dimensions.y;
+    z = (linear / (dimensions.x * dimensions.y));
+  }
+  size_t& Size3::operator[](unsigned i)
+  {
+    switch (i)
+    {
+    case 0:
+      return x;
+    case 1:
+      return y;
+    case 2:
+      return z;
+    default:
+      assert(false && "Size3 index out of range");
+    }
+  }
+  const size_t& Size3::operator[](unsigned i) const
+  {
+    switch (i)
+    {
+    case 0:
+      return x;
+    case 1:
+      return y;
+    case 2:
+      return z;
+    default:
+      assert(false && "Size3 index out of range");
+    }
+  }
+  bool Size3::operator==(const Size3& rhs) const
+  {
+    return x == rhs.x && y == rhs.y && z == rhs.z;
+  }
+  ostream& operator<<(ostream& stream, const Size3& size)
+  {
+    stream << dec    << "("
+           << size.x << ","
+           << size.y << ","
+           << size.z << ")";
+    return stream;
+  }
+  double TypedValue::getFloat(unsigned index) const
+  {
+    switch (size)
+    {
+    case 4:
+      return ((float*)data)[index];
+    case 8:
+      return ((double*)data)[index];
+    default:
+      FATAL_ERROR("Unsupported float size: %u bytes", size);
+    }
+  }
+  size_t TypedValue::getPointer(unsigned index) const
+  {
+    if (size != sizeof(size_t))
+    {
+      FATAL_ERROR("Unsupported pointer size: %u bytes", size);
+    }
+    return ((size_t*)data)[index];
+  }
+  int64_t TypedValue::getSInt(unsigned index) const
+  {
+    switch (size)
+    {
+    case 1:
+      return ((int8_t*)data)[index];
+    case 2:
+      return ((int16_t*)data)[index];
+    case 4:
+      return ((int32_t*)data)[index];
+    case 8:
+      return ((int64_t*)data)[index];
+    default:
+      FATAL_ERROR("Unsupported signed int size: %u bytes", size);
+    }
+  }
+  uint64_t TypedValue::getUInt(unsigned index) const
+  {
+    switch (size)
+    {
+    case 1:
+      return ((uint8_t*)data)[index];
+    case 2:
+      return ((uint16_t*)data)[index];
+    case 4:
+      return ((uint32_t*)data)[index];
+    case 8:
+      return ((uint64_t*)data)[index];
+    default:
+      FATAL_ERROR("Unsupported unsigned int size: %u bytes", size);
+    }
+  }
+  void TypedValue::setFloat(double value, unsigned index)
+  {
+    switch (size)
+    {
+    case 4:
+      ((float*)data)[index] = value;
+      break;
+    case 8:
+      ((double*)data)[index] = value;
+      break;
+    default:
+      FATAL_ERROR("Unsupported float size: %u bytes", size);
+    }
+  }
+  void TypedValue::setPointer(size_t value, unsigned index)
+  {
+    if (size != sizeof(size_t))
+    {
+      FATAL_ERROR("Unsupported pointer size: %u bytes", size);
+    }
+    ((size_t*)data)[index] = value;
+  }
+  void TypedValue::setSInt(int64_t value, unsigned index)
+  {
+    switch (size)
+    {
+    case 1:
+      ((int8_t*)data)[index] = value;
+      break;
+    case 2:
+      ((int16_t*)data)[index] = value;
+      break;
+    case 4:
+      ((int32_t*)data)[index] = value;
+      break;
+    case 8:
+      ((int64_t*)data)[index] = value;
+      break;
+    default:
+      FATAL_ERROR("Unsupported signed int size: %u bytes", size);
+    }
+  }
+  void TypedValue::setUInt(uint64_t value, unsigned index)
+  {
+    switch (size)
+    {
+    case 1:
+      ((uint8_t*)data)[index] = value;
+      break;
+    case 2:
+      ((uint16_t*)data)[index] = value;
+      break;
+    case 4:
+      ((uint32_t*)data)[index] = value;
+      break;
+    case 8:
+      ((uint64_t*)data)[index] = value;
+      break;
+    default:
+      FATAL_ERROR("Unsupported unsigned int size: %u bytes", size);
+    }
+  }
+  TypedValue TypedValue::clone() const
+  {
+    TypedValue result;
+    result.size = size;
+    result.num  = num;
+    result.data = new unsigned char[size*num];
+    memcpy(result.data, data, size*num);
+    return result;
+  }
+  bool checkEnv(const char *var)
+  {
+    const char *value = getenv(var);
+    return (value && !strcmp(value, "1"));
+  }
+  void dumpInstruction(ostream& out, const llvm::Instruction *instruction)
+  {
+    llvm::raw_os_ostream stream(out);
+    instruction->print(stream);
+  }
+  const char* getAddressSpaceName(unsigned addrSpace)
+  {
+    switch (addrSpace)
+    {
+    case AddrSpacePrivate:
+      return "private";
+    case AddrSpaceGlobal:
+      return "global";
+    case AddrSpaceConstant:
+      return "constant";
+    case AddrSpaceLocal:
+      return "local";
+    default:
+      return "(unknown)";
+    }
+  }
+  void getConstantData(unsigned char *data, const llvm::Constant *constant)
+  {
+    if (constant->getValueID() == llvm::Value::UndefValueVal)
+    {
+      return;
+    }
+    const llvm::Type *type = constant->getType();
+    unsigned size = getTypeSize(type);
+    switch (type->getTypeID())
+    {
+    case llvm::Type::IntegerTyID:
+      memcpy(data,
+             ((llvm::ConstantInt*)constant)->getValue().getRawData(),
+             size);
+      break;
+    case llvm::Type::FloatTyID:
+    {
+      *(float*)data =
+        ((llvm::ConstantFP*)constant)->getValueAPF().convertToFloat();
+      break;
+    }
+    case llvm::Type::DoubleTyID:
+    {
+      *(double*)data =
+        ((llvm::ConstantFP*)constant)->getValueAPF().convertToDouble();
+      break;
+    }
+    case llvm::Type::VectorTyID:
+    {
+      unsigned num = type->getVectorNumElements();
+      const llvm::Type *elemType = type->getVectorElementType();
+      unsigned elemSize = getTypeSize(elemType);
+      for (unsigned i = 0; i < num; i++)
+      {
+        getConstantData(data + i*elemSize, constant->getAggregateElement(i));
+      }
+      break;
+    }
+    case llvm::Type::ArrayTyID:
+    {
+      unsigned num = type->getArrayNumElements();
+      const llvm::Type *elemType = type->getArrayElementType();
+      unsigned elemSize = getTypeSize(elemType);
+      for (unsigned i = 0; i < num; i++)
+      {
+        getConstantData(data + i*elemSize, constant->getAggregateElement(i));
+      }
+      break;
+    }
+    case llvm::Type::PointerTyID:
+    {
+      if (constant->getValueID() != llvm::Value::ConstantPointerNullVal)
+      {
+        FATAL_ERROR("Unsupported constant pointer value: %d",
+                    constant->getValueID());
+      }
+      *(size_t*)data = 0;
+      break;
+    }
+    case llvm::Type::StructTyID:
+    {
+      unsigned num = type->getStructNumElements();
+      for (unsigned i = 0; i < num; i++)
+      {
+        unsigned offset =
+          getStructMemberOffset((const llvm::StructType*)type, i);
+        getConstantData(data + offset, constant->getAggregateElement(i));
+      }
+      break;
+    }
+    default:
+      FATAL_ERROR("Unsupported constant type: %d", type->getTypeID());
+    }
+  }
+  const llvm::Instruction* getConstExprAsInstruction(
+    const llvm::ConstantExpr *expr)
+  {
+    // Get operands
+    unsigned numOperands = expr->getNumOperands();
+    llvm::Value **valueOperands = new llvm::Value*[numOperands];
+    for (unsigned i = 0; i < numOperands; i++)
+    {
+      valueOperands[i] = expr->getOperand(i);
+    }
+    llvm::ArrayRef<llvm::Value*> operands(valueOperands, numOperands);
+    // Create instruction
+    unsigned opcode = expr->getOpcode();
+    switch (opcode)
+    {
+    case llvm::Instruction::Trunc:
+    case llvm::Instruction::ZExt:
+    case llvm::Instruction::SExt:
+    case llvm::Instruction::FPTrunc:
+    case llvm::Instruction::FPExt:
+    case llvm::Instruction::UIToFP:
+    case llvm::Instruction::SIToFP:
+    case llvm::Instruction::FPToUI:
+    case llvm::Instruction::FPToSI:
+    case llvm::Instruction::PtrToInt:
+    case llvm::Instruction::IntToPtr:
+    case llvm::Instruction::BitCast:
+      return llvm::CastInst::Create((llvm::Instruction::CastOps)opcode,
+                                    operands[0], expr->getType());
+    case llvm::Instruction::Select:
+      return llvm::SelectInst::Create(operands[0], operands[1], operands[2]);
+    case llvm::Instruction::InsertElement:
+      return llvm::InsertElementInst::Create(operands[0], operands[1],
+                                             operands[2]);
+    case llvm::Instruction::ExtractElement:
+      return llvm::ExtractElementInst::Create(operands[0], operands[1]);
+    case llvm::Instruction::InsertValue:
+      return llvm::InsertValueInst::Create(operands[0], operands[1],
+                                           expr->getIndices());
+    case llvm::Instruction::ExtractValue:
+      return llvm::ExtractValueInst::Create(operands[0], expr->getIndices());
+    case llvm::Instruction::ShuffleVector:
+      return new llvm::ShuffleVectorInst(operands[0], operands[1],
+                                         operands[2]);
+    case llvm::Instruction::GetElementPtr:
+      if (((const llvm::GEPOperator*)expr)->isInBounds())
+      {
+        return llvm::GetElementPtrInst::CreateInBounds(operands[0],
+                                                       operands.slice(1));
+      }
+      else
+      {
+#if LLVM_VERSION > 36
+        return llvm::GetElementPtrInst::Create(expr->getType(),
+                                               operands[0], operands.slice(1));
+        return llvm::GetElementPtrInst::Create(operands[0], operands.slice(1));
+      }
+    case llvm::Instruction::ICmp:
+    case llvm::Instruction::FCmp:
+      return llvm::CmpInst::Create((llvm::Instruction::OtherOps)opcode,
+                                   expr->getPredicate(),
+                                   operands[0], operands[1]);
+    default:
+      assert(expr->getNumOperands() == 2 && "Must be binary operator?");
+      llvm::BinaryOperator *binaryOp =
+        llvm::BinaryOperator::Create((llvm::Instruction::BinaryOps)opcode,
+                                     operands[0], operands[1]);
+      // Check for overflowing operator
+      if (opcode == llvm::Instruction::Add ||
+          opcode == llvm::Instruction::Mul ||
+          opcode == llvm::Instruction::Shl ||
+          opcode == llvm::Instruction::Sub)
+      {
+        binaryOp->setHasNoUnsignedWrap(
+          expr->getRawSubclassOptionalData() &
+          llvm::OverflowingBinaryOperator::NoUnsignedWrap);
+        binaryOp->setHasNoSignedWrap(
+          expr->getRawSubclassOptionalData() &
+          llvm::OverflowingBinaryOperator::NoSignedWrap);
+      }
+      // Check for possibly exact operator
+      if (opcode == llvm::Instruction::AShr ||
+          opcode == llvm::Instruction::LShr ||
+          opcode == llvm::Instruction::SDiv ||
+          opcode == llvm::Instruction::UDiv)
+      {
+        binaryOp->setIsExact(expr->getRawSubclassOptionalData() &
+                             llvm::PossiblyExactOperator::IsExact);
+      }
+      return binaryOp;
+    }
+  }
+  const llvm::ConstantInt* getMDOpAsConstInt(const llvm::MDOperand& op)
+  {
+    llvm::Metadata *md = op.get();
+    llvm::ConstantAsMetadata *cam =
+      llvm::dyn_cast<llvm::ConstantAsMetadata>(md);
+    if (!cam)
+      return NULL;
+    return llvm::dyn_cast<llvm::ConstantInt>(cam->getValue());
+  }
+  unsigned getStructMemberOffset(const llvm::StructType *type, unsigned index)
+  {
+    bool packed = ((llvm::StructType*)type)->isPacked();
+    unsigned offset = 0;
+    for (unsigned i = 0; i <= index; i++)
+    {
+      // Get member size and alignment
+      const llvm::Type *elemType = type->getStructElementType(i);
+      unsigned size = getTypeSize(elemType);
+      unsigned align = getTypeAlignment(elemType);
+      // Add padding if necessary
+      if (!packed && offset % align)
+      {
+        offset += (align - (offset%align));
+      }
+      if (i == index)
+      {
+        return offset;
+      }
+      offset += size;
+    }
+    // Unreachable
+    assert(false);
+  }
+  unsigned getTypeSize(const llvm::Type *type)
+  {
+    if (type->isArrayTy())
+    {
+      unsigned num = type->getArrayNumElements();
+      unsigned sz = getTypeSize(type->getArrayElementType());
+      return num*sz;
+    }
+    else if (type->isStructTy())
+    {
+      bool packed = ((llvm::StructType*)type)->isPacked();
+      unsigned size = 0;
+      unsigned alignment = 1;
+      for (unsigned i = 0; i < type->getStructNumElements(); i++)
+      {
+        // Get member size and alignment
+        const llvm::Type *elemType = type->getStructElementType(i);
+        unsigned sz    = getTypeSize(elemType);
+        unsigned align = getTypeAlignment(elemType);
+        // Add padding if necessary
+        if (!packed && size % align)
+        {
+          size += (align - (size%align));
+        }
+        size += sz;
+        alignment = max(alignment, align);
+      }
+      // Alignment of struct should match member with largest alignment
+      if (!packed && size % alignment)
+      {
+        size += (alignment - (size%alignment));
+      }
+      return size;
+    }
+    else if (type->isVectorTy())
+    {
+      unsigned num = type->getVectorNumElements();
+      unsigned sz = getTypeSize(type->getVectorElementType());
+      if (num == 3) num = 4; // Hack for 3-element vectors
+      return num*sz;
+    }
+    else if (type->isPointerTy())
+    {
+      return sizeof(size_t);
+    }
+    else
+    {
+      // For some reason, getScalarSizeInBits is not const
+      llvm::Type* nonConstTy = const_cast<llvm::Type*>(type);
+      // Round up for types that have a bit size not multiple of 8
+      // like "bool".
+      unsigned ret = nonConstTy->getScalarSizeInBits() / 8;
+      if (nonConstTy->getScalarSizeInBits() % 8)
+        ret++;
+      return ret;
+    }
+  }
+  /// Returns the byte alignment of this type
+  unsigned getTypeAlignment(const llvm::Type* type)
+  {
+    using namespace llvm;
+    // Array types are aligned to their element type
+    if (const ArrayType* psAT = dyn_cast<ArrayType>(type))
+    {
+      return getTypeAlignment(psAT->getElementType());
+    }
+    // Struct alignment is the size of its largest contained type
+    if (const StructType* structT = dyn_cast<StructType>(type))
+    {
+      if (structT->isPacked())
+        return 1;
+      StructType* nonConstTy = const_cast<StructType*>(structT);
+      unsigned uAlign = 0, uMaxAlign = 1;
+      unsigned uCount = structT->getNumElements();
+      for (unsigned i = 0; i < uCount; i++)
+      {
+          const Type* psElemType = nonConstTy->getTypeAtIndex(i);
+          uAlign = getTypeAlignment(psElemType);
+          if (uAlign > uMaxAlign)
+            uMaxAlign = uAlign;
+      }
+      return uMaxAlign;
+    }
+    return getTypeSize(type);
+  }
+  pair<unsigned,unsigned> getValueSize(const llvm::Value *value)
+  {
+    unsigned bits, numElements;
+    const llvm::Type *type = value->getType();
+    if (type->isVectorTy())
+    {
+      bits = type->getVectorElementType()->getPrimitiveSizeInBits();
+      numElements = type->getVectorNumElements();
+    }
+    else if (type->isAggregateType())
+    {
+      bits = getTypeSize(type)<<3;
+      numElements = 1;
+    }
+    else
+    {
+      bits = type->getPrimitiveSizeInBits();
+      numElements = 1;
+    }
+    unsigned elemSize = bits >> 3;
+    // Special case for pointer types
+    if (type->isPointerTy())
+    {
+      elemSize = sizeof(size_t);
+    }
+    // Special case for boolean results
+    if (bits == 1)
+    {
+      elemSize = sizeof(bool);
+    }
+    return pair<unsigned,unsigned>(elemSize,numElements);
+  }
+  bool isConstantOperand(const llvm::Value *operand)
+  {
+    unsigned id = operand->getValueID();
+    return (id >= llvm::Value::ConstantFirstVal &&
+            id <= llvm::Value::ConstantLastVal);
+  }
+  bool isVector3(const llvm::Value *value)
+  {
+    return (value->getType()->isVectorTy() &&
+            value->getType()->getVectorNumElements() == 3);
+  }
+  double now()
+  {
+#if defined(_WIN32) && !defined(__MINGW32__)
+    return time(NULL)*1e9;
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_usec*1e3 + tv.tv_sec*1e9;
+  }
+  void printTypedData(const llvm::Type *type, const unsigned char *data)
+  {
+    // TODO: Interpret other types (array, struct)
+    unsigned size = getTypeSize(type);
+    switch (type->getTypeID())
+    {
+    case llvm::Type::FloatTyID:
+      cout << *(float*)data;
+      break;
+    case llvm::Type::DoubleTyID:
+      cout << *(double*)data;
+      break;
+    case llvm::Type::IntegerTyID:
+      switch (size)
+      {
+      case 1:
+        cout << (int)*(char*)data;
+        break;
+      case 2:
+        cout << *(short*)data;
+        break;
+      case 4:
+        cout << *(int*)data;
+        break;
+      case 8:
+        cout << *(long*)data;
+        break;
+      default:
+        cout << "(invalid integer size)";
+        break;
+      }
+      break;
+    case llvm::Type::VectorTyID:
+    {
+      const llvm::Type *elemType = type->getVectorElementType();
+      cout << "(";
+      for (unsigned i = 0; i < type->getVectorNumElements(); i++)
+      {
+        if (i > 0)
+        {
+          cout << ",";
+        }
+        printTypedData(elemType, data+i*getTypeSize(elemType));
+      }
+      cout << ")";
+      break;
+    }
+    case llvm::Type::PointerTyID:
+      cout << "0x" << hex << *(size_t*)data;
+      break;
+    default:
+      cout << "(raw) 0x" << hex << uppercase << setfill('0');
+      for (unsigned i = 0; i < size; i++)
+      {
+        cout << setw(2) << (int)data[i];
+      }
+    }
+  }
+  FatalError::FatalError(const string& msg, const string& file, size_t line)
+    : std::runtime_error(msg)
+  {
+    m_file = file;
+    m_line = line;
+  }
+  FatalError::~FatalError() throw()
+  {
+  }
+  const string& FatalError::getFile() const
+  {
+    return m_file;
+  }
+  size_t FatalError::getLine() const
+  {
+    return m_line;
+  }
+  const char* FatalError::what() const throw()
+  {
+    return runtime_error::what();
+  }
diff --git a/src/core/common.h b/src/core/common.h
new file mode 100644
index 0000000..d908ffa
--- /dev/null
+++ b/src/core/common.h
@@ -0,0 +1,203 @@
+// common.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#ifndef __common_h_
+#define __common_h_
+#include "config.h"
+#include "CL/cl.h"
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <queue>
+#include <set>
+#include <sstream>
+#include <stack>
+#include <stdexcept>
+#include <stdint.h>
+#include <unordered_map>
+#include <vector>
+#define BIG_SEPARATOR   "================================"
+#define SMALL_SEPARATOR "--------------------------------"
+#if defined(_WIN32) && !defined(__MINGW32__)
+#define snprintf _snprintf
+#undef ERROR
+namespace llvm
+  class Constant;
+  class ConstantExpr;
+  class ConstantInt;
+  class Instruction;
+  class MDOperand;
+  class StructType;
+  class Type;
+  class Value;
+namespace oclgrind
+  class Kernel;
+  // Enumeration for address spaces
+  enum AddressSpace
+  {
+    AddrSpacePrivate = 0,
+    AddrSpaceGlobal = 1,
+    AddrSpaceConstant = 2,
+    AddrSpaceLocal = 3,
+  };
+  enum AtomicOp
+  {
+    AtomicAdd,
+    AtomicAnd,
+    AtomicCmpXchg,
+    AtomicDec,
+    AtomicInc,
+    AtomicMax,
+    AtomicMin,
+    AtomicOr,
+    AtomicSub,
+    AtomicXchg,
+    AtomicXor,
+  };
+  // Enumeration for different log message types
+  enum MessageType
+  {
+    DEBUG,
+    INFO,
+    ERROR,
+  };
+  // 3-dimensional size
+  typedef struct _Size3_
+  {
+    size_t x, y, z;
+    _Size3_();
+    _Size3_(size_t x, size_t y, size_t z);
+    _Size3_(size_t linear, _Size3_ dimensions);
+    size_t& operator[](unsigned i);
+    const size_t& operator[](unsigned i) const;
+    bool operator==(const _Size3_& rhs) const;
+    friend std::ostream& operator<<(std::ostream& stream, const _Size3_& sz);
+  } Size3;
+  // Structure for a value with a size/type
+  struct _TypedValue_
+  {
+    unsigned size;
+    unsigned num;
+    unsigned char *data;
+    struct _TypedValue_ clone() const;
+    double   getFloat(unsigned index = 0) const;
+    size_t   getPointer(unsigned index = 0) const;
+    int64_t  getSInt(unsigned index = 0) const;
+    uint64_t getUInt(unsigned index = 0) const;
+    void     setFloat(double value, unsigned index = 0);
+    void     setPointer(size_t value, unsigned index = 0);
+    void     setSInt(int64_t value, unsigned index = 0);
+    void     setUInt(uint64_t value, unsigned index = 0);
+  };
+  typedef _TypedValue_ TypedValue;
+  // Private memory map type
+  typedef std::map<const llvm::Value*,TypedValue> TypedValueMap;
+  // Image object
+  typedef struct
+  {
+    size_t address;
+    cl_image_format format;
+    cl_image_desc desc;
+  } Image;
+  // Check if an environment variable is set to 1
+  bool checkEnv(const char *var);
+  // Output an instruction in human-readable format
+  void dumpInstruction(std::ostream& out, const llvm::Instruction *instruction);
+  // Get the human readable name of an address space
+  const char* getAddressSpaceName(unsigned addrSpace);
+  // Retrieve the raw data for a constant
+  void getConstantData(unsigned char *data, const llvm::Constant *constant);
+  // Creates an instruction from a constant expression
+  const llvm::Instruction* getConstExprAsInstruction(
+    const llvm::ConstantExpr *expr);
+  // Get the ConstantInt object for an MDOperand
+  const llvm::ConstantInt* getMDOpAsConstInt(const llvm::MDOperand& op);
+  // Get the byte offset of a struct member
+  unsigned getStructMemberOffset(const llvm::StructType *type, unsigned index);
+  // Returns the size of a type
+  unsigned getTypeSize(const llvm::Type *type);
+  /// Returns the alignment requirements of this type
+  unsigned getTypeAlignment(const llvm::Type* type);
+  // Returns the size of a value
+  std::pair<unsigned,unsigned> getValueSize(const llvm::Value *value);
+  // Returns true if the operand is a constant value
+  bool isConstantOperand(const llvm::Value *operand);
+  // Returns true if the value is a 3-element vector
+  bool isVector3(const llvm::Value *value);
+  // Return the current time in nanoseconds since the epoch
+  double now();
+  // Print data in a human readable format (according to its type)
+  void printTypedData(const llvm::Type *type, const unsigned char *data);
+  // Exception class for raising fatal errors
+  class FatalError : std::runtime_error
+  {
+  public:
+    FatalError(const std::string& msg, const std::string& file, size_t line);
+    ~FatalError() throw();
+    virtual const std::string& getFile() const;
+    virtual size_t getLine() const;
+    virtual const char* what() const throw();
+  protected:
+    std::string m_file;
+    size_t m_line;
+  };
+  // Utility macro for raising an exception with a sprintf-based message
+  #define FATAL_ERROR(format, ...)                       \
+    {                                                    \
+      int sz = snprintf(NULL, 0, format, ##__VA_ARGS__); \
+      char *str = new char[sz+1];                        \
+      sprintf(str, format, ##__VA_ARGS__);               \
+      string msg = str;                                  \
+      delete[] str;                                      \
+      throw FatalError(msg, __FILE__, __LINE__);         \
+    }
+#endif // __common_h_
diff --git a/src/core/gen_clc_h.cmake b/src/core/gen_clc_h.cmake
new file mode 100644
index 0000000..43b4fa5
--- /dev/null
+++ b/src/core/gen_clc_h.cmake
@@ -0,0 +1,11 @@
+set(OUTPUT src/core/clc_h.cpp)
+file(WRITE ${OUTPUT} "extern const char CLC_H_DATA[] = \n\"")
+string(REGEX REPLACE "\\\\" "\\\\\\\\" CLC_H "${CLC_H}")
+string(REGEX REPLACE "\"" "\\\\\"" CLC_H "${CLC_H}")
+string(REGEX REPLACE "\n" "\\\\n\"\n\"" CLC_H "${CLC_H}")
+file(APPEND ${OUTPUT} "${CLC_H}")
+file(APPEND ${OUTPUT} "\";")
diff --git a/src/core/gen_clc_h.sh b/src/core/gen_clc_h.sh
new file mode 100755
index 0000000..e9ce2b1
--- /dev/null
+++ b/src/core/gen_clc_h.sh
@@ -0,0 +1,18 @@
+if [ $# -ne 2 ]
+  echo "Usage: gen_clc_h.sh INPUT OUTPUT"
+  exit 1
+echo "extern const char CLC_H_DATA[] =" >$OUT
+sed -e 's/\\/\\\\/g;s/"/\\"/g;s/^/"/;s/$/\\n"/' $IN >>$OUT
+if [ $? -ne 0 ]
+  exit 1
+echo ";" >>$OUT
diff --git a/src/core/half.h b/src/core/half.h
new file mode 100644
index 0000000..58afcf1
--- /dev/null
+++ b/src/core/half.h
@@ -0,0 +1,160 @@
+// half.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "common.h"
+static float halfToFloat(uint16_t half)
+  uint16_t h_sign, h_exponent, h_mantissa;
+  uint32_t f_sign, f_exponent, f_mantissa;
+  h_sign     = half & 0x8000; // 1000 0000 0000 0000
+  h_exponent = half & 0x7C00; // 0111 1100 0000 0000
+  h_mantissa = half & 0x03FF; // 0000 0011 1111 1111
+  f_sign     = ((uint32_t)h_sign) << 16;
+  if (h_exponent == 0)
+  {
+    if (h_mantissa == 0)
+    {
+      // Zero
+      f_exponent = 0;
+      f_mantissa = 0;
+    }
+    else
+    {
+      // Denorm - convert to normalized float
+      int e = -1;
+      do
+      {
+        e++;
+        h_mantissa <<= 1;
+      }
+      while((h_mantissa & 0x0400) == 0);
+      f_exponent = (-15 + 127 - e) << 23;
+      f_mantissa = ((uint32_t)(h_mantissa & 0x03FF)) << 13;
+    }
+  }
+  else if (h_exponent == 0x7C00)
+  {
+    // Inf or NaN
+    f_exponent = 0xFF << 23;
+    f_mantissa = h_mantissa;
+  }
+  else
+  {
+    // Normalized
+    f_exponent = (((int32_t)(h_exponent >> 10)) - 15 + 127) << 23;
+    f_mantissa = ((uint32_t)h_mantissa) << 13;
+  }
+  uint32_t result = f_sign | f_exponent | f_mantissa;
+  return *(float*)&result;
+enum HalfRoundMode
+  // Towards negative infinity
+  Half_RTN,
+  // Towards zero
+  Half_RTZ,
+  // Towards positive infinity
+  Half_RTP,
+  // Towards nearest even
+  Half_RTE
+static uint16_t floatToHalf(float sp, HalfRoundMode round = Half_RTZ)
+  uint16_t h_sign, h_exponent, h_mantissa;
+  uint32_t f_sign, f_exponent, f_mantissa;
+  union
+  {
+    float f;
+    uint32_t ui;
+  } FtoUI;
+  FtoUI.f = sp;
+  uint32_t f = FtoUI.ui;
+  f_sign     = f & 0x80000000; // 1000 0000 0000 0000 0000 0000 0000 0000
+  f_exponent = f & 0x7F800000; // 0111 1111 1000 0000 0000 0000 0000 0000
+  f_mantissa = f & 0x007FFFFF; // 0000 0000 0111 1111 1111 1111 1111 1111
+  h_sign     = f_sign >> 16;
+  if (f_exponent == 0)
+  {
+    // Zero
+    h_exponent = 0;
+    h_mantissa = 0;
+  }
+  else if (f_exponent == 0x7F800000)
+  {
+    // Inf or NaN
+    h_exponent = 0x7C00;
+    h_mantissa = f_mantissa;
+  }
+  else
+  {
+    int e = (((int32_t)(f_exponent >> 23)) - 127 + 15);
+    if (e >= 0x1F)
+    {
+      // Value will overflow
+      h_exponent = 0x7C00;
+      h_mantissa = 0;
+    }
+    else if (e <= 0)
+    {
+      // Value will underflow
+      h_exponent = 0;
+      if (14 - e > 24)
+      {
+        // Too small - flush to zero
+        h_mantissa = 0;
+      }
+      else
+      {
+        // Convert to denorm
+        f_mantissa |= 0x800000;
+        h_mantissa = (f_mantissa >> (14-e));
+        if ((f_mantissa >> (13 - e)) & 0x1)
+        {
+          h_mantissa += 0x1;
+        }
+      }
+    }
+    else
+    {
+      // Normalized
+      h_exponent = e << 10;
+      h_mantissa = f_mantissa >> 13;
+      // The current f_mantissa is done in RTZ
+      if (round == Half_RTE && (f & 0x00001000) != 0)
+      {
+        if ((f & 0x00002FFF) != 0)
+          h_mantissa += 1;
+      }
+      else if (round == Half_RTP)
+      {
+        FtoUI.ui &= 0xFFFFE000;
+        if (FtoUI.f < sp)
+          h_mantissa += 1;
+      }
+      else if (round == Half_RTN)
+      {
+        FtoUI.ui &= 0xFFFFE000;
+        if (sp < FtoUI.f)
+          h_mantissa += 1;
+      }
+    }
+  }
+  return h_sign + h_exponent + h_mantissa;
diff --git a/src/install/INSTALL.darwin b/src/install/INSTALL.darwin
new file mode 100644
index 0000000..b3292d5
--- /dev/null
+++ b/src/install/INSTALL.darwin
@@ -0,0 +1,17 @@
+To install Oclgrind, simply copy the bin, lib and include directories
+to (for example) /usr/local/, ensuring that file modification times
+are preserved. The easiest way to do this is with the following
+    sudo cp -rp {bin,lib,include} /usr/local
+Alternatively, Oclgrind can be used from a non-system directory. To do
+so, add $OCLGRIND_ROOT/bin to your PATH environment variable, and
+$OCLGRIND_ROOT/lib to your DYLD_LIBRARY_PATH environment variable
+(where $OCLGRIND_ROOT is the directory containing this file). If
+copying Oclgrind to a new location, ensure that the -p flag is passed
+to cp, to ensure that file modification times are preserved.
+Information about using Oclgrind can be found on the GitHub wiki page:
+    http://github.com/jrprice/Oclgrind/wiki
diff --git a/src/install/INSTALL.linux b/src/install/INSTALL.linux
new file mode 100644
index 0000000..cf81cf9
--- /dev/null
+++ b/src/install/INSTALL.linux
@@ -0,0 +1,20 @@
+To install Oclgrind, simply copy the bin, lib and include directories
+to (for example) /usr/local/, ensuring that file modification times
+are preserved. The easiest way to do this is with the following
+    sudo cp -rp {bin,lib,include} /usr/local
+Alternatively, Oclgrind can be used from a non-system directory. To do
+so, add $OCLGRIND_ROOT/bin to your PATH environment variable, and
+$OCLGRIND_ROOT/lib to your LD_LIBRARY_PATH environment variable (where
+$OCLGRIND_ROOT is the directory containing this file). If copying
+Oclgrind to a new location, ensure that the -p flag is passed to cp,
+to ensure that file modification times are preserved.
+To use Oclgrind with the OpenCL ICD loader (optional), copy
+oclgrind.icd to /etc/OpenCL/vendors/.
+Information about using Oclgrind can be found on the GitHub wiki page:
+    http://github.com/jrprice/Oclgrind/wiki
diff --git a/src/install/INSTALL.windows b/src/install/INSTALL.windows
new file mode 100644
index 0000000..2b02636
--- /dev/null
+++ b/src/install/INSTALL.windows
@@ -0,0 +1,8 @@
+To install Oclgrind, run 'install.bat' as an Administrator. This will
+install Oclgrind to 'C:\Program Files\Oclgrind' and create a registry
+entry for the OpenCL ICD loader. Oclgrind can be uninstalled by
+running 'uninstall.bat' as an Administrator.
+Alternatively, Oclgrind can be run from any other directory. You will
+need to manually create OpenCL ICD loading points by editing the
+registry (see oclgrind-icd.reg).
diff --git a/src/install/install.bat b/src/install/install.bat
new file mode 100644
index 0000000..cea2457
--- /dev/null
+++ b/src/install/install.bat
@@ -0,0 +1,23 @@
+cd %~dp0
+set "ROOT=%programfiles%\Oclgrind"
+mkdir               "%ROOT%"                  || goto :error
+xcopy include       "%ROOT%\include" /S /Y /I || goto :error
+xcopy x86           "%ROOT%\x86"     /S /Y /I || goto :error
+xcopy x64           "%ROOT%\x64"     /S /Y /I || goto :error
+xcopy uninstall.bat "%ROOT%\"           /Y    || goto :error
+regedit /S oclgrind-icd.reg                   || goto :error
+goto :EOF
+echo Did you run as Administrator?
diff --git a/src/install/oclgrind-icd.reg b/src/install/oclgrind-icd.reg
new file mode 100644
index 0000000..89af7a6
Binary files /dev/null and b/src/install/oclgrind-icd.reg differ
diff --git a/src/install/uninstall.bat b/src/install/uninstall.bat
new file mode 100644
index 0000000..660e8d3
--- /dev/null
+++ b/src/install/uninstall.bat
@@ -0,0 +1 @@
+start /B "" cmd /C rmdir "%programfiles%\Oclgrind" /S /Q
diff --git a/src/kernel/Simulation.cpp b/src/kernel/Simulation.cpp
new file mode 100644
index 0000000..208ed77
--- /dev/null
+++ b/src/kernel/Simulation.cpp
@@ -0,0 +1,764 @@
+// Simulation.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "config.h"
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include "core/Context.h"
+#include "core/Kernel.h"
+#include "core/KernelInvocation.h"
+#include "core/Memory.h"
+#include "core/Program.h"
+#include "kernel/Simulation.h"
+using namespace oclgrind;
+using namespace std;
+#define PARSING(parsing) m_parsing = parsing;
+// Convert an integer to char/uchar, checking if the value is valid
+#define INT_TO_CHAR(intval, result) \
+  result = intval;                  \
+  if (result != intval)             \
+  {                                 \
+    throw "Invalid char value";     \
+  }
+// Utility to read a typed value from a stream
+template<typename T> T readValue(istream& stream);
+  m_context = new Context();
+  m_kernel = NULL;
+  m_program = NULL;
+  delete m_kernel;
+  delete m_program;
+  delete m_context;
+template<typename T>
+void Simulation::dumpArgument(DumpArg& arg)
+  size_t num = arg.size / sizeof(T);
+  T *data = new T[num];
+  m_context->getGlobalMemory()->load((uint8_t*)data, arg.address, arg.size);
+  for (size_t i = 0; i < num; i++)
+  {
+    cout << "  " << arg.name << "[" << i << "] = ";
+    if (sizeof(T) == 1)
+      cout << (int)data[i];
+    else
+      cout << data[i];
+    cout << endl;
+  }
+  cout << endl;
+  delete[] data;
+template<typename T>
+void Simulation::get(T& result)
+  do
+  {
+    // Check if line buffer has content
+    streampos pos = m_lineBuffer.tellg();
+    string token;
+    m_lineBuffer >> token;
+    if (!m_lineBuffer.fail())
+    {
+      // Line has content, rewind line buffer
+      m_lineBuffer.clear();
+      m_lineBuffer.seekg(pos);
+      // Read value from line buffer
+      m_lineBuffer >> result;
+      if (m_lineBuffer.fail())
+      {
+        throw ifstream::failbit;
+      }
+      return;
+    }
+    // Get next line
+    string line;
+    getline(m_simfile, line);
+    m_lineNumber++;
+    // Remove comments
+    size_t comment = line.find_first_of('#');
+    if (comment != string::npos)
+    {
+      line = line.substr(0, comment);
+    }
+    // Update line buffer
+    m_lineBuffer.clear();
+    m_lineBuffer.str(line);
+  }
+  while (m_simfile.good());
+  // Couldn't read data from file, throw exception
+  throw m_simfile.eof() ? ifstream::eofbit : ifstream::failbit;
+bool Simulation::load(const char *filename)
+  // Open simulator file
+  m_lineNumber = 0;
+  m_lineBuffer.setstate(ios_base::eofbit);
+  m_simfile.open(filename);
+  if (m_simfile.fail())
+  {
+    cerr << "Unable to open simulator file." << endl;
+    return false;
+  }
+  try
+  {
+    // Read simulation parameters
+    string progFileName;
+    string kernelName;
+    PARSING("program file");
+    get(progFileName);
+    PARSING("kernel");
+    get(kernelName);
+    PARSING("NDRange");
+    get(m_ndrange.x);
+    get(m_ndrange.y);
+    get(m_ndrange.z);
+    PARSING("work-group size");
+    get(m_wgsize.x);
+    get(m_wgsize.y);
+    get(m_wgsize.z);
+    // Ensure work-group size exactly divides NDRange
+    if (m_ndrange.x % m_wgsize.x ||
+        m_ndrange.y % m_wgsize.y ||
+        m_ndrange.z % m_wgsize.z)
+    {
+      cerr << "Work group size must divide NDRange exactly." << endl;
+      return false;
+    }
+    // Open program file
+    ifstream progFile;
+    progFile.open(progFileName.c_str(), ios_base::in | ios_base::binary);
+    if (!progFile.good())
+    {
+      cerr << "Unable to open " << progFileName << endl;
+      return false;
+    }
+    // Check for LLVM bitcode magic numbers
+    char magic[2] = {0,0};
+    progFile.read(magic, 2);
+    if (magic[0] == 0x42 && magic[1] == 0x43)
+    {
+      // Load bitcode
+      progFile.close();
+      m_program = Program::createFromBitcodeFile(m_context, progFileName);
+      if (!m_program)
+      {
+        cerr << "Failed to load bitcode from " << progFileName << endl;
+        return false;
+      }
+    }
+    else
+    {
+      // Get size of file
+      progFile.seekg(0, ios_base::end);
+      size_t sz = progFile.tellg();
+      progFile.seekg(0, ios_base::beg);
+      // Load source
+      char *data = new char[sz + 1];
+      progFile.read(data, sz+1);
+      progFile.close();
+      data[sz] = '\0';
+      m_program = new Program(m_context, data);
+      delete[] data;
+      // Build program
+      if (!m_program->build(""))
+      {
+        cerr << "Build failure:" << endl << m_program->getBuildLog() << endl;
+        return false;
+      }
+    }
+    // Get kernel
+    m_kernel = m_program->createKernel(kernelName);
+    if (!m_kernel)
+    {
+      cerr << "Failed to create kernel " << kernelName << endl;
+      return false;
+    }
+    // Clear global memory
+    Memory *globalMemory = m_context->getGlobalMemory();
+    globalMemory->clear();
+    // Parse kernel arguments
+    m_dumpArguments.clear();
+    for (unsigned index = 0; index < m_kernel->getNumArguments(); index++)
+    {
+      parseArgument(index);
+    }
+    // Make sure there is no more input
+    string next;
+    m_simfile >> next;
+    if (m_simfile.good() || !m_simfile.eof())
+    {
+      cerr << "Unexpected token '" << next << "' (expected EOF)" << endl;
+      return false;
+    }
+  }
+  catch (const char *err)
+  {
+    cerr << "Line " << m_lineNumber << ": " << err
+         << " (" << m_parsing << ")" << endl;
+    return false;
+  }
+  catch (ifstream::iostate e)
+  {
+    if (e == ifstream::eofbit)
+    {
+      cerr << "Unexpected EOF when parsing " << m_parsing << endl;
+      return false;
+    }
+    else if (e == ifstream::failbit)
+    {
+      cerr << "Line " << m_lineNumber
+           << ": Failed to parse " << m_parsing << endl;
+      return false;
+    }
+    else
+    {
+      throw e;
+    }
+  }
+  return true;
+void Simulation::parseArgument(size_t index)
+  // Argument parsing parameters
+  size_t size = -1;
+  cl_mem_flags flags = 0;
+  ArgDataType type = TYPE_NONE;
+  size_t typeSize = 0;
+  bool null = false;
+  bool dump = false;
+  string fill = "";
+  string range = "";
+  string name = m_kernel->getArgumentName(index).str();
+  // Set meaningful parsing status for error messages
+  ostringstream stringstream;
+  stringstream << "argument " << index << ": " << name;
+  string formatted = stringstream.str();
+  PARSING(formatted.c_str());
+  // Get argument info
+  size_t argSize = m_kernel->getArgumentSize(index);
+  unsigned int addrSpace = m_kernel->getArgumentAddressQualifier(index);
+  const llvm::StringRef argType = m_kernel->getArgumentTypeName(index);
+  // Ensure we have an argument header
+  char c;
+  get(c);
+  if (c != '<')
+  {
+    throw "Expected argument header <...>";
+  }
+  // Get header
+  streampos startpos = m_lineBuffer.tellg();
+  string headerStr;
+  getline(m_lineBuffer, headerStr);
+  size_t end = headerStr.find_last_of('>');
+  if (end == string::npos)
+  {
+    throw "Missing '>' at end of argument header";
+  }
+  headerStr = headerStr.substr(0, end);
+  // Move line buffer to end of header
+  m_lineBuffer.clear();
+  m_lineBuffer.seekg((int)startpos + headerStr.size() + 1);
+  // Save format flags
+  ios_base::fmtflags previousFormat = m_lineBuffer.flags();
+  // Parse header
+  istringstream header(headerStr);
+  while (!header.eof())
+  {
+    // Get next header token
+    string token;
+    header >> token;
+    if (header.fail())
+    {
+      break;
+    }
+#define MATCH_TYPE(str, value, sz)                  \
+  else if (token == str)                            \
+  {                                                 \
+    if (type != TYPE_NONE)                          \
+    {                                               \
+      throw "Argument type defined multiple times"; \
+    }                                               \
+    type = value;                                   \
+    typeSize = sz;                                  \
+  }
+    // Parse token
+    if (false);
+    MATCH_TYPE("char", TYPE_CHAR, 1)
+    MATCH_TYPE("uchar", TYPE_UCHAR, 1)
+    MATCH_TYPE("short", TYPE_SHORT, 2)
+    MATCH_TYPE("ushort", TYPE_USHORT, 2)
+    MATCH_TYPE("int", TYPE_INT, 4)
+    MATCH_TYPE("uint", TYPE_UINT, 4)
+    MATCH_TYPE("long", TYPE_LONG, 8)
+    MATCH_TYPE("ulong", TYPE_ULONG, 8)
+    MATCH_TYPE("float", TYPE_FLOAT, 4)
+    MATCH_TYPE("double", TYPE_DOUBLE, 8)
+    else if (token.compare(0, 4, "dump") == 0)
+    {
+      dump = true;
+    }
+    else if (token.compare(0, 4, "fill") == 0)
+    {
+      if (token.size() < 6 || token[4] != '=')
+      {
+        throw "Expected =VALUE after 'fill";
+      }
+      fill = token.substr(5);
+    }
+    else if (token == "hex")
+    {
+      m_lineBuffer.setf(ios_base::hex);
+      m_lineBuffer.unsetf(ios_base::dec | ios_base::oct);
+    }
+    else if (token == "null")
+    {
+      if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL &&
+          addrSpace != CL_KERNEL_ARG_ADDRESS_CONSTANT)
+      {
+        throw "'null' only valid for buffer arguments";
+      }
+      null = true;
+    }
+    else if (token.compare(0, 5, "range") == 0)
+    {
+      if (token.size() < 7 || token[5] != '=')
+      {
+        throw "Expected =START:INC:END after 'range";
+      }
+      range = token.substr(6);
+    }
+    else if (token == "ro")
+    {
+      if (flags & CL_MEM_WRITE_ONLY)
+      {
+        throw "'ro' and 'wo' are mutually exclusive";
+      }
+      if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL)
+      {
+        throw "'ro' only valid for global memory buffers";
+      }
+      flags |= CL_MEM_READ_ONLY;
+    }
+    else if (token.compare(0, 4, "size") == 0)
+    {
+      istringstream value(token.substr(4));
+      char equals = 0;
+      value >> equals;
+      if (equals != '=')
+      {
+        throw "Expected = after 'size'";
+      }
+      value >> dec >> size;
+      if (value.fail() || !value.eof())
+      {
+        throw "Invalid value for 'size'";
+      }
+    }
+    else if (token == "wo")
+    {
+      if (flags & CL_MEM_READ_ONLY)
+      {
+        throw "'ro' and 'wo' are mutually exclusive";
+      }
+      if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL)
+      {
+        throw "'wo' only valid for global memory buffers";
+      }
+      flags |= CL_MEM_WRITE_ONLY;
+    }
+    else
+    {
+      string err = "Unrecognised header token '";
+      err += token;
+      err += "'";
+      throw err.c_str();
+    }
+  }
+  // Ensure size given
+  if (null)
+  {
+    if (size != -1 || !fill.empty() || !range.empty())
+    {
+      throw "'null' not valid with other argument descriptors";
+    }
+    size = 0;
+  }
+  else if (size == -1)
+  {
+    throw "size required";
+  }
+  if (type == TYPE_NONE)
+  {
+#define MATCH_TYPE_PREFIX(str, value, sz)       \
+  else if (argType.startswith(str))             \
+  {                                             \
+    type = value;                               \
+    typeSize = sz;                              \
+  }
+    // Set default type using kernel introspection
+    if (false);
+    else
+    {
+      throw "Invalid default kernel argument type";
+    }
+  }
+  // Ensure argument data size is a multiple of format type size
+  if (size % typeSize)
+  {
+    throw "Initialiser type must exactly divide argument size";
+  }
+  // Ensure 'dump' only used with non-null buffers
+  if (dump)
+  {
+    if (addrSpace != CL_KERNEL_ARG_ADDRESS_GLOBAL &&
+    {
+      throw "'dump' only valid for memory objects";
+    }
+    if (null)
+    {
+      throw "'dump' not valid with 'null' specifier";
+    }
+  }
+  // Generate argument data
+  TypedValue value;
+  value.size = argSize;
+  value.num = 1;
+  if (addrSpace == CL_KERNEL_ARG_ADDRESS_LOCAL)
+  {
+    value.size = size;
+    value.data = NULL;
+  }
+  else if (null)
+  {
+    value.data = new unsigned char[value.size];
+    memset(value.data, 0, value.size);
+  }
+  else
+  {
+    // Parse argument data
+    unsigned char *data = new unsigned char[size];
+    if (!fill.empty())
+    {
+      istringstream fillStream(fill);
+      fillStream.copyfmt(m_lineBuffer);
+  #define FILL_TYPE(type, T)                \
+    case type:                              \
+      parseFill<T>(data, size, fillStream); \
+      break;
+      switch (type)
+      {
+        FILL_TYPE(TYPE_CHAR, int8_t);
+        FILL_TYPE(TYPE_UCHAR, uint8_t);
+        FILL_TYPE(TYPE_SHORT, int16_t);
+        FILL_TYPE(TYPE_USHORT, uint16_t);
+        FILL_TYPE(TYPE_INT, int32_t);
+        FILL_TYPE(TYPE_UINT, uint32_t);
+        FILL_TYPE(TYPE_LONG, int64_t);
+        FILL_TYPE(TYPE_ULONG, uint64_t);
+        FILL_TYPE(TYPE_FLOAT, float);
+        FILL_TYPE(TYPE_DOUBLE, double);
+        default:
+          throw "Invalid argument data type";
+      }
+    }
+    else if (!range.empty())
+    {
+      istringstream rangeStream(range);
+      rangeStream.copyfmt(m_lineBuffer);
+  #define RANGE_TYPE(type, T)                 \
+    case type:                                \
+      parseRange<T>(data, size, rangeStream); \
+      break;
+      switch (type)
+      {
+        RANGE_TYPE(TYPE_CHAR, int8_t);
+        RANGE_TYPE(TYPE_UCHAR, uint8_t);
+        RANGE_TYPE(TYPE_SHORT, int16_t);
+        RANGE_TYPE(TYPE_USHORT, uint16_t);
+        RANGE_TYPE(TYPE_INT, int32_t);
+        RANGE_TYPE(TYPE_UINT, uint32_t);
+        RANGE_TYPE(TYPE_LONG, int64_t);
+        RANGE_TYPE(TYPE_ULONG, uint64_t);
+        RANGE_TYPE(TYPE_FLOAT, float);
+        RANGE_TYPE(TYPE_DOUBLE, double);
+        default:
+          throw "Invalid argument data type";
+      }
+    }
+    else if (addrSpace != CL_KERNEL_ARG_ADDRESS_LOCAL)
+    {
+  #define PARSE_TYPE(type, T)           \
+    case type:                          \
+      parseArgumentData<T>(data, size); \
+      break;
+      switch (type)
+      {
+        PARSE_TYPE(TYPE_CHAR, int8_t);
+        PARSE_TYPE(TYPE_UCHAR, uint8_t);
+        PARSE_TYPE(TYPE_SHORT, int16_t);
+        PARSE_TYPE(TYPE_USHORT, uint16_t);
+        PARSE_TYPE(TYPE_INT, int32_t);
+        PARSE_TYPE(TYPE_UINT, uint32_t);
+        PARSE_TYPE(TYPE_LONG, int64_t);
+        PARSE_TYPE(TYPE_ULONG, uint64_t);
+        PARSE_TYPE(TYPE_FLOAT, float);
+        PARSE_TYPE(TYPE_DOUBLE, double);
+        default:
+          throw "Invalid argument data type";
+      }
+    }
+    if (addrSpace == CL_KERNEL_ARG_ADDRESS_PRIVATE)
+    {
+      value.data = data;
+    }
+    else
+    {
+      // Allocate buffer and store content
+      Memory *globalMemory = m_context->getGlobalMemory();
+      size_t address = globalMemory->allocateBuffer(size, flags);
+      globalMemory->store((unsigned char*)&data[0], address, size);
+      value.data = new unsigned char[value.size];
+      value.setPointer(address);
+      delete[] data;
+      if (dump)
+      {
+        DumpArg dump =
+        {
+          address,
+          size,
+          type,
+          name,
+        };
+        m_dumpArguments.push_back(dump);
+      }
+    }
+  }
+  // Set argument value
+  m_kernel->setArgument(index, value);
+  if (value.data)
+  {
+    delete[] value.data;
+  }
+  // Reset parsing format
+  m_lineBuffer.flags(previousFormat);
+template<typename T>
+void Simulation::parseArgumentData(unsigned char *result, size_t size)
+  vector<T> data;
+  for (int i = 0; i < size / sizeof(T); i++)
+  {
+    T value;
+    if (sizeof(T) == 1)
+    {
+      int intval;
+      get(intval);
+      INT_TO_CHAR(intval, value);
+    }
+    else
+    {
+      get(value);
+    }
+    data.push_back(value);
+  }
+  memcpy(result, &data[0], size);
+template<typename T>
+void Simulation::parseFill(unsigned char *result, size_t size,
+                           istringstream& fill)
+  T value = readValue<T>(fill);
+  for (int i = 0; i < size/sizeof(T); i++)
+  {
+    ((T*)result)[i] = value;
+  }
+  if (fill.fail() || !fill.eof())
+  {
+    throw "Invalid fill value";
+  }
+template<typename T>
+void Simulation::parseRange(unsigned char *result, size_t size,
+                            istringstream& range)
+  // Parse range format
+  T values[3];
+  for (int i = 0; i < 3; i++)
+  {
+    values[i] = readValue<T>(range);
+    if (i < 2)
+    {
+      char colon = 0;
+      range >> colon;
+      if (range.fail() || colon != ':')
+      {
+        throw "Invalid range format";
+      }
+    }
+  }
+  if (range.fail() || !range.eof())
+  {
+    throw "Invalid range format";
+  }
+  // Ensure range is value
+  double num = (values[2] - values[0] + values[1]) / (double)values[1];
+  if (ceil(num) != num || num*sizeof(T) != size)
+  {
+    throw "Range doesn't produce correct buffer size";
+  }
+  // Produce range values
+  T value = values[0];
+  for (size_t i = 0; i < num; i++)
+  {
+    ((T*)result)[i] = value;
+    value += values[1];
+  }
+void Simulation::run(bool dumpGlobalMemory)
+  assert(m_kernel && m_program);
+  assert(m_kernel->allArgumentsSet());
+  Size3 offset(0, 0, 0);
+  KernelInvocation::run(m_context, m_kernel, 3, offset, m_ndrange, m_wgsize);
+  // Dump individual arguments
+  cout << dec;
+  list<DumpArg>::iterator itr;
+  for (itr = m_dumpArguments.begin(); itr != m_dumpArguments.end(); itr++)
+  {
+    cout << endl
+         << "Argument '" << itr->name << "': "
+         << itr->size << " bytes" << endl;
+#define DUMP_TYPE(type, T) \
+  case type:               \
+    dumpArgument<T>(*itr); \
+    break;
+    switch (itr->type)
+    {
+      DUMP_TYPE(TYPE_CHAR, char);
+      DUMP_TYPE(TYPE_UCHAR, uint8_t);
+      DUMP_TYPE(TYPE_SHORT, int16_t);
+      DUMP_TYPE(TYPE_USHORT, uint16_t);
+      DUMP_TYPE(TYPE_INT, int32_t);
+      DUMP_TYPE(TYPE_UINT, uint32_t);
+      DUMP_TYPE(TYPE_LONG, int64_t);
+      DUMP_TYPE(TYPE_ULONG, uint64_t);
+      DUMP_TYPE(TYPE_FLOAT, float);
+      DUMP_TYPE(TYPE_DOUBLE, double);
+      default:
+        throw "Invalid argument data type";
+    }
+  }
+  // Dump global memory if required
+  if (dumpGlobalMemory)
+  {
+    cout << endl << "Global Memory:" << endl;
+    m_context->getGlobalMemory()->dump();
+  }
+template<typename T>
+T readValue(istream& stream)
+  T value;
+  if (sizeof(T) == 1)
+  {
+    int intval;
+    stream >> intval;
+    INT_TO_CHAR(intval, value);
+  }
+  else
+  {
+    stream >> value;
+  }
+  return value;
diff --git a/src/kernel/Simulation.h b/src/kernel/Simulation.h
new file mode 100644
index 0000000..19b6e9b
--- /dev/null
+++ b/src/kernel/Simulation.h
@@ -0,0 +1,82 @@
+// Simulation.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/common.h"
+#include <fstream>
+#include <list>
+#include <sstream>
+#include <string>
+namespace oclgrind
+  class Context;
+  class Kernel;
+  class Program;
+class Simulation
+  enum ArgDataType
+  {
+    TYPE_INT,
+  };
+  public:
+    Simulation();
+    virtual ~Simulation();
+    bool load(const char *filename);
+    void run(bool dumpGlobalMemory=false);
+  private:
+    oclgrind::Context *m_context;
+    oclgrind::Kernel *m_kernel;
+    oclgrind::Program *m_program;
+    oclgrind::Size3 m_ndrange;
+    oclgrind::Size3 m_wgsize;
+    std::ifstream m_simfile;
+    std::string m_parsing;
+    size_t m_lineNumber;
+    std::istringstream m_lineBuffer;
+    typedef struct
+    {
+      size_t address;
+      size_t size;
+      ArgDataType type;
+      std::string name;
+    } DumpArg;
+    std::list<DumpArg> m_dumpArguments;
+    template<typename T>
+    void dumpArgument(DumpArg& arg);
+    template<typename T>
+    void get(T& result);
+    void parseArgument(size_t index);
+    template<typename T>
+    void parseArgumentData(unsigned char *result, size_t size);
+    template<typename T>
+    void parseFill(unsigned char *result, size_t size,
+                   std::istringstream& fill);
+    template<typename T>
+    void parseRange(unsigned char *result, size_t size,
+                    std::istringstream& range);
diff --git a/src/kernel/oclgrind-kernel.cpp b/src/kernel/oclgrind-kernel.cpp
new file mode 100644
index 0000000..4ac1d31
--- /dev/null
+++ b/src/kernel/oclgrind-kernel.cpp
@@ -0,0 +1,233 @@
+// main.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "config.h"
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include "kernel/Simulation.h"
+using namespace oclgrind;
+using namespace std;
+static bool outputGlobalMemory = false;
+static const char *simfile = NULL;
+static bool parseArguments(int argc, char *argv[]);
+static void printUsage();
+static void setEnvironment(const char *name, const char *value);
+int main(int argc, char *argv[])
+  // Parse arguments
+  if (!parseArguments(argc, argv))
+  {
+    return 1;
+  }
+  // Initialise simulation
+  Simulation simulation;
+  if (!simulation.load(simfile))
+  {
+    return 1;
+  }
+  // Run simulation
+  simulation.run(outputGlobalMemory);
+static bool parseArguments(int argc, char *argv[])
+  for (int i = 1; i < argc; i++)
+  {
+    if (!strcmp(argv[i], "--build-options"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --build-options" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_BUILD_OPTIONS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--data-races"))
+    {
+      setEnvironment("OCLGRIND_DATA_RACES", "1");
+    }
+    else if (!strcmp(argv[i], "--disable-pch"))
+    {
+      setEnvironment("OCLGRIND_DISABLE_PCH", "1");
+    }
+    else if (!strcmp(argv[i], "--dump-spir"))
+    {
+      setEnvironment("OCLGRIND_DUMP_SPIR", "1");
+    }
+    else if (!strcmp(argv[i], "-g") || !strcmp(argv[i], "--global-mem"))
+    {
+      outputGlobalMemory = true;
+    }
+    else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help"))
+    {
+      printUsage();
+      exit(0);
+    }
+    else if (!strcmp(argv[i], "--inst-counts"))
+    {
+      setEnvironment("OCLGRIND_INST_COUNTS", "1");
+    }
+    else if (!strcmp(argv[i], "-i") || !strcmp(argv[i], "--interactive"))
+    {
+      setEnvironment("OCLGRIND_INTERACTIVE", "1");
+    }
+    else if (!strcmp(argv[i], "--log"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --log" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_LOG", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--max-errors"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --max-errors" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_MAX_ERRORS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--num-threads"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --num-threads" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_NUM_THREADS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--pch-dir"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --pch-dir" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_PCH_DIR", argv[i]);
+    }
+    else if (!strcmp(argv[i], "--plugins"))
+    {
+      if (++i >= argc)
+      {
+        cerr << "Missing argument to --plugins" << endl;
+        return false;
+      }
+      setEnvironment("OCLGRIND_PLUGINS", argv[i]);
+    }
+    else if (!strcmp(argv[i], "-q") || !strcmp(argv[i], "--quick"))
+    {
+      setEnvironment("OCLGRIND_QUICK", "1");
+    }
+    else if (!strcmp(argv[i], "--uniform-writes"))
+    {
+      setEnvironment("OCLGRIND_UNIFORM_WRITES", "1");
+    }
+    else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version"))
+    {
+      cout << endl;
+      cout << "Oclgrind " PACKAGE_VERSION << endl;
+      cout << endl;
+      cout << "Copyright (c) 2013-2015" << endl;
+      cout << "James Price and Simon McIntosh-Smith, University of Bristol"
+           << endl;
+      cout << "https://github.com/jrprice/Oclgrind" << endl;
+      cout << endl;
+      exit(0);
+    }
+    else if (argv[i][0] == '-')
+    {
+      cerr << "Unrecognised option '" << argv[i] << "'" << endl;
+      return false;
+    }
+    else
+    {
+      if (simfile == NULL)
+      {
+        simfile = argv[i];
+      }
+      else
+      {
+        cerr << "Unexpected positional argument '" << argv[i] << "'" << endl;
+        return false;
+      }
+    }
+  }
+  if (simfile == NULL)
+  {
+    printUsage();
+    return false;
+  }
+  return true;
+static void printUsage()
+  cout
+    << "Usage: oclgrind-kernel [OPTIONS] simfile" << endl
+    << "       oclgrind-kernel [--help | --version]" << endl
+    << endl
+    << "Options:" << endl
+    << "     --build-options  OPTIONS  "
+             "Additional options to pass to the OpenCL compiler" << endl
+    << "     --data-races              "
+             "Enable data-race detection" << endl
+    << "     --disable-pch             "
+             "Don't use precompiled headers" << endl
+    << "     --dump-spir               "
+             "Dump SPIR to /tmp/oclgrind_*.{ll,bc}" << endl
+    << "  -g --global-mem              "
+             "Output global memory at exit" << endl
+    << "  -h --help                    "
+             "Display usage information" << endl
+    << "     --inst-counts             "
+             "Output histograms of instructions executed" << endl
+    << "  -i --interactive             "
+             "Enable interactive mode" << endl
+    << "     --log            LOGFILE  "
+             "Redirect log/error messages to a file" << endl
+    << "     --max-errors     NUM      "
+             "Limit the number of error/warning messages" << endl
+    << "     --num-threads    NUM      "
+             "Set the number of worker threads to use" << endl
+    << "     --pch-dir        DIR      "
+             "Override directory containing precompiled headers" << endl
+    << "     --plugins        PLUGINS  "
+             "Load colon seperated list of plugin libraries" << endl
+    << "  -q --quick                   "
+             "Only run first and last work-group" << endl
+    << "     --uniform-writes          "
+             "Don't suppress uniform write-write data-races" << endl
+    << "  -v --version                 "
+             "Display version information" << endl
+    << endl
+    << "For more information, please visit the Oclgrind wiki page:" << endl
+    << "-> https://github.com/jrprice/Oclgrind/wiki" << endl
+    << endl;
+static void setEnvironment(const char *name, const char *value)
+#if defined(_WIN32) && !defined(__MINGW32__)
+  _putenv_s(name, value);
+  setenv(name, value, 1);
diff --git a/src/plugins/InstructionCounter.cpp b/src/plugins/InstructionCounter.cpp
new file mode 100644
index 0000000..ce680f4
--- /dev/null
+++ b/src/plugins/InstructionCounter.cpp
@@ -0,0 +1,184 @@
+// InstructionCounter.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/common.h"
+#include <sstream>
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "InstructionCounter.h"
+#include "core/Kernel.h"
+#include "core/KernelInvocation.h"
+using namespace oclgrind;
+using namespace std;
+#define COUNTED_LOAD_BASE  (llvm::Instruction::OtherOpsEnd + 4)
+static bool compareNamedCount(pair<string,size_t> a, pair<string,size_t> b)
+  return a.second > b.second;
+string InstructionCounter::getOpcodeName(unsigned opcode) const
+  if (opcode >= COUNTED_CALL_BASE)
+  {
+    // Get functon name
+    unsigned index = opcode - COUNTED_CALL_BASE;
+    assert(index < m_functions.size());
+    return "call " + m_functions[index]->getName().str() + "()";
+  }
+  else if (opcode >= COUNTED_LOAD_BASE)
+  {
+    // Create stream using default locale
+    ostringstream name;
+    locale defaultLocale("");
+    name.imbue(defaultLocale);
+    // Get number of bytes
+    size_t bytes = m_memopBytes[opcode-COUNTED_LOAD_BASE];
+    // Get name of operation
+    if (opcode >= COUNTED_STORE_BASE)
+    {
+      opcode -= COUNTED_STORE_BASE;
+      name << "store";
+    }
+    else
+    {
+      opcode -= COUNTED_LOAD_BASE;
+      name << "load";
+    }
+    // Add address space to name
+    name << " " << getAddressSpaceName(opcode);
+    // Add number of bytes to name
+    name << " (" << bytes << " bytes)";
+    return name.str();
+  }
+  return llvm::Instruction::getOpcodeName(opcode);
+void InstructionCounter::instructionExecuted(
+  const WorkItem *workItem, const llvm::Instruction *instruction,
+  const TypedValue& result)
+  unsigned opcode = instruction->getOpcode();
+  // Check for loads and stores
+  if (opcode == llvm::Instruction::Load || opcode == llvm::Instruction::Store)
+  {
+    // Track operations in separate address spaces
+    bool load = (opcode == llvm::Instruction::Load);
+    const llvm::Type *type = instruction->getOperand(load?0:1)->getType();
+    unsigned addrSpace = type->getPointerAddressSpace();
+    opcode = (load ? COUNTED_LOAD_BASE : COUNTED_STORE_BASE) + addrSpace;
+    // Count total number of bytes loaded/stored
+    unsigned bytes = getTypeSize(type->getPointerElementType());
+    m_memopBytes[opcode-COUNTED_LOAD_BASE] += bytes;
+  }
+  else if (opcode == llvm::Instruction::Call)
+  {
+    // Track distinct function calls
+    const llvm::CallInst *callInst = (const llvm::CallInst*)instruction;
+    const llvm::Function *function = callInst->getCalledFunction();
+    if (function)
+    {
+      vector<const llvm::Function*>::iterator itr =
+        find(m_functions.begin(), m_functions.end(), function);
+      if (itr == m_functions.end())
+      {
+        opcode = COUNTED_CALL_BASE + m_functions.size();
+        m_functions.push_back(function);
+      }
+      else
+      {
+        opcode = COUNTED_CALL_BASE + (itr - m_functions.begin());
+      }
+    }
+  }
+  if (opcode >= m_instructionCounts.size())
+  {
+    m_instructionCounts.resize(opcode+1);
+  }
+  m_instructionCounts[opcode]++;
+bool InstructionCounter::isThreadSafe() const
+  return false;
+void InstructionCounter::kernelBegin(const KernelInvocation *kernelInvocation)
+  m_instructionCounts.clear();
+  m_instructionCounts.resize(COUNTED_CALL_BASE);
+  m_memopBytes.clear();
+  m_memopBytes.resize(16);
+  m_functions.clear();
+void InstructionCounter::kernelEnd(const KernelInvocation *kernelInvocation)
+  // Load default locale
+  locale previousLocale = cout.getloc();
+  locale defaultLocale("");
+  cout.imbue(defaultLocale);
+  cout << "Instructions executed for kernel '"
+       << kernelInvocation->getKernel()->getName() << "':";
+  cout << endl;
+  // Generate list named instructions and their counts
+  vector< pair<string,size_t> > namedCounts;
+  for (unsigned i = 0; i < m_instructionCounts.size(); i++)
+  {
+    if (m_instructionCounts[i] == 0)
+    {
+      continue;
+    }
+    string name = getOpcodeName(i);
+    if (name.compare(0, 14, "call llvm.dbg.") == 0)
+    {
+      continue;
+    }
+    namedCounts.push_back(make_pair(name, m_instructionCounts[i]));
+  }
+  // Sort named counts
+  sort(namedCounts.begin(), namedCounts.end(), compareNamedCount);
+  // Output sorted instruction counts
+  for (unsigned i = 0; i < namedCounts.size(); i++)
+  {
+    cout << setw(16) << dec << namedCounts[i].second << " - "
+         << namedCounts[i].first << endl;
+  }
+  cout << endl;
+  // Restore locale
+  cout.imbue(previousLocale);
diff --git a/src/plugins/InstructionCounter.h b/src/plugins/InstructionCounter.h
new file mode 100644
index 0000000..f12c33a
--- /dev/null
+++ b/src/plugins/InstructionCounter.h
@@ -0,0 +1,38 @@
+// InstructionCounter.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/Plugin.h"
+namespace llvm
+  class Function;
+namespace oclgrind
+  class InstructionCounter : public Plugin
+  {
+  public:
+    InstructionCounter(const Context *context) : Plugin(context){};
+    virtual void instructionExecuted(const WorkItem *workItem,
+                                     const llvm::Instruction *instruction,
+                                     const TypedValue& result) override;
+    virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
+    virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
+    virtual bool isThreadSafe() const override;
+  private:
+    std::vector<size_t> m_instructionCounts;
+    std::vector<size_t> m_memopBytes;
+    std::vector<const llvm::Function*> m_functions;
+    std::string getOpcodeName(unsigned opcode) const;
+  };
diff --git a/src/plugins/InteractiveDebugger.cpp b/src/plugins/InteractiveDebugger.cpp
new file mode 100644
index 0000000..a088338
--- /dev/null
+++ b/src/plugins/InteractiveDebugger.cpp
@@ -0,0 +1,1024 @@
+// InteractiveDebugger.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/common.h"
+#include <iterator>
+#include <sstream>
+#if !defined(_WIN32) || defined(__MINGW32__)
+#include <signal.h>
+#include <readline/readline.h>
+#include <readline/history.h>
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "InteractiveDebugger.h"
+#include "core/Context.h"
+#include "core/Kernel.h"
+#include "core/KernelInvocation.h"
+#include "core/Memory.h"
+#include "core/Program.h"
+#include "core/WorkGroup.h"
+#include "core/WorkItem.h"
+using namespace oclgrind;
+using namespace std;
+#define LIST_LENGTH 10
+static bool sigintBreak = false;
+#if !defined(_WIN32) || defined(__MINGW32__)
+static struct sigaction m_oldSignalHandler;
+void handleSignal(int s)
+  if (s == SIGINT)
+    sigintBreak = true;
+InteractiveDebugger::InteractiveDebugger(const Context *context)
+  : Plugin(context)
+  m_running          = true;
+  m_forceBreak       = false;
+  m_nextBreakpoint   = 1;
+  m_program          = NULL;
+  m_kernelInvocation = NULL;
+  // Set-up commands
+#define ADD_CMD(name, sname, func)  \
+  m_commands[name] = &InteractiveDebugger::func; \
+  m_commands[sname] = &InteractiveDebugger::func;
+  ADD_CMD("backtrace",    "bt", backtrace);
+  ADD_CMD("break",        "b",  brk);
+  ADD_CMD("continue",     "c",  cont);
+  ADD_CMD("delete",       "d",  del);
+  ADD_CMD("gmem",         "gm", mem);
+  ADD_CMD("help",         "h",  help);
+  ADD_CMD("info",         "i",  info);
+  ADD_CMD("list",         "l",  list);
+  ADD_CMD("lmem",         "lm", mem);
+  ADD_CMD("next",         "n",  next);
+  ADD_CMD("pmem",         "pm", mem);
+  ADD_CMD("print",        "p",  print);
+  ADD_CMD("quit",         "q",  quit);
+  ADD_CMD("step",         "s",  step);
+  ADD_CMD("workitem",     "wi", workitem);
+void InteractiveDebugger::instructionExecuted(
+  const WorkItem *workItem, const llvm::Instruction *instruction,
+  const TypedValue& result)
+  if (!shouldShowPrompt(workItem))
+    return;
+#if !defined(_WIN32) || defined(__MINGW32__)
+  // Restore old signal handler
+  sigaction(SIGINT, &m_oldSignalHandler, NULL);
+  m_forceBreak = false;
+  sigintBreak  = false;
+  // Print function if changed
+  if (m_previousDepth != workItem->getCallStack().size() &&
+      workItem->getState() != WorkItem::FINISHED)
+  {
+    cout << "In function ";
+    printFunction(workItem->getCurrentInstruction());
+  }
+  printCurrentLine();
+  m_listPosition = 0;
+  m_continue     = false;
+  m_next         = false;
+  while (true)
+  {
+    // Prompt for command
+    bool eof = false;
+    string cmd;
+    char *line = readline("(oclgrind) ");
+    if (line)
+    {
+      cmd = line;
+      free(line);
+    }
+    else
+    {
+      eof = true;
+    }
+  #else
+    cout << "(oclgrind) " << flush;
+    getline(cin, cmd);
+    eof = cin.eof();
+  #endif
+    // Quit on EOF
+    if (eof)
+    {
+      cout << "(quit)" << endl;
+      quit(vector<string>());
+      return;
+    }
+    // Split command into tokens
+    vector<string> tokens;
+    istringstream iss(cmd);
+    copy(istream_iterator<string>(iss),
+         istream_iterator<string>(),
+         back_inserter< vector<string> >(tokens));
+    // Skip empty lines
+    if (tokens.size() == 0)
+    {
+      continue;
+    }
+    add_history(cmd.c_str());
+  #endif
+    // Find command in map and execute
+    map<string,Command>::iterator itr = m_commands.find(tokens[0]);
+    if (itr != m_commands.end())
+    {
+      if ((this->*itr->second)(tokens))
+        break;
+    }
+    else
+    {
+      cout << "Unrecognized command '" << tokens[0] << "'" << endl;
+    }
+  }
+bool InteractiveDebugger::isThreadSafe() const
+  return false;
+void InteractiveDebugger::kernelBegin(const KernelInvocation *kernelInvocation)
+  m_continue      = false;
+  m_lastBreakLine = 0;
+  m_listPosition  = 0;
+  m_next          = false;
+  m_previousDepth = 0;
+  m_previousLine  = 0;
+  m_kernelInvocation = kernelInvocation;
+  m_program = kernelInvocation->getKernel()->getProgram();
+void InteractiveDebugger::kernelEnd(const KernelInvocation *kernelInvocation)
+  m_kernelInvocation = NULL;
+#if !defined(_WIN32) || defined(__MINGW32__)
+  // Restore old signal handler
+  sigaction(SIGINT, &m_oldSignalHandler, NULL);
+void InteractiveDebugger::log(MessageType type, const char *message)
+  if (type == ERROR)
+    m_forceBreak = true;
+//// Utility Functions ////
+size_t InteractiveDebugger::getCurrentLineNumber() const
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  if (!workItem || workItem->getState() == WorkItem::FINISHED)
+  {
+    return 0;
+  }
+  return getLineNumber(workItem->getCurrentInstruction());
+size_t InteractiveDebugger::getLineNumber(
+  const llvm::Instruction *instruction) const
+  llvm::MDNode *md = instruction->getMetadata("dbg");
+  if (md)
+  {
+#if LLVM_VERSION > 36
+    llvm::DILocation *loc = (llvm::DILocation*)md;
+    return loc->getLine();
+    llvm::DILocation loc((llvm::MDLocation*)md);
+    return loc.getLineNumber();
+  }
+  return 0;
+bool InteractiveDebugger::hasHitBreakpoint()
+  if (m_breakpoints.empty())
+    return false;
+  // Check if we have passed over the previous breakpoint
+  if (m_lastBreakLine)
+  {
+    if (getCurrentLineNumber() != m_lastBreakLine)
+      m_lastBreakLine = 0;
+    else
+      return false;;
+  }
+  // Check if we're at a breakpoint
+  size_t line = getCurrentLineNumber();
+  map<size_t, size_t>::iterator itr;
+  for (itr = m_breakpoints[m_program].begin();
+       itr != m_breakpoints[m_program].end(); itr++)
+  {
+    if (itr->second == line)
+    {
+      cout << "Breakpoint " << itr->first
+           << " hit at line " << itr->second
+           << " by work-item "
+           << m_kernelInvocation->getCurrentWorkItem()->getGlobalID()
+           << endl;
+      m_lastBreakLine = line;
+      m_listPosition = 0;
+      return true;
+    }
+  }
+  return false;
+void InteractiveDebugger::printCurrentLine() const
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  if (!workItem || workItem->getState() == WorkItem::FINISHED)
+  {
+    return;
+  }
+  size_t lineNum = getCurrentLineNumber();
+  if (m_program->getNumSourceLines() && lineNum > 0)
+  {
+    printSourceLine(lineNum);
+  }
+  else
+  {
+    cout << "Source line not available." << endl;
+    dumpInstruction(cout, workItem->getCurrentInstruction());
+    cout << endl;
+  }
+void InteractiveDebugger::printFunction(
+  const llvm::Instruction *instruction) const
+  // Get function
+  const llvm::Function *function = instruction->getParent()->getParent();
+  cout << function->getName().str() << "(";
+  // Print arguments
+  llvm::Function::const_arg_iterator argItr;
+  for (argItr = function->arg_begin();
+       argItr != function->arg_end(); argItr++)
+  {
+    if (argItr != function->arg_begin())
+    {
+      cout << ", ";
+    }
+    cout << argItr->getName().str() << "=";
+    m_kernelInvocation->getCurrentWorkItem()->printValue(argItr);
+  }
+  cout << ") at line " << dec << getLineNumber(instruction) << endl;
+void InteractiveDebugger::printSourceLine(size_t lineNum) const
+  const char *line = m_program->getSourceLine(lineNum);
+  if (line)
+  {
+    cout << dec << lineNum << "\t" << line << endl;
+  }
+  else
+  {
+    cout << "Invalid line number: " << lineNum << endl;
+  }
+bool InteractiveDebugger::shouldShowPrompt(const WorkItem *workItem)
+  if (!m_running)
+    return false;
+  if (m_forceBreak || sigintBreak)
+    return true;
+  if (hasHitBreakpoint())
+    return true;
+  if (m_continue)
+    return false;
+  if (workItem->getState() == WorkItem::BARRIER)
+    return true;
+  if (workItem->getState() == WorkItem::FINISHED)
+    return true;
+  if (!m_program->getNumSourceLines())
+    return true;
+  size_t line = getCurrentLineNumber();
+  if (m_next && workItem->getCallStack().size() > m_previousDepth)
+    return false;
+  if (!line || line == m_previousLine)
+    return false;
+  return true;
+//// Interactive Commands ////
+bool InteractiveDebugger::backtrace(vector<string> args)
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  if (!workItem || workItem->getState() == WorkItem::FINISHED)
+  {
+    return false;
+  }
+  stack<const llvm::Instruction*> callStack = workItem->getCallStack();
+  // Print current instruction
+  cout << "#" << callStack.size() <<  " ";
+  printFunction(workItem->getCurrentInstruction());
+  // Print call stack
+  while (!callStack.empty())
+  {
+    cout << "#" << (callStack.size()-1) <<  " ";
+    printFunction(callStack.top());
+    callStack.pop();
+  }
+  return false;
+bool InteractiveDebugger::brk(vector<string> args)
+  if (!m_program->getNumSourceLines())
+  {
+    cout << "Breakpoints only valid when source is available." << endl;
+    return false;
+  }
+  size_t lineNum = getCurrentLineNumber();
+  if (args.size() > 1)
+  {
+    // Parse argument as a target line number
+    istringstream ss(args[1]);
+    ss >> lineNum;
+    if (!ss.eof() || !lineNum || lineNum > m_program->getNumSourceLines()+1)
+    {
+      cout << "Invalid line number." << endl;
+      return false;
+    }
+  }
+  if (lineNum)
+  {
+    m_breakpoints[m_program][m_nextBreakpoint++] = lineNum;
+  }
+  else
+  {
+    cout << "Not currently on a line." << endl;
+  }
+  return false;
+bool InteractiveDebugger::cont(vector<string> args)
+#if !defined(_WIN32) || defined(__MINGW32__)
+  // Register a signal handler to catch interrupts
+  struct sigaction sigHandler;
+  sigHandler.sa_handler = handleSignal;
+  sigemptyset(&sigHandler.sa_mask);
+  sigHandler.sa_flags = 0;
+  sigaction(SIGINT, &sigHandler, &m_oldSignalHandler);
+  m_continue = true;
+  return true;
+bool InteractiveDebugger::del(vector<string> args)
+  if (args.size() > 1)
+  {
+    // Parse argument as a target breakpoint
+    size_t bpNum = 0;
+    istringstream ss(args[1]);
+    ss >> bpNum;
+    if (!ss.eof())
+    {
+      cout << "Invalid breakpoint number." << endl;
+      return false;
+    }
+    // Ensure breakpoint exists
+    if (!m_breakpoints[m_program].count(bpNum))
+    {
+      cout << "Breakpoint not found." << endl;
+      return false;
+    }
+    m_breakpoints[m_program].erase(bpNum);
+  }
+  else
+  {
+    // Prompt for confimation
+    string confirm;
+    cout << "Delete all breakpoints? (y/n) " << flush;
+    cin >> confirm;
+    cin.ignore();
+    if (confirm == "y")
+    {
+      m_breakpoints.clear();
+    }
+  }
+  return false;
+bool InteractiveDebugger::help(vector<string> args)
+  if (args.size() < 2)
+  {
+    cout << "Command list:" << endl;
+    cout << "  backtrace    (bt)" << endl;
+    cout << "  break        (b)" << endl;
+    cout << "  continue     (c)" << endl;
+    cout << "  delete       (d)" << endl;
+    cout << "  gmem         (gm)" << endl;
+    cout << "  help         (h)" << endl;
+    cout << "  info         (i)" << endl;
+    cout << "  list         (l)" << endl;
+    cout << "  next         (n)" << endl;
+    cout << "  lmem         (lm)" << endl;
+    cout << "  pmem         (pm)" << endl;
+    cout << "  print        (p)" << endl;
+    cout << "  quit         (q)" << endl;
+    cout << "  step         (s)" << endl;
+    cout << "  workitem     (wi)" << endl;
+    cout << "(type 'help command' for more information)" << endl;
+    return false;
+  }
+  if (args[1] == "backtrace" || args[1] == "bt")
+  {
+    cout << "Print function call stack." << endl;
+  }
+  else if (args[1] == "break" || args[1] == "b")
+  {
+    cout << "Set a breakpoint"
+         << " (only functional when source is available)." << endl
+         << "With no arguments, sets a breakpoint at the current line." << endl
+         << "Use a numeric argument to set a breakpoint at a specific line."
+         << endl;
+  }
+  else if (args[1] == "continue" || args[1] == "c")
+  {
+    cout << "Continue kernel execution until next breakpoint." << endl;
+  }
+  else if (args[1] == "delete" || args[1] == "d")
+  {
+    cout << "Delete a breakpoint." << endl
+         << "With no arguments, deletes all breakpoints." << endl;
+  }
+  else if (args[1] == "help" || args[1] == "h")
+  {
+    cout << "Display usage information for a command." << endl;
+  }
+  else if (args[1] == "info" || args[1] == "i")
+  {
+    cout << "Display information about current debugging context." << endl
+         << "With no arguments, displays general information." << endl
+         << "'info break' lists breakpoints."
+         << endl;
+  }
+  else if (args[1] == "list" || args[1] == "l")
+  {
+    cout << "List source lines." << endl
+         << "With no argument, lists " << LIST_LENGTH
+         << " lines after previous listing." << endl
+         << "Use - to list " << LIST_LENGTH
+         << " lines before the previous listing" << endl
+         << "Use a numeric argument to list around a specific line number."
+         << endl;
+  }
+  else if (args[1] == "gmem" || args[1] == "lmem" || args[1] == "pmem" ||
+           args[1] == "gm"   || args[1] == "lm"   || args[1] == "pm")
+  {
+    cout << "Examine contents of ";
+    if (args[1] == "gmem") cout << "global";
+    if (args[1] == "lmem") cout << "local";
+    if (args[1] == "pmem") cout << "private";
+    cout << " memory." << endl
+         << "With no arguments, dumps entire contents of memory." << endl
+         << "'" << args[1] << " address [size]'" << endl
+         << "address is hexadecimal and 4-byte aligned." << endl;
+  }
+  else if (args[1] == "next" || args[1] == "n")
+  {
+    cout << "Step forward,"
+         << " treating function calls as single instruction." << endl;
+  }
+  else if (args[1] == "print" || args[1] == "p")
+  {
+    cout << "Print the values of one or more variables." << endl
+         << "'print x y' prints the values of x and y" << endl
+         << "'print foo[i]' prints a value at a constant array index" << endl;
+  }
+  else if (args[1] == "quit" || args[1] == "q")
+  {
+    cout << "Quit interactive debugger." << endl;
+  }
+  else if (args[1] == "step" || args[1] == "s")
+  {
+    cout << "Step forward a single source line,"
+         << " or an instruction if no source available." << endl;
+  }
+  else if (args[1] == "workitem" || args[1] == "wi")
+  {
+    cout << "Switch to a different work-item." << endl
+         << "Up to three (space separated) arguments allowed,"
+         << " specifying the global ID of the work-item." << endl;
+  }
+  else
+  {
+    cout << "Unrecognized command '" << args[1] << "'" << endl;
+  }
+  return false;
+bool InteractiveDebugger::info(vector<string> args)
+  if (args.size() > 1)
+  {
+    if (args[1] == "break")
+    {
+      // List breakpoints
+      map<size_t, size_t>::iterator itr;
+      for (itr = m_breakpoints[m_program].begin();
+           itr != m_breakpoints[m_program].end(); itr++)
+      {
+        cout << "Breakpoint " << itr->first << ": Line " << itr->second << endl;
+      }
+    }
+    else
+    {
+      cout << "Invalid info command: " << args[1] << endl;
+    }
+    return false;
+  }
+  // Kernel invocation information
+  cout
+    << dec
+    << "Running kernel '" << m_kernelInvocation->getKernel()->getName() << "'"
+    << endl
+    << "-> Global work size:   " << m_kernelInvocation->getGlobalSize()
+    << endl
+    << "-> Global work offset: " << m_kernelInvocation->getGlobalOffset()
+    << endl
+    << "-> Local work size:    " << m_kernelInvocation->getLocalSize()
+    << endl;
+  // Current work-item
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  if (workItem)
+  {
+    cout << endl << "Current work-item: " << workItem->getGlobalID() << endl;
+    if (workItem->getState() == WorkItem::FINISHED)
+    {
+      cout << "Work-item has finished." << endl;
+    }
+    else
+    {
+      cout << "In function ";
+      printFunction(workItem->getCurrentInstruction());
+      printCurrentLine();
+    }
+  }
+  else
+  {
+    cout << "All work-items finished." << endl;
+  }
+  return false;
+bool InteractiveDebugger::list(vector<string> args)
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  if (!workItem)
+  {
+    cout << "All work-items finished." << endl;
+    return false;
+  }
+  if (!m_program->getNumSourceLines())
+  {
+    cout << "No source code available." << endl;
+    return false;
+  }
+  // Check for an argument
+  size_t start = 0;
+  bool forwards = true;
+  if (args.size() > 1)
+  {
+    if (args[1] == "-")
+    {
+      forwards = false;
+    }
+    else
+    {
+      // Parse argument as a target line number
+      istringstream ss(args[1]);
+      ss >> start;
+      if (!ss.eof())
+      {
+        cout << "Invalid line number." << endl;
+        return false;
+      }
+      start = start > LIST_LENGTH/2 ? start - LIST_LENGTH/2 : 1;
+    }
+  }
+  if (!start)
+  {
+    if (forwards)
+    {
+      // Starting position is the previous list position + LIST_LENGTH
+      start = m_listPosition ?
+        m_listPosition + LIST_LENGTH : getCurrentLineNumber() + 1;
+      if (start >= m_program->getNumSourceLines() + 1)
+      {
+        m_listPosition = m_program->getNumSourceLines() + 1;
+        return false;
+      }
+    }
+    else
+    {
+      // Starting position is the previous list position - LIST_LENGTH
+      start = m_listPosition ? m_listPosition : getCurrentLineNumber();
+      start = start > LIST_LENGTH ? start - LIST_LENGTH : 1;
+    }
+  }
+  // Display lines
+  for (int i = 0; i < LIST_LENGTH; i++)
+  {
+    if (start + i >= m_program->getNumSourceLines() + 1)
+    {
+      break;
+    }
+    printSourceLine(start + i);
+  }
+  m_listPosition = start;
+  return false;
+bool InteractiveDebugger::mem(vector<string> args)
+  // Get target memory object
+  Memory *memory = NULL;
+  if (args[0][0] == 'g')
+  {
+    memory = m_context->getGlobalMemory();
+  }
+  else if (args[0][0] == 'l')
+  {
+    memory = m_kernelInvocation->getCurrentWorkGroup()->getLocalMemory();
+  }
+  else if (args[0][0] == 'p')
+  {
+    memory = m_kernelInvocation->getCurrentWorkItem()->getPrivateMemory();
+  }
+  // If no arguments, dump memory
+  if (args.size() == 1)
+  {
+    memory->dump();
+    return false;
+  }
+  else if (args.size() > 3)
+  {
+    cout << "Invalid number of arguments." << endl;
+    return false;
+  }
+  // Get target address
+  size_t address;
+  stringstream ss(args[1]);
+  ss >> hex >> address;
+  if (!ss.eof() || address%4 != 0)
+  {
+    cout << "Invalid address." << endl;
+    return false;
+  }
+  // Get optional size
+  size_t size = 8;
+  if (args.size() == 3)
+  {
+    stringstream ss(args[2]);
+    ss >> dec >> size;
+    if (!ss.eof() || !size)
+    {
+      cout << "Invalid size" << endl;
+      return false;
+    }
+  }
+  // Check address is valid
+  if (!memory->isAddressValid(address, size))
+  {
+    cout << "Invalid memory address." << endl;
+    return false;
+  }
+  // Output data
+  unsigned char *data = (unsigned char*)memory->getPointer(address);
+  for (unsigned i = 0; i < size; i++)
+  {
+    if (i%4 == 0)
+    {
+      cout << endl << hex << uppercase
+           << setw(16) << setfill(' ') << right
+           << (address + i) << ":";
+    }
+    cout << " " << hex << uppercase << setw(2) << setfill('0') << (int)data[i];
+  }
+  cout << endl << endl;
+  return false;
+bool InteractiveDebugger::next(vector<string> args)
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  if (!workItem)
+  {
+    cout << "All work-items finished." << endl;
+    return false;
+  }
+  if (workItem->getState() == WorkItem::FINISHED)
+  {
+    cout << "Work-item has finished." << endl;
+    return false;
+  }
+  else if (workItem->getState() == WorkItem::BARRIER)
+  {
+    cout << "Work-item is at barrier." << endl;
+    return false;
+  }
+  // Step until we return to the same depth
+  m_previousDepth = workItem->getCallStack().size();
+  m_previousLine = getCurrentLineNumber();
+  m_next = true;
+  return true;
+bool InteractiveDebugger::print(vector<string> args)
+  if (args.size() < 2)
+  {
+    cout << "Variable name(s) required." << endl;
+    return false;
+  }
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  for (unsigned i = 1; i < args.size(); i++)
+  {
+    cout << args[i] << " = ";
+    // Check for subscript operator
+    size_t start = args[i].find("[");
+    if (start != string::npos)
+    {
+      // Find end of subscript
+      size_t end = args[i].find(']');
+      if (end == string::npos)
+      {
+        cout << "missing ']'" << endl;
+        return false;
+      }
+      if (end != args[i].length() - 1)
+      {
+        cout << "invalid variable" << endl;
+        return false;
+      }
+      // Parse index value
+      size_t index = 0;
+      string var = args[i].substr(0, start);
+      stringstream ss(args[i].substr(start+1, end-start-1));
+      ss >> index;
+      if (!ss.eof())
+      {
+        cout << "invalid index" << endl;
+        return false;
+      }
+      // Get variable value and type
+      const llvm::Value *ptr = workItem->getVariable(var);
+      if (!ptr)
+      {
+        cout << "not found" << endl;
+        return false;
+      }
+      const llvm::Type *ptrType = ptr->getType();
+      // Check for alloca instruction, in which case look at allocated type
+      bool alloca = false;
+      if (ptr->getValueID() >= llvm::Value::InstructionVal &&
+          ((llvm::Instruction*)ptr)->getOpcode() == llvm::Instruction::Alloca)
+      {
+        ptrType = ((const llvm::AllocaInst*)ptr)->getAllocatedType();
+        alloca = true;
+      }
+      // Ensure type is a pointer
+      if (!ptrType->isPointerTy())
+      {
+        cout << "not a pointer" << endl;
+        return false;
+      }
+      // Get base address
+      size_t base = *(size_t*)workItem->getValueData(ptr);
+      if (alloca)
+      {
+        // Load base address from private memory
+        workItem->getPrivateMemory()->load((unsigned char*)&base,
+                                                    base, sizeof(size_t));
+      }
+      // Get target memory object
+      Memory *memory = NULL;
+      switch (ptrType->getPointerAddressSpace())
+      {
+      case AddrSpacePrivate:
+        memory = workItem->getPrivateMemory();
+        break;
+      case AddrSpaceGlobal:
+      case AddrSpaceConstant:
+        memory = m_context->getGlobalMemory();
+        break;
+      case AddrSpaceLocal:
+        memory = m_kernelInvocation->getCurrentWorkGroup()->getLocalMemory();
+        break;
+      default:
+        cout << "invalid address space" << endl;
+        return false;
+      }
+      // Get element type
+      const llvm::Type *elemType = ptrType->getPointerElementType();
+      unsigned elemSize = getTypeSize(elemType);
+      // Load data
+      if (!memory->isAddressValid(base + index*elemSize, elemSize))
+      {
+        cout << "invalid memory address" << endl;
+      }
+      else
+      {
+        // Print data
+        void *data = (void*)memory->getPointer(base+index*elemSize);
+        printTypedData(elemType, (unsigned char*)data);
+        cout << endl;
+      }
+    }
+    else
+    {
+      if (!workItem->printVariable(args[i]))
+      {
+        cout << "not found";
+      }
+      cout << endl;
+    }
+  }
+  return false;
+bool InteractiveDebugger::quit(vector<string> args)
+#if !defined(_WIN32) || defined(__MINGW32__)
+  // Restore old signal handler
+  sigaction(SIGINT, &m_oldSignalHandler, NULL);
+  m_running = false;
+  return true;
+bool InteractiveDebugger::step(vector<string> args)
+  const WorkItem *workItem = m_kernelInvocation->getCurrentWorkItem();
+  if (!workItem)
+  {
+    cout << "All work-items finished." << endl;
+    return false;
+  }
+  if (workItem->getState() == WorkItem::FINISHED)
+  {
+    cout << "Work-item has finished." << endl;
+    return false;
+  }
+  else if (workItem->getState() == WorkItem::BARRIER)
+  {
+    cout << "Work-item is at barrier." << endl;
+    return false;
+  }
+  // Save current position
+  m_previousDepth = workItem->getCallStack().size();
+  m_previousLine = getCurrentLineNumber();
+  return true;
+bool InteractiveDebugger::workitem(vector<string> args)
+  // TODO: Take offsets into account?
+  Size3 gid(0,0,0);
+  for (unsigned i = 1; i < args.size(); i++)
+  {
+    // Parse argument as a target line number
+    istringstream ss(args[i]);
+    ss >> gid[i-1];
+    if (!ss.eof() || gid[i-1] >= m_kernelInvocation->getGlobalSize()[i-1])
+    {
+      cout << "Invalid global ID." << endl;
+      return false;
+    }
+  }
+  // Ugly const_cast since this operation actually changes something about
+  // the simulation. This goes against the idea that plugins are entirely
+  // passive.
+  if (!const_cast<KernelInvocation*>(m_kernelInvocation)->switchWorkItem(gid))
+  {
+    cout << "Work-item has already finished, unable to load state." << endl;
+    return false;
+  }
+  // Print new WI id
+  cout << "Switched to work-item: (" << gid[0] << ","
+                                     << gid[1] << ","
+                                     << gid[2] << ")" << endl;
+  if (m_kernelInvocation->getCurrentWorkItem()->getState() ==
+      WorkItem::FINISHED)
+  {
+    cout << "Work-item has finished execution." << endl;
+  }
+  else
+  {
+    printCurrentLine();
+  }
+  return false;
diff --git a/src/plugins/InteractiveDebugger.h b/src/plugins/InteractiveDebugger.h
new file mode 100644
index 0000000..2b5db65
--- /dev/null
+++ b/src/plugins/InteractiveDebugger.h
@@ -0,0 +1,72 @@
+// InteractiveDebugger.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/Plugin.h"
+namespace oclgrind
+  class Program;
+  class InteractiveDebugger : public Plugin
+  {
+  public:
+    InteractiveDebugger(const Context *context);
+    virtual void instructionExecuted(const WorkItem *workItem,
+                                     const llvm::Instruction *instruction,
+                                     const TypedValue& result) override;
+    virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
+    virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
+    virtual void log(MessageType type, const char *message) override;
+    virtual bool isThreadSafe() const override;
+  private:
+    bool m_continue;
+    bool m_running;
+    bool m_forceBreak;
+    size_t m_listPosition;
+    bool m_next;
+    size_t m_lastBreakLine;
+    size_t m_nextBreakpoint;
+    size_t m_previousDepth;
+    size_t m_previousLine;
+    std::map<const Program*, std::map<size_t, size_t> > m_breakpoints;
+    const Program *m_program;
+    const KernelInvocation *m_kernelInvocation;
+    size_t getCurrentLineNumber() const;
+    size_t getLineNumber(const llvm::Instruction *instruction) const;
+    bool hasHitBreakpoint();
+    void printCurrentLine() const;
+    void printFunction(const llvm::Instruction *instruction) const;
+    void printSourceLine(size_t lineNum) const;
+    bool shouldShowPrompt(const WorkItem *workItem);
+    // Interactive commands
+    typedef bool (InteractiveDebugger::*Command)(std::vector<std::string>);
+    std::map<std::string, Command> m_commands;
+#define CMD(name) bool name(std::vector<std::string> args);
+    CMD(backtrace);
+    CMD(brk);
+    CMD(cont);
+    CMD(del);
+    CMD(help);
+    CMD(info);
+    CMD(list);
+    CMD(mem);
+    CMD(next);
+    CMD(print);
+    CMD(quit);
+    CMD(step);
+    CMD(workitem);
+#undef CMD
+  };
diff --git a/src/plugins/Logger.cpp b/src/plugins/Logger.cpp
new file mode 100644
index 0000000..7b73296
--- /dev/null
+++ b/src/plugins/Logger.cpp
@@ -0,0 +1,81 @@
+// Logger.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/common.h"
+#include <fstream>
+#include <mutex>
+#include "Logger.h"
+using namespace oclgrind;
+using namespace std;
+#define DEFAULT_MAX_ERRORS 1000
+unsigned Logger::m_numErrors = 0;
+static mutex logMutex;
+Logger::Logger(const Context *context)
+ : Plugin(context)
+  m_log = &cerr;
+  const char *logfile = getenv("OCLGRIND_LOG");
+  if (logfile)
+  {
+    m_log = new ofstream(logfile);
+    if (!m_log->good())
+    {
+      cerr << "Oclgrind: Unable to open log file '" << logfile << "'" << endl;
+      m_log = &cerr;
+    }
+  }
+  m_maxErrors = DEFAULT_MAX_ERRORS;
+  const char *maxErrors = getenv("OCLGRIND_MAX_ERRORS");
+  if (maxErrors)
+  {
+    char *next;
+    m_maxErrors = strtoul(maxErrors, &next, 10);
+    if (strlen(next))
+    {
+      cerr << "Oclgrind: Invalid value for OCLGRIND_MAX_ERRORS" << endl;
+    }
+  }
+  if (m_log != &cerr)
+  {
+    ((ofstream*)m_log)->close();
+    delete m_log;
+  }
+void Logger::log(MessageType type, const char *message)
+  lock_guard<mutex> lock(logMutex);
+  // Limit number of errors/warning printed
+  if (type == ERROR || type == WARNING)
+  {
+    if (m_numErrors == m_maxErrors)
+    {
+      *m_log << endl << "Oclgrind: "
+             << m_numErrors << " errors generated - suppressing further errors"
+             << endl << endl;
+    }
+    if (m_numErrors++ >= m_maxErrors)
+      return;
+  }
+  *m_log << endl << message << endl;
diff --git a/src/plugins/Logger.h b/src/plugins/Logger.h
new file mode 100644
index 0000000..294bc67
--- /dev/null
+++ b/src/plugins/Logger.h
@@ -0,0 +1,27 @@
+// Logger.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/Plugin.h"
+namespace oclgrind
+  class Logger : public Plugin
+  {
+  public:
+    Logger(const Context *context);
+    virtual ~Logger();
+    virtual void log(MessageType type, const char *message) override;
+  private:
+    std::ostream *m_log;
+    unsigned m_maxErrors;
+    static unsigned m_numErrors;
+  };
diff --git a/src/plugins/MemCheck.cpp b/src/plugins/MemCheck.cpp
new file mode 100644
index 0000000..fb04e57
--- /dev/null
+++ b/src/plugins/MemCheck.cpp
@@ -0,0 +1,107 @@
+// MemCheck.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/common.h"
+#include "core/Context.h"
+#include "core/Memory.h"
+#include "MemCheck.h"
+using namespace oclgrind;
+using namespace std;
+MemCheck::MemCheck(const Context *context)
+ : Plugin(context)
+void MemCheck::memoryAtomicLoad(const Memory *memory,
+                                const WorkItem *workItem,
+                                AtomicOp op, size_t address, size_t size)
+  checkLoad(memory, address, size);
+void MemCheck::memoryAtomicStore(const Memory *memory,
+                                 const WorkItem *workItem,
+                                 AtomicOp op, size_t address, size_t size)
+  checkStore(memory, address, size);
+void MemCheck::memoryLoad(const Memory *memory, const WorkItem *workItem,
+                          size_t address, size_t size)
+  checkLoad(memory, address, size);
+void MemCheck::memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+                          size_t address, size_t size)
+  checkLoad(memory, address, size);
+void MemCheck::memoryStore(const Memory *memory, const WorkItem *workItem,
+                           size_t address, size_t size,
+                           const uint8_t *storeData)
+  checkStore(memory, address, size);
+void MemCheck::memoryStore(const Memory *memory, const WorkGroup *workGroup,
+                           size_t address, size_t size,
+                           const uint8_t *storeData)
+  checkStore(memory, address, size);
+void MemCheck::checkLoad(const Memory *memory,
+                         size_t address, size_t size) const
+  if (!memory->isAddressValid(address, size))
+  {
+    logInvalidAccess(true, memory->getAddressSpace(), address, size);
+    return;
+  }
+  if (memory->getBuffer(address)->flags & CL_MEM_WRITE_ONLY)
+  {
+    m_context->logError("Invalid read from write-only buffer");
+  }
+void MemCheck::checkStore(const Memory *memory,
+                          size_t address, size_t size) const
+  if (!memory->isAddressValid(address, size))
+  {
+    logInvalidAccess(false, memory->getAddressSpace(), address, size);
+    return;
+  }
+  if (memory->getBuffer(address)->flags & CL_MEM_READ_ONLY)
+  {
+    m_context->logError("Invalid write to read-only buffer");
+  }
+void MemCheck::logInvalidAccess(bool read, unsigned addrSpace,
+                                size_t address, size_t size) const
+  Context::Message msg(ERROR, m_context);
+  msg << "Invalid " << (read ? "read" : "write")
+      << " of size " << size
+      << " at " << getAddressSpaceName(addrSpace)
+      << " memory address 0x" << hex << address << endl
+      << msg.INDENT
+      << "Kernel: " << msg.CURRENT_KERNEL << endl
+      << "Entity: " << msg.CURRENT_ENTITY << endl
+      << msg.CURRENT_LOCATION << endl;
+  msg.send();
\ No newline at end of file
diff --git a/src/plugins/MemCheck.h b/src/plugins/MemCheck.h
new file mode 100644
index 0000000..9e685bf
--- /dev/null
+++ b/src/plugins/MemCheck.h
@@ -0,0 +1,43 @@
+// MemCheck.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/Plugin.h"
+namespace oclgrind
+  class MemCheck : public Plugin
+  {
+  public:
+    MemCheck(const Context *context);
+    virtual void memoryAtomicLoad(const Memory *memory,
+                                  const WorkItem *workItem,
+                                  AtomicOp op,
+                                  size_t address, size_t size) override;
+    virtual void memoryAtomicStore(const Memory *memory,
+                                   const WorkItem *workItem,
+                                   AtomicOp op,
+                                   size_t address, size_t size) override;
+    virtual void memoryLoad(const Memory *memory, const WorkItem *workItem,
+                            size_t address, size_t size) override;
+    virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+                            size_t address, size_t size) override;
+    virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
+                             size_t address, size_t size,
+                             const uint8_t *storeData) override;
+    virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
+                             size_t address, size_t size,
+                             const uint8_t *storeData) override;
+  private:
+    void checkLoad(const Memory *memory, size_t address, size_t size) const;
+    void checkStore(const Memory *memory, size_t address, size_t size) const;
+    void logInvalidAccess(bool read, unsigned addrSpace,
+                          size_t address, size_t size) const;
+  };
diff --git a/src/plugins/RaceDetector.cpp b/src/plugins/RaceDetector.cpp
new file mode 100644
index 0000000..10f417e
--- /dev/null
+++ b/src/plugins/RaceDetector.cpp
@@ -0,0 +1,336 @@
+// RaceDetector.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/common.h"
+#include "core/Context.h"
+#include "core/KernelInvocation.h"
+#include "core/Memory.h"
+#include "core/WorkGroup.h"
+#include "core/WorkItem.h"
+#include "RaceDetector.h"
+using namespace oclgrind;
+using namespace std;
+#define KEY(memory,address) make_pair(memory, EXTRACT_BUFFER(address))
+RaceDetector::RaceDetector(const Context *context)
+ : Plugin(context)
+  m_kernelInvocation = NULL;
+  m_allowUniformWrites = !checkEnv("OCLGRIND_UNIFORM_WRITES");
+bool RaceDetector::isThreadSafe() const
+  // TODO: Improve DRD efficiency for multi-threaded case instead.
+  return false;
+void RaceDetector::kernelBegin(const KernelInvocation *kernelInvocation)
+  m_kernelInvocation = kernelInvocation;
+void RaceDetector::kernelEnd(const KernelInvocation *kernelInvocation)
+  synchronize(m_context->getGlobalMemory(), false);
+  m_kernelInvocation = NULL;
+void RaceDetector::memoryAllocated(const Memory *memory, size_t address,
+                                   size_t size, cl_mem_flags flags)
+  if (memory->getAddressSpace() == AddrSpacePrivate ||
+      memory->getAddressSpace() == AddrSpaceConstant)
+    return;
+  m_state[KEY(memory,address)] = make_pair(new State[size], size);
+void RaceDetector::memoryAtomicLoad(const Memory *memory,
+                                    const WorkItem *workItem,
+                                    AtomicOp op, size_t address, size_t size)
+  registerAtomic(memory, workItem, address, size, false);
+void RaceDetector::memoryAtomicStore(const Memory *memory,
+                                     const WorkItem *workItem,
+                                     AtomicOp op, size_t address, size_t size)
+  registerAtomic(memory, workItem, address, size, true);
+void RaceDetector::memoryDeallocated(const Memory *memory, size_t address)
+  if (memory->getAddressSpace() == AddrSpacePrivate ||
+      memory->getAddressSpace() == AddrSpaceConstant)
+    return;
+  delete[] m_state[KEY(memory,address)].first;
+  m_state.erase(KEY(memory,address));
+void RaceDetector::memoryLoad(const Memory *memory, const WorkItem *workItem,
+                              size_t address, size_t size)
+  registerLoadStore(memory, workItem, workItem->getWorkGroup(),
+                    address, size, NULL);
+void RaceDetector::memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+                              size_t address, size_t size)
+  registerLoadStore(memory, NULL, workGroup, address, size, NULL);
+void RaceDetector::memoryStore(const Memory *memory, const WorkItem *workItem,
+                               size_t address, size_t size,
+                               const uint8_t *storeData)
+  registerLoadStore(memory, workItem, workItem->getWorkGroup(),
+                    address, size, storeData);
+void RaceDetector::memoryStore(const Memory *memory, const WorkGroup *workGroup,
+                               size_t address, size_t size,
+                               const uint8_t *storeData)
+  registerLoadStore(memory, NULL, workGroup, address, size, storeData);
+void RaceDetector::logRace(DataRaceType type,
+                           unsigned int addrSpace,
+                           size_t address,
+                           size_t lastWorkGroup,
+                           size_t lastWorkItem,
+                           const llvm::Instruction *lastInstruction) const
+  const char *raceType = NULL;
+  switch (type)
+  {
+    case ReadWriteRace:
+      raceType = "Read-write";
+      break;
+    case WriteWriteRace:
+      raceType = "Write-write";
+      break;
+  }
+  Context::Message msg(ERROR, m_context);
+  msg << raceType << " data race at "
+      << getAddressSpaceName(addrSpace)
+      << " memory address 0x" << hex << address << endl
+      << msg.INDENT
+      << "Kernel: " << msg.CURRENT_KERNEL << endl
+      << endl
+      << "First entity:  " << msg.CURRENT_ENTITY << endl
+      << msg.CURRENT_LOCATION << endl
+      << endl
+      << "Second entity: ";
+  // Show details of other entity involved in race
+  if (lastWorkItem != -1)
+  {
+    Size3 global(lastWorkItem, m_kernelInvocation->getGlobalSize());
+    Size3 local, group;
+    local.x = global.x % m_kernelInvocation->getLocalSize().x;
+    local.y = global.y % m_kernelInvocation->getLocalSize().y;
+    local.z = global.z % m_kernelInvocation->getLocalSize().z;
+    group.x = global.x / m_kernelInvocation->getLocalSize().x;
+    group.y = global.y / m_kernelInvocation->getLocalSize().y;
+    group.z = global.z / m_kernelInvocation->getLocalSize().z;
+    msg << "Global" << global << " Local" << local << " Group" << group;
+  }
+  else if (lastWorkGroup != -1)
+  {
+    msg << "Group"
+        << Size3(lastWorkGroup, m_kernelInvocation->getNumGroups());
+  }
+  else
+  {
+    msg << "(unknown)";
+  }
+  msg << endl
+      << lastInstruction << endl;
+  msg.send();
+void RaceDetector::registerAtomic(const Memory *memory,
+                                  const WorkItem *workItem,
+                                  size_t address, size_t size,
+                                  bool store)
+  if (!memory->isAddressValid(address, size))
+    return;
+  State *state = m_state[KEY(memory,address)].first + EXTRACT_OFFSET(address);
+  // Get work-item index
+  size_t workItemIndex = workItem->getGlobalIndex();
+  bool race = false;
+  for (size_t offset = 0; offset < size; offset++, state++)
+  {
+    // Check for races with non-atomic operations
+    bool conflict = store ? !state->canAtomicStore : !state->canAtomicLoad;
+    if (!race && conflict && workItemIndex != state->workItem)
+    {
+      logRace(ReadWriteRace,
+              memory->getAddressSpace(),
+              address,
+              state->workItem,
+              state->workGroup,
+              state->instruction);
+      race = true;
+    }
+    // Update state
+    if (store)
+      state->canLoad = false;
+    state->canStore = false;
+    if (!state->wasWorkItem)
+    {
+      state->instruction = workItem->getCurrentInstruction();
+      state->workItem = workItemIndex;
+      state->wasWorkItem = true;
+    }
+  }
+void RaceDetector::registerLoadStore(const Memory *memory,
+                                     const WorkItem *workItem,
+                                     const WorkGroup *workGroup,
+                                     size_t address, size_t size,
+                                     const uint8_t *storeData)
+  if (!m_kernelInvocation)
+    return;
+  if (memory->getAddressSpace() == AddrSpacePrivate ||
+      memory->getAddressSpace() == AddrSpaceConstant)
+    return;
+  if (!memory->isAddressValid(address, size))
+    return;
+  bool load = !storeData;
+  bool store = storeData;
+  // Get index of work-item and work-group performing access
+  size_t workItemIndex = -1, workGroupIndex = -1;
+  if (workItem)
+  {
+    workItemIndex = workItem->getGlobalIndex();
+  }
+  if (workGroup)
+  {
+    workGroupIndex = workGroup->getGroupIndex();
+  }
+  bool race = false;
+  size_t base = EXTRACT_OFFSET(address);
+  State *state = m_state[KEY(memory, address)].first + base;
+  for (size_t offset = 0; offset < size; offset++, state++)
+  {
+    bool conflict = store ? !state->canStore : !state->canLoad;
+    if (m_allowUniformWrites && storeData)
+    {
+      uint8_t *ptr = (uint8_t*)(memory->getPointer(address));
+      conflict &= (ptr[offset] != storeData[offset]);
+    }
+    if (!race && conflict &&
+        (state->wasWorkItem ?                // If state set by work-item,
+         state->workItem != workItemIndex :  // must be same work-item,
+         state->workGroup != workGroupIndex) // otherwise must be same group
+        )
+    {
+      // Report data-race
+      DataRaceType type = load|state->canLoad ? ReadWriteRace : WriteWriteRace;
+      logRace(type, memory->getAddressSpace(),
+              address + offset,
+              state->workItem,
+              state->workGroup,
+              state->instruction);
+      race = true;
+    }
+    else
+    {
+      // Only update WI info if this operation is stronger than previous one
+      bool updateWI = store || (load && state->canStore);
+      // Update state
+      if (store)
+        state->canAtomicLoad = false;
+      state->canAtomicStore = false;
+      state->canLoad &= load;
+      state->canStore = false;
+      if (updateWI)
+      {
+        state->workGroup = workGroupIndex;
+        if (workItem)
+        {
+          state->instruction = workItem->getCurrentInstruction();
+          state->workItem = workItemIndex;
+          state->wasWorkItem = true;
+        }
+      }
+    }
+  }
+void RaceDetector::synchronize(const Memory *memory, bool workGroup)
+  StateMap::iterator itr;
+  for (itr = m_state.begin(); itr != m_state.end(); itr++)
+  {
+    if (itr->first.first != memory)
+      continue;
+    pair<State*,size_t> obj = itr->second;
+    for (State *state = obj.first; state < obj.first+obj.second; state++)
+    {
+      // TODO: atomic_intergroup_race test failure
+      state->canAtomicLoad = true;
+      state->canAtomicStore = true;
+      state->workItem = -1;
+      state->wasWorkItem = false;
+      if (!workGroup)
+      {
+        state->workGroup = -1;
+        state->canLoad = true;
+        state->canStore = true;
+      }
+    }
+  }
+void RaceDetector::workGroupBarrier(const WorkGroup *workGroup, uint32_t flags)
+  if (flags & CLK_LOCAL_MEM_FENCE)
+    synchronize(workGroup->getLocalMemory(), false);
+  if (flags & CLK_GLOBAL_MEM_FENCE)
+    synchronize(m_context->getGlobalMemory(), true);
+  instruction = NULL;
+  workItem = -1;
+  workGroup = -1;
+  canAtomicLoad = true;
+  canAtomicStore = true;
+  canLoad = true;
+  canStore = true;
+  wasWorkItem = false;
diff --git a/src/plugins/RaceDetector.h b/src/plugins/RaceDetector.h
new file mode 100644
index 0000000..2442b56
--- /dev/null
+++ b/src/plugins/RaceDetector.h
@@ -0,0 +1,94 @@
+// RaceDetector.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "core/Plugin.h"
+namespace oclgrind
+  class RaceDetector : public Plugin
+  {
+  public:
+    RaceDetector(const Context *context);
+    virtual void kernelBegin(const KernelInvocation *kernelInvocation) override;
+    virtual void kernelEnd(const KernelInvocation *kernelInvocation) override;
+    virtual void memoryAllocated(const Memory *memory, size_t address,
+                                 size_t size, cl_mem_flags flags) override;
+    virtual void memoryAtomicLoad(const Memory *memory,
+                                  const WorkItem *workItem,
+                                  AtomicOp op,
+                                  size_t address, size_t size) override;
+    virtual void memoryAtomicStore(const Memory *memory,
+                                   const WorkItem *workItem,
+                                   AtomicOp op,
+                                   size_t address, size_t size) override;
+    virtual void memoryDeallocated(const Memory *memory, size_t address);
+    virtual void memoryLoad(const Memory *memory, const WorkItem *workItem,
+                            size_t address, size_t size) override;
+    virtual void memoryLoad(const Memory *memory, const WorkGroup *workGroup,
+                            size_t address, size_t size) override;
+    virtual void memoryStore(const Memory *memory, const WorkItem *workItem,
+                             size_t address, size_t size,
+                             const uint8_t *storeData) override;
+    virtual void memoryStore(const Memory *memory, const WorkGroup *workGroup,
+                             size_t address, size_t size,
+                             const uint8_t *storeData) override;
+    virtual void workGroupBarrier(const WorkGroup *workGroup,
+                                  uint32_t flags) override;
+    virtual bool isThreadSafe() const override;
+  private:
+    struct State
+    {
+      const llvm::Instruction *instruction;
+      size_t workItem;
+      size_t workGroup;
+      bool canAtomicLoad;
+      bool canAtomicStore;
+      bool canLoad;
+      bool canStore;
+      bool wasWorkItem;
+      State();
+    };
+    // Enumeration for types of data-race
+    enum DataRaceType
+    {
+      ReadWriteRace,
+      WriteWriteRace
+    };
+    typedef std::map<
+                      std::pair<const Memory*, size_t>,
+                      std::pair<State*, size_t>
+                    > StateMap;
+    StateMap m_state;
+    bool m_allowUniformWrites;
+    const KernelInvocation *m_kernelInvocation;
+    void logRace(DataRaceType type,
+                 unsigned int addrSpace,
+                 size_t address,
+                 size_t lastWorkGroup,
+                 size_t lastWorkItem,
+                 const llvm::Instruction *lastInstruction) const;
+    void registerAtomic(const Memory *memory,
+                        const WorkItem *workItem,
+                        size_t address, size_t size,
+                        bool store);
+    void registerLoadStore(const Memory *memory,
+                           const WorkItem *workItem,
+                           const WorkGroup *workGroup,
+                           size_t address, size_t size,
+                           const uint8_t *storeData);
+    void synchronize(const Memory *memory, bool workGroup);
+  };
diff --git a/src/runtime/async_queue.cpp b/src/runtime/async_queue.cpp
new file mode 100644
index 0000000..cc5f41c
--- /dev/null
+++ b/src/runtime/async_queue.cpp
@@ -0,0 +1,136 @@
+// async_queue.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "async_queue.h"
+#include <cassert>
+#include <iostream>
+#include <list>
+#include <map>
+#include "core/Kernel.h"
+#include "core/Queue.h"
+using namespace oclgrind;
+using namespace std;
+// Maps to keep track of retained objects
+static map< Queue::Command*, list<cl_mem> > memObjectMap;
+static map< Queue::Command*, cl_kernel > kernelMap;
+static map< Queue::Command*, cl_event > eventMap;
+static map< Queue::Command*, list<cl_event> > waitListMap;
+void asyncEnqueue(cl_command_queue queue,
+                  cl_command_type type,
+                  Queue::Command *cmd,
+                  cl_uint numEvents,
+                  const cl_event *waitList,
+                  cl_event *eventOut)
+  // Add event wait list to command
+  for (unsigned i = 0; i < numEvents; i++)
+  {
+    cmd->waitList.push_back(waitList[i]->event);
+    waitListMap[cmd].push_back(waitList[i]);
+    clRetainEvent(waitList[i]);
+  }
+  // Enqueue command
+  Event *event = queue->queue->enqueue(cmd);
+  // Create event objects
+  cl_event _event = new _cl_event;
+  _event->dispatch = m_dispatchTable;
+  _event->context = queue->context;
+  _event->queue = queue;
+  _event->type = type;
+  _event->event = event;
+  _event->refCount = 1;
+  // Add event to map
+  eventMap[cmd] = _event;
+  // Pass event as output and retain (if required)
+  if (eventOut)
+  {
+    clRetainEvent(_event);
+    *eventOut = _event;
+  }
+void asyncQueueRetain(Queue::Command *cmd, cl_mem mem)
+  // Retain object and add to map
+  clRetainMemObject(mem);
+  memObjectMap[cmd].push_back(mem);
+void asyncQueueRetain(Queue::Command *cmd, cl_kernel kernel)
+  assert(kernelMap.find(cmd) == kernelMap.end());
+  // Retain kernel and add to map
+  clRetainKernel(kernel);
+  kernelMap[cmd] = kernel;
+  // Retain memory objects arguments
+  map<cl_uint,cl_mem>::const_iterator itr;
+  for (itr = kernel->memArgs.begin(); itr != kernel->memArgs.end(); itr++)
+  {
+    asyncQueueRetain(cmd, itr->second);
+  }
+void asyncQueueRelease(Queue::Command *cmd)
+  // Release memory objects
+  if (memObjectMap.find(cmd) != memObjectMap.end())
+  {
+    list<cl_mem> memObjects = memObjectMap[cmd];
+    while (!memObjects.empty())
+    {
+      clReleaseMemObject(memObjects.front());
+      memObjects.pop_front();
+    }
+    memObjectMap.erase(cmd);
+  }
+  // Release kernel
+  if (cmd->type == Queue::KERNEL)
+  {
+    assert(kernelMap.find(cmd) != kernelMap.end());
+    clReleaseKernel(kernelMap[cmd]);
+    kernelMap.erase(cmd);
+    delete ((Queue::KernelCommand*)cmd)->kernel;
+  }
+  // Remove event from map
+  cl_event event = eventMap[cmd];
+  eventMap.erase(cmd);
+  // Perform callbacks
+  list< pair<void (CL_CALLBACK *)(cl_event, cl_int, void *),
+             void*> >::iterator callItr;
+  for (callItr = event->callbacks.begin();
+       callItr != event->callbacks.end();
+       callItr++)
+  {
+    callItr->first(event, event->event->state, callItr->second);
+  }
+  // Release events
+  list<cl_event>::iterator waitItr;
+  for (waitItr = waitListMap[cmd].begin();
+       waitItr != waitListMap[cmd].end();
+       waitItr++)
+  {
+    clReleaseEvent(*waitItr);
+  }
+  waitListMap.erase(cmd);
+  clReleaseEvent(event);
diff --git a/src/runtime/async_queue.h b/src/runtime/async_queue.h
new file mode 100644
index 0000000..5ff4f4a
--- /dev/null
+++ b/src/runtime/async_queue.h
@@ -0,0 +1,21 @@
+// async_queue.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include "icd.h"
+#include "core/Queue.h"
+extern void asyncEnqueue(cl_command_queue queue,
+                         cl_command_type type,
+                         oclgrind::Queue::Command *cmd,
+                         cl_uint numEvents,
+                         const cl_event *waitList,
+                         cl_event *eventOut);
+extern void asyncQueueRetain(oclgrind::Queue::Command *cmd, cl_mem mem);
+extern void asyncQueueRetain(oclgrind::Queue::Command *cmd, cl_kernel);
+extern void asyncQueueRelease(oclgrind::Queue::Command *cmd);
diff --git a/src/runtime/icd.def b/src/runtime/icd.def
new file mode 100644
index 0000000..7e017c6
--- /dev/null
+++ b/src/runtime/icd.def
@@ -0,0 +1,5 @@
+; Make ICD initialisation functions visible
diff --git a/src/runtime/icd.h b/src/runtime/icd.h
new file mode 100644
index 0000000..7059cf9
--- /dev/null
+++ b/src/runtime/icd.h
@@ -0,0 +1,235 @@
+// icd.h (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#ifndef _ICD_H_
+#define _ICD_H_
+// Rename OpenCL API functions to avoid clashes with ICD library
+#define clGetPlatformIDs _clGetPlatformIDs
+#define clGetPlatformInfo _clGetPlatformInfo
+#define clGetDeviceIDs _clGetDeviceIDs
+#define clGetDeviceInfo _clGetDeviceInfo
+#define clCreateSubDevices _clCreateSubDevices
+#define clRetainDevice _clRetainDevice
+#define clReleaseDevice _clReleaseDevice
+#define clCreateContext _clCreateContext
+#define clCreateContextFromType _clCreateContextFromType
+#define clRetainContext _clRetainContext
+#define clReleaseContext _clReleaseContext
+#define clGetContextInfo _clGetContextInfo
+#define clCreateCommandQueue _clCreateCommandQueue
+#define clSetCommandQueueProperty _clSetCommandQueueProperty
+#define clRetainCommandQueue _clRetainCommandQueue
+#define clReleaseCommandQueue _clReleaseCommandQueue
+#define clGetCommandQueueInfo _clGetCommandQueueInfo
+#define clCreateBuffer _clCreateBuffer
+#define clCreateSubBuffer _clCreateSubBuffer
+#define clCreateImage _clCreateImage
+#define clCreateImage2D _clCreateImage2D
+#define clCreateImage3D _clCreateImage3D
+#define clRetainMemObject _clRetainMemObject
+#define clReleaseMemObject _clReleaseMemObject
+#define clGetSupportedImageFormats _clGetSupportedImageFormats
+#define clGetMemObjectInfo _clGetMemObjectInfo
+#define clGetImageInfo _clGetImageInfo
+#define clSetMemObjectDestructorCallback _clSetMemObjectDestructorCallback
+#define clCreateSampler _clCreateSampler
+#define clRetainSampler _clRetainSampler
+#define clReleaseSampler _clReleaseSampler
+#define clGetSamplerInfo _clGetSamplerInfo
+#define clCreateProgramWithSource _clCreateProgramWithSource
+#define clCreateProgramWithBinary _clCreateProgramWithBinary
+#define clCreateProgramWithBuiltInKernels _clCreateProgramWithBuiltInKernels
+#define clRetainProgram _clRetainProgram
+#define clReleaseProgram _clReleaseProgram
+#define clBuildProgram _clBuildProgram
+#define clUnloadCompiler _clUnloadCompiler
+#define clCompileProgram _clCompileProgram
+#define clLinkProgram _clLinkProgram
+#define clUnloadPlatformCompiler _clUnloadPlatformCompiler
+#define clGetProgramInfo _clGetProgramInfo
+#define clGetProgramBuildInfo _clGetProgramBuildInfo
+#define clCreateKernel _clCreateKernel
+#define clCreateKernelsInProgram _clCreateKernelsInProgram
+#define clRetainKernel _clRetainKernel
+#define clReleaseKernel _clReleaseKernel
+#define clSetKernelArg _clSetKernelArg
+#define clGetKernelInfo _clGetKernelInfo
+#define clGetKernelArgInfo _clGetKernelArgInfo
+#define clGetKernelWorkGroupInfo _clGetKernelWorkGroupInfo
+#define clWaitForEvents _clWaitForEvents
+#define clGetEventInfo _clGetEventInfo
+#define clCreateUserEvent _clCreateUserEvent
+#define clRetainEvent _clRetainEvent
+#define clReleaseEvent _clReleaseEvent
+#define clSetUserEventStatus _clSetUserEventStatus
+#define clSetEventCallback _clSetEventCallback
+#define clGetEventProfilingInfo _clGetEventProfilingInfo
+#define clFlush _clFlush
+#define clFinish _clFinish
+#define clEnqueueReadBuffer _clEnqueueReadBuffer
+#define clEnqueueReadBufferRect _clEnqueueReadBufferRect
+#define clEnqueueWriteBuffer _clEnqueueWriteBuffer
+#define clEnqueueWriteBufferRect _clEnqueueWriteBufferRect
+#define clEnqueueCopyBuffer _clEnqueueCopyBuffer
+#define clEnqueueCopyBufferRect _clEnqueueCopyBufferRect
+#define clEnqueueFillBuffer _clEnqueueFillBuffer
+#define clEnqueueFillImage _clEnqueueFillImage
+#define clEnqueueReadImage _clEnqueueReadImage
+#define clEnqueueWriteImage _clEnqueueWriteImage
+#define clEnqueueCopyImage _clEnqueueCopyImage
+#define clEnqueueCopyImageToBuffer _clEnqueueCopyImageToBuffer
+#define clEnqueueCopyBufferToImage _clEnqueueCopyBufferToImage
+#define clEnqueueMapBuffer _clEnqueueMapBuffer
+#define clEnqueueMapImage _clEnqueueMapImage
+#define clEnqueueUnmapMemObject _clEnqueueUnmapMemObject
+#define clEnqueueMigrateMemObjects _clEnqueueMigrateMemObjects
+#define clEnqueueNDRangeKernel _clEnqueueNDRangeKernel
+#define clEnqueueTask _clEnqueueTask
+#define clEnqueueNativeKernel _clEnqueueNativeKernel
+#define clGetExtensionFunctionAddressForPlatform _clGetExtensionFunctionAddressForPlatform
+#define clEnqueueMarkerWithWaitList _clEnqueueMarkerWithWaitList
+#define clEnqueueBarrierWithWaitList _clEnqueueBarrierWithWaitList
+#define clSetPrintfCallback _clSetPrintfCallback
+#define clEnqueueMarker _clEnqueueMarker
+#define clEnqueueWaitForEvents _clEnqueueWaitForEvents
+#define clEnqueueBarrier _clEnqueueBarrier
+#define clCreateFromGLBuffer _clCreateFromGLBuffer
+#define clCreateFromGLTexture _clCreateFromGLTexture
+#define clCreateFromGLTexture2D _clCreateFromGLTexture2D
+#define clCreateFromGLTexture3D _clCreateFromGLTexture3D
+#define clCreateFromGLRenderbuffer _clCreateFromGLRenderbuffer
+#define clGetGLObjectInfo _clGetGLObjectInfo
+#define clGetGLTextureInfo _clGetGLTextureInfo
+#define clEnqueueAcquireGLObjects _clEnqueueAcquireGLObjects
+#define clEnqueueReleaseGLObjects _clEnqueueReleaseGLObjects
+#define clGetGLContextInfoKHR _clGetGLContextInfoKHR
+#define clCreateEventFromGLsyncKHR _clCreateEventFromGLsyncKHR
+#endif // OCLGRIND_ICD
+#include <list>
+#include <map>
+#include <stack>
+#include <stdint.h>
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "CL/cl_gl.h"
+#include "CL/cl_gl_ext.h"
+#if defined(_WIN32) && !defined(__MINGW32__)
+#include "CL/cl_d3d11.h"
+#include "CL/cl_d3d10.h"
+#include "CL/cl_dx9_media_sharing.h"
+namespace oclgrind
+  class Context;
+  class Kernel;
+  class Program;
+  class Queue;
+  struct Event;
+struct _cl_platform_id
+  void *dispatch;
+struct _cl_device_id
+  void **dispatch;
+struct _cl_context
+  void *dispatch;
+  oclgrind::Context *context;
+  void (CL_CALLBACK *notify)(const char *, const void *, size_t, void *);
+  void *data;
+  cl_context_properties *properties;
+  size_t szProperties;
+  unsigned int refCount;
+struct _cl_command_queue
+  void *dispatch;
+  cl_command_queue_properties properties;
+  cl_context context;
+  oclgrind::Queue *queue;
+  unsigned int refCount;
+struct _cl_mem
+  void *dispatch;
+  cl_context context;
+  cl_mem parent;
+  size_t address;
+  size_t size;
+  size_t offset;
+  cl_mem_flags flags;
+  bool isImage;
+  void *hostPtr;
+  std::stack< std::pair<void (CL_CALLBACK*)(cl_mem, void *), void*> > callbacks;
+  unsigned int refCount;
+struct cl_image : _cl_mem
+  cl_image_format format;
+  cl_image_desc desc;
+struct _cl_program
+  void *dispatch;
+  oclgrind::Program *program;
+  cl_context context;
+  unsigned int refCount;
+struct _cl_kernel
+  void *dispatch;
+  oclgrind::Kernel *kernel;
+  cl_program program;
+  std::map<cl_uint, cl_mem> memArgs;
+  unsigned int refCount;
+struct _cl_event
+  void *dispatch;
+  cl_context context;
+  cl_command_queue queue;
+  cl_command_type type;
+  oclgrind::Event *event;
+  std::list< std::pair<void (CL_CALLBACK*)(cl_event, cl_int, void*), void*> > callbacks;
+  unsigned int refCount;
+struct _cl_sampler
+  void *dispatch;
+  cl_context context;
+  cl_bool normCoords;
+  cl_addressing_mode addressMode;
+  cl_filter_mode filterMode;
+  uint32_t sampler;
+  unsigned int refCount;
+extern void *m_dispatchTable[256];
+#endif // _ICD_H_
diff --git a/src/runtime/oclgrind b/src/runtime/oclgrind
new file mode 100755
index 0000000..4925be4
--- /dev/null
+++ b/src/runtime/oclgrind
@@ -0,0 +1,145 @@
+# oclgrind (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+function usage
+  echo "Usage: "
+  echo "  oclgrind [OPTIONS] COMMAND"
+  echo "  oclgrind [--help | --version]"
+  echo
+  echo "Options:"
+  echo -n "     --build-options  OPTIONS  "
+  echo          "Additional options to pass to the OpenCL compiler"
+  echo -n "     --check-api               "
+  echo          "Reports errors on API calls"
+  echo -n "     --data-races              "
+  echo          "Enable data-race detection"
+  echo -n "     --disable-pch             "
+  echo          "Don't use precompiled headers"
+  echo -n "     --dump-spir               "
+  echo          "Dump SPIR to /tmp/oclgrind_*.{ll,bc}"
+  echo -n "  -h --help                    "
+  echo          "Display usage information"
+  echo -n "     --inst-counts             "
+  echo          "Output histograms of instructions executed"
+  echo -n "  -i --interactive             "
+  echo          "Enable interactive mode"
+  echo -n "     --log            LOGFILE  "
+  echo          "Redirect log/error messages to a file"
+  echo -n "     --max-errors     NUM      "
+  echo          "Limit the number of error/warning messages"
+  echo -n "     --num-threads    NUM      "
+  echo          "Set the number of worker threads to use"
+  echo -n "     --pch-dir        DIR      "
+  echo          "Override directory containing precompiled headers"
+  echo -n "     --plugins        PLUGINS  "
+  echo          "Load colon seperated list of plugin libraries"
+  echo -n "  -q --quick                   "
+  echo          "Only run first and last work-group"
+  echo -n "     --uniform-writes          "
+  echo          "Don't suppress uniform write-write data-races"
+  echo -n "  -v --version                 "
+  echo          "Display version information"
+  echo
+  echo "For more information, please visit the Oclgrind wiki page:"
+  echo "-> https://github.com/jrprice/Oclgrind/wiki"
+  echo
+# Parse arguments
+while [ $# -gt 0 -a "${1:0:1}" == "-" ]
+  if [ "$1" == "--build-options" ]
+  then
+    shift
+  elif [ "$1" == "--check-api" ]
+  then
+    export OCLGRIND_CHECK_API=1
+  elif [ "$1" == "--data-races" ]
+  then
+  elif [ "$1" == "--disable-pch" ]
+  then
+  elif [ "$1" == "--dump-spir" ]
+  then
+    export OCLGRIND_DUMP_SPIR=1
+  elif [ "$1" == "-h" -o "$1" == "--help" ]
+  then
+    usage
+    exit 0
+  elif [ "$1" == "--inst-counts" ]
+  then
+  elif [ "$1" == "-i" -o "$1" == "--interactive" ]
+  then
+  elif [ "$1" == "--log" ]
+  then
+    shift
+    export OCLGRIND_LOG="$1"
+  elif [ "$1" == "--max-errors" ]
+  then
+    shift
+    export OCLGRIND_MAX_ERRORS="$1"
+  elif [ "$1" == "--num-threads" ]
+  then
+    shift
+    export OCLGRIND_NUM_THREADS="$1"
+  elif [ "$1" == "--pch-dir" ]
+  then
+    shift
+    export OCLGRIND_PCH_DIR="$1"
+  elif [ "$1" == "--plugins" ]
+  then
+    shift
+    export OCLGRIND_PLUGINS="$1"
+  elif [ "$1" == "-q" -o "$1" == "--quick" ]
+  then
+    export OCLGRIND_QUICK=1
+  elif [ "$1" == "--uniform-writes" ]
+  then
+  elif [ "$1" == "-v" -o "$1" == "--version" ]
+  then
+    echo
+    echo "Oclgrind __VERSION__"
+    echo
+    echo "Copyright (c) 2013-2015"
+    echo "James Price and Simon McIntosh-Smith, University of Bristol"
+    echo "https://github.com/jrprice/Oclgrind"
+    echo
+    exit 0
+  else
+    echo "Unrecognized argument '$1'"
+    usage
+    exit 1
+  fi
+  shift
+# Ensure target command supplied
+if [ $# -lt 1 ]
+  usage
+  exit 1
+# Inject liboclgrind.{so,dylib} and run command
+LIBDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/../lib"
+if [ "$(uname -s)" == "Darwin" ]
+    DYLD_INSERT_LIBRARIES=$LIBDIR/liboclgrind-rt.dylib \
+    LD_PRELOAD=$LIBDIR/liboclgrind-rt.so "$@"
diff --git a/src/runtime/runtime.cpp b/src/runtime/runtime.cpp
new file mode 100644
index 0000000..1cf7338
--- /dev/null
+++ b/src/runtime/runtime.cpp
@@ -0,0 +1,5594 @@
+// runtime.cpp (Oclgrind)
+// Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+// University of Bristol. All rights reserved.
+// This program is provided under a three-clause BSD license. For full
+// license terms please see the LICENSE file distributed with this
+// source code.
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include "async_queue.h"
+#include "icd.h"
+#include "core/Context.h"
+#include "core/Kernel.h"
+#include "core/half.h"
+#include "core/Memory.h"
+#include "core/Program.h"
+#include "core/Queue.h"
+using namespace std;
+#define MAX_GLOBAL_MEM_SIZE      (128 * 1048576)
+#define MAX_CONSTANT_BUFFER_SIZE (1048576)
+#define MAX_LOCAL_MEM_SIZE       (32768)
+#define MAX_WI_SIZE              (65536)
+#define PLATFORM_NAME       "Oclgrind"
+#define PLATFORM_VENDOR     "University of Bristol"
+#define PLATFORM_VERSION    "OpenCL 1.2 (Oclgrind " PACKAGE_VERSION ")"
+#define PLATFORM_SUFFIX     "oclg"
+#define PLATFORM_EXTENSIONS "cl_khr_icd"
+#define DEVICE_NAME          "Oclgrind Simulator"
+#define DEVICE_VENDOR        "University of Bristol"
+#define DEVICE_VENDOR_ID     0x0042
+#define DEVICE_VERSION       "OpenCL 1.2 (Oclgrind " PACKAGE_VERSION ")"
+#define DEVICE_LANG_VERSION  "OpenCL C 1.2 (Oclgrind " PACKAGE_VERSION ")"
+#define DRIVER_VERSION       "Oclgrind " PACKAGE_VERSION
+#define DEVICE_EXTENSIONS    "         \
+  cl_khr_spir                          \
+  cl_khr_3d_image_writes               \
+  cl_khr_global_int32_base_atomics     \
+  cl_khr_global_int32_extended_atomics \
+  cl_khr_local_int32_base_atomics      \
+  cl_khr_local_int32_extended_atomics  \
+  cl_khr_byte_addressable_store        \
+  cl_khr_fp64"
+#define CASE(X) case X: return #X;
+  const char* CLErrorToString(cl_int err)
+  {
+    switch (err)
+    {
+    }
+    return "Unknown";
+  }
+#undef CASE
+  void notifyAPIError(cl_context context, cl_int err,
+                      const char* function, string info = "")
+  {
+    // Remove leading underscore from function name if necessary
+    if (!strncmp(function, "_cl", 3))
+    {
+      function++;
+    }
+    // Build error message
+    ostringstream oss;
+    oss << endl
+        << "Oclgrind - OpenCL runtime error detected" << endl
+        << "\tFunction: " << function << endl
+        << "\tError:    " << CLErrorToString(err) << endl;
+    if (!info.empty())
+    {
+      oss << "\t" << info << endl;
+    }
+    string error = oss.str();
+    // Output message to stderr if required
+    const char *checkAPI = getenv("OCLGRIND_CHECK_API");
+    if (checkAPI && strcmp(checkAPI, "1") == 0)
+    {
+      cerr << error << endl;
+    }
+    // Fire context callback if set
+    if (context && context->notify)
+    {
+      context->notify(error.c_str(), context->data, 0, NULL);
+    }
+  }
+#if defined(_WIN32) && !defined(__MINGW32__)
+#define __func__ __FUNCTION__
+#define ReturnErrorInfo(context, err, info)          \
+{                                                    \
+  ostringstream oss;                                 \
+  oss << info;                                       \
+  notifyAPIError(context, err, __func__, oss.str()); \
+  return err;                                        \
+#define ReturnErrorArg(context, err, arg) \
+  ReturnErrorInfo(context, err, "For argument '" #arg "'")
+#define ReturnError(context, err) \
+  ReturnErrorInfo(context, err, "")
+#define SetErrorInfo(context, err, info)               \
+  if (err != CL_SUCCESS)                               \
+  {                                                    \
+    ostringstream oss;                                 \
+    oss << info;                                       \
+    notifyAPIError(context, err, __func__, oss.str()); \
+  }                                                    \
+  if (errcode_ret)                                     \
+  {                                                    \
+    *errcode_ret = err;                                \
+  }
+#define SetErrorArg(context, err, arg) \
+  SetErrorInfo(context, err, "For argument '" #arg "'")
+#define SetError(context, err) \
+  SetErrorInfo(context, err, "")
+#define ParamValueSizeTooSmall                        \
+  "param_value_size is " << param_value_size <<       \
+  ", but result requires " << result_size << " bytes"
+static struct _cl_platform_id *m_platform = NULL;
+static struct _cl_device_id *m_device = NULL;
+  cl_uint num_entries,
+  cl_platform_id *platforms,
+  cl_uint *num_platforms
+  if (!m_platform)
+  {
+    m_platform = new _cl_platform_id;
+    m_platform->dispatch = m_dispatchTable;
+    m_device = new _cl_device_id;
+    m_device->dispatch = m_dispatchTable;
+  }
+  if (num_entries > 0)
+  {
+    platforms[0] = m_platform;
+  }
+  if (num_platforms)
+  {
+    *num_platforms = 1;
+  }
+  return CL_SUCCESS;
+// OpenCL Runtime API Definitions //
+  const char *  funcname
+  if (strcmp(funcname, "clIcdGetPlatformIDsKHR") == 0)
+  {
+    return (void*)clIcdGetPlatformIDsKHR;
+  }
+  else
+  {
+    return NULL;
+  }
+  cl_uint           num_entries,
+  cl_platform_id *  platforms,
+  cl_uint *         num_platforms
+  return clIcdGetPlatformIDsKHR(num_entries, platforms, num_platforms);
+  cl_platform_id    platform,
+  cl_platform_info  param_name,
+  size_t            param_value_size,
+  void *            param_value,
+  size_t *          param_value_size_ret
+  // Select platform info string
+  const char *result = NULL;
+  switch(param_name)
+  {
+    result = PLATFORM_PROFILE;
+    break;
+    result = PLATFORM_VERSION;
+    break;
+    result = PLATFORM_NAME;
+    break;
+    result = PLATFORM_VENDOR;
+    break;
+    break;
+    result = PLATFORM_SUFFIX;
+    break;
+  default:
+    ReturnErrorArg(NULL, CL_INVALID_VALUE, param_name);
+  }
+  // Compute size of result
+  size_t result_size = strlen(result) + 1;
+  if (param_value_size_ret)
+  {
+    *param_value_size_ret = result_size;
+  }
+  // Return result
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(NULL, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+    }
+    else
+    {
+      memcpy(param_value, result, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_platform_id  platform,
+  cl_device_type  device_type,
+  cl_uint         num_entries,
+  cl_device_id *  devices,
+  cl_uint *       num_devices
+  // Check parameters
+  if (devices && num_entries < 1)
+  {
+    ReturnError(NULL, CL_INVALID_VALUE);
+  }
+  if (device_type != CL_DEVICE_TYPE_CPU &&
+      device_type != CL_DEVICE_TYPE_DEFAULT &&
+      device_type != CL_DEVICE_TYPE_ALL)
+  {
+    ReturnError(NULL, CL_DEVICE_NOT_FOUND);
+  }
+  if (devices)
+  {
+    *devices = m_device;
+  }
+  if (num_devices)
+  {
+    *num_devices = 1;
+  }
+  return CL_SUCCESS;
+  cl_device_id    device,
+  cl_device_info  param_name,
+  size_t          param_value_size,
+  void *          param_value,
+  size_t *        param_value_size_ret
+  // Check device is valid
+  if (device != m_device)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_DEVICE, device);
+  }
+  size_t dummy;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  // All possible return types
+  union
+  {
+    cl_uint cluint;
+    size_t sizet;
+    size_t sizet3[3];
+    cl_ulong clulong;
+    cl_bool clbool;
+    cl_device_id cldeviceid;
+    cl_device_type cldevicetype;
+    cl_device_fp_config devicefpconfig;
+    cl_device_mem_cache_type devicememcachetype;
+    cl_device_local_mem_type devicelocalmemtype;
+    cl_device_exec_capabilities cldevexeccap;
+    cl_command_queue_properties clcmdqprop;
+    cl_platform_id clplatid;
+    cl_device_partition_property cldevpartprop;
+    cl_device_affinity_domain cldevaffdom;
+  } result_data;
+  // The result is actually a string that needs copying
+  const char* str = 0;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_device_type);
+    result_data.cldevicetype = CL_DEVICE_TYPE_CPU;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = DEVICE_VENDOR_ID;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 3;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = MAX_WI_SIZE;
+    break;
+    result_size = 3*sizeof(size_t);
+    result_data.sizet3[0] = MAX_WI_SIZE;
+    result_data.sizet3[1] = MAX_WI_SIZE;
+    result_data.sizet3[2] = MAX_WI_SIZE;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = sizeof(size_t)<<3;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 128;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 8;
+    break;
+    result_size = sizeof(cl_ulong);
+    result_data.clulong = MAX_GLOBAL_MEM_SIZE;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 8192;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 2048;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_TRUE;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 1024;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.sizet = 16;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = sizeof(cl_long16)<<3;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+    result_size = sizeof(cl_device_fp_config);
+    result_data.devicefpconfig =
+    break;
+    result_size = sizeof(cl_device_mem_cache_type);
+    result_data.devicememcachetype = CL_NONE;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+    result_size = sizeof(cl_ulong);
+    result_data.clulong = 0;
+    break;
+    result_size = sizeof(cl_ulong);
+    result_data.clulong = MAX_GLOBAL_MEM_SIZE;
+    break;
+    result_size = sizeof(cl_ulong);
+    result_data.clulong = MAX_CONSTANT_BUFFER_SIZE;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1024;
+    break;
+    result_size = sizeof(cl_device_local_mem_type);
+    result_data.devicelocalmemtype = CL_LOCAL;
+    break;
+    result_size = sizeof(cl_ulong);
+    result_data.clulong = MAX_LOCAL_MEM_SIZE;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_FALSE;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 1000;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_TRUE;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_TRUE;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_TRUE;
+    break;
+    result_size = sizeof(cl_device_exec_capabilities);
+    result_data.cldevexeccap =  CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL;
+    break;
+    result_size = sizeof(cl_command_queue_properties);
+    result_data.clcmdqprop = CL_QUEUE_PROFILING_ENABLE;
+    break;
+    result_size = sizeof(DEVICE_NAME);
+    str = DEVICE_NAME;
+    break;
+    result_size = sizeof(DEVICE_VENDOR);
+    str = DEVICE_VENDOR;
+    break;
+    result_size = sizeof(DRIVER_VERSION);
+    str = DRIVER_VERSION;
+    break;
+    result_size = sizeof(DEVICE_PROFILE);
+    str = DEVICE_PROFILE;
+    break;
+    result_size = sizeof(DEVICE_VERSION);
+    str = DEVICE_VERSION;
+    break;
+    result_size = sizeof(DEVICE_EXTENSIONS);
+    break;
+    result_size = sizeof(cl_platform_id);
+    result_data.clplatid = m_platform;
+    break;
+    result_size = sizeof(cl_device_fp_config);
+    result_data.devicefpconfig =
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_FALSE;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+    result_size = sizeof(DEVICE_LANG_VERSION);
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_TRUE;
+    break;
+    result_size = 1;
+    str = "";
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 65536;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 2048;
+    break;
+    result_size = sizeof(cl_device_id);
+    result_data.cldeviceid = NULL;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+    result_size = sizeof(cl_device_partition_property);
+    result_data.cldevpartprop = 0;
+    break;
+    result_size = sizeof(cl_device_affinity_domain);
+    result_data.cldevaffdom = 0;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = CL_TRUE;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 1024;
+    break;
+    result_size = sizeof(DEVICE_SPIR_VERSIONS);
+    break;
+  default:
+    ReturnErrorArg(NULL, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(NULL, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+    }
+    else
+    {
+      const void* src = str ? (const void*)str : (const void*)&result_data;
+      memcpy(param_value, src, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_device_id                          in_device,
+  const cl_device_partition_property *  properties,
+  cl_uint                               num_entries,
+  cl_device_id *                        out_devices,
+  cl_uint *                             num_devices
+  ReturnErrorInfo(NULL, CL_INVALID_VALUE, "Not yet implemented");
+  cl_device_id  device
+  return CL_SUCCESS;
+  cl_device_id  device
+  return CL_SUCCESS;
+  const cl_context_properties *  properties,
+  cl_uint                        num_devices,
+  const cl_device_id *           devices,
+  void (CL_CALLBACK *            pfn_notify)(const char *,
+                                             const void *,
+                                             size_t,
+                                             void *),
+  void *                         user_data,
+  cl_int *                       errcode_ret
+  // Check parameters
+  if (num_devices != 1)
+  {
+    SetErrorArg(NULL, CL_INVALID_VALUE, num_devices);
+    return NULL;
+  }
+  if (!devices)
+  {
+    SetErrorArg(NULL, CL_INVALID_VALUE, devices);
+    return NULL;
+  }
+  if (devices[0] != m_device)
+  {
+    return NULL;
+  }
+  if (!pfn_notify && user_data)
+  {
+                 "pfn_notify NULL but user_data non-NULL");
+    return NULL;
+  }
+  // Create context object
+  cl_context context = new _cl_context;
+  context->dispatch = m_dispatchTable;
+  context->context = new oclgrind::Context();
+  context->notify = pfn_notify;
+  context->data = user_data;
+  context->properties = NULL;
+  context->szProperties = 0;
+  context->refCount = 1;
+  if (properties)
+  {
+    int num = 1;
+    while (properties[num])
+    {
+      num++;
+    }
+    size_t sz = (num+1)*sizeof(cl_context_properties);
+    context->szProperties = sz;
+    context->properties = (cl_context_properties*)malloc(sz);
+    memcpy(context->properties, properties, sz);
+  }
+  SetError(NULL, CL_SUCCESS);
+  return context;
+  const cl_context_properties *  properties,
+  cl_device_type                 device_type,
+  void (CL_CALLBACK *            pfn_notify)(const char *,
+                                             const void *,
+                                             size_t,
+                                             void *),
+  void *                         user_data,
+  cl_int *                       errcode_ret
+  // Check parameters
+  if (!pfn_notify && user_data)
+  {
+                 "pfn_notify NULL but user_data non-NULL");
+    return NULL;
+  }
+  if (device_type != CL_DEVICE_TYPE_CPU &&
+      device_type != CL_DEVICE_TYPE_DEFAULT &&
+      device_type != CL_DEVICE_TYPE_ALL)
+  {
+    SetErrorArg(NULL, CL_DEVICE_NOT_FOUND, device_type);
+    return NULL;
+  }
+  // Create context object
+  cl_context context = new _cl_context;
+  context->dispatch = m_dispatchTable;
+  context->context = new oclgrind::Context();
+  context->notify = pfn_notify;
+  context->data = user_data;
+  context->properties = NULL;
+  context->szProperties = 0;
+  context->refCount = 1;
+  if (properties)
+  {
+    int num = 0;
+    while (properties[num])
+    {
+      num++;
+    }
+    size_t sz = (num+1)*sizeof(cl_context_properties);
+    context->szProperties = sz;
+    context->properties = (cl_context_properties*)malloc(sz);
+    memcpy(context->properties, properties, sz);
+  }
+  SetError(NULL, CL_SUCCESS);
+  return context;
+  cl_context  context
+  if (!context)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+  }
+  context->refCount++;
+  return CL_SUCCESS;
+  cl_context  context
+  if (!context)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+  }
+  if (--context->refCount == 0)
+  {
+    delete context->context;
+    delete context;
+  }
+  return CL_SUCCESS;
+  cl_context       context,
+  cl_context_info  param_name,
+  size_t           param_value_size,
+  void *           param_value,
+  size_t *         param_value_size_ret
+  // Check context is valid
+  if (!context)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+  }
+  size_t dummy = 0;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_uint cluint;
+    cl_device_id cldevid;
+  } result_data;
+  cl_context_properties* properties = NULL;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_uint);
+    result_data.cluint = context->refCount;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 1;
+    break;
+    result_size = sizeof(cl_device_id);
+    result_data.cldevid = m_device;
+    break;
+    result_size = context->szProperties;
+    properties = context->properties;
+    break;
+  default:
+    ReturnErrorArg(context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+    }
+    else
+    {
+      if (properties)
+        memcpy(param_value, properties, result_size);
+      else
+        memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+CL_API_ENTRY cl_command_queue CL_API_CALL
+  cl_context                   context,
+  cl_device_id                 device,
+  cl_command_queue_properties  properties,
+  cl_int *                     errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  if (device != m_device)
+  {
+    SetErrorArg(context, CL_INVALID_DEVICE, device);
+    return NULL;
+  }
+  {
+    SetErrorInfo(context, CL_INVALID_QUEUE_PROPERTIES,
+                 "Out-of-order command queues not supported");
+    return NULL;
+  }
+  // Create command-queue object
+  cl_command_queue queue;
+  queue = new _cl_command_queue;
+  queue->queue = new oclgrind::Queue(context->context);
+  queue->dispatch = m_dispatchTable;
+  queue->properties = properties;
+  queue->context = context;
+  queue->refCount = 1;
+  clRetainContext(context);
+  SetError(context, CL_SUCCESS);
+  return queue;
+  cl_command_queue               command_queue,
+  cl_command_queue_properties    properties,
+  cl_bool                        enable,
+  cl_command_queue_properties *  old_properties
+  return CL_SUCCESS;
+  cl_command_queue  command_queue
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  command_queue->refCount++;
+  return CL_SUCCESS;
+  cl_command_queue  command_queue
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (--command_queue->refCount == 0)
+  {
+    // TODO: Retain/release queue from async thread
+    // TODO: Spec states that this function performs an implicit flush,
+    // so maybe we are OK to delete queue here?
+    clFinish(command_queue);
+    delete command_queue->queue;
+    clReleaseContext(command_queue->context);
+    delete command_queue;
+  }
+  return CL_SUCCESS;
+  cl_command_queue       command_queue,
+  cl_command_queue_info  param_name,
+  size_t                 param_value_size,
+  void *                 param_value,
+  size_t *               param_value_size_ret
+  // Check queue is valid
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  size_t dummy = 0;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_uint cluint;
+    cl_context context;
+    cl_device_id cldevid;
+    cl_command_queue_properties properties;
+  } result_data;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_context);
+    result_data.context = command_queue->context;
+    break;
+    result_size = sizeof(cl_device_id);
+    result_data.cldevid = m_device;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = command_queue->refCount;
+    break;
+    result_size = sizeof(cl_command_queue_properties);
+    result_data.properties = command_queue->properties;
+    break;
+  default:
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                      ParamValueSizeTooSmall);
+    }
+    else
+    {
+      memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_context    context,
+  cl_mem_flags  flags,
+  size_t        size,
+  void *        host_ptr,
+  cl_int *      errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  if (size == 0)
+  {
+    SetErrorArg(context, CL_INVALID_BUFFER_SIZE, size);
+    return NULL;
+  }
+  if ((host_ptr == NULL) ==
+      ((flags & CL_MEM_COPY_HOST_PTR) ||
+        flags & CL_MEM_USE_HOST_PTR))
+  {
+    SetErrorInfo(context, CL_INVALID_HOST_PTR,
+                 "host_ptr NULL but CL_MEM_{COPY,USE}_HOST_PTR used");
+    return NULL;
+  }
+  if ((flags & CL_MEM_USE_HOST_PTR) &&
+  {
+    SetErrorInfo(context, CL_INVALID_VALUE,
+                 "CL_MEM_USE_HOST_PTR cannot be used with "
+                 "CL_MEM_{COPY,ALLOC}_HOST_PTR");
+    return NULL;
+  }
+  // Create memory object
+  oclgrind::Memory *globalMemory = context->context->getGlobalMemory();
+  cl_mem mem = new _cl_mem;
+  mem->dispatch = m_dispatchTable;
+  mem->context = context;
+  mem->parent = NULL;
+  mem->size = size;
+  mem->offset = 0;
+  mem->flags = flags;
+  mem->isImage = false;
+  mem->refCount = 1;
+  if (flags & CL_MEM_USE_HOST_PTR)
+  {
+    mem->address = globalMemory->createHostBuffer(size, host_ptr, flags);
+    mem->hostPtr = host_ptr;
+  }
+  else
+  {
+    mem->address = globalMemory->allocateBuffer(size, flags);
+    mem->hostPtr = NULL;
+  }
+  if (!mem->address)
+  {
+    delete mem;
+    return NULL;
+  }
+  clRetainContext(context);
+  if (flags & CL_MEM_COPY_HOST_PTR)
+  {
+    context->context->getGlobalMemory()->store((const unsigned char*)host_ptr,
+                                               mem->address, size);
+  }
+  SetError(context, CL_SUCCESS);
+  return mem;
+  cl_mem                 buffer,
+  cl_mem_flags           flags,
+  cl_buffer_create_type  buffer_create_type,
+  const void *           buffer_create_info,
+  cl_int *               errcode_ret
+  // Check parameters
+  if (!buffer)
+  {
+    SetErrorArg(NULL, CL_INVALID_MEM_OBJECT, buffer);
+    return NULL;
+  }
+  if (buffer->parent)
+  {
+    SetErrorInfo(buffer->context, CL_INVALID_MEM_OBJECT,
+                 "Parent buffer cannot be a sub-buffer");
+    return NULL;
+  }
+  if (buffer_create_type != CL_BUFFER_CREATE_TYPE_REGION)
+  {
+    SetErrorArg(buffer->context, CL_INVALID_VALUE, buffer_create_type);
+    return NULL;
+  }
+  if (!buffer_create_info)
+  {
+    SetErrorArg(buffer->context, CL_INVALID_VALUE, buffer_create_info);
+    return NULL;
+  }
+  _cl_buffer_region region = *(_cl_buffer_region*)buffer_create_info;
+  if (region.origin + region.size > buffer->size)
+  {
+    SetErrorInfo(buffer->context, CL_INVALID_VALUE,
+                 "Region doesn't fit inside parent buffer");
+    return NULL;
+  }
+  if (region.size == 0)
+  {
+    SetErrorInfo(buffer->context, CL_INVALID_VALUE, "Region size cannot be 0");
+    return NULL;
+  }
+  // Inherit flags from parent where appropriate
+  cl_mem_flags memFlags = 0;
+  cl_mem_flags rwFlags = CL_MEM_READ_ONLY | CL_MEM_READ_WRITE |
+                         CL_MEM_WRITE_ONLY;
+  cl_mem_flags hostAccess = CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY |
+                            CL_MEM_HOST_WRITE_ONLY;
+  cl_mem_flags hostPtr = CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
+                         CL_MEM_COPY_HOST_PTR;
+  if ((flags & rwFlags) == 0)
+  {
+    memFlags |= buffer->flags & rwFlags;
+  }
+  else
+  {
+    memFlags |= flags & rwFlags;
+  }
+  if ((flags & hostAccess) == 0)
+  {
+    memFlags |= buffer->flags & hostAccess;
+  }
+  else
+  {
+    memFlags |= flags & hostAccess;
+  }
+  memFlags |= buffer->flags & hostPtr;
+  // Create memory object
+  cl_mem mem = new _cl_mem;
+  mem->dispatch = m_dispatchTable;
+  mem->context = buffer->context;
+  mem->parent = buffer;
+  mem->size = region.size;
+  mem->offset = region.origin;
+  mem->isImage = false;
+  mem->flags = memFlags;
+  mem->hostPtr = (unsigned char*)buffer->hostPtr + region.origin;
+  mem->refCount = 1;
+  mem->address = buffer->address + region.origin;
+  clRetainMemObject(buffer);
+  SetError(buffer->context, CL_SUCCESS);
+  return mem;
+// Utility function for getting number of dimensions in image
+size_t getNumDimensions(cl_mem_object_type type)
+  switch (type)
+  {
+    return 1;
+    return 2;
+    return 3;
+  default:
+    return 0;
+  }
+// Utility function for getting number of channels in an image
+size_t getNumChannels(const cl_image_format *format)
+  switch (format->image_channel_order)
+  {
+  case CL_R:
+  case CL_Rx:
+  case CL_A:
+    return 1;
+  case CL_RG:
+  case CL_RGx:
+  case CL_RA:
+    return 2;
+  case CL_RGB:
+  case CL_RGBx:
+    return 3;
+  case CL_RGBA:
+  case CL_ARGB:
+  case CL_BGRA:
+    return 4;
+  default:
+    return 0;
+  }
+// Utility function for computing an image format's pixel size (in bytes)
+size_t getPixelSize(const cl_image_format *format)
+  // Get number of channels
+  size_t numChannels = getNumChannels(format);
+  // Get size of each pixel (in bytes)
+  switch (format->image_channel_data_type)
+  {
+  case CL_SNORM_INT8:
+  case CL_UNORM_INT8:
+  case CL_SIGNED_INT8:
+    return numChannels;
+  case CL_SNORM_INT16:
+  case CL_UNORM_INT16:
+  case CL_SIGNED_INT16:
+  case CL_UNSIGNED_INT16:
+  case CL_HALF_FLOAT:
+    return 2*numChannels;
+  case CL_SIGNED_INT32:
+  case CL_UNSIGNED_INT32:
+  case CL_FLOAT:
+    return 4*numChannels;
+  case CL_UNORM_SHORT_565:
+  case CL_UNORM_SHORT_555:
+    return 2;
+  case CL_UNORM_INT_101010:
+    return 4;
+  default:
+    return 0;
+  }
+bool isImageArray(cl_mem_object_type type)
+  if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+  {
+    return true;
+  }
+  return false;
+  cl_context               context,
+  cl_mem_flags             flags,
+  const cl_image_format *  image_format,
+  const cl_image_desc *    image_desc,
+  void *                   host_ptr,
+  cl_int *                 errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  if (!image_format)
+  {
+    SetErrorArg(context, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, image_format);
+    return NULL;
+  }
+  if (!image_desc)
+  {
+    SetErrorArg(context, CL_INVALID_IMAGE_DESCRIPTOR, image_desc);
+    return NULL;
+  }
+  // Get size of each pixel (in bytes)
+  size_t pixelSize = getPixelSize(image_format);
+  if (!pixelSize)
+  {
+    SetErrorArg(context, CL_INVALID_VALUE, image_format);
+    return NULL;
+  }
+  // Get image dimensions
+  size_t dims = getNumDimensions(image_desc->image_type);
+  size_t width = image_desc->image_width;
+  size_t height = 1, depth = 1;
+  size_t arraySize = 1;
+  if (dims > 1)
+  {
+    height = image_desc->image_height;
+  }
+  if (dims > 2)
+  {
+    depth = image_desc->image_depth;
+  }
+  if (isImageArray(image_desc->image_type))
+  {
+    arraySize = image_desc->image_array_size;
+  }
+  // Calculate total size of image
+  size_t size = width * height * depth * arraySize * pixelSize;
+  cl_mem mem;
+  if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+  {
+    // Use existing buffer
+    if (!image_desc->buffer)
+    {
+      SetErrorInfo(context, CL_INVALID_VALUE,
+                   "image_desc->buffer cannot be NULL "
+                   "when using CL_MEM_OBJECT_IMAGE1D_BUFFER");
+      return NULL;
+    }
+    mem = image_desc->buffer;
+    clRetainMemObject(image_desc->buffer);
+  }
+  else
+  {
+    // Create buffer
+    // TODO: Use pitches
+    mem = clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
+    if (!mem)
+    {
+      return NULL;
+    }
+  }
+  // Create image object wrapper
+  cl_image *image = new cl_image;
+  *(cl_mem)image = *mem;
+  image->isImage = true;
+  image->format = *image_format;
+  image->desc = *image_desc;
+  image->desc.image_width = width;
+  image->desc.image_height = height;
+  image->desc.image_depth = depth;
+  image->desc.image_array_size = arraySize;
+  image->refCount = 1;
+  if (image_desc->image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)
+  {
+    delete mem;
+  }
+  SetError(context, CL_SUCCESS);
+  return image;
+  cl_context               context,
+  cl_mem_flags             flags,
+  const cl_image_format *  image_format,
+  size_t                   image_width,
+  size_t                   image_height,
+  size_t                   image_row_pitch,
+  void *                   host_ptr,
+  cl_int *                 errcode_ret
+  cl_image_desc desc =
+  {
+    image_width,
+    image_height,
+    1,
+    1,
+    image_row_pitch,
+    0,
+    0,
+    0,
+    NULL
+  };
+  return clCreateImage(context, flags,
+                       image_format, &desc,
+                       host_ptr, errcode_ret);
+  cl_context               context,
+  cl_mem_flags             flags,
+  const cl_image_format *  image_format,
+  size_t                   image_width,
+  size_t                   image_height,
+  size_t                   image_depth,
+  size_t                   image_row_pitch,
+  size_t                   image_slice_pitch,
+  void *                   host_ptr,
+  cl_int *                 errcode_ret
+  cl_image_desc desc =
+  {
+    image_width,
+    image_height,
+    image_depth,
+    1,
+    image_row_pitch,
+    image_slice_pitch,
+    0,
+    0,
+    NULL
+  };
+  return clCreateImage(context, flags,
+                       image_format, &desc,
+                       host_ptr, errcode_ret);
+  cl_mem  memobj
+  if (!memobj)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  memobj->refCount++;
+  return CL_SUCCESS;
+  cl_mem  memobj
+  if (!memobj)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  if (--memobj->refCount == 0)
+  {
+    if (memobj->isImage &&
+        ((cl_image*)memobj)->desc.image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    {
+      clReleaseMemObject(((cl_image*)memobj)->desc.buffer);
+    }
+    else
+    {
+      if (memobj->parent)
+      {
+        clReleaseMemObject(memobj->parent);
+      }
+      else
+      {
+        memobj->context->context->getGlobalMemory()->deallocateBuffer(
+          memobj->address);
+        clReleaseContext(memobj->context);
+      }
+      while (!memobj->callbacks.empty())
+      {
+        pair<void (CL_CALLBACK *)(cl_mem, void *), void*> callback =
+          memobj->callbacks.top();
+        callback.first(memobj, callback.second);
+        memobj->callbacks.pop();
+      }
+    }
+    delete memobj;
+  }
+  return CL_SUCCESS;
+  cl_context          context,
+  cl_mem_flags        flags,
+  cl_mem_object_type  image_type,
+  cl_uint             num_entries,
+  cl_image_format *   image_formats,
+  cl_uint *           num_image_formats
+  // Check parameters
+  if (!context)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_CONTEXT, context);
+  }
+  if (num_entries == 0 && image_formats)
+  {
+    ReturnErrorInfo(context, CL_INVALID_VALUE,
+                    "num_entries should be >0 if image_formats non-NULL");
+  }
+  // TODO: Add support for packed image types
+  // Channel orders
+  const cl_channel_order ordersAll[] =
+  {
+    CL_R, CL_Rx, CL_A,
+    CL_RG, CL_RGx, CL_RA,
+    CL_RGBA,
+  };
+  const cl_channel_order ordersNormalized[] = {CL_INTENSITY, CL_LUMINANCE};
+  const cl_channel_order ordersByte[] = {CL_ARGB, CL_BGRA};
+  const cl_channel_order ordersPacked[] = {CL_RGB, CL_RGBx};
+  const cl_channel_order *orders[] =
+  {
+    ordersAll, ordersNormalized, ordersByte //, ordersPacked
+  };
+  const size_t numOrders[] =
+  {
+    sizeof(ordersAll)        / sizeof(cl_channel_order),
+    sizeof(ordersNormalized) / sizeof(cl_channel_order),
+    sizeof(ordersByte)       / sizeof(cl_channel_order),
+    //sizeof(ordersPacked)     / sizeof(cl_channel_order),
+  };
+  // Channel types
+  const cl_channel_type typesAll[] =
+  {
+  };
+  const cl_channel_type typesNormalized[] =
+  {
+  };
+  const cl_channel_type typesByte[] =
+  {
+  };
+  const cl_channel_type typesPacked[] =
+  {
+  };
+  const cl_channel_type *types[] =
+  {
+    typesAll, typesNormalized, typesByte //, typesPacked,
+  };
+  const size_t numTypes[] =
+  {
+    sizeof(typesAll)        / sizeof(cl_channel_order),
+    sizeof(typesNormalized) / sizeof(cl_channel_order),
+    sizeof(typesByte)       / sizeof(cl_channel_order),
+    //sizeof(typesPacked)     / sizeof(cl_channel_order),
+  };
+  // Calculate total number of formats
+  size_t numCatagories = sizeof(orders)/sizeof(cl_channel_order*);
+  size_t numFormats = 0;
+  for (size_t c = 0; c < numCatagories; c++)
+  {
+    numFormats += numOrders[c] * numTypes[c];
+  }
+  if (num_image_formats)
+  {
+    *num_image_formats = numFormats;
+  }
+  // Generate list of all valid order/type combinations
+  if (image_formats)
+  {
+    unsigned i = 0;
+    for (size_t c = 0; c < numCatagories; c++)
+    {
+      for (size_t o = 0; o < numOrders[c]; o++)
+      {
+        for (size_t t = 0; t < numTypes[c]; t++)
+        {
+          if (i >= num_entries)
+          {
+            return CL_SUCCESS;
+          }
+          cl_image_format format = {orders[c][o], types[c][t]};
+          image_formats[i++] = format;
+        }
+      }
+    }
+  }
+  return CL_SUCCESS;
+  cl_mem       memobj,
+  cl_mem_info  param_name,
+  size_t       param_value_size,
+  void *       param_value,
+  size_t *     param_value_size_ret
+  // Check mem object is valid
+  if (!memobj)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  size_t dummy = 0;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_mem_object_type clmemobjty;
+    cl_mem_flags clmemflags;
+    cl_context context;
+    cl_mem clmem;
+    size_t sizet;
+    cl_uint cluint;
+    void* ptr;
+  } result_data;
+  switch (param_name)
+  {
+  case CL_MEM_TYPE:
+    result_size = sizeof(cl_mem_object_type);
+    result_data.clmemobjty = memobj->isImage ?
+      ((cl_image*)memobj)->desc.image_type : CL_MEM_OBJECT_BUFFER;
+    break;
+  case CL_MEM_FLAGS:
+    result_size = sizeof(cl_mem_flags);
+    result_data.clmemflags = memobj->flags;
+    break;
+  case CL_MEM_SIZE:
+    result_size = sizeof(size_t);
+    result_data.sizet = memobj->size;
+    break;
+  case CL_MEM_HOST_PTR:
+    result_size = sizeof(void*);
+    result_data.ptr = memobj->hostPtr;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = memobj->refCount;
+    break;
+    result_size = sizeof(cl_context);
+    result_data.context = memobj->context;
+    break;
+    result_size = sizeof(cl_mem);
+    result_data.clmem = memobj->parent;
+    break;
+  case CL_MEM_OFFSET:
+    result_size = sizeof(size_t);
+    result_data.sizet = memobj->offset;
+    break;
+  default:
+    ReturnErrorArg(memobj->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(memobj->context, CL_INVALID_VALUE,
+                      ParamValueSizeTooSmall);
+    }
+    else
+    {
+      memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_mem         image,
+  cl_image_info  param_name,
+  size_t         param_value_size,
+  void *         param_value,
+  size_t *       param_value_size_ret
+  // Check mem object is valid
+  if (!image)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, image);
+  }
+  cl_image *img = (cl_image*)image;
+  size_t dummy = 0;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_image_format climgfmt;
+    size_t sizet;
+    cl_mem clmem;
+    cl_uint cluint;
+  } result_data;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_image_format);
+    result_data.climgfmt = img->format;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = getPixelSize(&img->format);
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = img->desc.image_row_pitch;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = img->desc.image_slice_pitch;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = img->desc.image_width;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet =
+      getNumDimensions(img->desc.image_type) > 1 ? img->desc.image_height : 0;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet =
+      getNumDimensions(img->desc.image_type) > 2 ? img->desc.image_depth : 0;
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet =
+      isImageArray(img->desc.image_type) ? img->desc.image_array_size : 0;
+    break;
+    result_size = sizeof(cl_mem);
+    result_data.clmem = img->desc.buffer;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = 0;
+    break;
+  default:
+    ReturnErrorArg(image->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(image->context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+    }
+    else
+    {
+      memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_mem               memobj,
+  void (CL_CALLBACK *  pfn_notify)(cl_mem, void*),
+  void *               user_data
+  // Check parameters
+  if (!memobj)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  if (!pfn_notify)
+  {
+    ReturnErrorArg(memobj->context, CL_INVALID_VALUE, pfn_notify);
+  }
+  memobj->callbacks.push(make_pair(pfn_notify, user_data));
+  return CL_SUCCESS;
+  cl_context          context,
+  cl_bool             normalized_coords,
+  cl_addressing_mode  addressing_mode,
+  cl_filter_mode      filter_mode,
+  cl_int *            errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  // Create sampler bitfield
+  uint32_t bitfield = 0;
+  if (normalized_coords)
+  {
+    bitfield |= 0x0001;
+  }
+  switch (addressing_mode)
+  {
+    case CL_ADDRESS_NONE:
+      break;
+      bitfield |= 0x0002;
+      break;
+      bitfield |= 0x0004;
+      break;
+      bitfield |= 0x0006;
+      break;
+      bitfield |= 0x0008;
+      break;
+    default:
+      SetErrorArg(context, CL_INVALID_VALUE, addressing_mode);
+      return NULL;
+  }
+  switch (filter_mode)
+  {
+      bitfield |= 0x0010;
+      break;
+      bitfield |= 0x0020;
+      break;
+    default:
+      SetErrorArg(context, CL_INVALID_VALUE, filter_mode);
+      return NULL;
+  }
+  // Create sampler
+  cl_sampler sampler = new _cl_sampler;
+  sampler->dispatch = m_dispatchTable;
+  sampler->context = context;
+  sampler->normCoords = normalized_coords;
+  sampler->addressMode = addressing_mode;
+  sampler->filterMode = filter_mode;
+  sampler->sampler = bitfield;
+  SetError(context, CL_SUCCESS);
+  return sampler;
+  cl_sampler  sampler
+  if (!sampler)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_SAMPLER, sampler);
+  }
+  sampler->refCount++;
+  return CL_SUCCESS;
+  cl_sampler  sampler
+  if (!sampler)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_SAMPLER, sampler);
+  }
+  if (--sampler->refCount == 0)
+  {
+    delete sampler;
+  }
+  return CL_SUCCESS;
+  cl_sampler       sampler,
+  cl_sampler_info  param_name,
+  size_t           param_value_size,
+  void *           param_value,
+  size_t *         param_value_size_ret
+  // Check sampler is valid
+  if (!sampler)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_SAMPLER, sampler);
+  }
+  size_t dummy = 0;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_uint cluint;
+    cl_context clcontext;
+    cl_bool clbool;
+    cl_addressing_mode claddrmode;
+    cl_filter_mode clfiltmode;
+  } result_data;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_uint);
+    result_data.cluint = sampler->refCount;
+    break;
+    result_size = sizeof(cl_context);
+    result_data.clcontext = sampler->context;
+    break;
+    result_size = sizeof(cl_bool);
+    result_data.clbool = sampler->normCoords;
+    break;
+    result_size = sizeof(cl_addressing_mode);
+    result_data.claddrmode = sampler->addressMode;
+    break;
+    result_size = sizeof(cl_filter_mode);
+    result_data.clfiltmode = sampler->filterMode;
+    break;
+  default:
+    ReturnErrorArg(sampler->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(sampler->context, CL_INVALID_VALUE,
+                      ParamValueSizeTooSmall);
+    }
+    else
+    {
+      memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_context      context,
+  cl_uint         count,
+  const char **   strings,
+  const size_t *  lengths,
+  cl_int *        errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  if (count == 0)
+  {
+    SetErrorArg(context, CL_INVALID_VALUE, count);
+    return NULL;
+  }
+  if (!strings || !strings[0])
+  {
+    SetErrorArg(context, CL_INVALID_VALUE, strings);
+    return NULL;
+  }
+  // Concatenate sources into a single string
+  std::string source;
+  for (unsigned i = 0; i < count; i++)
+  {
+    size_t length = (lengths && lengths[i]) ? lengths[i] : strlen(strings[i]);
+    source.append(strings[i], length);
+  }
+  // Create program object
+  cl_program prog = new _cl_program;
+  prog->dispatch = m_dispatchTable;
+  prog->program = new oclgrind::Program(context->context, source);
+  prog->context = context;
+  prog->refCount = 1;
+  if (!prog->program)
+  {
+    SetError(context, CL_OUT_OF_HOST_MEMORY);
+    delete prog;
+    return NULL;
+  }
+  clRetainContext(context);
+  SetError(context, CL_SUCCESS);
+  return prog;
+  cl_context              context,
+  cl_uint                 num_devices,
+  const cl_device_id *    device_list,
+  const size_t *          lengths,
+  const unsigned char **  binaries,
+  cl_int *                binary_status,
+  cl_int *                errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  if (num_devices != 1 || !device_list)
+  {
+    SetErrorInfo(context, CL_INVALID_VALUE, "Invalid device list");
+    return NULL;
+  }
+  if (!lengths)
+  {
+    SetErrorArg(context, CL_INVALID_VALUE, lengths);
+    return NULL;
+  }
+  if (!binaries)
+  {
+    SetErrorArg(context, CL_INVALID_VALUE, binaries);
+    return NULL;
+  }
+  if (device_list[0] != m_device)
+  {
+    SetErrorArg(context, CL_INVALID_DEVICE, device_list);
+    return NULL;
+  }
+  // Create program object
+  cl_program prog = new _cl_program;
+  prog->dispatch = m_dispatchTable;
+  prog->program = oclgrind::Program::createFromBitcode(context->context,
+                                                       binaries[0], lengths[0]);
+  prog->context = context;
+  prog->refCount = 1;
+  if (!prog->program)
+  {
+    SetError(context, CL_INVALID_BINARY);
+    if (binary_status)
+    {
+      binary_status[0] = CL_INVALID_BINARY;
+    }
+    delete prog;
+    return NULL;
+  }
+  if (binary_status)
+  {
+    binary_status[0] = CL_SUCCESS;
+  }
+  clRetainContext(context);
+  SetError(context, CL_SUCCESS);
+  return prog;
+  cl_context            context,
+  cl_uint               num_devices,
+  const cl_device_id *  device_list,
+  const char *          kernel_names,
+  cl_int *              errcode_ret
+  if (!context)
+  {
+    return NULL;
+  }
+  SetErrorInfo(context, CL_INVALID_VALUE, "No built-in kernels available");
+  return NULL;
+  cl_program  program
+  if (!program)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+  }
+  program->refCount++;
+  return CL_SUCCESS;
+  cl_program  program
+  if (!program)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+  }
+  if (--program->refCount == 0)
+  {
+    delete program->program;
+    clReleaseContext(program->context);
+    delete program;
+  }
+  return CL_SUCCESS;
+  cl_program            program,
+  cl_uint               num_devices,
+  const cl_device_id *  device_list,
+  const char *          options,
+  void (CL_CALLBACK *   pfn_notify)(cl_program, void*),
+  void *                user_data
+  // Check parameters
+  if (!program || !program->program)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+  }
+  if (num_devices > 0 && !device_list)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                    "num_devices >0 but device_list is NULL");
+  }
+  if (num_devices == 0 && device_list)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                    "num_devices == 0 but device_list non-NULL");
+  }
+  if (!pfn_notify && user_data)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                    "pfn_notify NULL but user_data non-NULL");
+  }
+  if (device_list && !device_list[0])
+  {
+    ReturnErrorArg(program->context, CL_INVALID_DEVICE, device);
+  }
+  // Build program
+  if (!program->program->build(options))
+  {
+    ReturnError(program->context, CL_BUILD_PROGRAM_FAILURE);
+  }
+  // Fire callback
+  if (pfn_notify)
+  {
+    pfn_notify(program, user_data);
+  }
+  return CL_SUCCESS;
+  void
+  return CL_SUCCESS;
+  cl_program            program,
+  cl_uint               num_devices,
+  const cl_device_id *  device_list,
+  const char *          options,
+  cl_uint               num_input_headers,
+  const cl_program *    input_headers,
+  const char **         header_include_names,
+  void (CL_CALLBACK *   pfn_notify)(cl_program, void*),
+  void *                user_data
+  // Check parameters
+  if (!program)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+  }
+  if (num_devices > 0 && !device_list)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                    "num_devices >0 but device_list is NULL");
+  }
+  if (num_devices == 0 && device_list)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                    "num_devices == 0 but device_list non-NULL");
+  }
+  if (!pfn_notify && user_data)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                    "pfn_notify NULL but user_data non-NULL");
+  }
+  if (device_list && !device_list[0])
+  {
+    ReturnErrorArg(program->context, CL_INVALID_DEVICE, device);
+  }
+  // Prepare headers
+  list<oclgrind::Program::Header> headers;
+  for (unsigned i = 0; i < num_input_headers; i++)
+  {
+    headers.push_back(make_pair(header_include_names[i],
+                                input_headers[i]->program));
+  }
+  // Build program
+  if (!program->program->build(options, headers))
+  {
+    ReturnError(program->context, CL_BUILD_PROGRAM_FAILURE);
+  }
+  // Fire callback
+  if (pfn_notify)
+  {
+    pfn_notify(program, user_data);
+  }
+  return CL_SUCCESS;
+  cl_context            context,
+  cl_uint               num_devices,
+  const cl_device_id *  device_list,
+  const char *          options,
+  cl_uint               num_input_programs,
+  const cl_program *    input_programs,
+  void (CL_CALLBACK *   pfn_notify)(cl_program, void*),
+  void *                user_data,
+  cl_int *              errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  if (num_devices > 0 && !device_list)
+  {
+    SetErrorInfo(context, CL_INVALID_VALUE,
+                 "num_devices >0 but device_list is NULL");
+    return NULL;
+  }
+  if (num_devices == 0 && device_list)
+  {
+    SetErrorInfo(context, CL_INVALID_VALUE,
+                 "num_devices == 0 but device_list non-NULL");
+    return NULL;
+  }
+  if (!pfn_notify && user_data)
+  {
+    SetErrorInfo(context, CL_INVALID_VALUE,
+                 "pfn_notify NULL but user_data non-NULL");
+    return NULL;
+  }
+  if (device_list && !device_list[0])
+  {
+    SetErrorArg(context, CL_INVALID_DEVICE, device_list);
+    return NULL;
+  }
+  // Prepare programs
+  list<const oclgrind::Program*> programs;
+  for (unsigned i = 0; i < num_input_programs; i++)
+  {
+    programs.push_back(input_programs[i]->program);
+  }
+  // Create program object
+  cl_program prog = new _cl_program;
+  prog->dispatch = m_dispatchTable;
+  prog->program = oclgrind::Program::createFromPrograms(context->context,
+                                                        programs);
+  prog->context = context;
+  prog->refCount = 1;
+  if (!prog->program)
+  {
+    SetError(context, CL_INVALID_BINARY);
+    delete prog;
+    return NULL;
+  }
+  // Fire callback
+  if (pfn_notify)
+  {
+    pfn_notify(prog, user_data);
+  }
+  clRetainContext(context);
+  SetError(context, CL_SUCCESS);
+  return prog;
+  cl_platform_id  platform
+  return CL_SUCCESS;
+  cl_program       program,
+  cl_program_info  param_name,
+  size_t           param_value_size,
+  void *           param_value,
+  size_t *         param_value_size_ret
+  size_t result_size = 0;
+  void *result_data = NULL;
+  // Check program is valid
+  if (!program)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+  }
+  if ((param_name == CL_PROGRAM_NUM_KERNELS ||
+       param_name == CL_PROGRAM_KERNEL_NAMES) &&
+      program->program->getBuildStatus() != CL_BUILD_SUCCESS)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_PROGRAM_EXECUTABLE,
+                    "Program not successfully built");
+  }
+  switch (param_name)
+  {
+    result_size = sizeof(cl_uint);
+    result_data = malloc(result_size);
+    *(cl_uint*)result_data = program->refCount;
+    break;
+    result_size = sizeof(cl_context);
+    result_data = malloc(result_size);
+    *(cl_context*)result_data = program->context;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data = malloc(result_size);
+    *(cl_uint*)result_data = 1;
+    break;
+    result_size = sizeof(cl_device_id);
+    result_data = malloc(result_size);
+    *(cl_device_id*)result_data = m_device;
+    break;
+    result_size = strlen(program->program->getSource().c_str()) + 1;
+    result_data = malloc(result_size);
+    strcpy((char*)result_data, program->program->getSource().c_str());
+    break;
+    result_size = sizeof(size_t);
+    result_data = malloc(result_size);
+    *(size_t*)result_data = program->program->getBinarySize();
+    break;
+    result_size = sizeof(unsigned char*);
+    result_data = program->program->getBinary();
+    break;
+    result_size = sizeof(size_t);
+    result_data = malloc(result_size);
+    *(size_t*)result_data = program->program->getNumKernels();
+    break;
+  {
+    list<string> names = program->program->getKernelNames();
+    string ret;
+    for (list<string>::iterator itr = names.begin(); itr != names.end(); itr++)
+    {
+      ret += *itr;
+      ret += ";";
+    }
+    if (!ret.empty())
+    {
+      ret.erase(ret.length()-1);
+    }
+    result_size = strlen(ret.c_str()) + 1;
+    result_data = malloc(result_size);
+    strcpy((char*)result_data, ret.c_str());
+    break;
+  }
+  default:
+    ReturnErrorArg(program->context, CL_INVALID_VALUE, param_name);
+  }
+  cl_int return_value = CL_SUCCESS;
+  if (param_value)
+  {
+    if (param_name == CL_PROGRAM_BINARIES)
+    {
+      memcpy(((unsigned char**)param_value)[0],
+             result_data, program->program->getBinarySize());
+    }
+    else
+    {
+      // Check destination is large enough
+      if (param_value_size < result_size)
+      {
+        // TODO: Use API error reporting mechanism
+        return_value = CL_INVALID_VALUE;
+      }
+      else
+      {
+        memcpy(param_value, result_data, result_size);
+      }
+    }
+  }
+  if (param_value_size_ret)
+  {
+    *param_value_size_ret = result_size;
+  }
+  free(result_data);
+  return return_value;
+  cl_program             program,
+  cl_device_id           device,
+  cl_program_build_info  param_name,
+  size_t                 param_value_size,
+  void *                 param_value,
+  size_t *               param_value_size_ret
+  // Check program is valid
+  if (!program)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+  }
+  size_t dummy;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_build_status status;
+    cl_program_binary_type type;
+  } result_data;
+  const char* str = 0;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_build_status);
+    result_data.status = program->program->getBuildStatus();
+    break;
+    str = program->program->getBuildOptions().c_str();
+    result_size = strlen(str) + 1;
+    break;
+    str = program->program->getBuildLog().c_str();
+    result_size = strlen(str) + 1;
+    break;
+    result_size = sizeof(cl_program_binary_type);
+    break;
+  default:
+    ReturnErrorArg(program->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                      ParamValueSizeTooSmall);
+    }
+    else
+    {
+      if (str)
+        memcpy(param_value, str, result_size);
+      else
+        memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_program    program,
+  const char *  kernel_name,
+  cl_int *      errcode_ret
+  // Check parameters
+  if (program->dispatch != m_dispatchTable)
+  {
+    return NULL;
+  }
+  if (!kernel_name)
+  {
+    SetErrorArg(program->context, CL_INVALID_VALUE, kernel_name);
+    return NULL;
+  }
+  // Create kernel object
+  cl_kernel kernel = new _cl_kernel;
+  kernel->dispatch = m_dispatchTable;
+  kernel->kernel = program->program->createKernel(kernel_name);
+  kernel->program = program;
+  kernel->refCount = 1;
+  if (!kernel->kernel)
+  {
+    SetErrorInfo(program->context, CL_INVALID_KERNEL_NAME,
+                 "Kernel '" << kernel_name << "' not found");
+    delete kernel;
+    return NULL;
+  }
+  clRetainProgram(program);
+  SetError(program->context, CL_SUCCESS);
+  return kernel;
+  cl_program   program,
+  cl_uint      num_kernels,
+  cl_kernel *  kernels,
+  cl_uint *    num_kernels_ret
+  // Check parameters
+  if (!program)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_PROGRAM, program);
+  }
+  if (program->program->getBuildStatus() != CL_BUILD_SUCCESS)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_PROGRAM_EXECUTABLE,
+                    "Program not built");
+  }
+  unsigned int num = program->program->getNumKernels();
+  if (kernels && num_kernels < num)
+  {
+    ReturnErrorInfo(program->context, CL_INVALID_VALUE,
+                    "num_kernels is " << num_kernels <<
+                    ", but " << num << " kernels found");
+  }
+  if (kernels)
+  {
+    int i = 0;
+    list<string> names = program->program->getKernelNames();
+    for (list<string>::iterator itr = names.begin(); itr != names.end(); itr++)
+    {
+      cl_kernel kernel = new _cl_kernel;
+      kernel->dispatch = m_dispatchTable;
+      kernel->kernel = program->program->createKernel(*itr);
+      kernel->program = program;
+      kernel->refCount = 1;
+      kernels[i++] = kernel;
+      clRetainProgram(program);
+    }
+  }
+  if (num_kernels_ret)
+  {
+    *num_kernels_ret = num;
+  }
+  return CL_SUCCESS;
+  cl_kernel  kernel
+  if (!kernel)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+  }
+  kernel->refCount++;
+  return CL_SUCCESS;
+  cl_kernel  kernel
+  if (!kernel)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+  }
+  if (--kernel->refCount == 0)
+  {
+    delete kernel->kernel;
+    clReleaseProgram(kernel->program);
+    delete kernel;
+  }
+  return CL_SUCCESS;
+  cl_kernel     kernel,
+  cl_uint       arg_index,
+  size_t        arg_size,
+  const void *  arg_value
+  // Check parameters
+  if (arg_index >= kernel->kernel->getNumArguments())
+  {
+    ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_INDEX,
+                    "arg_index is " << arg_index <<
+                    ", but kernel has " << kernel->kernel->getNumArguments()
+                    << " arguments");
+  }
+  unsigned int addr = kernel->kernel->getArgumentAddressQualifier(arg_index);
+  bool isSampler =
+    kernel->kernel->getArgumentTypeName(arg_index) == "sampler_t";
+  if (kernel->kernel->getArgumentSize(arg_index) != arg_size
+      && !isSampler
+      && addr != CL_KERNEL_ARG_ADDRESS_LOCAL)
+  {
+    ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_SIZE,
+                    "arg_size is " << arg_size << ", but argument should be "
+                    << kernel->kernel->getArgumentSize(arg_index) << " bytes");
+  }
+  // Prepare argument value
+  oclgrind::TypedValue value;
+  value.data = new unsigned char[arg_size];
+  value.size = arg_size;
+  value.num = 1;
+  switch (addr)
+  {
+    if (isSampler)
+    {
+      memcpy(value.data, &(*(cl_sampler*)arg_value)->sampler, 4);
+    }
+    else
+    {
+      memcpy(value.data, arg_value, arg_size);
+    }
+    break;
+    delete value.data;
+    value.data = NULL;
+    break;
+    if (arg_value && *(cl_mem*)arg_value)
+    {
+      cl_mem mem = *(cl_mem*)arg_value;
+      if (mem->isImage)
+      {
+        // Create Image struct
+        oclgrind::Image *image = new oclgrind::Image;
+        image->address = mem->address;
+        image->format = ((cl_image*)mem)->format;
+        image->desc = ((cl_image*)mem)->desc;
+        *(oclgrind::Image**)value.data = image;
+      }
+      else
+      {
+        memcpy(value.data, &mem->address, arg_size);
+      }
+      kernel->memArgs[arg_index] = mem;
+    }
+    else
+    {
+      value.setPointer(0);
+      kernel->memArgs.erase(arg_index);
+    }
+    break;
+  default:
+    ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_VALUE,
+                    "Unsupported address space");
+  }
+  // Set argument
+  kernel->kernel->setArgument(arg_index, value);
+  delete[] value.data;
+  return CL_SUCCESS;
+  cl_kernel       kernel,
+  cl_kernel_info  param_name,
+  size_t          param_value_size,
+  void *          param_value,
+  size_t *        param_value_size_ret
+  // Check kernel is valid
+  if (!kernel)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+  }
+  size_t dummy;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_uint cluint;
+    cl_context context;
+    cl_program program;
+  } result_data;
+  const char* str = 0;
+  switch (param_name)
+  {
+    result_size = kernel->kernel->getName().size() + 1;
+    str = kernel->kernel->getName().c_str();
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = kernel->kernel->getNumArguments();
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = kernel->refCount;
+    break;
+    result_size = sizeof(cl_context);
+    result_data.context = kernel->program->context;
+    break;
+    result_size = sizeof(cl_program);
+    result_data.program = kernel->program;
+    break;
+    result_size = kernel->kernel->getAttributes().size() + 1;
+    str = kernel->kernel->getAttributes().c_str();
+    break;
+  default:
+    ReturnErrorArg(kernel->program->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+                      ParamValueSizeTooSmall);
+    }
+    else
+    {
+      if (str)
+        memcpy(param_value, str, result_size);
+      else
+        memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_kernel           kernel,
+  cl_uint             arg_indx,
+  cl_kernel_arg_info  param_name,
+  size_t              param_value_size,
+  void *              param_value,
+  size_t *            param_value_size_ret
+  // Check parameters are valid
+  if (!kernel)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+  }
+  if (arg_indx >= kernel->kernel->getNumArguments())
+  {
+    ReturnErrorInfo(kernel->program->context, CL_INVALID_ARG_INDEX,
+                    "arg_indx is " << arg_indx <<
+                    ", but kernel has " << kernel->kernel->getNumArguments()
+                    << " arguments");
+  }
+  size_t dummy = 0;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_kernel_arg_address_qualifier addressQual;
+    cl_kernel_arg_access_qualifier accessQual;
+    cl_kernel_arg_type_qualifier typeQual;
+  } result_data;
+  std::string str_data;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_kernel_arg_address_qualifier);
+    result_data.addressQual =
+      kernel->kernel->getArgumentAddressQualifier(arg_indx);
+    break;
+    result_size = sizeof(cl_kernel_arg_access_qualifier);
+    result_data.accessQual =
+      kernel->kernel->getArgumentAccessQualifier(arg_indx);
+    break;
+    str_data = kernel->kernel->getArgumentTypeName(arg_indx).str();
+    result_size = str_data.size() + 1;
+    break;
+    result_size = sizeof(cl_kernel_arg_type_qualifier);
+    result_data.typeQual = kernel->kernel->getArgumentTypeQualifier(arg_indx);
+    break;
+    str_data = kernel->kernel->getArgumentName(arg_indx).str();
+    result_size = str_data.size() + 1;
+    break;
+  default:
+    ReturnErrorArg(kernel->program->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+                      ParamValueSizeTooSmall);
+    }
+    if (str_data.size())
+      memcpy(param_value, str_data.c_str(), result_size);
+    else
+      memcpy(param_value, &result_data, result_size);
+  }
+  return CL_SUCCESS;
+  cl_kernel                  kernel,
+  cl_device_id               device,
+  cl_kernel_work_group_info  param_name,
+  size_t                     param_value_size,
+  void *                     param_value,
+  size_t *                   param_value_size_ret
+  // Check parameters are valid
+  if (!kernel)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_KERNEL, kernel);
+  }
+  if (!device || device != m_device)
+  {
+    ReturnErrorArg(kernel->program->context, CL_INVALID_DEVICE, device);
+  }
+  size_t dummy;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    size_t sizet;
+    size_t sizet3[3];
+    cl_ulong clulong;
+  } result_data;
+  switch (param_name)
+  {
+    ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+                    "CL_KERNEL_GLOBAL_SIZE only valid on custom devices");
+    result_size = sizeof(size_t);
+    result_data.sizet = MAX_WI_SIZE;
+    break;
+    result_size = sizeof(size_t[3]);
+    kernel->kernel->getRequiredWorkGroupSize(result_data.sizet3);
+    break;
+    result_size = sizeof(cl_ulong);
+    result_data.clulong = kernel->kernel->getLocalMemorySize();
+    break;
+    result_size = sizeof(size_t);
+    result_data.sizet = 1;
+    break;
+    result_size = sizeof(cl_ulong);
+    result_data.clulong = 0;
+    break;
+  default:
+    ReturnErrorArg(kernel->program->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(kernel->program->context, CL_INVALID_VALUE,
+                      ParamValueSizeTooSmall);
+    }
+    else
+    {
+      memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+/* Event Object APIs  */
+  // Utility to check if an event has completed (or terminated)
+  inline bool isComplete(cl_event event)
+  {
+    return (event->event->state == CL_COMPLETE || event->event->state < 0);
+  }
+  cl_uint           num_events,
+  const cl_event *  event_list
+  // Check parameters
+  if (!num_events)
+  {
+    ReturnErrorInfo(NULL, CL_INVALID_VALUE, "num_events cannot be 0");
+  }
+  if (!event_list)
+  {
+    ReturnErrorInfo(NULL, CL_INVALID_VALUE, "event_list cannot be NULL");
+  }
+  // Loop until all events complete
+  bool complete = false;
+  while (!complete)
+  {
+    complete = true;
+    for (unsigned i = 0; i < num_events; i++)
+    {
+      // Skip event if already complete
+      if (isComplete(event_list[i]))
+      {
+        continue;
+      }
+      // If it's not a user event, update the queue
+      if (event_list[i]->queue)
+      {
+        oclgrind::Queue::Command *cmd = event_list[i]->queue->queue->update();
+        if (cmd)
+        {
+          asyncQueueRelease(cmd);
+          delete cmd;
+        }
+        // If it's still not complete, update flag
+        if (!isComplete(event_list[i]))
+        {
+          complete = false;
+        }
+      }
+      else
+      {
+        complete = false;
+      }
+    }
+  }
+  // Check if any command terminated unsuccessfully
+  for (unsigned i = 0; i < num_events; i++)
+  {
+    if (event_list[i]->event->state < 0)
+    {
+      ReturnErrorInfo(event_list[i]->context,
+                      "Event " << i <<
+                      " terminated with error " << event_list[i]->event->state);
+    }
+  }
+  return CL_SUCCESS;
+  cl_event       event,
+  cl_event_info  param_name,
+  size_t         param_value_size,
+  void *         param_value,
+  size_t *       param_value_size_ret
+  // Check event is valid
+  if (!event)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+  }
+  size_t dummy;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  union
+  {
+    cl_command_queue queue;
+    cl_context context;
+    cl_command_type type;
+    cl_int clint;
+    cl_uint cluint;
+    size_t sizet;
+    size_t sizet3[3];
+  } result_data;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_command_queue);
+    result_data.queue = event->queue;
+    break;
+    result_size = sizeof(cl_context);
+    result_data.context = event->context;
+    break;
+    result_size = sizeof(cl_command_type);
+    result_data.type = event->type;
+    break;
+    result_size = sizeof(cl_int);
+    result_data.clint = event->event->state;
+    break;
+    result_size = sizeof(cl_uint);
+    result_data.cluint = event->refCount;
+    break;
+  default:
+    ReturnErrorArg(event->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(event->context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+    }
+    else
+    {
+      memcpy(param_value, &result_data, result_size);
+    }
+  }
+  return CL_SUCCESS;
+  cl_context  context,
+  cl_int *    errcode_ret
+  // Check parameters
+  if (!context)
+  {
+    SetErrorArg(NULL, CL_INVALID_CONTEXT, context);
+    return NULL;
+  }
+  /// Create event object
+  cl_event event = new _cl_event;
+  event->dispatch = m_dispatchTable;
+  event->context = context;
+  event->queue = 0;
+  event->type = CL_COMMAND_USER;
+  event->event = new oclgrind::Event();
+  event->event->state = CL_SUBMITTED;
+  event->refCount = 1;
+  SetError(context, CL_SUCCESS);
+  return event;
+  cl_event  event
+  if (!event)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+  }
+  event->refCount++;
+  return CL_SUCCESS;
+  cl_event  event
+  if (!event)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+  }
+  if (--event->refCount == 0)
+  {
+    if (event->event)
+    {
+      delete event->event;
+    }
+    delete event;
+  }
+  return CL_SUCCESS;
+  cl_event  event,
+  cl_int    execution_status
+  // Check parameters
+  if (!event)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+  }
+  if (event->queue)
+  {
+    ReturnErrorInfo(event->context, CL_INVALID_EVENT, "Not a user event");
+  }
+  if (execution_status != CL_COMPLETE && execution_status >= 0)
+  {
+    ReturnErrorArg(event->context, CL_INVALID_VALUE, execution_status);
+  }
+  if (event->event->state == CL_COMPLETE || event->event->state < 0)
+  {
+    ReturnErrorInfo(event->context, CL_INVALID_OPERATION,
+                    "Event status already set");
+  }
+  event->event->state = execution_status;
+  // Perform callbacks
+  list< pair<void (CL_CALLBACK *)(cl_event, cl_int, void *), void*> >::iterator itr;
+  for (itr = event->callbacks.begin(); itr != event->callbacks.end(); itr++)
+  {
+    itr->first(event, execution_status, itr->second);
+  }
+  return CL_SUCCESS;
+  cl_event             event,
+  cl_int               command_exec_callback_type,
+  void (CL_CALLBACK *  pfn_notify)(cl_event, cl_int, void*),
+  void *               user_data
+  // Check parameters
+  if (!event)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+  }
+  if (!pfn_notify)
+  {
+    ReturnErrorArg(event->context, CL_INVALID_VALUE, pfn_notify);
+  }
+  if (command_exec_callback_type != CL_COMPLETE &&
+      command_exec_callback_type != CL_SUBMITTED &&
+      command_exec_callback_type != CL_RUNNING)
+  {
+    ReturnErrorArg(event->context, CL_INVALID_VALUE,
+                   command_exec_callback_type);
+  }
+  event->callbacks.push_back(make_pair(pfn_notify, user_data));
+  return CL_SUCCESS;
+  cl_event           event,
+  cl_profiling_info  param_name,
+  size_t             param_value_size,
+  void *             param_value,
+  size_t *           param_value_size_ret
+  // Check event is valid
+  if (!event)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_EVENT, event);
+  }
+  if (!event->queue)
+  {
+    ReturnError(event->context, CL_PROFILING_INFO_NOT_AVAILABLE);
+  }
+  size_t dummy = 0;
+  size_t& result_size = param_value_size_ret ? *param_value_size_ret : dummy;
+  cl_ulong result;
+  switch (param_name)
+  {
+    result_size = sizeof(cl_ulong);
+    result = event->event->queueTime;
+    break;
+    result_size = sizeof(cl_ulong);
+    result = event->event->startTime;
+    break;
+    result_size = sizeof(cl_ulong);
+    result = event->event->startTime;
+    break;
+    result_size = sizeof(cl_ulong);
+    result = event->event->endTime;
+    break;
+  default:
+    ReturnErrorArg(event->context, CL_INVALID_VALUE, param_name);
+  }
+  if (param_value)
+  {
+    // Check destination is large enough
+    if (param_value_size < result_size)
+    {
+      ReturnErrorInfo(event->context, CL_INVALID_VALUE, ParamValueSizeTooSmall);
+    }
+    else
+    {
+      *(cl_ulong*)param_value = result;
+    }
+  }
+  return CL_SUCCESS;
+  cl_command_queue  command_queue
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  // TODO: Implement properly?
+  clFinish(command_queue);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  while (!command_queue->queue->isEmpty())
+  {
+    // TODO: Move this update to async thread?
+    oclgrind::Queue::Command *cmd = command_queue->queue->update();
+    if (cmd)
+    {
+      asyncQueueRelease(cmd);
+      delete cmd;
+    }
+  }
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            buffer,
+  cl_bool           blocking_read,
+  size_t            offset,
+  size_t            cb,
+  void *            ptr,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  if (!ptr)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+  }
+  if (offset + cb > buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "offset + cb (" << offset << " + " << cb <<
+                    ") exceeds buffer size (" << buffer->size << " bytes)");
+  }
+  if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                    "Buffer flags specify host will not read data");
+  }
+  // Enqueue command
+  oclgrind::Queue::BufferCommand *cmd =
+    new oclgrind::Queue::BufferCommand(oclgrind::Queue::READ);
+  cmd->ptr = (unsigned char*)ptr;
+  cmd->address = buffer->address + offset;
+  cmd->size = cb;
+  asyncQueueRetain(cmd, buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_READ_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  if (blocking_read)
+  {
+    return clFinish(command_queue);
+  }
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            buffer,
+  cl_bool           blocking_read,
+  const size_t *    buffer_origin,
+  const size_t *    host_origin,
+  const size_t *    region,
+  size_t            buffer_row_pitch,
+  size_t            buffer_slice_pitch,
+  size_t            host_row_pitch,
+  size_t            host_slice_pitch,
+  void *            ptr,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  if (!ptr)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+  }
+  if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                    "Buffer flags specify host will not read data");
+  }
+  // Compute pitches if neccessary
+  if (buffer_row_pitch == 0)
+  {
+    buffer_row_pitch = region[0];
+  }
+  if (buffer_slice_pitch == 0)
+  {
+    buffer_slice_pitch = region[1] * buffer_row_pitch;
+  }
+  if (host_row_pitch == 0)
+  {
+    host_row_pitch = region[0];
+  }
+  if (host_slice_pitch == 0)
+  {
+    host_slice_pitch = region[1] * host_row_pitch;
+  }
+  // Compute origin offsets
+  size_t buffer_offset =
+    buffer_origin[2] * buffer_slice_pitch +
+    buffer_origin[1] * buffer_row_pitch +
+    buffer_origin[0];
+  size_t host_offset =
+    host_origin[2] * host_slice_pitch +
+    host_origin[1] * host_row_pitch +
+    host_origin[0];
+  // Ensure buffer region valid
+  size_t end =
+    buffer_offset + region[0] +
+    (region[1]-1) * buffer_row_pitch +
+    (region[2]-1) * buffer_slice_pitch;
+  if (end > buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "Region exceeds buffer size (" <<
+                    buffer->size << " bytes)");
+  }
+  // Enqueue command
+  oclgrind::Queue::BufferRectCommand *cmd =
+    new oclgrind::Queue::BufferRectCommand(oclgrind::Queue::READ_RECT);
+  cmd->ptr = (unsigned char*)ptr;
+  cmd->address = buffer->address;
+  cmd->buffer_offset[0] = buffer_offset;
+  cmd->buffer_offset[1] = buffer_row_pitch;
+  cmd->buffer_offset[2] = buffer_slice_pitch;
+  cmd->host_offset[0] = host_offset;
+  cmd->host_offset[1] = host_row_pitch;
+  cmd->host_offset[2] = host_slice_pitch;
+  memcpy(cmd->region, region, 3*sizeof(size_t));
+  asyncQueueRetain(cmd, buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_READ_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  if (blocking_read)
+  {
+    return clFinish(command_queue);
+  }
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            buffer,
+  cl_bool           blocking_write,
+  size_t            offset,
+  size_t            cb,
+  const void *      ptr,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  if (!ptr)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+  }
+  if (offset + cb > buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "offset + cb (" << offset << " + " << cb <<
+                    ") exceeds buffer size (" << buffer->size << " bytes)");
+  }
+  if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                    "Buffer flags specify host will not write data");
+  }
+  // Enqueue command
+  oclgrind::Queue::BufferCommand *cmd =
+    new oclgrind::Queue::BufferCommand(oclgrind::Queue::WRITE);
+  cmd->ptr = (unsigned char*)ptr;
+  cmd->address = buffer->address + offset;
+  cmd->size = cb;
+  asyncQueueRetain(cmd, buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_WRITE_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  if (blocking_write)
+  {
+    return clFinish(command_queue);
+  }
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            buffer,
+  cl_bool           blocking_write,
+  const size_t *    buffer_origin,
+  const size_t *    host_origin,
+  const size_t *    region,
+  size_t            buffer_row_pitch,
+  size_t            buffer_slice_pitch,
+  size_t            host_row_pitch,
+  size_t            host_slice_pitch,
+  const void *      ptr,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  if (!ptr)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, ptr);
+  }
+  if (buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                    "Buffer flags specify host will not write data");
+  }
+  // Compute pitches if necessary
+  if (buffer_row_pitch == 0)
+  {
+    buffer_row_pitch = region[0];
+  }
+  if (buffer_slice_pitch == 0)
+  {
+    buffer_slice_pitch = region[1] * buffer_row_pitch;
+  }
+  if (host_row_pitch == 0)
+  {
+    host_row_pitch = region[0];
+  }
+  if (host_slice_pitch == 0)
+  {
+    host_slice_pitch = region[1] * host_row_pitch;
+  }
+  // Compute origin offsets
+  size_t buffer_offset =
+    buffer_origin[2] * buffer_slice_pitch +
+    buffer_origin[1] * buffer_row_pitch +
+    buffer_origin[0];
+  size_t host_offset =
+    host_origin[2] * host_slice_pitch +
+    host_origin[1] * host_row_pitch +
+    host_origin[0];
+  // Ensure buffer region valid
+  size_t end =
+    buffer_offset + region[0] +
+    (region[1]-1) * buffer_row_pitch +
+    (region[2]-1) * buffer_slice_pitch;
+  if (end > buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "Region exceeds buffer size (" <<
+                    buffer->size << " bytes)");
+  }
+  // Enqueue command
+  oclgrind::Queue::BufferRectCommand *cmd =
+    new oclgrind::Queue::BufferRectCommand(oclgrind::Queue::WRITE_RECT);
+  cmd->ptr = (unsigned char*)ptr;
+  cmd->address = buffer->address;
+  cmd->buffer_offset[0] = buffer_offset;
+  cmd->buffer_offset[1] = buffer_row_pitch;
+  cmd->buffer_offset[2] = buffer_slice_pitch;
+  cmd->host_offset[0] = host_offset;
+  cmd->host_offset[1] = host_row_pitch;
+  cmd->host_offset[2] = host_slice_pitch;
+  memcpy(cmd->region, region, 3*sizeof(size_t));
+  asyncQueueRetain(cmd, buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_WRITE_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  if (blocking_write)
+  {
+    return clFinish(command_queue);
+  }
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            src_buffer,
+  cl_mem            dst_buffer,
+  size_t            src_offset,
+  size_t            dst_offset,
+  size_t            cb,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!src_buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_buffer);
+  }
+  if (!dst_buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_buffer);
+  }
+  if (dst_offset + cb > dst_buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "dst_offset + cb (" << dst_offset << " + " << cb <<
+                    ") exceeds buffer size (" << dst_buffer->size << " bytes)");
+  }
+  if (src_offset + cb > src_buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "src_offset + cb (" << src_offset << " + " << cb <<
+                    ") exceeds buffer size (" << src_buffer->size << " bytes)");
+  }
+  // Enqueue command
+  oclgrind::Queue::CopyCommand *cmd = new oclgrind::Queue::CopyCommand();
+  cmd->dst = dst_buffer->address + dst_offset;
+  cmd->src = src_buffer->address + src_offset;
+  cmd->size = cb;
+  asyncQueueRetain(cmd, src_buffer);
+  asyncQueueRetain(cmd, dst_buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_COPY_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            src_buffer,
+  cl_mem            dst_buffer,
+  const size_t *    src_origin,
+  const size_t *    dst_origin,
+  const size_t *    region,
+  size_t            src_row_pitch,
+  size_t            src_slice_pitch,
+  size_t            dst_row_pitch,
+  size_t            dst_slice_pitch,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!src_buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_buffer);
+  }
+  if (!dst_buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_buffer);
+  }
+  // Compute pitches if neccessary
+  if (src_row_pitch == 0)
+  {
+    src_row_pitch = region[0];
+  }
+  if (src_slice_pitch == 0)
+  {
+    src_slice_pitch = region[1] * src_row_pitch;
+  }
+  if (dst_row_pitch == 0)
+  {
+    dst_row_pitch = region[0];
+  }
+  if (dst_slice_pitch == 0)
+  {
+    dst_slice_pitch = region[1] * dst_row_pitch;
+  }
+  // Compute origin offsets
+  size_t src_offset =
+    src_origin[2] * src_slice_pitch +
+    src_origin[1] * src_row_pitch +
+    src_origin[0];
+  size_t dst_offset =
+    dst_origin[2] * dst_slice_pitch +
+    dst_origin[1] * dst_row_pitch +
+    dst_origin[0];
+  // Ensure buffer region valid
+  size_t src_end =
+    src_offset + region[0] +
+    (region[1]-1) * src_row_pitch +
+    (region[2]-1) * src_slice_pitch;
+  size_t dst_end =
+    dst_offset + region[0] +
+    (region[1]-1) * dst_row_pitch +
+    (region[2]-1) * dst_slice_pitch;
+  if (src_end > src_buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "Region exceeds source buffer size (" <<
+                    src_buffer->size << " bytes)");
+  }
+  if (dst_end > dst_buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "Region exceeds destination buffer size (" <<
+                    dst_buffer->size << " bytes)");
+  }
+  // Enqueue command
+  oclgrind::Queue::CopyRectCommand *cmd = new oclgrind::Queue::CopyRectCommand();
+  cmd->src = src_buffer->address;
+  cmd->dst = dst_buffer->address;
+  cmd->src_offset[0] = src_offset;
+  cmd->src_offset[1] = src_row_pitch;
+  cmd->src_offset[2] = src_slice_pitch;
+  cmd->dst_offset[0] = dst_offset;
+  cmd->dst_offset[1] = dst_row_pitch;
+  cmd->dst_offset[2] = dst_slice_pitch;
+  memcpy(cmd->region, region, 3*sizeof(size_t));
+  asyncQueueRetain(cmd, src_buffer);
+  asyncQueueRetain(cmd, dst_buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_COPY_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            buffer,
+  const void *      pattern,
+  size_t            pattern_size,
+  size_t            offset,
+  size_t            cb,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, buffer);
+  }
+  if (offset + cb > buffer->size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "offset + cb (" << offset << " + " << cb <<
+                    ") exceeds buffer size (" << buffer->size << " bytes)");
+  }
+  if (!pattern)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, pattern);
+  }
+  if (pattern_size == 0)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, pattern_size);
+  }
+  if (offset%pattern_size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "offset (" << offset << ")" <<
+                    " not a multiple of pattern_size (" << pattern_size << ")");
+  }
+  if (cb%pattern_size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "cb (" << cb << ")" <<
+                    " not a multiple of pattern_size (" << pattern_size << ")");
+  }
+  // Enqueue command
+  oclgrind::Queue::FillBufferCommand *cmd =
+    new oclgrind::Queue::FillBufferCommand((const unsigned char*)pattern,
+                                          pattern_size);
+  cmd->address = buffer->address + offset;
+  cmd->size = cb;
+  asyncQueueRetain(cmd, buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_FILL_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            image,
+  const void *      fill_color,
+  const size_t *    origin,
+  const size_t *    region,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!image)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+  }
+  if (!fill_color)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, fill_color);
+  }
+  if (!region[0] || !region[1] || !region[2])
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "Values in region cannot be 0");
+  }
+  // Get image dimensions
+  cl_image *img = (cl_image*)image;
+  size_t width = img->desc.image_width;
+  size_t height = img->desc.image_height;
+  size_t depth = img->desc.image_depth;
+  size_t arraySize = img->desc.image_array_size;
+  size_t pixelSize = getPixelSize(&img->format);
+  size_t row_pitch = width * pixelSize;
+  size_t slice_pitch = height * row_pitch;
+  if (img->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+    height = arraySize;
+  if (img->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+    depth = arraySize;
+  // Ensure region is within image bounds
+  if (origin[0] + region[0] > width)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "origin[0] + region[0] > width ("
+                    << origin[0] << " + " << region[0] << " > " << width
+                    << " )");
+  }
+  if (origin[1] + region[1] > height)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "origin[1] + region[1] > height ("
+                    << origin[1] << " + " << region[1] << " > " << height
+                    << " )");
+  }
+  if (origin[2] + region[2] > depth)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "origin[2] + region[2] > depth ("
+                    << origin[2] << " + " << region[2] << " > " << depth
+                    << " )");
+  }
+  // Generate color data with correct order and data type
+  unsigned char *color = new unsigned char[pixelSize];
+  for (unsigned output = 0; output < getNumChannels(&img->format); output++)
+  {
+    // Get input channel index
+    int input = output;
+    switch (img->format.image_channel_order)
+    {
+      case CL_R:
+      case CL_Rx:
+      case CL_RG:
+      case CL_RGx:
+      case CL_RGB:
+      case CL_RGBx:
+      case CL_RGBA:
+        break;
+      case CL_BGRA:
+        if (output == 0) input = 2;
+        if (output == 2) input = 0;
+        break;
+      case CL_ARGB:
+        if (output == 0) input = 3;
+        if (output == 1) input = 0;
+        if (output == 2) input = 1;
+        if (output == 3) input = 2;
+        break;
+      case CL_A:
+        if (output == 0) input = 3;
+        break;
+      case CL_RA:
+        if (output == 1) input = 3;
+        break;
+      case CL_INTENSITY:
+      case CL_LUMINANCE:
+        input = 0;
+        break;
+      default:
+        ReturnError(command_queue->context, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+    }
+    // Interpret data
+    switch (img->format.image_channel_data_type)
+    {
+    case CL_SNORM_INT8:
+      ((int8_t*)color)[output] =
+        rint(min(max(((float*)fill_color)[input]*127.f, -127.f), 128.f));
+      break;
+    case CL_UNORM_INT8:
+      ((uint8_t*)color)[output] =
+        rint(min(max(((float*)fill_color)[input]*255.f, 0.f), 255.f));
+      break;
+    case CL_SNORM_INT16:
+      ((int16_t*)color)[output] =
+        rint(min(max(((float*)fill_color)[input]*32767.f, -32768.f), 32767.f));
+      break;
+    case CL_UNORM_INT16:
+      ((uint16_t*)color)[output] =
+        rint(min(max(((float*)fill_color)[input]*65535.f, 0.f), 65535.f));
+      break;
+    case CL_FLOAT:
+      ((float*)color)[output] = ((float*)fill_color)[input];
+      break;
+    case CL_HALF_FLOAT:
+      ((uint16_t*)color)[output] = floatToHalf(((float*)fill_color)[input]);
+      break;
+    case CL_SIGNED_INT8:
+      ((int8_t*)color)[output] = ((int32_t*)fill_color)[input];
+      break;
+    case CL_SIGNED_INT16:
+      ((int16_t*)color)[output] = ((int32_t*)fill_color)[input];
+      break;
+    case CL_SIGNED_INT32:
+      ((int32_t*)color)[output] = ((int32_t*)fill_color)[input];
+      break;
+    case CL_UNSIGNED_INT8:
+      ((uint8_t*)color)[output] = ((uint32_t*)fill_color)[input];
+      break;
+    case CL_UNSIGNED_INT16:
+      ((uint16_t*)color)[output] = ((uint32_t*)fill_color)[input];
+      break;
+    case CL_UNSIGNED_INT32:
+      ((uint32_t*)color)[output] = ((uint32_t*)fill_color)[input];
+      break;
+    default:
+      ReturnError(command_queue->context, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+    }
+  }
+  // Enqueue command
+  oclgrind::Queue::FillImageCommand *cmd =
+    new oclgrind::Queue::FillImageCommand(image->address, origin, region,
+                                         row_pitch, slice_pitch,
+                                         pixelSize, color);
+  asyncQueueRetain(cmd, image);
+  asyncEnqueue(command_queue, CL_COMMAND_FILL_IMAGE, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  delete[] color;
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_mem            image,
+  cl_bool           blocking_read,
+  const size_t *    origin,
+  const size_t *    region,
+  size_t            row_pitch,
+  size_t            slice_pitch,
+  void *            ptr,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!image)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+  }
+  cl_image *img = (cl_image*)image;
+  size_t pixelSize = getPixelSize(&img->format);
+  size_t buffer_origin[3] = {origin[0]*pixelSize, origin[1], origin[2]};
+  size_t pixel_region[3] = {region[0]*pixelSize, region[1], region[2]};
+  size_t host_origin[3] = {0, 0, 0};
+  size_t img_row_pitch = img->desc.image_width * pixelSize;
+  size_t img_slice_pitch = img->desc.image_height * img_row_pitch;
+  if (row_pitch == 0)
+  {
+    row_pitch = pixel_region[0];
+  }
+  if (slice_pitch == 0)
+  {
+    slice_pitch = pixel_region[1] * row_pitch;
+  }
+  // Enqueue read
+  cl_int ret = clEnqueueReadBufferRect(
+    command_queue, image, blocking_read,
+    buffer_origin, host_origin, pixel_region,
+    img_row_pitch, img_slice_pitch, row_pitch, slice_pitch,
+    ptr, num_events_in_wait_list, event_wait_list, event);
+  if (event)
+  {
+    (*event)->type = CL_COMMAND_READ_IMAGE;
+  }
+  return ret;
+  cl_command_queue  command_queue,
+  cl_mem            image,
+  cl_bool           blocking_write,
+  const size_t *    origin,
+  const size_t *    region,
+  size_t            input_row_pitch,
+  size_t            input_slice_pitch,
+  const void *      ptr,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!image)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+  }
+  cl_image *img = (cl_image*)image;
+  size_t pixelSize = getPixelSize(&img->format);
+  size_t buffer_origin[3] = {origin[0]*pixelSize, origin[1], origin[2]};
+  size_t pixel_region[3] = {region[0]*pixelSize, region[1], region[2]};
+  size_t host_origin[3] = {0, 0, 0};
+  size_t img_row_pitch = img->desc.image_width * pixelSize;
+  size_t img_slice_pitch = img->desc.image_height * img_row_pitch;
+  if (input_row_pitch == 0)
+  {
+    input_row_pitch = pixel_region[0];
+  }
+  if (input_slice_pitch == 0)
+  {
+    input_slice_pitch = pixel_region[1] * input_row_pitch;
+  }
+  // Enqueue write
+  cl_int ret = clEnqueueWriteBufferRect(
+    command_queue, image, blocking_write,
+    buffer_origin, host_origin, pixel_region,
+    img_row_pitch, img_slice_pitch, input_row_pitch, input_slice_pitch,
+    ptr, num_events_in_wait_list, event_wait_list, event);
+  if (event)
+  {
+    (*event)->type = CL_COMMAND_WRITE_IMAGE;
+  }
+  return ret;
+  cl_command_queue  command_queue,
+  cl_mem            src_image,
+  cl_mem            dst_image,
+  const size_t *    src_origin,
+  const size_t *    dst_origin,
+  const size_t *    region,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!src_image)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_image);
+  }
+  if (!dst_image)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_image);
+  }
+  cl_image *src = (cl_image*)src_image;
+  cl_image *dst = (cl_image*)dst_image;
+  if (src->format.image_channel_order != dst->format.image_channel_order)
+  {
+    ReturnErrorInfo(command_queue->context, CL_IMAGE_FORMAT_MISMATCH,
+                    "Channel orders do not match");
+  }
+  if (src->format.image_channel_data_type != dst->format.image_channel_data_type)
+  {
+    ReturnErrorInfo(command_queue->context, CL_IMAGE_FORMAT_MISMATCH,
+                    "Channel data types do no match");
+  }
+  size_t srcPixelSize = getPixelSize(&src->format);
+  size_t dstPixelSize = getPixelSize(&dst->format);
+  size_t src_pixel_origin[3] = {src_origin[0]*srcPixelSize,
+                                src_origin[1], src_origin[2]};
+  size_t dst_pixel_origin[3] = {dst_origin[0]*dstPixelSize,
+                                dst_origin[1], dst_origin[2]};
+  size_t pixel_region[3] = {region[0]*srcPixelSize, region[1], region[2]};
+  size_t src_row_pitch = src->desc.image_width * srcPixelSize;
+  size_t src_slice_pitch = src->desc.image_height * src_row_pitch;
+  size_t dst_row_pitch = dst->desc.image_width * dstPixelSize;
+  size_t dst_slice_pitch = dst->desc.image_height * dst_row_pitch;
+  // Enqueue copy
+  cl_int ret = clEnqueueCopyBufferRect(
+    command_queue, src_image, dst_image,
+    src_pixel_origin, dst_pixel_origin, pixel_region,
+    src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
+    num_events_in_wait_list, event_wait_list, event);
+  if (event)
+  {
+    (*event)->type = CL_COMMAND_COPY_IMAGE;
+  }
+  return ret;
+  cl_command_queue  command_queue,
+  cl_mem            src_image,
+  cl_mem            dst_buffer,
+  const size_t *    src_origin,
+  const size_t *    region,
+  size_t            dst_offset,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!src_image)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_image);
+  }
+  if (!dst_buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_buffer);
+  }
+  cl_image *src = (cl_image*)src_image;
+  size_t pixel_size = getPixelSize(&src->format);
+  size_t src_pixel_origin[3] = {src_origin[0]*pixel_size,
+                                src_origin[1], src_origin[2]};
+  size_t src_row_pitch = src->desc.image_width * pixel_size;
+  size_t src_slice_pitch = src->desc.image_height * src_row_pitch;
+  size_t pixel_region[3] = {region[0]*pixel_size, region[1], region[2]};
+  size_t dst_origin[3] = {dst_offset, 0, 0};
+  // Enqueue copy
+  cl_int ret = clEnqueueCopyBufferRect(
+    command_queue, src_image, dst_buffer,
+    src_pixel_origin, dst_origin, pixel_region,
+    src_row_pitch, src_slice_pitch, 0, 0,
+    num_events_in_wait_list, event_wait_list, event);
+  if (event)
+  {
+    (*event)->type = CL_COMMAND_COPY_IMAGE_TO_BUFFER;
+  }
+  return ret;
+  cl_command_queue  command_queue,
+  cl_mem            src_buffer,
+  cl_mem            dst_image,
+  size_t            src_offset,
+  const size_t *    dst_origin,
+  const size_t *    region,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!src_buffer)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, src_buffer);
+  }
+  if (!dst_image)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, dst_image);
+  }
+  cl_image *dst = (cl_image*)dst_image;
+  size_t pixel_size = getPixelSize(&dst->format);
+  size_t dst_pixel_origin[3] = {dst_origin[0]*pixel_size,
+                                dst_origin[1], dst_origin[2]};
+  size_t dst_row_pitch = dst->desc.image_width * pixel_size;
+  size_t dst_slice_pitch = dst->desc.image_height * dst_row_pitch;
+  size_t pixel_region[3] = {region[0]*pixel_size, region[1], region[2]};
+  size_t src_origin[3] = {src_offset, 0, 0};
+  // Enqueue copy
+  cl_int ret = clEnqueueCopyBufferRect(
+    command_queue, src_buffer, dst_image,
+    src_origin, dst_pixel_origin, pixel_region,
+    0, 0, dst_row_pitch, dst_slice_pitch,
+    num_events_in_wait_list, event_wait_list, event);
+  if (event)
+  {
+    (*event)->type = CL_COMMAND_COPY_BUFFER_TO_IMAGE;
+  }
+  return ret;
+  cl_command_queue  command_queue,
+  cl_mem            buffer,
+  cl_bool           blocking_map,
+  cl_map_flags      map_flags,
+  size_t            offset,
+  size_t            cb,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event,
+  cl_int *          errcode_ret
+  // Check parameters
+  if (!command_queue)
+  {
+    SetErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+    return NULL;
+  }
+  if (!buffer)
+  {
+    SetErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, buffer);
+    return NULL;
+  }
+  if (map_flags & CL_MAP_WRITE &&
+      buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                 "Buffer flags specify host will not write data");
+    return NULL;
+  }
+  if (map_flags & CL_MAP_READ &&
+      buffer->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                 "Buffer flags specify host will not read data");
+    return NULL;
+  }
+  // Check map region
+  if (offset + cb > buffer->size)
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                 "offset + cb (" << offset << " + " << cb <<
+                 ") exceeds buffer size (" << buffer->size << " bytes)");
+    return NULL;
+  }
+  // Map buffer
+  void *ptr = buffer->context->context->getGlobalMemory()->mapBuffer(
+    buffer->address, offset, cb);
+  if (ptr == NULL)
+  {
+    SetError(command_queue->context, CL_INVALID_VALUE);
+    return NULL;
+  }
+  // Enqueue command
+  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  asyncQueueRetain(cmd, buffer);
+  asyncEnqueue(command_queue, CL_COMMAND_MAP_BUFFER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  SetError(command_queue->context, CL_SUCCESS);
+  if (blocking_map)
+  {
+    SetError(command_queue->context, clFinish(command_queue));
+  }
+  return ptr;
+  cl_command_queue  command_queue,
+  cl_mem            image,
+  cl_bool           blocking_map,
+  cl_map_flags      map_flags,
+  const size_t *    origin,
+  const size_t *    region,
+  size_t *          image_row_pitch,
+  size_t *          image_slice_pitch,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event,
+  cl_int *          errcode_ret
+  // Check parameters
+  if (!command_queue)
+  {
+    SetErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+    return NULL;
+  }
+  if (!image)
+  {
+    SetErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, image);
+    return NULL;
+  }
+  if (!image_row_pitch)
+  {
+    SetErrorArg(command_queue->context, CL_INVALID_VALUE, image_row_pitch);
+    return NULL;
+  }
+  if (map_flags & CL_MAP_WRITE &&
+      image->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_READ_ONLY))
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                 "Image flags specify host will not write data");
+    return NULL;
+  }
+  if (map_flags & CL_MAP_READ &&
+      image->flags & (CL_MEM_HOST_NO_ACCESS | CL_MEM_HOST_WRITE_ONLY))
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_OPERATION,
+                 "Image flags specify host will not read data");
+    return NULL;
+  }
+  if (!region[0] || !region[1] || !region[2])
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                 "Values in region cannot be 0");
+  }
+  // Get image dimensions
+  cl_image *img = (cl_image*)image;
+  size_t width = img->desc.image_width;
+  size_t height = img->desc.image_height;
+  size_t depth = img->desc.image_depth;
+  size_t arraySize = img->desc.image_array_size;
+  size_t pixelSize = getPixelSize(&img->format);
+  size_t row_pitch = width * pixelSize;
+  size_t slice_pitch = height * row_pitch;
+  if (img->desc.image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+    height = arraySize;
+  if (img->desc.image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+    depth = arraySize;
+  // Ensure region is within image bounds
+  if (origin[0] + region[0] > width)
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                 "origin[0] + region[0] > width ("
+                 << origin[0] << " + " << region[0] << " > " << width
+                 << " )");
+  }
+  if (origin[1] + region[1] > height)
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                 "origin[1] + region[1] > height ("
+                 << origin[1] << " + " << region[1] << " > " << height
+                 << " )");
+  }
+  if (origin[2] + region[2] > depth)
+  {
+    SetErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                 "origin[2] + region[2] > depth ("
+                 << origin[2] << " + " << region[2] << " > " << depth
+                 << " )");
+  }
+  // Compute byte offset and size
+  size_t offset = origin[0] * pixelSize
+                + origin[1] * row_pitch
+                + origin[2] * slice_pitch;
+  size_t size = region[0] * pixelSize
+              + (region[1]-1) * row_pitch
+              + (region[2]-1) * slice_pitch;
+  // Map image
+  void *ptr = image->context->context->getGlobalMemory()->mapBuffer(
+        image->address, offset, size);
+  if (ptr == NULL)
+  {
+    SetError(command_queue->context, CL_INVALID_VALUE);
+    return NULL;
+  }
+  *image_row_pitch = row_pitch;
+  if (image_slice_pitch)
+  {
+    *image_slice_pitch = slice_pitch;
+  }
+  // Enqueue command
+  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  asyncQueueRetain(cmd, image);
+  asyncEnqueue(command_queue, CL_COMMAND_MAP_IMAGE, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  SetError(command_queue->context, CL_SUCCESS);
+  if (blocking_map)
+  {
+    SetError(command_queue->context, clFinish(command_queue));
+  }
+  return ptr;
+  cl_command_queue  command_queue,
+  cl_mem            memobj,
+  void *            mapped_ptr,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!memobj)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_MEM_OBJECT, memobj);
+  }
+  // Enqueue command
+  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  asyncQueueRetain(cmd, memobj);
+  asyncEnqueue(command_queue, CL_COMMAND_UNMAP_MEM_OBJECT, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_command_queue        command_queue,
+  cl_uint                 num_mem_objects,
+  const cl_mem *          mem_objects,
+  cl_mem_migration_flags  flags,
+  cl_uint                 num_events_in_wait_list,
+  const cl_event *        event_wait_list,
+  cl_event *              event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  // Enqueue command
+  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  asyncEnqueue(command_queue, CL_COMMAND_MIGRATE_MEM_OBJECTS, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_kernel         kernel,
+  cl_uint           work_dim,
+  const size_t *    global_work_offset,
+  const size_t *    global_work_size,
+  const size_t *    local_work_size,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (work_dim < 1 || work_dim > 3)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_DIMENSION,
+                    "Kernels must be 1, 2 or 3 dimensional (work_dim = "
+                    << work_dim << ")");
+  }
+  if (!global_work_size)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_GLOBAL_WORK_SIZE,
+                    "global_work_size cannot be NULL");
+  }
+  // Check global and local sizes are valid
+  for (unsigned i = 0; i < work_dim; i++)
+  {
+    if (!global_work_size[i])
+    {
+      ReturnErrorInfo(command_queue->context, CL_INVALID_GLOBAL_WORK_SIZE,
+                      "global_work_size[" << i << "] = 0");
+    }
+    if (local_work_size && global_work_size[i] % local_work_size[i])
+    {
+      ReturnErrorInfo(command_queue->context, CL_INVALID_WORK_GROUP_SIZE,
+                      "Dimension " << i <<
+                      ": local_work_size (" << local_work_size[i] <<
+                      ") does not divide global_work_size (" <<
+                      global_work_size[i] << ")");
+    }
+  }
+  // Ensure all arguments have been set
+  if (!kernel->kernel->allArgumentsSet())
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_KERNEL_ARGS,
+                    "Not all kernel arguments set");
+  }
+  // Set-up offsets and sizes
+  oclgrind::Queue::KernelCommand *cmd = new oclgrind::Queue::KernelCommand();
+  cmd->kernel = new oclgrind::Kernel(*kernel->kernel);
+  cmd->work_dim = work_dim;
+  cmd->globalSize   = oclgrind::Size3(1, 1, 1);
+  cmd->globalOffset = oclgrind::Size3(0, 0, 0);
+  cmd->localSize    = oclgrind::Size3(1, 1, 1);
+  memcpy(&cmd->globalSize, global_work_size, work_dim*sizeof(size_t));
+  if (global_work_offset)
+  {
+    memcpy(&cmd->globalOffset, global_work_offset, work_dim*sizeof(size_t));
+  }
+  if (local_work_size)
+  {
+    memcpy(&cmd->localSize, local_work_size, work_dim*sizeof(size_t));
+  }
+  // Enqueue command
+  asyncQueueRetain(cmd, kernel);
+  asyncEnqueue(command_queue, CL_COMMAND_NDRANGE_KERNEL, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_kernel         kernel,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  size_t work = 1;
+  return clEnqueueNDRangeKernel(command_queue, kernel, 1,
+                                NULL, &work, &work,
+                                num_events_in_wait_list,
+                                event_wait_list,
+                                event);
+  cl_command_queue  command_queue,
+  void (CL_CALLBACK *user_func)(void *),
+  void *            args,
+  size_t            cb_args,
+  cl_uint           num_mem_objects,
+  const cl_mem *    mem_list,
+  const void **     args_mem_loc,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  if (!user_func)
+  {
+    ReturnErrorArg(command_queue->context, CL_INVALID_VALUE, user_func);
+  }
+  if (!args && (cb_args > 0 || num_mem_objects > 0))
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                   "args is NULL but cb_args|num_mem_objects >0");
+  }
+  if (args && cb_args == 0)
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "args is non-NULL but cb_args is 0");
+  }
+  if (num_mem_objects > 0 && (!mem_list || !args_mem_loc))
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "num_mem_objects >0 but mem_list|args_mem_loc is NULL");
+  }
+  if (num_mem_objects == 0 && (mem_list || args_mem_loc))
+  {
+    ReturnErrorInfo(command_queue->context, CL_INVALID_VALUE,
+                    "num_mem_objects is 0 but mem_list|args_mem_loc not NULL");
+  }
+  // Replace mem objects with real pointers
+  oclgrind::Memory *memory = command_queue->context->context->getGlobalMemory();
+  for (unsigned i = 0; i < num_mem_objects; i++)
+  {
+    if (!mem_list[i])
+    {
+      ReturnErrorInfo(command_queue->context, CL_INVALID_MEM_OBJECT,
+                      "Memory object " << i << " is NULL");
+    }
+    void *addr = memory->getPointer(mem_list[i]->address);
+    if (addr == NULL)
+    {
+      ReturnErrorInfo(command_queue->context, CL_INVALID_MEM_OBJECT,
+                      "Memory object " << i << " not valid");
+    }
+    memcpy((void*)args_mem_loc[i], &addr, sizeof(void*));
+  }
+  // Create command
+  oclgrind::Queue::NativeKernelCommand *cmd =
+    new oclgrind::Queue::NativeKernelCommand(user_func, args, cb_args);
+  // Retain memory objects
+  for (unsigned i = 0; i < num_mem_objects; i++)
+  {
+    asyncQueueRetain(cmd, mem_list[i]);
+  }
+  // Enqueue commands
+  asyncEnqueue(command_queue, CL_COMMAND_NATIVE_KERNEL, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_platform_id  platform,
+  const char *    func_name
+  return NULL;
+  cl_command_queue  command_queue,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  // Enqueue command
+  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  asyncEnqueue(command_queue, CL_COMMAND_MARKER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  // Check parameters
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  // Enqueue command
+  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  asyncEnqueue(command_queue, CL_COMMAND_BARRIER, cmd,
+               num_events_in_wait_list, event_wait_list, event);
+  return CL_SUCCESS;
+  cl_context           context,
+  void (CL_CALLBACK *  pfn_notify)(cl_context, cl_uint, char*, void*),
+  void *               user_data
+  cl_command_queue  command_queue,
+  cl_event *        event
+  return clEnqueueMarkerWithWaitList(command_queue, 0, NULL, event);
+  cl_command_queue  command_queue,
+  cl_uint           num_events,
+  const cl_event *  event_list
+  if (!command_queue)
+  {
+    ReturnErrorArg(NULL, CL_INVALID_COMMAND_QUEUE, command_queue);
+  }
+  // Enqueue command
+  oclgrind::Queue::Command *cmd = new oclgrind::Queue::Command();
+  asyncEnqueue(command_queue, CL_COMMAND_BARRIER, cmd,
+               num_events, event_list, NULL);
+  return CL_SUCCESS;
+  cl_command_queue  command_queue
+  return clEnqueueBarrierWithWaitList(command_queue, 0, NULL, NULL);
+  cl_context    context,
+  cl_mem_flags  flags,
+  cl_GLuint     bufret_mem,
+  int *         errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  return NULL;
+  cl_context    context,
+  cl_mem_flags  flags,
+  cl_GLenum     target,
+  cl_GLint      miplevel,
+  cl_GLuint     texture,
+  cl_int *      errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  return NULL;
+  cl_context    context,
+  cl_mem_flags  flags,
+  cl_GLenum     target,
+  cl_GLint      miplevel,
+  cl_GLuint     texture,
+  cl_int *      errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  return NULL;
+  cl_context    context,
+  cl_mem_flags  flags,
+  cl_GLenum     target,
+  cl_GLint      miplevel,
+  cl_GLuint     texture,
+  cl_int *      errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  return NULL;
+  cl_context    context,
+  cl_mem_flags  flags,
+  cl_GLuint     renderbuffer,
+  cl_int *      errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  return NULL;
+  cl_mem               memobj,
+  cl_gl_object_type *  gl_object_type,
+  cl_GLuint *          gl_object_name
+  ReturnErrorInfo(NULL, CL_INVALID_MEM_OBJECT, "CL/GL interop not implements");
+  cl_mem              memobj,
+  cl_gl_texture_info  param_name,
+  size_t              param_value_size,
+  void *              param_value,
+  size_t *            param_value_size_ret
+  ReturnErrorInfo(NULL, CL_INVALID_MEM_OBJECT, "CL/GL interop not implemented");
+  cl_command_queue  command_queue,
+  cl_uint           num_objects,
+  const cl_mem *    mem_objects,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  ReturnErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  cl_command_queue  command_queue,
+  cl_uint           num_objects,
+  const cl_mem *    mem_objects,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  ReturnErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  const cl_context_properties *  properties,
+  cl_gl_context_info             param_name,
+  size_t                         param_value_size,
+  void *                         param_value,
+  size_t *                       param_value_size_ret
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/GL interop not implemented");
+  cl_context  context,
+  cl_GLsync   cl_GLsync,
+  cl_int *    errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/GL interop not implemented");
+  return NULL;
+#if defined(_WIN32) && !defined(__MINGW32__) // DX extension functions
+  cl_platform_id              platform,
+  cl_d3d10_device_source_khr  d3d_device_source,
+  void *                      d3d_object,
+  cl_d3d10_device_set_khr     d3d_device_set,
+  cl_uint                     num_entries,
+  cl_device_id *              devices,
+  cl_uint *                   num_devices
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_context      context,
+  cl_mem_flags    flags,
+  ID3D10Buffer *  resource,
+  cl_int *        errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/DX interop not implemented");
+  return NULL;
+  cl_context         context,
+  cl_mem_flags       flags,
+  ID3D10Texture2D *  resource,
+  UINT               subresource,
+  cl_int *           errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  return NULL;
+  cl_context         context,
+  cl_mem_flags       flags,
+  ID3D10Texture3D *  resource,
+  UINT               subresource,
+  cl_int *           errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  return NULL;
+  cl_command_queue  command_queue,
+  cl_uint           num_objects,
+  const cl_mem *    mem_objects,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_command_queue  command_queue,
+  cl_uint           num_objects,
+  const cl_mem *    mem_objects,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_platform_id              platform,
+  cl_d3d11_device_source_khr  d3d_device_source,
+  void *                      d3d_object,
+  cl_d3d11_device_set_khr     d3d_device_set,
+  cl_uint                     num_entries,
+  cl_device_id *              devices,
+  cl_uint *                   num_devices
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_context      context,
+  cl_mem_flags    flags,
+  ID3D11Buffer *  resource,
+  cl_int *        errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/DX interop not implemented");
+  return NULL;
+  cl_context         context,
+  cl_mem_flags       flags,
+  ID3D11Texture2D *  resource,
+  UINT               subresource,
+  cl_int *           errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  return NULL;
+  cl_context         context,
+  cl_mem_flags       flags,
+  ID3D11Texture3D *  resource,
+  UINT               subresource,
+  cl_int *           errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  return NULL;
+  cl_command_queue  command_queue,
+  cl_uint           num_objects,
+  const cl_mem *    mem_objects,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_command_queue  command_queue,
+  cl_uint           num_objects,
+  const cl_mem *    mem_objects,
+  cl_uint           num_events_in_wait_list,
+  const cl_event *  event_wait_list,
+  cl_event *        event
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_platform_id                   platform,
+  cl_uint                          num_media_adapters,
+  cl_dx9_media_adapter_type_khr *  media_adapter_type,
+  void *                           media_adapters,
+  cl_dx9_media_adapter_set_khr     media_adapter_set,
+  cl_uint                          num_entries,
+  cl_device_id *                   devices,
+  cl_uint *                        num_devices
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_context                     context,
+  cl_mem_flags                   flags,
+  cl_dx9_media_adapter_type_khr  adapter_type,
+  void *                         surface_info,
+  cl_uint                        plane,
+  cl_int *                       errcode_ret
+  SetErrorInfo(NULL, CL_INVALID_CONTEXT, "CL/DX interop not implemented");
+  return NULL;
+  cl_command_queue command_queue,
+  cl_uint          num_objects,
+  const cl_mem *   mem_objects,
+  cl_uint          num_events_in_wait_list,
+  const cl_event * event_wait_list,
+  cl_event *       event
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+  cl_command_queue command_queue,
+  cl_uint          num_objects,
+  const cl_mem *   mem_objects,
+  cl_uint          num_events_in_wait_list,
+  const cl_event * event_wait_list,
+  cl_event *       event
+  ReturnErrorInfo(NULL, CL_INVALID_OPERATION, "CL/DX interop not implemented");
+#endif // DX extension functions
+// Dispatch Table //
+#define _NULL_ NULL
+void *m_dispatchTable[] =
+  DISPATCH_TABLE_ENTRY(clGetPlatformInfo),
+  DISPATCH_TABLE_ENTRY(clCreateContext),
+  DISPATCH_TABLE_ENTRY(clCreateContextFromType),
+  DISPATCH_TABLE_ENTRY(clRetainContext),
+  DISPATCH_TABLE_ENTRY(clReleaseContext),
+  DISPATCH_TABLE_ENTRY(clGetContextInfo),
+  DISPATCH_TABLE_ENTRY(clCreateCommandQueue),
+  DISPATCH_TABLE_ENTRY(clRetainCommandQueue),
+  DISPATCH_TABLE_ENTRY(clReleaseCommandQueue),
+  DISPATCH_TABLE_ENTRY(clGetCommandQueueInfo),
+  DISPATCH_TABLE_ENTRY(clSetCommandQueueProperty),
+  DISPATCH_TABLE_ENTRY(clCreateBuffer),
+  DISPATCH_TABLE_ENTRY(clRetainMemObject),
+  DISPATCH_TABLE_ENTRY(clReleaseMemObject),
+  DISPATCH_TABLE_ENTRY(clGetSupportedImageFormats),
+  DISPATCH_TABLE_ENTRY(clGetMemObjectInfo),
+  DISPATCH_TABLE_ENTRY(clCreateSampler),
+  DISPATCH_TABLE_ENTRY(clRetainSampler),
+  DISPATCH_TABLE_ENTRY(clReleaseSampler),
+  DISPATCH_TABLE_ENTRY(clGetSamplerInfo),
+  DISPATCH_TABLE_ENTRY(clCreateProgramWithSource),
+  DISPATCH_TABLE_ENTRY(clCreateProgramWithBinary),
+  DISPATCH_TABLE_ENTRY(clRetainProgram),
+  DISPATCH_TABLE_ENTRY(clReleaseProgram),
+  DISPATCH_TABLE_ENTRY(clBuildProgram),
+  DISPATCH_TABLE_ENTRY(clUnloadCompiler),
+  DISPATCH_TABLE_ENTRY(clGetProgramInfo),
+  DISPATCH_TABLE_ENTRY(clGetProgramBuildInfo),
+  DISPATCH_TABLE_ENTRY(clCreateKernel),
+  DISPATCH_TABLE_ENTRY(clCreateKernelsInProgram),
+  DISPATCH_TABLE_ENTRY(clRetainKernel),
+  DISPATCH_TABLE_ENTRY(clReleaseKernel),
+  DISPATCH_TABLE_ENTRY(clGetKernelWorkGroupInfo),
+  DISPATCH_TABLE_ENTRY(clReleaseEvent),
+  DISPATCH_TABLE_ENTRY(clGetEventProfilingInfo),
+  DISPATCH_TABLE_ENTRY(clEnqueueReadBuffer),
+  DISPATCH_TABLE_ENTRY(clEnqueueWriteBuffer),
+  DISPATCH_TABLE_ENTRY(clEnqueueCopyBuffer),
+  DISPATCH_TABLE_ENTRY(clEnqueueReadImage),
+  DISPATCH_TABLE_ENTRY(clEnqueueWriteImage),
+  DISPATCH_TABLE_ENTRY(clEnqueueCopyImage),
+  DISPATCH_TABLE_ENTRY(clEnqueueCopyImageToBuffer),
+  DISPATCH_TABLE_ENTRY(clEnqueueCopyBufferToImage),
+  DISPATCH_TABLE_ENTRY(clEnqueueMapBuffer),
+  DISPATCH_TABLE_ENTRY(clEnqueueMapImage),
+  DISPATCH_TABLE_ENTRY(clEnqueueUnmapMemObject),
+  DISPATCH_TABLE_ENTRY(clEnqueueNDRangeKernel),
+  DISPATCH_TABLE_ENTRY(clEnqueueNativeKernel),
+  DISPATCH_TABLE_ENTRY(clEnqueueMarker),
+  DISPATCH_TABLE_ENTRY(clEnqueueWaitForEvents),
+  DISPATCH_TABLE_ENTRY(clEnqueueBarrier),
+  DISPATCH_TABLE_ENTRY(clGetExtensionFunctionAddress),
+  DISPATCH_TABLE_ENTRY(clCreateFromGLBuffer),
+  DISPATCH_TABLE_ENTRY(clCreateFromGLTexture2D),
+  DISPATCH_TABLE_ENTRY(clCreateFromGLTexture3D),
+  DISPATCH_TABLE_ENTRY(clCreateFromGLRenderbuffer),
+  DISPATCH_TABLE_ENTRY(clEnqueueAcquireGLObjects),
+  DISPATCH_TABLE_ENTRY(clEnqueueReleaseGLObjects),
+#if defined(_WIN32)
+  DISPATCH_TABLE_ENTRY(clCreateFromD3D10BufferKHR),
+  DISPATCH_TABLE_ENTRY(clCreateFromD3D10Texture2DKHR),
+  DISPATCH_TABLE_ENTRY(clCreateFromD3D10Texture3DKHR),
+  DISPATCH_TABLE_ENTRY(clEnqueueAcquireD3D10ObjectsKHR),
+  DISPATCH_TABLE_ENTRY(clEnqueueReleaseD3D10ObjectsKHR),
+  // OpenCL 1.1
+  DISPATCH_TABLE_ENTRY(clSetEventCallback),
+  DISPATCH_TABLE_ENTRY(clCreateSubBuffer),
+  DISPATCH_TABLE_ENTRY(clSetMemObjectDestructorCallback),
+  DISPATCH_TABLE_ENTRY(clCreateUserEvent),
+  DISPATCH_TABLE_ENTRY(clSetUserEventStatus),
+  DISPATCH_TABLE_ENTRY(clEnqueueReadBufferRect),
+  DISPATCH_TABLE_ENTRY(clEnqueueWriteBufferRect),
+  DISPATCH_TABLE_ENTRY(clEnqueueCopyBufferRect),
+  DISPATCH_TABLE_ENTRY(clCreateEventFromGLsyncKHR),
+  // OpenCL 1.2
+  DISPATCH_TABLE_ENTRY(clCreateSubDevices),
+  DISPATCH_TABLE_ENTRY(clRetainDevice),
+  DISPATCH_TABLE_ENTRY(clReleaseDevice),
+  DISPATCH_TABLE_ENTRY(clCreateProgramWithBuiltInKernels),
+  DISPATCH_TABLE_ENTRY(clCompileProgram),
+  DISPATCH_TABLE_ENTRY(clUnloadPlatformCompiler),
+  DISPATCH_TABLE_ENTRY(clGetKernelArgInfo),
+  DISPATCH_TABLE_ENTRY(clEnqueueFillBuffer),
+  DISPATCH_TABLE_ENTRY(clEnqueueFillImage),
+  DISPATCH_TABLE_ENTRY(clEnqueueMigrateMemObjects),
+  DISPATCH_TABLE_ENTRY(clEnqueueMarkerWithWaitList),
+  DISPATCH_TABLE_ENTRY(clEnqueueBarrierWithWaitList),
+  DISPATCH_TABLE_ENTRY(clGetExtensionFunctionAddressForPlatform),
+  DISPATCH_TABLE_ENTRY(clCreateFromGLTexture),
+#if defined(_WIN32)
+  DISPATCH_TABLE_ENTRY(clCreateFromD3D11BufferKHR),
+  DISPATCH_TABLE_ENTRY(clCreateFromD3D11Texture2DKHR),
+  DISPATCH_TABLE_ENTRY(clCreateFromD3D11Texture3DKHR),
+  DISPATCH_TABLE_ENTRY(clCreateFromDX9MediaSurfaceKHR),
+  DISPATCH_TABLE_ENTRY(clEnqueueAcquireD3D11ObjectsKHR),
+  DISPATCH_TABLE_ENTRY(clEnqueueReleaseD3D11ObjectsKHR),
+  DISPATCH_TABLE_ENTRY(clGetDeviceIDsFromDX9MediaAdapterKHR),
+  DISPATCH_TABLE_ENTRY(clEnqueueAcquireDX9MediaSurfacesKHR),
+  DISPATCH_TABLE_ENTRY(clEnqueueReleaseDX9MediaSurfacesKHR),
diff --git a/src/runtime/runtime.def b/src/runtime/runtime.def
new file mode 100644
index 0000000..77992cf
--- /dev/null
+++ b/src/runtime/runtime.def
@@ -0,0 +1,119 @@
+; Make runtime functions visible
diff --git a/tests/apps/CMakeLists.txt b/tests/apps/CMakeLists.txt
new file mode 100644
index 0000000..0dff241
--- /dev/null
+++ b/tests/apps/CMakeLists.txt
@@ -0,0 +1,33 @@
+# CMakeLists.txt (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+# Add app tests
+  vecadd)
+  add_executable(${test} ${test}/${test}.c)
+  target_link_libraries(${test} oclgrind-rt)
+  # Generate test binaries in same dir as Oclgrind libraries on Windows
+  if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+    add_test(app_${test} "${CMAKE_BINARY_DIR}/${test}")
+    set_target_properties(${test} PROPERTIES
+  else()
+    add_test(app_${test} "${test}/${test}")
+    set_target_properties(${test} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${test}")
+    set_target_properties(${test} PROPERTIES LINKER_LANGUAGE CXX)
+  endif()
+  set_tests_properties(app_${test} PROPERTIES DEPENDS ${test})
+  # Set PCH directory
+  set_tests_properties(app_${test} PROPERTIES
diff --git a/tests/apps/vecadd/vecadd.c b/tests/apps/vecadd/vecadd.c
new file mode 100644
index 0000000..22d55ed
--- /dev/null
+++ b/tests/apps/vecadd/vecadd.c
@@ -0,0 +1,190 @@
+#include <CL/cl.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define TOL 1e-8
+#define MAX_ERRORS 8
+#define MAX_PLATFORMS 8
+const char *KERNEL_SOURCE =
+"kernel void vecadd(global float *a, \n"
+"                   global float *b, \n"
+"                   global float *c) \n"
+"{                                   \n"
+"  int i = get_global_id(0);         \n"
+"  c[i] = a[i] + b[i];               \n"
+"}                                   \n"
+void checkError(cl_int err, const char *operation);
+int main(int argc, char *argv[])
+  cl_int err;
+  cl_platform_id platform;
+  cl_device_id device;
+  cl_context context;
+  cl_command_queue queue;
+  cl_program program;
+  cl_kernel kernel;
+  cl_mem d_a, d_b, d_c;
+  float *h_a, *h_b, *h_c;
+  size_t N = 1024;
+  if (argc > 1)
+  {
+    N = atoi(argv[1]);
+  }
+  size_t global = N;
+  if (argc > 2)
+  {
+    global = atoi(argv[2]);
+  }
+  if (!N || !global)
+  {
+    printf("Usage: ./vecadd N [GLOBAL_SIZE]\n");
+    exit(1);
+  }
+  // Get list of platforms
+  cl_uint numPlatforms = 0;
+  cl_platform_id platforms[MAX_PLATFORMS];
+  err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms);
+  checkError(err, "getting platforms");
+  // Find Oclgrind
+  platform = NULL;
+  for (int i = 0; i < numPlatforms; i++)
+  {
+    char name[256];
+    err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 256, name, NULL);
+    checkError(err, "getting platform name");
+    if (!strcmp(name, "Oclgrind"))
+    {
+      platform = platforms[i];
+      break;
+    }
+  }
+  if (!platform)
+  {
+    fprintf(stderr, "Unable to find Oclgrind platform\n");
+    exit(1);
+  }
+  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
+  checkError(err, "getting device");
+  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+  checkError(err, "creating context");
+  queue = clCreateCommandQueue(context, device, 0, &err);
+  checkError(err, "creating command queue");
+  program = clCreateProgramWithSource(context, 1, &KERNEL_SOURCE, NULL, &err);
+  checkError(err, "creating program");
+  err = clBuildProgram(program, 1, &device, "", NULL, NULL);
+  {
+    size_t sz;
+    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+                          sizeof(size_t), NULL, &sz);
+    char *buildLog = malloc(++sz);
+    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
+                          sz, buildLog, NULL);
+    fprintf(stderr, "%s\n", buildLog);
+  }
+  checkError(err, "building program");
+  kernel = clCreateKernel(program, "vecadd", &err);
+  checkError(err, "creating kernel");
+  size_t dataSize = N*sizeof(cl_float);
+  // Initialise host data
+  srand(0);
+  h_a = malloc(dataSize);
+  h_b = malloc(dataSize);
+  h_c = malloc(dataSize);
+  for (int i = 0; i < N; i++)
+  {
+    h_a[i] = rand()/(float)RAND_MAX;
+    h_b[i] = rand()/(float)RAND_MAX;
+    h_c[i] = 0;
+  }
+  d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+  checkError(err, "creating d_a buffer");
+  d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
+  checkError(err, "creating d_b buffer");
+  d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
+  checkError(err, "creating d_c buffer");
+  err = clEnqueueWriteBuffer(queue, d_a, CL_FALSE,
+                             0, dataSize, h_a, 0, NULL, NULL);
+  checkError(err, "writing d_a data");
+  err = clEnqueueWriteBuffer(queue, d_b, CL_FALSE,
+                             0, dataSize, h_b, 0, NULL, NULL);
+  checkError(err, "writing d_b data");
+  err = clEnqueueWriteBuffer(queue, d_c, CL_FALSE,
+                             0, dataSize, h_c, 0, NULL, NULL);
+  checkError(err, "writing d_c data");
+  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
+  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
+  checkError(err, "setting kernel args");
+  err = clEnqueueNDRangeKernel(queue, kernel,
+                               1, NULL, &global, NULL, 0, NULL, NULL);
+  checkError(err, "enqueuing kernel");
+  err = clFinish(queue);
+  checkError(err, "running kernel");
+  err = clEnqueueReadBuffer(queue, d_c, CL_TRUE,
+                            0, dataSize, h_c, 0, NULL, NULL);
+  checkError(err, "reading d_c data");
+  // Check results
+  int errors = 0;
+  for (int i = 0; i < N; i++)
+  {
+    float ref = h_a[i] + h_b[i];
+    if (fabs(ref - h_c[i]) > TOL)
+    {
+      if (errors < MAX_ERRORS)
+      {
+        fprintf(stderr, "%4d: %.4f != %.4f\n", i, h_c[i], ref);
+      }
+      errors++;
+    }
+  }
+  printf("%d errors detected\n", errors);
+  free(h_a);
+  free(h_b);
+  free(h_c);
+  clReleaseMemObject(d_a);
+  clReleaseMemObject(d_b);
+  clReleaseMemObject(d_c);
+  clReleaseKernel(kernel);
+  clReleaseProgram(program);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return (errors != 0);
+void checkError(cl_int err, const char *operation)
+  if (err != CL_SUCCESS)
+  {
+    fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
+    exit(1);
+  }
diff --git a/tests/kernels/TESTS b/tests/kernels/TESTS
new file mode 100644
index 0000000..2ac8723
--- /dev/null
+++ b/tests/kernels/TESTS
@@ -0,0 +1,56 @@
\ No newline at end of file
diff --git a/tests/kernels/alignment/packed.cl b/tests/kernels/alignment/packed.cl
new file mode 100644
index 0000000..f6d5391
--- /dev/null
+++ b/tests/kernels/alignment/packed.cl
@@ -0,0 +1,10 @@
+struct __attribute__((packed)) Foo
+  char a;
+  int b;
+kernel void packed(struct Foo x, global int *out)
+  *out = x.b;
diff --git a/tests/kernels/alignment/packed.ref b/tests/kernels/alignment/packed.ref
new file mode 100644
index 0000000..df23fc9
--- /dev/null
+++ b/tests/kernels/alignment/packed.ref
@@ -0,0 +1,4 @@
+Argument 'out': 4 bytes
+  out[0] = 2
diff --git a/tests/kernels/alignment/packed.sim b/tests/kernels/alignment/packed.sim
new file mode 100644
index 0000000..46e9090
--- /dev/null
+++ b/tests/kernels/alignment/packed.sim
@@ -0,0 +1,10 @@
+1 1 1
+1 1 1
+<size=5 char hex>
+0x02 0x00 0x0 0x00
+<size=4 fill=0 dump>
diff --git a/tests/kernels/alignment/unaligned.cl b/tests/kernels/alignment/unaligned.cl
new file mode 100644
index 0000000..140607e
--- /dev/null
+++ b/tests/kernels/alignment/unaligned.cl
@@ -0,0 +1,6 @@
+kernel void unaligned(global int *in, global int *out)
+  global char *char_ptr = (global char*)in + 2;
+  global int *address   = (global int*)char_ptr;
+  *out = *address;
diff --git a/tests/kernels/alignment/unaligned.ref b/tests/kernels/alignment/unaligned.ref
new file mode 100644
index 0000000..1114e03
--- /dev/null
+++ b/tests/kernels/alignment/unaligned.ref
@@ -0,0 +1,5 @@
+Argument 'out': 4 bytes
+  out[0] = 2752512
diff --git a/tests/kernels/alignment/unaligned.sim b/tests/kernels/alignment/unaligned.sim
new file mode 100644
index 0000000..70211e3
--- /dev/null
+++ b/tests/kernels/alignment/unaligned.sim
@@ -0,0 +1,7 @@
+1 1 1
+1 1 1
+<size=8 fill=42>
+<size=4 fill=0 dump>
diff --git a/tests/kernels/async_copy/async_copy.cl b/tests/kernels/async_copy/async_copy.cl
new file mode 100644
index 0000000..a5280ce
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy.cl
@@ -0,0 +1,8 @@
+kernel void async_copy(global int *data, local int *scratch)
+  event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+  wait_group_events(1, &event);
+  int i = get_local_id(0);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/async_copy/async_copy.ref b/tests/kernels/async_copy/async_copy.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy.ref
@@ -0,0 +1,7 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy.sim b/tests/kernels/async_copy/async_copy.sim
new file mode 100644
index 0000000..58ec323
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/async_copy/async_copy_divergent.cl b/tests/kernels/async_copy/async_copy_divergent.cl
new file mode 100644
index 0000000..f428a10
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_divergent.cl
@@ -0,0 +1,14 @@
+kernel void async_copy_divergent(global int *data, local int *scratch)
+  int i = get_local_id(0);
+  size_t size = get_local_size(0);
+  if (i == size-1)
+  {
+    size = 1;
+  }
+  event_t event = async_work_group_copy(scratch, data, size, 0);
+  wait_group_events(1, &event);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/async_copy/async_copy_divergent.ref b/tests/kernels/async_copy/async_copy_divergent.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_divergent.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_divergent.sim b/tests/kernels/async_copy/async_copy_divergent.sim
new file mode 100644
index 0000000..ef59d1a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_divergent.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/async_copy/async_copy_global_race.cl b/tests/kernels/async_copy/async_copy_global_race.cl
new file mode 100644
index 0000000..bf2684a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_global_race.cl
@@ -0,0 +1,11 @@
+kernel void async_copy_global_race(global int *data, local int *scratch)
+  int i = get_local_id(0);
+  scratch[i] = i;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  data[i] = 0;
+  event_t event = async_work_group_copy(data, scratch, get_local_size(0), 0);
+  wait_group_events(1, &event);
diff --git a/tests/kernels/async_copy/async_copy_global_race.ref b/tests/kernels/async_copy/async_copy_global_race.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_global_race.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 0
+  data[1] = 1
+  data[2] = 2
+  data[3] = 3
diff --git a/tests/kernels/async_copy/async_copy_global_race.sim b/tests/kernels/async_copy/async_copy_global_race.sim
new file mode 100644
index 0000000..9ff8835
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_global_race.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/async_copy/async_copy_local_race.cl b/tests/kernels/async_copy/async_copy_local_race.cl
new file mode 100644
index 0000000..02fd84e
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_local_race.cl
@@ -0,0 +1,10 @@
+kernel void async_copy_local_race(global int *data, local int *scratch)
+  int i = get_local_id(0);
+  scratch[i] = 0;
+  event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+  wait_group_events(1, &event);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/async_copy/async_copy_local_race.ref b/tests/kernels/async_copy/async_copy_local_race.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_local_race.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_local_race.sim b/tests/kernels/async_copy/async_copy_local_race.sim
new file mode 100644
index 0000000..5506a1a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_local_race.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/async_copy/async_copy_loop.cl b/tests/kernels/async_copy/async_copy_loop.cl
new file mode 100644
index 0000000..caff0c3
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop.cl
@@ -0,0 +1,14 @@
+kernel void async_copy_loop(global int *data, local int *scratch)
+  int i = get_local_id(0);
+  event_t event = 0;
+  for (int j = 0; j < get_local_size(0); j++)
+  {
+    int offset = j;
+    event = async_work_group_copy(scratch+offset, data+offset, 1, event);
+  }
+  wait_group_events(1, &event);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/async_copy/async_copy_loop.ref b/tests/kernels/async_copy/async_copy_loop.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop.ref
@@ -0,0 +1,7 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_loop.sim b/tests/kernels/async_copy/async_copy_loop.sim
new file mode 100644
index 0000000..7f4fbd2
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/async_copy/async_copy_loop_divergent.cl b/tests/kernels/async_copy/async_copy_loop_divergent.cl
new file mode 100644
index 0000000..5d7f399
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop_divergent.cl
@@ -0,0 +1,19 @@
+kernel void async_copy_loop_divergent(global int *data, local int *scratch)
+  int i = get_local_id(0);
+  event_t event = 0;
+  for (int j = 0; j < get_local_size(0); j++)
+  {
+    int offset = j;
+    if (i == 2 && j == 2)
+    {
+      offset = 0;
+    }
+    event = async_work_group_copy(scratch+offset, data+offset, 1, event);
+  }
+  wait_group_events(1, &event);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/async_copy/async_copy_loop_divergent.ref b/tests/kernels/async_copy/async_copy_loop_divergent.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop_divergent.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_loop_divergent.sim b/tests/kernels/async_copy/async_copy_loop_divergent.sim
new file mode 100644
index 0000000..6c2da0e
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_loop_divergent.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/async_copy/async_copy_single_wi.cl b/tests/kernels/async_copy/async_copy_single_wi.cl
new file mode 100644
index 0000000..40cac34
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_single_wi.cl
@@ -0,0 +1,13 @@
+kernel void async_copy_single_wi(global int *data, local int *scratch)
+  int i = get_local_id(0);
+  event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+  if (i == 0)
+  {
+    // An extra copy that will only be registered by one work-item
+    event = async_work_group_copy(scratch, data, 1, event);
+  }
+  wait_group_events(1, &event);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/async_copy/async_copy_single_wi.ref b/tests/kernels/async_copy/async_copy_single_wi.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_single_wi.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_single_wi.sim b/tests/kernels/async_copy/async_copy_single_wi.sim
new file mode 100644
index 0000000..fc4c50a
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_single_wi.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/async_copy/async_copy_unwaited.cl b/tests/kernels/async_copy/async_copy_unwaited.cl
new file mode 100644
index 0000000..5c64771
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_unwaited.cl
@@ -0,0 +1,7 @@
+kernel void async_copy_unwaited(global int *data, local int *scratch)
+  event_t event = async_work_group_copy(scratch, data, get_local_size(0), 0);
+  int i = get_local_id(0);
+  data[get_local_size(0)-i-1] = i;
diff --git a/tests/kernels/async_copy/async_copy_unwaited.ref b/tests/kernels/async_copy/async_copy_unwaited.ref
new file mode 100644
index 0000000..8ce4dbb
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_unwaited.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/async_copy/async_copy_unwaited.sim b/tests/kernels/async_copy/async_copy_unwaited.sim
new file mode 100644
index 0000000..698f053
--- /dev/null
+++ b/tests/kernels/async_copy/async_copy_unwaited.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/atomics/atomic_cmpxchg_false_race.cl b/tests/kernels/atomics/atomic_cmpxchg_false_race.cl
new file mode 100644
index 0000000..dda8dde
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_false_race.cl
@@ -0,0 +1,36 @@
+kernel void atomic_cmpxchg_false_race(global int *data, local int *scratch)
+  int l = get_local_id(0);
+  if (l == 0)
+  {
+    scratch[0] = 0;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  bool done = false;
+  int before, old;
+  int result;
+  for (int i = 0; i < get_local_size(0); i++)
+  {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    before = scratch[0];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (!done)
+    {
+      old = atomic_cmpxchg(scratch, before, before+1);
+      if (old == before)
+      {
+        done = true;
+        result = scratch[0];
+      }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (l == 0)
+  {
+    *data = *scratch;
+  }
+  data[l+1] = result;
diff --git a/tests/kernels/atomics/atomic_cmpxchg_false_race.ref b/tests/kernels/atomics/atomic_cmpxchg_false_race.ref
new file mode 100644
index 0000000..fe14281
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_false_race.ref
@@ -0,0 +1,8 @@
+Argument 'data': 20 bytes
+  data[0] = 4
+  data[1] = 1
+  data[2] = 2
+  data[3] = 3
+  data[4] = 4
diff --git a/tests/kernels/atomics/atomic_cmpxchg_false_race.sim b/tests/kernels/atomics/atomic_cmpxchg_false_race.sim
new file mode 100644
index 0000000..f926a6e
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_false_race.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=20 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.cl b/tests/kernels/atomics/atomic_cmpxchg_read_race.cl
new file mode 100644
index 0000000..9be3a88
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.cl
@@ -0,0 +1,12 @@
+kernel void atomic_cmpxchg_read_race(global int *data)
+  int i = get_global_id(0);
+  if (i == 0)
+  {
+    *data = 0;
+  }
+  else
+  {
+    atomic_cmpxchg(data, 0, i);
+  }
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.ref b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
new file mode 100644
index 0000000..b398c6c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.ref
@@ -0,0 +1,5 @@
+Argument 'data': 4 bytes
+  data[0] = 1
diff --git a/tests/kernels/atomics/atomic_cmpxchg_read_race.sim b/tests/kernels/atomics/atomic_cmpxchg_read_race.sim
new file mode 100644
index 0000000..daa580c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_read_race.sim
@@ -0,0 +1,7 @@
+2 1 1
+2 1 1
+<size=4 dump>
diff --git a/tests/kernels/atomics/atomic_cmpxchg_write_race.cl b/tests/kernels/atomics/atomic_cmpxchg_write_race.cl
new file mode 100644
index 0000000..b78cc6d
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_write_race.cl
@@ -0,0 +1,9 @@
+kernel void atomic_cmpxchg_write_race(global int *data)
+  int i = get_global_id(0);
+  if (i == 0)
+  {
+    *data = 0;
+  }
+  atomic_cmpxchg(data, i, 42);
diff --git a/tests/kernels/atomics/atomic_cmpxchg_write_race.ref b/tests/kernels/atomics/atomic_cmpxchg_write_race.ref
new file mode 100644
index 0000000..af96d9b
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_write_race.ref
@@ -0,0 +1,5 @@
+Argument 'data': 4 bytes
+  data[0] = 42
diff --git a/tests/kernels/atomics/atomic_cmpxchg_write_race.sim b/tests/kernels/atomics/atomic_cmpxchg_write_race.sim
new file mode 100644
index 0000000..74591bf
--- /dev/null
+++ b/tests/kernels/atomics/atomic_cmpxchg_write_race.sim
@@ -0,0 +1,7 @@
+2 1 1
+2 1 1
+<size=4 dump>
diff --git a/tests/kernels/atomics/atomic_global_fence.cl b/tests/kernels/atomics/atomic_global_fence.cl
new file mode 100644
index 0000000..a4edf11
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence.cl
@@ -0,0 +1,17 @@
+kernel void atomic_global_fence(global int *data, global int *scratch)
+  int i = get_global_id(0);
+  int l = get_local_id(0);
+  int g = get_group_id(0);
+  if (l == 0)
+  {
+    scratch[g] = 0;
+  }
+  atomic_add(scratch+g, i);
+  if (l == 0)
+  {
+    data[g] = scratch[g];
+  }
diff --git a/tests/kernels/atomics/atomic_global_fence.ref b/tests/kernels/atomics/atomic_global_fence.ref
new file mode 100644
index 0000000..a7bf48a
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence.ref
@@ -0,0 +1,5 @@
+Argument 'data': 8 bytes
+  data[0] = 6
+  data[1] = 22
diff --git a/tests/kernels/atomics/atomic_global_fence.sim b/tests/kernels/atomics/atomic_global_fence.sim
new file mode 100644
index 0000000..76f685c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence.sim
@@ -0,0 +1,7 @@
+8 1 1
+4 1 1
+<size=8 fill=0 dump>
+<size=8 fill=-1>
diff --git a/tests/kernels/atomics/atomic_global_fence_race.cl b/tests/kernels/atomics/atomic_global_fence_race.cl
new file mode 100644
index 0000000..a84cbb7
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence_race.cl
@@ -0,0 +1,12 @@
+kernel void atomic_global_fence_race(global int *data, global int *scratch)
+  int i = get_global_id(0);
+  int l = get_local_id(0);
+  int g = get_group_id(0);
+  atomic_add(scratch, i);
+  if (l == 0)
+  {
+    data[g] = *scratch;
+  }
diff --git a/tests/kernels/atomics/atomic_global_fence_race.ref b/tests/kernels/atomics/atomic_global_fence_race.ref
new file mode 100644
index 0000000..4920bcf
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence_race.ref
@@ -0,0 +1,6 @@
+Argument 'data': 8 bytes
+  data[0] = 6
+  data[1] = 28
diff --git a/tests/kernels/atomics/atomic_global_fence_race.sim b/tests/kernels/atomics/atomic_global_fence_race.sim
new file mode 100644
index 0000000..af77d6c
--- /dev/null
+++ b/tests/kernels/atomics/atomic_global_fence_race.sim
@@ -0,0 +1,7 @@
+8 1 1
+4 1 1
+<size=8 fill=0 dump>
+<size=4 fill=0>
diff --git a/tests/kernels/atomics/atomic_increment.cl b/tests/kernels/atomics/atomic_increment.cl
new file mode 100644
index 0000000..e9a11fa
--- /dev/null
+++ b/tests/kernels/atomics/atomic_increment.cl
@@ -0,0 +1,4 @@
+kernel void atomic_increment(global int *data)
+  atomic_inc(data);
diff --git a/tests/kernels/atomics/atomic_increment.ref b/tests/kernels/atomics/atomic_increment.ref
new file mode 100644
index 0000000..f61189d
--- /dev/null
+++ b/tests/kernels/atomics/atomic_increment.ref
@@ -0,0 +1,4 @@
+Argument 'data': 4 bytes
+  data[0] = 4
diff --git a/tests/kernels/atomics/atomic_increment.sim b/tests/kernels/atomics/atomic_increment.sim
new file mode 100644
index 0000000..38e2866
--- /dev/null
+++ b/tests/kernels/atomics/atomic_increment.sim
@@ -0,0 +1,6 @@
+4 1 1
+1 1 1
+<size=4 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_intergroup_race.cl b/tests/kernels/atomics/atomic_intergroup_race.cl
new file mode 100644
index 0000000..b8d70f6
--- /dev/null
+++ b/tests/kernels/atomics/atomic_intergroup_race.cl
@@ -0,0 +1,10 @@
+kernel void atomic_intergroup_race(global int *data)
+  int i = get_global_id(0);
+  if (i == 0)
+  {
+    *data = 0;
+  }
+  atomic_inc(data);
diff --git a/tests/kernels/atomics/atomic_intergroup_race.ref b/tests/kernels/atomics/atomic_intergroup_race.ref
new file mode 100644
index 0000000..cab3430
--- /dev/null
+++ b/tests/kernels/atomics/atomic_intergroup_race.ref
@@ -0,0 +1,5 @@
+Argument 'data': 4 bytes
+  data[0] = 8
diff --git a/tests/kernels/atomics/atomic_intergroup_race.sim b/tests/kernels/atomics/atomic_intergroup_race.sim
new file mode 100644
index 0000000..2516334
--- /dev/null
+++ b/tests/kernels/atomics/atomic_intergroup_race.sim
@@ -0,0 +1,6 @@
+8 1 1
+4 1 1
+<size=4 fill=-1 dump>
diff --git a/tests/kernels/atomics/atomic_local_fence.cl b/tests/kernels/atomics/atomic_local_fence.cl
new file mode 100644
index 0000000..e9227a5
--- /dev/null
+++ b/tests/kernels/atomics/atomic_local_fence.cl
@@ -0,0 +1,17 @@
+kernel void atomic_local_fence(global int *data, local int *scratch)
+  int i = get_global_id(0);
+  int l = get_local_id(0);
+  int g = get_group_id(0);
+  if (l == 0)
+  {
+    *scratch = 0;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  atomic_add(scratch, i);
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (l == 0)
+  {
+    data[g] = *scratch;
+  }
diff --git a/tests/kernels/atomics/atomic_local_fence.ref b/tests/kernels/atomics/atomic_local_fence.ref
new file mode 100644
index 0000000..a7bf48a
--- /dev/null
+++ b/tests/kernels/atomics/atomic_local_fence.ref
@@ -0,0 +1,5 @@
+Argument 'data': 8 bytes
+  data[0] = 6
+  data[1] = 22
diff --git a/tests/kernels/atomics/atomic_local_fence.sim b/tests/kernels/atomics/atomic_local_fence.sim
new file mode 100644
index 0000000..6abffee
--- /dev/null
+++ b/tests/kernels/atomics/atomic_local_fence.sim
@@ -0,0 +1,7 @@
+8 1 1
+4 1 1
+<size=8 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_race_after.cl b/tests/kernels/atomics/atomic_race_after.cl
new file mode 100644
index 0000000..d168053
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_after.cl
@@ -0,0 +1,8 @@
+kernel void atomic_race_after(global int *data)
+  atomic_inc(data);
+  if (get_global_id(0) == get_global_size(0)-1)
+  {
+    (*data)++;
+  }
diff --git a/tests/kernels/atomics/atomic_race_after.ref b/tests/kernels/atomics/atomic_race_after.ref
new file mode 100644
index 0000000..bc902a8
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_after.ref
@@ -0,0 +1,5 @@
+Argument 'data': 4 bytes
+  data[0] = 5
diff --git a/tests/kernels/atomics/atomic_race_after.sim b/tests/kernels/atomics/atomic_race_after.sim
new file mode 100644
index 0000000..d182089
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_after.sim
@@ -0,0 +1,6 @@
+4 1 1
+4 1 1
+<size=4 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_race_before.cl b/tests/kernels/atomics/atomic_race_before.cl
new file mode 100644
index 0000000..53db050
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_before.cl
@@ -0,0 +1,8 @@
+kernel void atomic_race_before(global int *data)
+  if (get_global_id(0) == 0)
+  {
+    *data = 0;
+  }
+  atomic_inc(data);
diff --git a/tests/kernels/atomics/atomic_race_before.ref b/tests/kernels/atomics/atomic_race_before.ref
new file mode 100644
index 0000000..6ecedc3
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_before.ref
@@ -0,0 +1,5 @@
+Argument 'data': 4 bytes
+  data[0] = 4
diff --git a/tests/kernels/atomics/atomic_race_before.sim b/tests/kernels/atomics/atomic_race_before.sim
new file mode 100644
index 0000000..109c204
--- /dev/null
+++ b/tests/kernels/atomics/atomic_race_before.sim
@@ -0,0 +1,6 @@
+4 1 1
+4 1 1
+<size=4 fill=0 dump>
diff --git a/tests/kernels/atomics/atomic_same_workitem.cl b/tests/kernels/atomics/atomic_same_workitem.cl
new file mode 100644
index 0000000..735c5e9
--- /dev/null
+++ b/tests/kernels/atomics/atomic_same_workitem.cl
@@ -0,0 +1,14 @@
+kernel void atomic_same_workitem(global int *data)
+  int i = get_global_id(0);
+  if ((i % 2) == 0)
+  {
+    data[i] = 0;
+    atomic_inc(data+i);
+  }
+  else
+  {
+    atomic_inc(data+i);
+    data[i] = data[i] + 1;
+  }
diff --git a/tests/kernels/atomics/atomic_same_workitem.ref b/tests/kernels/atomics/atomic_same_workitem.ref
new file mode 100644
index 0000000..3ef3ca7
--- /dev/null
+++ b/tests/kernels/atomics/atomic_same_workitem.ref
@@ -0,0 +1,7 @@
+Argument 'data': 16 bytes
+  data[0] = 1
+  data[1] = 2
+  data[2] = 1
+  data[3] = 2
diff --git a/tests/kernels/atomics/atomic_same_workitem.sim b/tests/kernels/atomics/atomic_same_workitem.sim
new file mode 100644
index 0000000..2e3f210
--- /dev/null
+++ b/tests/kernels/atomics/atomic_same_workitem.sim
@@ -0,0 +1,6 @@
+4 1 1
+4 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/barrier/barrier_different_instructions.cl b/tests/kernels/barrier/barrier_different_instructions.cl
new file mode 100644
index 0000000..bd4d00a
--- /dev/null
+++ b/tests/kernels/barrier/barrier_different_instructions.cl
@@ -0,0 +1,14 @@
+kernel void barrier_different_instructions(global int *data)
+  int i = get_global_id(0);
+  if (i == 0)
+  {
+    data[0] = 42;
+    barrier(CLK_GLOBAL_MEM_FENCE);
+  }
+  else
+  {
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    data[i] = i + data[0];
+  }
diff --git a/tests/kernels/barrier/barrier_different_instructions.ref b/tests/kernels/barrier/barrier_different_instructions.ref
new file mode 100644
index 0000000..3ffaa5a
--- /dev/null
+++ b/tests/kernels/barrier/barrier_different_instructions.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 42
+  data[1] = 43
+  data[2] = 44
+  data[3] = 45
diff --git a/tests/kernels/barrier/barrier_different_instructions.sim b/tests/kernels/barrier/barrier_different_instructions.sim
new file mode 100644
index 0000000..96afa08
--- /dev/null
+++ b/tests/kernels/barrier/barrier_different_instructions.sim
@@ -0,0 +1,6 @@
+4 1 1
+4 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/barrier/barrier_divergence.cl b/tests/kernels/barrier/barrier_divergence.cl
new file mode 100644
index 0000000..c64a7c4
--- /dev/null
+++ b/tests/kernels/barrier/barrier_divergence.cl
@@ -0,0 +1,9 @@
+kernel void barrier_divergence(global int *data)
+  int i = get_global_id(0);
+  if (i != 0)
+  {
+    barrier(CLK_GLOBAL_MEM_FENCE);
+  }
+  data[i] = i;
diff --git a/tests/kernels/barrier/barrier_divergence.ref b/tests/kernels/barrier/barrier_divergence.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/barrier/barrier_divergence.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 0
+  data[1] = 1
+  data[2] = 2
+  data[3] = 3
diff --git a/tests/kernels/barrier/barrier_divergence.sim b/tests/kernels/barrier/barrier_divergence.sim
new file mode 100644
index 0000000..aa68728
--- /dev/null
+++ b/tests/kernels/barrier/barrier_divergence.sim
@@ -0,0 +1,6 @@
+4 1 1
+4 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/bugs/gvn_arbitrary_integers.cl b/tests/kernels/bugs/gvn_arbitrary_integers.cl
new file mode 100644
index 0000000..38c6f52
--- /dev/null
+++ b/tests/kernels/bugs/gvn_arbitrary_integers.cl
@@ -0,0 +1,8 @@
+__kernel void gvn_arbitrary_integers(__global int *source,
+                                     __global int *dest)
+  size_t i = get_global_id(0);
+  int3 tmp = 0;
+  tmp.S2 = source[i];
+  vstore3(tmp, 0, dest);
diff --git a/tests/kernels/bugs/gvn_arbitrary_integers.ref b/tests/kernels/bugs/gvn_arbitrary_integers.ref
new file mode 100644
index 0000000..fafe2ec
--- /dev/null
+++ b/tests/kernels/bugs/gvn_arbitrary_integers.ref
@@ -0,0 +1,6 @@
+Argument 'dest': 12 bytes
+  dest[0] = 0
+  dest[1] = 0
+  dest[2] = 42
diff --git a/tests/kernels/bugs/gvn_arbitrary_integers.sim b/tests/kernels/bugs/gvn_arbitrary_integers.sim
new file mode 100644
index 0000000..064c5b4
--- /dev/null
+++ b/tests/kernels/bugs/gvn_arbitrary_integers.sim
@@ -0,0 +1,7 @@
+1 1 1
+1 1 1
+<size=4 fill=42>
+<size=12 fill=0 dump>
diff --git a/tests/kernels/bugs/kernel_struct_argument.cl b/tests/kernels/bugs/kernel_struct_argument.cl
new file mode 100644
index 0000000..8b8af8b
--- /dev/null
+++ b/tests/kernels/bugs/kernel_struct_argument.cl
@@ -0,0 +1,11 @@
+typedef struct
+  float a;
+  float b;
+  float c;
+} Structure;
+kernel void kernel_struct_argument(Structure x, global float *out)
+  *out = x.a * x.b + x.c;
diff --git a/tests/kernels/bugs/kernel_struct_argument.ref b/tests/kernels/bugs/kernel_struct_argument.ref
new file mode 100644
index 0000000..b8c7e51
--- /dev/null
+++ b/tests/kernels/bugs/kernel_struct_argument.ref
@@ -0,0 +1,4 @@
+Argument 'out': 4 bytes
+  out[0] = 144
diff --git a/tests/kernels/bugs/kernel_struct_argument.sim b/tests/kernels/bugs/kernel_struct_argument.sim
new file mode 100644
index 0000000..4ff650d
--- /dev/null
+++ b/tests/kernels/bugs/kernel_struct_argument.sim
@@ -0,0 +1,11 @@
+1 1 1
+1 1 1
+<size=12 float>
+<size=4 dump fill=0>
diff --git a/tests/kernels/bugs/many_alloca.cl b/tests/kernels/bugs/many_alloca.cl
new file mode 100644
index 0000000..00d9fd1
--- /dev/null
+++ b/tests/kernels/bugs/many_alloca.cl
@@ -0,0 +1,21 @@
+void bar(int *x)
+  *x += 1;
+int foo()
+  int x = 0;
+  bar(&x);
+  return x;
+kernel void many_alloca(global int *data, int n)
+  int x = 0;
+  for (int i = 0; i < n; i++)
+  {
+    x += foo();
+  }
+  data[get_global_id(0)] = x;
diff --git a/tests/kernels/bugs/many_alloca.ref b/tests/kernels/bugs/many_alloca.ref
new file mode 100644
index 0000000..201d55d
--- /dev/null
+++ b/tests/kernels/bugs/many_alloca.ref
@@ -0,0 +1,4 @@
+Argument 'data': 4 bytes
+  data[0] = 100000
diff --git a/tests/kernels/bugs/many_alloca.sim b/tests/kernels/bugs/many_alloca.sim
new file mode 100644
index 0000000..3df81c9
--- /dev/null
+++ b/tests/kernels/bugs/many_alloca.sim
@@ -0,0 +1,9 @@
+1 1 1
+1 1 1
+<size=4 fill=0 dump>
diff --git a/tests/kernels/bugs/multidim_array_in_struct.cl b/tests/kernels/bugs/multidim_array_in_struct.cl
new file mode 100644
index 0000000..11ecf21
--- /dev/null
+++ b/tests/kernels/bugs/multidim_array_in_struct.cl
@@ -0,0 +1,40 @@
+// Issue #64 on GitHub:
+// https://github.com/jrprice/Oclgrind/issues/64
+// Required alignment for multi-dimensional arrays was incorrect.
+struct S0
+  uchar a;
+  ulong b[2][3][1];
+kernel void multidim_array_in_struct(global ulong *output)
+  struct S0 s =
+  {
+    1UL,
+    {
+      {
+        {1L},
+        {1L},
+        {1L}
+      },
+      {
+        {1L},
+        {1L},
+        {1L}
+      }
+    },
+  };
+  ulong c = 0UL;
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 1; k++)
+        c += s.b[i][j][k];
+  *output = c;
diff --git a/tests/kernels/bugs/multidim_array_in_struct.ref b/tests/kernels/bugs/multidim_array_in_struct.ref
new file mode 100644
index 0000000..f9606f2
--- /dev/null
+++ b/tests/kernels/bugs/multidim_array_in_struct.ref
@@ -0,0 +1,4 @@
+Argument 'output': 8 bytes
+  output[0] = 6
diff --git a/tests/kernels/bugs/multidim_array_in_struct.sim b/tests/kernels/bugs/multidim_array_in_struct.sim
new file mode 100644
index 0000000..07443f7
--- /dev/null
+++ b/tests/kernels/bugs/multidim_array_in_struct.sim
@@ -0,0 +1,13 @@
+# Issue #64 on GitHub:
+# https://github.com/jrprice/Oclgrind/issues/64
+# Required alignment for multi-dimensional arrays was incorrect.
+1 1 1
+1 1 1
+<size=8 fill=0 dump>
diff --git a/tests/kernels/bugs/null_argument.cl b/tests/kernels/bugs/null_argument.cl
new file mode 100644
index 0000000..d987861
--- /dev/null
+++ b/tests/kernels/bugs/null_argument.cl
@@ -0,0 +1,9 @@
+ulong func_1(ulong * p_1)
+  return 1;
+kernel void null_argument(global ulong *output)
+  *output = func_1((void*)0);
diff --git a/tests/kernels/bugs/null_argument.ref b/tests/kernels/bugs/null_argument.ref
new file mode 100644
index 0000000..dcf81cb
--- /dev/null
+++ b/tests/kernels/bugs/null_argument.ref
@@ -0,0 +1,4 @@
+Argument 'output': 8 bytes
+  output[0] = 1
diff --git a/tests/kernels/bugs/null_argument.sim b/tests/kernels/bugs/null_argument.sim
new file mode 100644
index 0000000..eb55985
--- /dev/null
+++ b/tests/kernels/bugs/null_argument.sim
@@ -0,0 +1,6 @@
+1 1 1
+1 1 1
+<size=8 fill=0 dump>
diff --git a/tests/kernels/bugs/sroa_addrspace_cast.cl b/tests/kernels/bugs/sroa_addrspace_cast.cl
new file mode 100644
index 0000000..1eac32c
--- /dev/null
+++ b/tests/kernels/bugs/sroa_addrspace_cast.cl
@@ -0,0 +1,12 @@
+typedef struct
+  float x;
+} DataStruct;
+__kernel void sroa_addrspace_cast(__global DataStruct *input,
+                                  __global float *output)
+  size_t i = get_global_id(0);
+  DataStruct s = input[i];
+  output[i] = s.x;
diff --git a/tests/kernels/bugs/sroa_addrspace_cast.ref b/tests/kernels/bugs/sroa_addrspace_cast.ref
new file mode 100644
index 0000000..2fff44c
--- /dev/null
+++ b/tests/kernels/bugs/sroa_addrspace_cast.ref
@@ -0,0 +1,4 @@
+Argument 'output': 4 bytes
+  output[0] = 42.24
diff --git a/tests/kernels/bugs/sroa_addrspace_cast.sim b/tests/kernels/bugs/sroa_addrspace_cast.sim
new file mode 100644
index 0000000..5d26265
--- /dev/null
+++ b/tests/kernels/bugs/sroa_addrspace_cast.sim
@@ -0,0 +1,7 @@
+1 1 1
+1 1 1
+<size=4 float fill=42.24>
+<size=4 fill=0 dump>
diff --git a/tests/kernels/data-race/broadcast.cl b/tests/kernels/data-race/broadcast.cl
new file mode 100644
index 0000000..674f9f3
--- /dev/null
+++ b/tests/kernels/data-race/broadcast.cl
@@ -0,0 +1,5 @@
+kernel void broadcast(global int *value, global int *output)
+  int i = get_global_id(0);
+  output[i] = value[0];
diff --git a/tests/kernels/data-race/broadcast.ref b/tests/kernels/data-race/broadcast.ref
new file mode 100644
index 0000000..69790f7
--- /dev/null
+++ b/tests/kernels/data-race/broadcast.ref
@@ -0,0 +1,7 @@
+Argument 'output': 16 bytes
+  output[0] = 42
+  output[1] = 42
+  output[2] = 42
+  output[3] = 42
diff --git a/tests/kernels/data-race/broadcast.sim b/tests/kernels/data-race/broadcast.sim
new file mode 100644
index 0000000..7663c4f
--- /dev/null
+++ b/tests/kernels/data-race/broadcast.sim
@@ -0,0 +1,9 @@
+4 1 1
+1 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/global_fence.cl b/tests/kernels/data-race/global_fence.cl
new file mode 100644
index 0000000..ed175f7
--- /dev/null
+++ b/tests/kernels/data-race/global_fence.cl
@@ -0,0 +1,16 @@
+kernel void global_fence(global int *scratch, global int *output)
+  int i = get_global_id(0);
+  int g = get_group_id(0);
+  scratch[i] = i;
+  if (get_local_id(0) == 0)
+  {
+    int x = 0;
+    for (int l = 0; l < get_local_size(0); l++)
+    {
+      x += scratch[get_local_size(0)*g + l];
+    }
+    output[g] = x;
+  }
diff --git a/tests/kernels/data-race/global_fence.ref b/tests/kernels/data-race/global_fence.ref
new file mode 100644
index 0000000..342c29a
--- /dev/null
+++ b/tests/kernels/data-race/global_fence.ref
@@ -0,0 +1,7 @@
+Argument 'output': 16 bytes
+  output[0] = 6
+  output[1] = 22
+  output[2] = 38
+  output[3] = 54
diff --git a/tests/kernels/data-race/global_fence.sim b/tests/kernels/data-race/global_fence.sim
new file mode 100644
index 0000000..088170f
--- /dev/null
+++ b/tests/kernels/data-race/global_fence.sim
@@ -0,0 +1,7 @@
+16 1 1
+4 1 1
+<size=64 fill=0>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/global_only_fence.cl b/tests/kernels/data-race/global_only_fence.cl
new file mode 100644
index 0000000..c1f83cb
--- /dev/null
+++ b/tests/kernels/data-race/global_only_fence.cl
@@ -0,0 +1,16 @@
+kernel void global_only_fence(local int *scratch, global int *output)
+  int l = get_local_id(0);
+  int g = get_group_id(0);
+  scratch[l] = l;
+  if (get_local_id(0) == 0)
+  {
+    int x = 0;
+    for (int i = 0; i < get_local_size(0); i++)
+    {
+      x += scratch[i];
+    }
+    output[g] = x;
+  }
diff --git a/tests/kernels/data-race/global_only_fence.ref b/tests/kernels/data-race/global_only_fence.ref
new file mode 100644
index 0000000..5b62861
--- /dev/null
+++ b/tests/kernels/data-race/global_only_fence.ref
@@ -0,0 +1,8 @@
+Argument 'output': 16 bytes
+  output[0] = 6
+  output[1] = 0
+  output[2] = 0
+  output[3] = 0
diff --git a/tests/kernels/data-race/global_only_fence.sim b/tests/kernels/data-race/global_only_fence.sim
new file mode 100644
index 0000000..7bc05c6
--- /dev/null
+++ b/tests/kernels/data-race/global_only_fence.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/global_read_write_race.cl b/tests/kernels/data-race/global_read_write_race.cl
new file mode 100644
index 0000000..7463e22
--- /dev/null
+++ b/tests/kernels/data-race/global_read_write_race.cl
@@ -0,0 +1,8 @@
+kernel void global_read_write_race(global int *data)
+  int i = get_global_id(0);
+  if (i > 0)
+  {
+    data[i] = data[i-1];
+  }
diff --git a/tests/kernels/data-race/global_read_write_race.ref b/tests/kernels/data-race/global_read_write_race.ref
new file mode 100644
index 0000000..7e1c317
--- /dev/null
+++ b/tests/kernels/data-race/global_read_write_race.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 0
+  data[1] = 0
+  data[2] = 0
+  data[3] = 0
diff --git a/tests/kernels/data-race/global_read_write_race.sim b/tests/kernels/data-race/global_read_write_race.sim
new file mode 100644
index 0000000..11077ab
--- /dev/null
+++ b/tests/kernels/data-race/global_read_write_race.sim
@@ -0,0 +1,6 @@
+4 1 1
+1 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/data-race/global_write_write_race.cl b/tests/kernels/data-race/global_write_write_race.cl
new file mode 100644
index 0000000..53b5d9c
--- /dev/null
+++ b/tests/kernels/data-race/global_write_write_race.cl
@@ -0,0 +1,4 @@
+kernel void global_write_write_race(global int *data)
+  data[0] = get_global_id(0);
diff --git a/tests/kernels/data-race/global_write_write_race.ref b/tests/kernels/data-race/global_write_write_race.ref
new file mode 100644
index 0000000..0b31b65
--- /dev/null
+++ b/tests/kernels/data-race/global_write_write_race.ref
@@ -0,0 +1,5 @@
+Argument 'data': 4 bytes
+  data[0] = 3
diff --git a/tests/kernels/data-race/global_write_write_race.sim b/tests/kernels/data-race/global_write_write_race.sim
new file mode 100644
index 0000000..236990b
--- /dev/null
+++ b/tests/kernels/data-race/global_write_write_race.sim
@@ -0,0 +1,6 @@
+4 1 1
+1 1 1
+<size=4 fill=0 dump>
diff --git a/tests/kernels/data-race/increment.cl b/tests/kernels/data-race/increment.cl
new file mode 100644
index 0000000..d00f274
--- /dev/null
+++ b/tests/kernels/data-race/increment.cl
@@ -0,0 +1,5 @@
+kernel void increment(global int *data)
+  int i = get_global_id(0);
+  data[i]  = data[i] + 1;
diff --git a/tests/kernels/data-race/increment.ref b/tests/kernels/data-race/increment.ref
new file mode 100644
index 0000000..11a20e6
--- /dev/null
+++ b/tests/kernels/data-race/increment.ref
@@ -0,0 +1,7 @@
+Argument 'data': 16 bytes
+  data[0] = 1
+  data[1] = 2
+  data[2] = 3
+  data[3] = 4
diff --git a/tests/kernels/data-race/increment.sim b/tests/kernels/data-race/increment.sim
new file mode 100644
index 0000000..fc44402
--- /dev/null
+++ b/tests/kernels/data-race/increment.sim
@@ -0,0 +1,6 @@
+4 1 1
+1 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/data-race/intergroup_hidden_race.cl b/tests/kernels/data-race/intergroup_hidden_race.cl
new file mode 100644
index 0000000..5ac0b99
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_hidden_race.cl
@@ -0,0 +1,9 @@
+kernel void intergroup_hidden_race(global int *data, global int *output)
+  int group = get_group_id(0);
+  output[group] = data[0];
+  if (group == 1)
+  {
+    data[0] = group;
+  }
diff --git a/tests/kernels/data-race/intergroup_hidden_race.ref b/tests/kernels/data-race/intergroup_hidden_race.ref
new file mode 100644
index 0000000..9390c4c
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_hidden_race.ref
@@ -0,0 +1,6 @@
+Argument 'output': 8 bytes
+  output[0] = 0
+  output[1] = 0
diff --git a/tests/kernels/data-race/intergroup_hidden_race.sim b/tests/kernels/data-race/intergroup_hidden_race.sim
new file mode 100644
index 0000000..b31145c
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_hidden_race.sim
@@ -0,0 +1,7 @@
+2 1 1
+1 1 1
+<size=4 fill=0>
+<size=8 fill=0 dump>
diff --git a/tests/kernels/data-race/intergroup_race.cl b/tests/kernels/data-race/intergroup_race.cl
new file mode 100644
index 0000000..0a9784b
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_race.cl
@@ -0,0 +1,19 @@
+kernel void intergroup_race(global int *data)
+  int g = get_group_id(0);
+  if (get_local_id(0) == 0)
+  {
+    data[g] = g;
+  }
+  if (get_global_id(0) == 0)
+  {
+    int x = 0;
+    for (int i = 0; i < get_num_groups(0); i++)
+    {
+      x += data[i];
+    }
+    data[0] = x;
+  }
diff --git a/tests/kernels/data-race/intergroup_race.ref b/tests/kernels/data-race/intergroup_race.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_race.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 0
+  data[1] = 1
+  data[2] = 2
+  data[3] = 3
diff --git a/tests/kernels/data-race/intergroup_race.sim b/tests/kernels/data-race/intergroup_race.sim
new file mode 100644
index 0000000..4e60c87
--- /dev/null
+++ b/tests/kernels/data-race/intergroup_race.sim
@@ -0,0 +1,6 @@
+16 1 1
+4 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/intragroup_hidden_race.cl b/tests/kernels/data-race/intragroup_hidden_race.cl
new file mode 100644
index 0000000..b101a41
--- /dev/null
+++ b/tests/kernels/data-race/intragroup_hidden_race.cl
@@ -0,0 +1,10 @@
+kernel void intragroup_hidden_race(global int *data, global int *output)
+  int id = get_local_id(0);
+  output[id] = data[0];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (id == 0)
+  {
+    data[0] = -1;
+  }
diff --git a/tests/kernels/data-race/intragroup_hidden_race.ref b/tests/kernels/data-race/intragroup_hidden_race.ref
new file mode 100644
index 0000000..7ff022b
--- /dev/null
+++ b/tests/kernels/data-race/intragroup_hidden_race.ref
@@ -0,0 +1,6 @@
+Argument 'output': 8 bytes
+  output[0] = 42
+  output[1] = 42
diff --git a/tests/kernels/data-race/intragroup_hidden_race.sim b/tests/kernels/data-race/intragroup_hidden_race.sim
new file mode 100644
index 0000000..16479a5
--- /dev/null
+++ b/tests/kernels/data-race/intragroup_hidden_race.sim
@@ -0,0 +1,7 @@
+2 1 1
+2 1 1
+<size=4 fill=42>
+<size=8 fill=0 dump>
diff --git a/tests/kernels/data-race/local_only_fence.cl b/tests/kernels/data-race/local_only_fence.cl
new file mode 100644
index 0000000..7b28012
--- /dev/null
+++ b/tests/kernels/data-race/local_only_fence.cl
@@ -0,0 +1,16 @@
+kernel void local_only_fence(global int *scratch, global int *output)
+  int i = get_global_id(0);
+  int g = get_group_id(0);
+  scratch[i] = i;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (get_local_id(0) == 0)
+  {
+    int x = 0;
+    for (int l = 0; l < get_local_size(0); l++)
+    {
+      x += scratch[get_local_size(0)*g + l];
+    }
+    output[g] = x;
+  }
diff --git a/tests/kernels/data-race/local_only_fence.ref b/tests/kernels/data-race/local_only_fence.ref
new file mode 100644
index 0000000..b6b7f00
--- /dev/null
+++ b/tests/kernels/data-race/local_only_fence.ref
@@ -0,0 +1,8 @@
+Argument 'output': 16 bytes
+  output[0] = 6
+  output[1] = 22
+  output[2] = 38
+  output[3] = 54
diff --git a/tests/kernels/data-race/local_only_fence.sim b/tests/kernels/data-race/local_only_fence.sim
new file mode 100644
index 0000000..1662f3e
--- /dev/null
+++ b/tests/kernels/data-race/local_only_fence.sim
@@ -0,0 +1,7 @@
+16 1 1
+4 1 1
+<size=64 fill=0>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/local_read_write_race.cl b/tests/kernels/data-race/local_read_write_race.cl
new file mode 100644
index 0000000..bcc3ff8
--- /dev/null
+++ b/tests/kernels/data-race/local_read_write_race.cl
@@ -0,0 +1,14 @@
+kernel void local_read_write_race(global int *data, local int *scratch)
+  int l = get_local_id(0);
+  scratch[l] = l;
+  if (l == 0)
+  {
+    int x = 0;
+    for (int i = 0; i < get_local_size(0); i++)
+    {
+      x += scratch[i];
+    }
+    *data = x;
+  }
diff --git a/tests/kernels/data-race/local_read_write_race.ref b/tests/kernels/data-race/local_read_write_race.ref
new file mode 100644
index 0000000..0943b15
--- /dev/null
+++ b/tests/kernels/data-race/local_read_write_race.ref
@@ -0,0 +1,5 @@
+Argument 'data': 4 bytes
+  data[0] = 0
diff --git a/tests/kernels/data-race/local_read_write_race.sim b/tests/kernels/data-race/local_read_write_race.sim
new file mode 100644
index 0000000..b3c4fbb
--- /dev/null
+++ b/tests/kernels/data-race/local_read_write_race.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=4 fill=0 dump>
diff --git a/tests/kernels/data-race/local_write_write_race.cl b/tests/kernels/data-race/local_write_write_race.cl
new file mode 100644
index 0000000..26a96c1
--- /dev/null
+++ b/tests/kernels/data-race/local_write_write_race.cl
@@ -0,0 +1,7 @@
+kernel void local_write_write_race(global int *data, local int *scratch)
+  int i = get_global_id(0);
+  *scratch = i;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  data[i] = *scratch;
diff --git a/tests/kernels/data-race/local_write_write_race.ref b/tests/kernels/data-race/local_write_write_race.ref
new file mode 100644
index 0000000..3fe4e95
--- /dev/null
+++ b/tests/kernels/data-race/local_write_write_race.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 3
+  data[2] = 3
+  data[3] = 3
diff --git a/tests/kernels/data-race/local_write_write_race.sim b/tests/kernels/data-race/local_write_write_race.sim
new file mode 100644
index 0000000..43106c7
--- /dev/null
+++ b/tests/kernels/data-race/local_write_write_race.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/data-race/uniform_write_race.cl b/tests/kernels/data-race/uniform_write_race.cl
new file mode 100644
index 0000000..ec13b59
--- /dev/null
+++ b/tests/kernels/data-race/uniform_write_race.cl
@@ -0,0 +1,4 @@
+kernel void uniform_write_race(global int *data)
+  *data = 0;
diff --git a/tests/kernels/data-race/uniform_write_race.ref b/tests/kernels/data-race/uniform_write_race.ref
new file mode 100644
index 0000000..b688113
--- /dev/null
+++ b/tests/kernels/data-race/uniform_write_race.ref
@@ -0,0 +1,4 @@
+Argument 'data': 4 bytes
+  data[0] = 0
diff --git a/tests/kernels/data-race/uniform_write_race.sim b/tests/kernels/data-race/uniform_write_race.sim
new file mode 100644
index 0000000..d08df5f
--- /dev/null
+++ b/tests/kernels/data-race/uniform_write_race.sim
@@ -0,0 +1,6 @@
+4 1 1
+4 1 1
+<size=4 fill=-1 dump>
diff --git a/tests/kernels/memcheck/async_copy_out_of_bounds.cl b/tests/kernels/memcheck/async_copy_out_of_bounds.cl
new file mode 100644
index 0000000..9c38a91
--- /dev/null
+++ b/tests/kernels/memcheck/async_copy_out_of_bounds.cl
@@ -0,0 +1,8 @@
+kernel void async_copy_out_of_bounds(local int *src, global int *dst)
+  int l  = get_local_id(0);
+  src[l] = l;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  event_t event = async_work_group_copy(dst+1, src, get_local_size(0), 0);
+  wait_group_events(1, &event);
diff --git a/tests/kernels/memcheck/async_copy_out_of_bounds.ref b/tests/kernels/memcheck/async_copy_out_of_bounds.ref
new file mode 100644
index 0000000..9a8cb35
--- /dev/null
+++ b/tests/kernels/memcheck/async_copy_out_of_bounds.ref
@@ -0,0 +1,8 @@
+Argument 'dst': 16 bytes
+  dst[0] = 0
+  dst[1] = 0
+  dst[2] = 1
+  dst[3] = 2
diff --git a/tests/kernels/memcheck/async_copy_out_of_bounds.sim b/tests/kernels/memcheck/async_copy_out_of_bounds.sim
new file mode 100644
index 0000000..fd6d8de
--- /dev/null
+++ b/tests/kernels/memcheck/async_copy_out_of_bounds.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/memcheck/atomic_out_of_bounds.cl b/tests/kernels/memcheck/atomic_out_of_bounds.cl
new file mode 100644
index 0000000..bbb58b9
--- /dev/null
+++ b/tests/kernels/memcheck/atomic_out_of_bounds.cl
@@ -0,0 +1,5 @@
+kernel void atomic_out_of_bounds(global int *counters)
+  int i = get_global_id(0);
+  atomic_inc(counters+i);
diff --git a/tests/kernels/memcheck/atomic_out_of_bounds.ref b/tests/kernels/memcheck/atomic_out_of_bounds.ref
new file mode 100644
index 0000000..cfcff7d
--- /dev/null
+++ b/tests/kernels/memcheck/atomic_out_of_bounds.ref
@@ -0,0 +1,8 @@
+Argument 'counters': 16 bytes
+  counters[0] = 1
+  counters[1] = 1
+  counters[2] = 1
+  counters[3] = 1
diff --git a/tests/kernels/memcheck/atomic_out_of_bounds.sim b/tests/kernels/memcheck/atomic_out_of_bounds.sim
new file mode 100644
index 0000000..3c70419
--- /dev/null
+++ b/tests/kernels/memcheck/atomic_out_of_bounds.sim
@@ -0,0 +1,6 @@
+5 1 1
+1 1 1
+<size=16 fill=0 dump>
diff --git a/tests/kernels/memcheck/dereference_null.cl b/tests/kernels/memcheck/dereference_null.cl
new file mode 100644
index 0000000..c5df927
--- /dev/null
+++ b/tests/kernels/memcheck/dereference_null.cl
@@ -0,0 +1,4 @@
+kernel void dereference_null(global int *input, global int *output)
+  output[0] *= input[0];
diff --git a/tests/kernels/memcheck/dereference_null.ref b/tests/kernels/memcheck/dereference_null.ref
new file mode 100644
index 0000000..5a01471
--- /dev/null
+++ b/tests/kernels/memcheck/dereference_null.ref
@@ -0,0 +1,5 @@
+Argument 'output': 4 bytes
+  output[0] = 0
diff --git a/tests/kernels/memcheck/dereference_null.sim b/tests/kernels/memcheck/dereference_null.sim
new file mode 100644
index 0000000..84da097
--- /dev/null
+++ b/tests/kernels/memcheck/dereference_null.sim
@@ -0,0 +1,7 @@
+1 1 1
+1 1 1
+<size=4 fill=0 dump>
diff --git a/tests/kernels/memcheck/read_out_of_bounds.cl b/tests/kernels/memcheck/read_out_of_bounds.cl
new file mode 100644
index 0000000..d76d7f1
--- /dev/null
+++ b/tests/kernels/memcheck/read_out_of_bounds.cl
@@ -0,0 +1,12 @@
+kernel void read_out_of_bounds(global int *a, global int *b, global int *c)
+  int i = get_global_id(0);
+  if (i < 4)
+  {
+    c[i] = a[i] + b[i];
+  }
+  else
+  {
+    c[i] = a[0] * (a[i] + b[i]);
+  }
diff --git a/tests/kernels/memcheck/read_out_of_bounds.ref b/tests/kernels/memcheck/read_out_of_bounds.ref
new file mode 100644
index 0000000..539c267
--- /dev/null
+++ b/tests/kernels/memcheck/read_out_of_bounds.ref
@@ -0,0 +1,9 @@
+Argument 'c': 20 bytes
+  c[0] = 0
+  c[1] = 2
+  c[2] = 4
+  c[3] = 6
+  c[4] = 0
diff --git a/tests/kernels/memcheck/read_out_of_bounds.sim b/tests/kernels/memcheck/read_out_of_bounds.sim
new file mode 100644
index 0000000..df72869
--- /dev/null
+++ b/tests/kernels/memcheck/read_out_of_bounds.sim
@@ -0,0 +1,8 @@
+5 1 1
+5 1 1
+<size=16 range=0:1:3>
+<size=16 range=0:1:3>
+<size=20 fill=0 dump>
diff --git a/tests/kernels/memcheck/read_write_only_memory.cl b/tests/kernels/memcheck/read_write_only_memory.cl
new file mode 100644
index 0000000..3f65143
--- /dev/null
+++ b/tests/kernels/memcheck/read_write_only_memory.cl
@@ -0,0 +1,5 @@
+kernel void read_write_only_memory(global int *input, global int *output)
+  int i = get_global_id(0);
+  output[i] += input[i];
diff --git a/tests/kernels/memcheck/read_write_only_memory.ref b/tests/kernels/memcheck/read_write_only_memory.ref
new file mode 100644
index 0000000..cb933ab
--- /dev/null
+++ b/tests/kernels/memcheck/read_write_only_memory.ref
@@ -0,0 +1,8 @@
+Argument 'output': 16 bytes
+  output[0] = 0
+  output[1] = 1
+  output[2] = 2
+  output[3] = 3
diff --git a/tests/kernels/memcheck/read_write_only_memory.sim b/tests/kernels/memcheck/read_write_only_memory.sim
new file mode 100644
index 0000000..70981e0
--- /dev/null
+++ b/tests/kernels/memcheck/read_write_only_memory.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 ro>
+<size=16 fill=0 wo dump>
diff --git a/tests/kernels/memcheck/write_out_of_bounds.cl b/tests/kernels/memcheck/write_out_of_bounds.cl
new file mode 100644
index 0000000..fc4c3c6
--- /dev/null
+++ b/tests/kernels/memcheck/write_out_of_bounds.cl
@@ -0,0 +1,5 @@
+kernel void write_out_of_bounds(global int *a, global int *b, global int *c)
+  int i = get_global_id(0);
+  c[i] = a[i] + b[i];
diff --git a/tests/kernels/memcheck/write_out_of_bounds.ref b/tests/kernels/memcheck/write_out_of_bounds.ref
new file mode 100644
index 0000000..6412f26
--- /dev/null
+++ b/tests/kernels/memcheck/write_out_of_bounds.ref
@@ -0,0 +1,8 @@
+Argument 'c': 16 bytes
+  c[0] = 0
+  c[1] = 2
+  c[2] = 4
+  c[3] = 6
diff --git a/tests/kernels/memcheck/write_out_of_bounds.sim b/tests/kernels/memcheck/write_out_of_bounds.sim
new file mode 100644
index 0000000..6fe07f6
--- /dev/null
+++ b/tests/kernels/memcheck/write_out_of_bounds.sim
@@ -0,0 +1,8 @@
+5 1 1
+5 1 1
+<size=20 range=0:1:4>
+<size=20 range=0:1:4>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/memcheck/write_read_only_memory.cl b/tests/kernels/memcheck/write_read_only_memory.cl
new file mode 100644
index 0000000..8666891
--- /dev/null
+++ b/tests/kernels/memcheck/write_read_only_memory.cl
@@ -0,0 +1,5 @@
+kernel void write_read_only_memory(global int *input, global int *output)
+  int i = get_global_id(0);
+  output[i] = input[i]++;
diff --git a/tests/kernels/memcheck/write_read_only_memory.ref b/tests/kernels/memcheck/write_read_only_memory.ref
new file mode 100644
index 0000000..cb933ab
--- /dev/null
+++ b/tests/kernels/memcheck/write_read_only_memory.ref
@@ -0,0 +1,8 @@
+Argument 'output': 16 bytes
+  output[0] = 0
+  output[1] = 1
+  output[2] = 2
+  output[3] = 3
diff --git a/tests/kernels/memcheck/write_read_only_memory.sim b/tests/kernels/memcheck/write_read_only_memory.sim
new file mode 100644
index 0000000..89c4b8c
--- /dev/null
+++ b/tests/kernels/memcheck/write_read_only_memory.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 ro>
+<size=16 fill=0 dump>
diff --git a/tests/kernels/misc/array.cl b/tests/kernels/misc/array.cl
new file mode 100644
index 0000000..cd4e43a
--- /dev/null
+++ b/tests/kernels/misc/array.cl
@@ -0,0 +1,10 @@
+kernel void array(global long16 *output)
+  long16 data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  int i = get_global_id(0);
+  long16 *foo = data;
+  output[i] = foo[i];
diff --git a/tests/kernels/misc/array.ref b/tests/kernels/misc/array.ref
new file mode 100644
index 0000000..1a1d2d0
--- /dev/null
+++ b/tests/kernels/misc/array.ref
@@ -0,0 +1,131 @@
+Argument 'output': 1024 bytes
+  output[0] = 0
+  output[1] = 0
+  output[2] = 0
+  output[3] = 0
+  output[4] = 0
+  output[5] = 0
+  output[6] = 0
+  output[7] = 0
+  output[8] = 0
+  output[9] = 0
+  output[10] = 0
+  output[11] = 0
+  output[12] = 0
+  output[13] = 0
+  output[14] = 0
+  output[15] = 0
+  output[16] = 1
+  output[17] = 1
+  output[18] = 1
+  output[19] = 1
+  output[20] = 1
+  output[21] = 1
+  output[22] = 1
+  output[23] = 1
+  output[24] = 1
+  output[25] = 1
+  output[26] = 1
+  output[27] = 1
+  output[28] = 1
+  output[29] = 1
+  output[30] = 1
+  output[31] = 1
+  output[32] = 2
+  output[33] = 2
+  output[34] = 2
+  output[35] = 2
+  output[36] = 2
+  output[37] = 2
+  output[38] = 2
+  output[39] = 2
+  output[40] = 2
+  output[41] = 2
+  output[42] = 2
+  output[43] = 2
+  output[44] = 2
+  output[45] = 2
+  output[46] = 2
+  output[47] = 2
+  output[48] = 3
+  output[49] = 3
+  output[50] = 3
+  output[51] = 3
+  output[52] = 3
+  output[53] = 3
+  output[54] = 3
+  output[55] = 3
+  output[56] = 3
+  output[57] = 3
+  output[58] = 3
+  output[59] = 3
+  output[60] = 3
+  output[61] = 3
+  output[62] = 3
+  output[63] = 3
+  output[64] = 4
+  output[65] = 4
+  output[66] = 4
+  output[67] = 4
+  output[68] = 4
+  output[69] = 4
+  output[70] = 4
+  output[71] = 4
+  output[72] = 4
+  output[73] = 4
+  output[74] = 4
+  output[75] = 4
+  output[76] = 4
+  output[77] = 4
+  output[78] = 4
+  output[79] = 4
+  output[80] = 5
+  output[81] = 5
+  output[82] = 5
+  output[83] = 5
+  output[84] = 5
+  output[85] = 5
+  output[86] = 5
+  output[87] = 5
+  output[88] = 5
+  output[89] = 5
+  output[90] = 5
+  output[91] = 5
+  output[92] = 5
+  output[93] = 5
+  output[94] = 5
+  output[95] = 5
+  output[96] = 6
+  output[97] = 6
+  output[98] = 6
+  output[99] = 6
+  output[100] = 6
+  output[101] = 6
+  output[102] = 6
+  output[103] = 6
+  output[104] = 6
+  output[105] = 6
+  output[106] = 6
+  output[107] = 6
+  output[108] = 6
+  output[109] = 6
+  output[110] = 6
+  output[111] = 6
+  output[112] = 7
+  output[113] = 7
+  output[114] = 7
+  output[115] = 7
+  output[116] = 7
+  output[117] = 7
+  output[118] = 7
+  output[119] = 7
+  output[120] = 7
+  output[121] = 7
+  output[122] = 7
+  output[123] = 7
+  output[124] = 7
+  output[125] = 7
+  output[126] = 7
+  output[127] = 7
diff --git a/tests/kernels/misc/array.sim b/tests/kernels/misc/array.sim
new file mode 100644
index 0000000..e0b46e3
--- /dev/null
+++ b/tests/kernels/misc/array.sim
@@ -0,0 +1,6 @@
+8 1 1
+1 1 1
+<size=1024 fill=0 dump>
diff --git a/tests/kernels/misc/reduce.cl b/tests/kernels/misc/reduce.cl
new file mode 100644
index 0000000..28f53ca
--- /dev/null
+++ b/tests/kernels/misc/reduce.cl
@@ -0,0 +1,28 @@
+kernel void reduce(unsigned int n,
+                   global unsigned int *data,
+                   global unsigned int *result,
+                   local unsigned int *localData)
+  unsigned int lid = get_local_id(0);
+  unsigned int lsz = get_local_size(0);
+  unsigned int sum = 0;
+  for (unsigned int i = lid; i < n; i+=lsz)
+  {
+    sum += data[i];
+  }
+  localData[lid] = sum;
+  for (unsigned int offset = lsz/2; offset > 0; offset/=2)
+  {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid < offset)
+    {
+      localData[lid] += localData[lid + offset];
+    }
+  }
+  if (lid == 0)
+  {
+    *result = localData[lid];
+  }
diff --git a/tests/kernels/misc/reduce.ref b/tests/kernels/misc/reduce.ref
new file mode 100644
index 0000000..fa92b4e
--- /dev/null
+++ b/tests/kernels/misc/reduce.ref
@@ -0,0 +1,4 @@
+Argument 'result': 4 bytes
+  result[0] = 120
diff --git a/tests/kernels/misc/reduce.sim b/tests/kernels/misc/reduce.sim
new file mode 100644
index 0000000..927a2e0
--- /dev/null
+++ b/tests/kernels/misc/reduce.sim
@@ -0,0 +1,11 @@
+4 1 1
+4 1 1
+<size=64 range=0:1:15>
+<size=4 fill=0 dump>
diff --git a/tests/kernels/misc/vecadd.cl b/tests/kernels/misc/vecadd.cl
new file mode 100644
index 0000000..04e2835
--- /dev/null
+++ b/tests/kernels/misc/vecadd.cl
@@ -0,0 +1,5 @@
+kernel void vecadd(global float *a, global float *b, global float *c)
+  size_t i = get_global_id(0);
+  c[i] = a[i] + b[i];
diff --git a/tests/kernels/misc/vecadd.ref b/tests/kernels/misc/vecadd.ref
new file mode 100644
index 0000000..9fa7b4c
--- /dev/null
+++ b/tests/kernels/misc/vecadd.ref
@@ -0,0 +1,1027 @@
+Argument 'c': 4096 bytes
+  c[0] = 0
+  c[1] = 2
+  c[2] = 4
+  c[3] = 6
+  c[4] = 8
+  c[5] = 10
+  c[6] = 12
+  c[7] = 14
+  c[8] = 16
+  c[9] = 18
+  c[10] = 20
+  c[11] = 22
+  c[12] = 24
+  c[13] = 26
+  c[14] = 28
+  c[15] = 30
+  c[16] = 32
+  c[17] = 34
+  c[18] = 36
+  c[19] = 38
+  c[20] = 40
+  c[21] = 42
+  c[22] = 44
+  c[23] = 46
+  c[24] = 48
+  c[25] = 50
+  c[26] = 52
+  c[27] = 54
+  c[28] = 56
+  c[29] = 58
+  c[30] = 60
+  c[31] = 62
+  c[32] = 64
+  c[33] = 66
+  c[34] = 68
+  c[35] = 70
+  c[36] = 72
+  c[37] = 74
+  c[38] = 76
+  c[39] = 78
+  c[40] = 80
+  c[41] = 82
+  c[42] = 84
+  c[43] = 86
+  c[44] = 88
+  c[45] = 90
+  c[46] = 92
+  c[47] = 94
+  c[48] = 96
+  c[49] = 98
+  c[50] = 100
+  c[51] = 102
+  c[52] = 104
+  c[53] = 106
+  c[54] = 108
+  c[55] = 110
+  c[56] = 112
+  c[57] = 114
+  c[58] = 116
+  c[59] = 118
+  c[60] = 120
+  c[61] = 122
+  c[62] = 124
+  c[63] = 126
+  c[64] = 128
+  c[65] = 130
+  c[66] = 132
+  c[67] = 134
+  c[68] = 136
+  c[69] = 138
+  c[70] = 140
+  c[71] = 142
+  c[72] = 144
+  c[73] = 146
+  c[74] = 148
+  c[75] = 150
+  c[76] = 152
+  c[77] = 154
+  c[78] = 156
+  c[79] = 158
+  c[80] = 160
+  c[81] = 162
+  c[82] = 164
+  c[83] = 166
+  c[84] = 168
+  c[85] = 170
+  c[86] = 172
+  c[87] = 174
+  c[88] = 176
+  c[89] = 178
+  c[90] = 180
+  c[91] = 182
+  c[92] = 184
+  c[93] = 186
+  c[94] = 188
+  c[95] = 190
+  c[96] = 192
+  c[97] = 194
+  c[98] = 196
+  c[99] = 198
+  c[100] = 200
+  c[101] = 202
+  c[102] = 204
+  c[103] = 206
+  c[104] = 208
+  c[105] = 210
+  c[106] = 212
+  c[107] = 214
+  c[108] = 216
+  c[109] = 218
+  c[110] = 220
+  c[111] = 222
+  c[112] = 224
+  c[113] = 226
+  c[114] = 228
+  c[115] = 230
+  c[116] = 232
+  c[117] = 234
+  c[118] = 236
+  c[119] = 238
+  c[120] = 240
+  c[121] = 242
+  c[122] = 244
+  c[123] = 246
+  c[124] = 248
+  c[125] = 250
+  c[126] = 252
+  c[127] = 254
+  c[128] = 256
+  c[129] = 258
+  c[130] = 260
+  c[131] = 262
+  c[132] = 264
+  c[133] = 266
+  c[134] = 268
+  c[135] = 270
+  c[136] = 272
+  c[137] = 274
+  c[138] = 276
+  c[139] = 278
+  c[140] = 280
+  c[141] = 282
+  c[142] = 284
+  c[143] = 286
+  c[144] = 288
+  c[145] = 290
+  c[146] = 292
+  c[147] = 294
+  c[148] = 296
+  c[149] = 298
+  c[150] = 300
+  c[151] = 302
+  c[152] = 304
+  c[153] = 306
+  c[154] = 308
+  c[155] = 310
+  c[156] = 312
+  c[157] = 314
+  c[158] = 316
+  c[159] = 318
+  c[160] = 320
+  c[161] = 322
+  c[162] = 324
+  c[163] = 326
+  c[164] = 328
+  c[165] = 330
+  c[166] = 332
+  c[167] = 334
+  c[168] = 336
+  c[169] = 338
+  c[170] = 340
+  c[171] = 342
+  c[172] = 344
+  c[173] = 346
+  c[174] = 348
+  c[175] = 350
+  c[176] = 352
+  c[177] = 354
+  c[178] = 356
+  c[179] = 358
+  c[180] = 360
+  c[181] = 362
+  c[182] = 364
+  c[183] = 366
+  c[184] = 368
+  c[185] = 370
+  c[186] = 372
+  c[187] = 374
+  c[188] = 376
+  c[189] = 378
+  c[190] = 380
+  c[191] = 382
+  c[192] = 384
+  c[193] = 386
+  c[194] = 388
+  c[195] = 390
+  c[196] = 392
+  c[197] = 394
+  c[198] = 396
+  c[199] = 398
+  c[200] = 400
+  c[201] = 402
+  c[202] = 404
+  c[203] = 406
+  c[204] = 408
+  c[205] = 410
+  c[206] = 412
+  c[207] = 414
+  c[208] = 416
+  c[209] = 418
+  c[210] = 420
+  c[211] = 422
+  c[212] = 424
+  c[213] = 426
+  c[214] = 428
+  c[215] = 430
+  c[216] = 432
+  c[217] = 434
+  c[218] = 436
+  c[219] = 438
+  c[220] = 440
+  c[221] = 442
+  c[222] = 444
+  c[223] = 446
+  c[224] = 448
+  c[225] = 450
+  c[226] = 452
+  c[227] = 454
+  c[228] = 456
+  c[229] = 458
+  c[230] = 460
+  c[231] = 462
+  c[232] = 464
+  c[233] = 466
+  c[234] = 468
+  c[235] = 470
+  c[236] = 472
+  c[237] = 474
+  c[238] = 476
+  c[239] = 478
+  c[240] = 480
+  c[241] = 482
+  c[242] = 484
+  c[243] = 486
+  c[244] = 488
+  c[245] = 490
+  c[246] = 492
+  c[247] = 494
+  c[248] = 496
+  c[249] = 498
+  c[250] = 500
+  c[251] = 502
+  c[252] = 504
+  c[253] = 506
+  c[254] = 508
+  c[255] = 510
+  c[256] = 512
+  c[257] = 514
+  c[258] = 516
+  c[259] = 518
+  c[260] = 520
+  c[261] = 522
+  c[262] = 524
+  c[263] = 526
+  c[264] = 528
+  c[265] = 530
+  c[266] = 532
+  c[267] = 534
+  c[268] = 536
+  c[269] = 538
+  c[270] = 540
+  c[271] = 542
+  c[272] = 544
+  c[273] = 546
+  c[274] = 548
+  c[275] = 550
+  c[276] = 552
+  c[277] = 554
+  c[278] = 556
+  c[279] = 558
+  c[280] = 560
+  c[281] = 562
+  c[282] = 564
+  c[283] = 566
+  c[284] = 568
+  c[285] = 570
+  c[286] = 572
+  c[287] = 574
+  c[288] = 576
+  c[289] = 578
+  c[290] = 580
+  c[291] = 582
+  c[292] = 584
+  c[293] = 586
+  c[294] = 588
+  c[295] = 590
+  c[296] = 592
+  c[297] = 594
+  c[298] = 596
+  c[299] = 598
+  c[300] = 600
+  c[301] = 602
+  c[302] = 604
+  c[303] = 606
+  c[304] = 608
+  c[305] = 610
+  c[306] = 612
+  c[307] = 614
+  c[308] = 616
+  c[309] = 618
+  c[310] = 620
+  c[311] = 622
+  c[312] = 624
+  c[313] = 626
+  c[314] = 628
+  c[315] = 630
+  c[316] = 632
+  c[317] = 634
+  c[318] = 636
+  c[319] = 638
+  c[320] = 640
+  c[321] = 642
+  c[322] = 644
+  c[323] = 646
+  c[324] = 648
+  c[325] = 650
+  c[326] = 652
+  c[327] = 654
+  c[328] = 656
+  c[329] = 658
+  c[330] = 660
+  c[331] = 662
+  c[332] = 664
+  c[333] = 666
+  c[334] = 668
+  c[335] = 670
+  c[336] = 672
+  c[337] = 674
+  c[338] = 676
+  c[339] = 678
+  c[340] = 680
+  c[341] = 682
+  c[342] = 684
+  c[343] = 686
+  c[344] = 688
+  c[345] = 690
+  c[346] = 692
+  c[347] = 694
+  c[348] = 696
+  c[349] = 698
+  c[350] = 700
+  c[351] = 702
+  c[352] = 704
+  c[353] = 706
+  c[354] = 708
+  c[355] = 710
+  c[356] = 712
+  c[357] = 714
+  c[358] = 716
+  c[359] = 718
+  c[360] = 720
+  c[361] = 722
+  c[362] = 724
+  c[363] = 726
+  c[364] = 728
+  c[365] = 730
+  c[366] = 732
+  c[367] = 734
+  c[368] = 736
+  c[369] = 738
+  c[370] = 740
+  c[371] = 742
+  c[372] = 744
+  c[373] = 746
+  c[374] = 748
+  c[375] = 750
+  c[376] = 752
+  c[377] = 754
+  c[378] = 756
+  c[379] = 758
+  c[380] = 760
+  c[381] = 762
+  c[382] = 764
+  c[383] = 766
+  c[384] = 768
+  c[385] = 770
+  c[386] = 772
+  c[387] = 774
+  c[388] = 776
+  c[389] = 778
+  c[390] = 780
+  c[391] = 782
+  c[392] = 784
+  c[393] = 786
+  c[394] = 788
+  c[395] = 790
+  c[396] = 792
+  c[397] = 794
+  c[398] = 796
+  c[399] = 798
+  c[400] = 800
+  c[401] = 802
+  c[402] = 804
+  c[403] = 806
+  c[404] = 808
+  c[405] = 810
+  c[406] = 812
+  c[407] = 814
+  c[408] = 816
+  c[409] = 818
+  c[410] = 820
+  c[411] = 822
+  c[412] = 824
+  c[413] = 826
+  c[414] = 828
+  c[415] = 830
+  c[416] = 832
+  c[417] = 834
+  c[418] = 836
+  c[419] = 838
+  c[420] = 840
+  c[421] = 842
+  c[422] = 844
+  c[423] = 846
+  c[424] = 848
+  c[425] = 850
+  c[426] = 852
+  c[427] = 854
+  c[428] = 856
+  c[429] = 858
+  c[430] = 860
+  c[431] = 862
+  c[432] = 864
+  c[433] = 866
+  c[434] = 868
+  c[435] = 870
+  c[436] = 872
+  c[437] = 874
+  c[438] = 876
+  c[439] = 878
+  c[440] = 880
+  c[441] = 882
+  c[442] = 884
+  c[443] = 886
+  c[444] = 888
+  c[445] = 890
+  c[446] = 892
+  c[447] = 894
+  c[448] = 896
+  c[449] = 898
+  c[450] = 900
+  c[451] = 902
+  c[452] = 904
+  c[453] = 906
+  c[454] = 908
+  c[455] = 910
+  c[456] = 912
+  c[457] = 914
+  c[458] = 916
+  c[459] = 918
+  c[460] = 920
+  c[461] = 922
+  c[462] = 924
+  c[463] = 926
+  c[464] = 928
+  c[465] = 930
+  c[466] = 932
+  c[467] = 934
+  c[468] = 936
+  c[469] = 938
+  c[470] = 940
+  c[471] = 942
+  c[472] = 944
+  c[473] = 946
+  c[474] = 948
+  c[475] = 950
+  c[476] = 952
+  c[477] = 954
+  c[478] = 956
+  c[479] = 958
+  c[480] = 960
+  c[481] = 962
+  c[482] = 964
+  c[483] = 966
+  c[484] = 968
+  c[485] = 970
+  c[486] = 972
+  c[487] = 974
+  c[488] = 976
+  c[489] = 978
+  c[490] = 980
+  c[491] = 982
+  c[492] = 984
+  c[493] = 986
+  c[494] = 988
+  c[495] = 990
+  c[496] = 992
+  c[497] = 994
+  c[498] = 996
+  c[499] = 998
+  c[500] = 1000
+  c[501] = 1002
+  c[502] = 1004
+  c[503] = 1006
+  c[504] = 1008
+  c[505] = 1010
+  c[506] = 1012
+  c[507] = 1014
+  c[508] = 1016
+  c[509] = 1018
+  c[510] = 1020
+  c[511] = 1022
+  c[512] = 1024
+  c[513] = 1026
+  c[514] = 1028
+  c[515] = 1030
+  c[516] = 1032
+  c[517] = 1034
+  c[518] = 1036
+  c[519] = 1038
+  c[520] = 1040
+  c[521] = 1042
+  c[522] = 1044
+  c[523] = 1046
+  c[524] = 1048
+  c[525] = 1050
+  c[526] = 1052
+  c[527] = 1054
+  c[528] = 1056
+  c[529] = 1058
+  c[530] = 1060
+  c[531] = 1062
+  c[532] = 1064
+  c[533] = 1066
+  c[534] = 1068
+  c[535] = 1070
+  c[536] = 1072
+  c[537] = 1074
+  c[538] = 1076
+  c[539] = 1078
+  c[540] = 1080
+  c[541] = 1082
+  c[542] = 1084
+  c[543] = 1086
+  c[544] = 1088
+  c[545] = 1090
+  c[546] = 1092
+  c[547] = 1094
+  c[548] = 1096
+  c[549] = 1098
+  c[550] = 1100
+  c[551] = 1102
+  c[552] = 1104
+  c[553] = 1106
+  c[554] = 1108
+  c[555] = 1110
+  c[556] = 1112
+  c[557] = 1114
+  c[558] = 1116
+  c[559] = 1118
+  c[560] = 1120
+  c[561] = 1122
+  c[562] = 1124
+  c[563] = 1126
+  c[564] = 1128
+  c[565] = 1130
+  c[566] = 1132
+  c[567] = 1134
+  c[568] = 1136
+  c[569] = 1138
+  c[570] = 1140
+  c[571] = 1142
+  c[572] = 1144
+  c[573] = 1146
+  c[574] = 1148
+  c[575] = 1150
+  c[576] = 1152
+  c[577] = 1154
+  c[578] = 1156
+  c[579] = 1158
+  c[580] = 1160
+  c[581] = 1162
+  c[582] = 1164
+  c[583] = 1166
+  c[584] = 1168
+  c[585] = 1170
+  c[586] = 1172
+  c[587] = 1174
+  c[588] = 1176
+  c[589] = 1178
+  c[590] = 1180
+  c[591] = 1182
+  c[592] = 1184
+  c[593] = 1186
+  c[594] = 1188
+  c[595] = 1190
+  c[596] = 1192
+  c[597] = 1194
+  c[598] = 1196
+  c[599] = 1198
+  c[600] = 1200
+  c[601] = 1202
+  c[602] = 1204
+  c[603] = 1206
+  c[604] = 1208
+  c[605] = 1210
+  c[606] = 1212
+  c[607] = 1214
+  c[608] = 1216
+  c[609] = 1218
+  c[610] = 1220
+  c[611] = 1222
+  c[612] = 1224
+  c[613] = 1226
+  c[614] = 1228
+  c[615] = 1230
+  c[616] = 1232
+  c[617] = 1234
+  c[618] = 1236
+  c[619] = 1238
+  c[620] = 1240
+  c[621] = 1242
+  c[622] = 1244
+  c[623] = 1246
+  c[624] = 1248
+  c[625] = 1250
+  c[626] = 1252
+  c[627] = 1254
+  c[628] = 1256
+  c[629] = 1258
+  c[630] = 1260
+  c[631] = 1262
+  c[632] = 1264
+  c[633] = 1266
+  c[634] = 1268
+  c[635] = 1270
+  c[636] = 1272
+  c[637] = 1274
+  c[638] = 1276
+  c[639] = 1278
+  c[640] = 1280
+  c[641] = 1282
+  c[642] = 1284
+  c[643] = 1286
+  c[644] = 1288
+  c[645] = 1290
+  c[646] = 1292
+  c[647] = 1294
+  c[648] = 1296
+  c[649] = 1298
+  c[650] = 1300
+  c[651] = 1302
+  c[652] = 1304
+  c[653] = 1306
+  c[654] = 1308
+  c[655] = 1310
+  c[656] = 1312
+  c[657] = 1314
+  c[658] = 1316
+  c[659] = 1318
+  c[660] = 1320
+  c[661] = 1322
+  c[662] = 1324
+  c[663] = 1326
+  c[664] = 1328
+  c[665] = 1330
+  c[666] = 1332
+  c[667] = 1334
+  c[668] = 1336
+  c[669] = 1338
+  c[670] = 1340
+  c[671] = 1342
+  c[672] = 1344
+  c[673] = 1346
+  c[674] = 1348
+  c[675] = 1350
+  c[676] = 1352
+  c[677] = 1354
+  c[678] = 1356
+  c[679] = 1358
+  c[680] = 1360
+  c[681] = 1362
+  c[682] = 1364
+  c[683] = 1366
+  c[684] = 1368
+  c[685] = 1370
+  c[686] = 1372
+  c[687] = 1374
+  c[688] = 1376
+  c[689] = 1378
+  c[690] = 1380
+  c[691] = 1382
+  c[692] = 1384
+  c[693] = 1386
+  c[694] = 1388
+  c[695] = 1390
+  c[696] = 1392
+  c[697] = 1394
+  c[698] = 1396
+  c[699] = 1398
+  c[700] = 1400
+  c[701] = 1402
+  c[702] = 1404
+  c[703] = 1406
+  c[704] = 1408
+  c[705] = 1410
+  c[706] = 1412
+  c[707] = 1414
+  c[708] = 1416
+  c[709] = 1418
+  c[710] = 1420
+  c[711] = 1422
+  c[712] = 1424
+  c[713] = 1426
+  c[714] = 1428
+  c[715] = 1430
+  c[716] = 1432
+  c[717] = 1434
+  c[718] = 1436
+  c[719] = 1438
+  c[720] = 1440
+  c[721] = 1442
+  c[722] = 1444
+  c[723] = 1446
+  c[724] = 1448
+  c[725] = 1450
+  c[726] = 1452
+  c[727] = 1454
+  c[728] = 1456
+  c[729] = 1458
+  c[730] = 1460
+  c[731] = 1462
+  c[732] = 1464
+  c[733] = 1466
+  c[734] = 1468
+  c[735] = 1470
+  c[736] = 1472
+  c[737] = 1474
+  c[738] = 1476
+  c[739] = 1478
+  c[740] = 1480
+  c[741] = 1482
+  c[742] = 1484
+  c[743] = 1486
+  c[744] = 1488
+  c[745] = 1490
+  c[746] = 1492
+  c[747] = 1494
+  c[748] = 1496
+  c[749] = 1498
+  c[750] = 1500
+  c[751] = 1502
+  c[752] = 1504
+  c[753] = 1506
+  c[754] = 1508
+  c[755] = 1510
+  c[756] = 1512
+  c[757] = 1514
+  c[758] = 1516
+  c[759] = 1518
+  c[760] = 1520
+  c[761] = 1522
+  c[762] = 1524
+  c[763] = 1526
+  c[764] = 1528
+  c[765] = 1530
+  c[766] = 1532
+  c[767] = 1534
+  c[768] = 1536
+  c[769] = 1538
+  c[770] = 1540
+  c[771] = 1542
+  c[772] = 1544
+  c[773] = 1546
+  c[774] = 1548
+  c[775] = 1550
+  c[776] = 1552
+  c[777] = 1554
+  c[778] = 1556
+  c[779] = 1558
+  c[780] = 1560
+  c[781] = 1562
+  c[782] = 1564
+  c[783] = 1566
+  c[784] = 1568
+  c[785] = 1570
+  c[786] = 1572
+  c[787] = 1574
+  c[788] = 1576
+  c[789] = 1578
+  c[790] = 1580
+  c[791] = 1582
+  c[792] = 1584
+  c[793] = 1586
+  c[794] = 1588
+  c[795] = 1590
+  c[796] = 1592
+  c[797] = 1594
+  c[798] = 1596
+  c[799] = 1598
+  c[800] = 1600
+  c[801] = 1602
+  c[802] = 1604
+  c[803] = 1606
+  c[804] = 1608
+  c[805] = 1610
+  c[806] = 1612
+  c[807] = 1614
+  c[808] = 1616
+  c[809] = 1618
+  c[810] = 1620
+  c[811] = 1622
+  c[812] = 1624
+  c[813] = 1626
+  c[814] = 1628
+  c[815] = 1630
+  c[816] = 1632
+  c[817] = 1634
+  c[818] = 1636
+  c[819] = 1638
+  c[820] = 1640
+  c[821] = 1642
+  c[822] = 1644
+  c[823] = 1646
+  c[824] = 1648
+  c[825] = 1650
+  c[826] = 1652
+  c[827] = 1654
+  c[828] = 1656
+  c[829] = 1658
+  c[830] = 1660
+  c[831] = 1662
+  c[832] = 1664
+  c[833] = 1666
+  c[834] = 1668
+  c[835] = 1670
+  c[836] = 1672
+  c[837] = 1674
+  c[838] = 1676
+  c[839] = 1678
+  c[840] = 1680
+  c[841] = 1682
+  c[842] = 1684
+  c[843] = 1686
+  c[844] = 1688
+  c[845] = 1690
+  c[846] = 1692
+  c[847] = 1694
+  c[848] = 1696
+  c[849] = 1698
+  c[850] = 1700
+  c[851] = 1702
+  c[852] = 1704
+  c[853] = 1706
+  c[854] = 1708
+  c[855] = 1710
+  c[856] = 1712
+  c[857] = 1714
+  c[858] = 1716
+  c[859] = 1718
+  c[860] = 1720
+  c[861] = 1722
+  c[862] = 1724
+  c[863] = 1726
+  c[864] = 1728
+  c[865] = 1730
+  c[866] = 1732
+  c[867] = 1734
+  c[868] = 1736
+  c[869] = 1738
+  c[870] = 1740
+  c[871] = 1742
+  c[872] = 1744
+  c[873] = 1746
+  c[874] = 1748
+  c[875] = 1750
+  c[876] = 1752
+  c[877] = 1754
+  c[878] = 1756
+  c[879] = 1758
+  c[880] = 1760
+  c[881] = 1762
+  c[882] = 1764
+  c[883] = 1766
+  c[884] = 1768
+  c[885] = 1770
+  c[886] = 1772
+  c[887] = 1774
+  c[888] = 1776
+  c[889] = 1778
+  c[890] = 1780
+  c[891] = 1782
+  c[892] = 1784
+  c[893] = 1786
+  c[894] = 1788
+  c[895] = 1790
+  c[896] = 1792
+  c[897] = 1794
+  c[898] = 1796
+  c[899] = 1798
+  c[900] = 1800
+  c[901] = 1802
+  c[902] = 1804
+  c[903] = 1806
+  c[904] = 1808
+  c[905] = 1810
+  c[906] = 1812
+  c[907] = 1814
+  c[908] = 1816
+  c[909] = 1818
+  c[910] = 1820
+  c[911] = 1822
+  c[912] = 1824
+  c[913] = 1826
+  c[914] = 1828
+  c[915] = 1830
+  c[916] = 1832
+  c[917] = 1834
+  c[918] = 1836
+  c[919] = 1838
+  c[920] = 1840
+  c[921] = 1842
+  c[922] = 1844
+  c[923] = 1846
+  c[924] = 1848
+  c[925] = 1850
+  c[926] = 1852
+  c[927] = 1854
+  c[928] = 1856
+  c[929] = 1858
+  c[930] = 1860
+  c[931] = 1862
+  c[932] = 1864
+  c[933] = 1866
+  c[934] = 1868
+  c[935] = 1870
+  c[936] = 1872
+  c[937] = 1874
+  c[938] = 1876
+  c[939] = 1878
+  c[940] = 1880
+  c[941] = 1882
+  c[942] = 1884
+  c[943] = 1886
+  c[944] = 1888
+  c[945] = 1890
+  c[946] = 1892
+  c[947] = 1894
+  c[948] = 1896
+  c[949] = 1898
+  c[950] = 1900
+  c[951] = 1902
+  c[952] = 1904
+  c[953] = 1906
+  c[954] = 1908
+  c[955] = 1910
+  c[956] = 1912
+  c[957] = 1914
+  c[958] = 1916
+  c[959] = 1918
+  c[960] = 1920
+  c[961] = 1922
+  c[962] = 1924
+  c[963] = 1926
+  c[964] = 1928
+  c[965] = 1930
+  c[966] = 1932
+  c[967] = 1934
+  c[968] = 1936
+  c[969] = 1938
+  c[970] = 1940
+  c[971] = 1942
+  c[972] = 1944
+  c[973] = 1946
+  c[974] = 1948
+  c[975] = 1950
+  c[976] = 1952
+  c[977] = 1954
+  c[978] = 1956
+  c[979] = 1958
+  c[980] = 1960
+  c[981] = 1962
+  c[982] = 1964
+  c[983] = 1966
+  c[984] = 1968
+  c[985] = 1970
+  c[986] = 1972
+  c[987] = 1974
+  c[988] = 1976
+  c[989] = 1978
+  c[990] = 1980
+  c[991] = 1982
+  c[992] = 1984
+  c[993] = 1986
+  c[994] = 1988
+  c[995] = 1990
+  c[996] = 1992
+  c[997] = 1994
+  c[998] = 1996
+  c[999] = 1998
+  c[1000] = 2000
+  c[1001] = 2002
+  c[1002] = 2004
+  c[1003] = 2006
+  c[1004] = 2008
+  c[1005] = 2010
+  c[1006] = 2012
+  c[1007] = 2014
+  c[1008] = 2016
+  c[1009] = 2018
+  c[1010] = 2020
+  c[1011] = 2022
+  c[1012] = 2024
+  c[1013] = 2026
+  c[1014] = 2028
+  c[1015] = 2030
+  c[1016] = 2032
+  c[1017] = 2034
+  c[1018] = 2036
+  c[1019] = 2038
+  c[1020] = 2040
+  c[1021] = 2042
+  c[1022] = 2044
+  c[1023] = 2046
diff --git a/tests/kernels/misc/vecadd.sim b/tests/kernels/misc/vecadd.sim
new file mode 100644
index 0000000..23e1a9a
--- /dev/null
+++ b/tests/kernels/misc/vecadd.sim
@@ -0,0 +1,8 @@
+1024 1 1
+16 1 1
+<size=4096 range=0:1:1023>
+<size=4096 range=0:1:1023>
+<size=4096 fill=0 dump>
diff --git a/tests/kernels/run_kernel_test.py b/tests/kernels/run_kernel_test.py
new file mode 100644
index 0000000..9387b9c
--- /dev/null
+++ b/tests/kernels/run_kernel_test.py
@@ -0,0 +1,93 @@
+# run_kernel_test.py (Oclgrind)
+# Copyright (c) 2013-2015, James Price and Simon McIntosh-Smith,
+# University of Bristol. All rights reserved.
+# This program is provided under a three-clause BSD license. For full
+# license terms please see the LICENSE file distributed with this
+# source code.
+import os
+import re
+import subprocess
+import sys
+# Check arguments
+if len(sys.argv) != 3:
+  print 'Usage: python run_kernel_test.py EXE SIMFILE'
+  sys.exit(1)
+if not os.path.isfile(sys.argv[2]):
+  print 'Test file not found'
+  sys.exit(1)
+# Construct paths to test inputs/outputs
+test_exe    = sys.argv[1]
+test_file   = sys.argv[2]
+test_dir    = os.path.dirname(os.path.realpath(test_file))
+test_file   = os.path.basename(test_file)
+test_name   = os.path.splitext(test_file)[0]
+test_out    = test_name + '.out'
+test_ref    = test_dir + os.path.sep + test_name + '.ref'
+current_dir = os.getcwd()
+if os.environ.get('AM_TESTS') == '1':
+  # If running via automake, use build directory for output file
+  test_out = 'tests' + os.path.sep + 'kernels' + os.path.sep + \
+             test_dir.split(os.path.sep)[-1] + os.path.sep + test_out
+  # Otherwise, use test directory for output file
+  test_out = test_dir + os.path.sep + test_out
+# Run oclgrind-kernel
+out = open(test_out, 'w')
+retval = subprocess.call([test_exe, '--data-races', test_file],
+                         stdout=out, stderr=out)
+if retval != 0:
+  print 'oclgrind-kernel returned non-zero value (' + str(retval) + ')'
+  sys.exit(retval)
+# Open output and reference files
+out = open(test_out).read().splitlines()
+ref = open(test_ref).read().splitlines()
+# Scan through file to reach argument data
+oi = 0
+ri = 0
+  while re.match('Argument \'.*\': [0-9]+ *bytes', out[oi]) == None:
+    oi += 1
+  while re.match('Argument \'.*\': [0-9]+ *bytes', ref[ri]) == None:
+    ri += 1
+  print 'Error searching for argument data'
+  sys.exit(1)
+# Check that an error was produced iff an error was expected
+# An error occured if global memory dump isn't at start of file
+# TODO: Improve this so that more details about the error are checked
+should_error = ri > 1
+if should_error and oi < 2:
+  print 'Error expected, but no error reported'
+  sys.exit(1)
+if not should_error and oi > 1:
+  print 'Error reported, but no error expected'
+  sys.exit(1)
+# Check that the global memory dump matches the reference
+# TODO: 32-bit machines will fail this due to memory address comparisons
+match = 1
+while oi < len(out):
+  if out[oi] != ref[ri]:
+    print '[%d:%d] "%s" vs "%s"' % (oi, ri, out[oi], ref[ri])
+    match = 0
+  oi += 1
+  ri += 1
+if not match:
+  print
+  print 'Output didn\'t match reference'
+  sys.exit(1)
+# Test passed
diff --git a/tests/kernels/wait_event/wait_event_chained.cl b/tests/kernels/wait_event/wait_event_chained.cl
new file mode 100644
index 0000000..1b86f8f
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_chained.cl
@@ -0,0 +1,13 @@
+kernel void wait_event_chained(global int *data, local int *scratch)
+  event_t event;
+  event = async_work_group_copy(scratch, data, 1, 0);
+  for (int i = 1; i < 4; i++)
+  {
+    async_work_group_copy(scratch+i, data+i, 1, event);
+  }
+  wait_group_events(1, &event);
+  int i = get_local_id(0);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/wait_event/wait_event_chained.ref b/tests/kernels/wait_event/wait_event_chained.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_chained.ref
@@ -0,0 +1,7 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/wait_event/wait_event_chained.sim b/tests/kernels/wait_event/wait_event_chained.sim
new file mode 100644
index 0000000..c865d3c
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_chained.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/wait_event/wait_event_divergent.cl b/tests/kernels/wait_event/wait_event_divergent.cl
new file mode 100644
index 0000000..d88f3f3
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_divergent.cl
@@ -0,0 +1,11 @@
+kernel void wait_event_divergent(global int *data, local int *scratch)
+  int i = get_local_id(0);
+  event_t events[2];
+  events[0] = async_work_group_copy(scratch, data, 1, 0);
+  events[1] = async_work_group_copy(scratch+1, data+1, 1, 0);
+  wait_group_events(1, events+i);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/wait_event/wait_event_divergent.ref b/tests/kernels/wait_event/wait_event_divergent.ref
new file mode 100644
index 0000000..56f64ac
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_divergent.ref
@@ -0,0 +1,6 @@
+Argument 'data': 8 bytes
+  data[0] = 0
+  data[1] = 0
diff --git a/tests/kernels/wait_event/wait_event_divergent.sim b/tests/kernels/wait_event/wait_event_divergent.sim
new file mode 100644
index 0000000..da1eb99
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_divergent.sim
@@ -0,0 +1,7 @@
+2 1 1
+2 1 1
+<size=8 range=0:1:1 dump>
diff --git a/tests/kernels/wait_event/wait_event_duplicates.cl b/tests/kernels/wait_event/wait_event_duplicates.cl
new file mode 100644
index 0000000..a625cc5
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_duplicates.cl
@@ -0,0 +1,13 @@
+kernel void wait_event_duplicates(global int *data, local int *scratch)
+  event_t events[4];
+  events[0] = async_work_group_copy(scratch, data, 1, 0);
+  events[1] = events[0];
+  events[2] = async_work_group_copy(scratch+1, data+1, 3, 0);
+  events[3] = events[0];
+  wait_group_events(4, events);
+  int i = get_local_id(0);
+  data[get_local_size(0)-i-1] = scratch[i];
diff --git a/tests/kernels/wait_event/wait_event_duplicates.ref b/tests/kernels/wait_event/wait_event_duplicates.ref
new file mode 100644
index 0000000..cf0b04f
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_duplicates.ref
@@ -0,0 +1,7 @@
+Argument 'data': 16 bytes
+  data[0] = 3
+  data[1] = 2
+  data[2] = 1
+  data[3] = 0
diff --git a/tests/kernels/wait_event/wait_event_duplicates.sim b/tests/kernels/wait_event/wait_event_duplicates.sim
new file mode 100644
index 0000000..39ea9b9
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_duplicates.sim
@@ -0,0 +1,7 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>
diff --git a/tests/kernels/wait_event/wait_event_invalid.cl b/tests/kernels/wait_event/wait_event_invalid.cl
new file mode 100644
index 0000000..239530e
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_invalid.cl
@@ -0,0 +1,5 @@
+kernel void wait_event_invalid(global int *data)
+  event_t event = 42;
+  wait_group_events(1, &event);
diff --git a/tests/kernels/wait_event/wait_event_invalid.ref b/tests/kernels/wait_event/wait_event_invalid.ref
new file mode 100644
index 0000000..4da13c4
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_invalid.ref
@@ -0,0 +1,8 @@
+Argument 'data': 16 bytes
+  data[0] = 0
+  data[1] = 1
+  data[2] = 2
+  data[3] = 3
diff --git a/tests/kernels/wait_event/wait_event_invalid.sim b/tests/kernels/wait_event/wait_event_invalid.sim
new file mode 100644
index 0000000..fdfff51
--- /dev/null
+++ b/tests/kernels/wait_event/wait_event_invalid.sim
@@ -0,0 +1,6 @@
+4 1 1
+4 1 1
+<size=16 range=0:1:3 dump>

Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/oclgrind.git

More information about the Pkg-opencl-devel mailing list