[Pkg-opencl-devel] [beignet] 28/66: Imported Upstream version 0.1+git20130614+89b5e40

Fri Oct 31 07:27:04 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit dd82e26e135a9826da498792fb57927b45f7ef41
Author: Simon Richter <sjr at debian.org>
Date:   Fri Jun 14 15:22:18 2013 +0200

    Imported Upstream version 0.1+git20130614+89b5e40
---
 CMake/FindLLVM.cmake                               |  31 +-
 CMakeLists.txt                                     |   2 +-
 README.html                                        |  31 +-
 README.md                                          |  33 +-
 backend/doc/TODO.html                              |   7 +-
 backend/doc/TODO.md                                |  12 +-
 backend/src/CMakeLists.txt                         |   1 +
 backend/src/backend/context.cpp                    |   8 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |  44 ++-
 backend/src/backend/gen_context.cpp                |   4 +
 backend/src/backend/gen_encoder.cpp                |  39 +-
 backend/src/backend/gen_insn_selection.cpp         |  35 +-
 backend/src/backend/gen_insn_selection.hpp         |   8 +-
 backend/src/backend/gen_reg_allocation.cpp         |   7 +-
 backend/src/backend/program.cpp                    | 137 ++++++-
 backend/src/backend/program.h                      |   1 +
 backend/src/ir/image.cpp                           |  14 +-
 backend/src/ir/instruction.hpp                     |  10 +-
 backend/src/ir/profile.cpp                         |   1 +
 backend/src/ir/profile.hpp                         |   3 +-
 backend/src/ir/unit.cpp                            |  15 +-
 backend/src/ir/unit.hpp                            |   7 +
 backend/src/llvm/llvm_gen_backend.cpp              | 106 +++--
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   4 +
 backend/src/llvm/llvm_passes.cpp                   |  20 +-
 backend/src/llvm/llvm_scalarize.cpp                |  40 +-
 backend/src/llvm/llvm_to_gen.cpp                   |  13 +-
 backend/src/ocl_stdlib.h                           | 433 ++++++++++++---------
 kernels/buildin_work_dim.cl                        |   3 +
 kernels/compiler_box_blur.cl                       |  51 +--
 kernels/compiler_box_blur_image.cl                 |  18 +
 kernels/compiler_box_blur_ref.bmp                  | Bin 49206 -> 49206 bytes
 kernels/compiler_clod.cl                           |   4 +-
 kernels/compiler_displacement_map_element.cl       |  11 +
 kernels/compiler_global_memory_barrier.cl          |   7 +
 kernels/compiler_group_size.cl                     |  12 +
 kernels/compiler_julia.cl                          |   4 +-
 kernels/compiler_julia_no_break.cl                 |   4 +-
 kernels/compiler_local_memory.cl                   |   5 -
 kernels/compiler_local_memory_two_ptr.cl           |   1 +
 kernels/compiler_mandelbrot.cl                     |   6 +-
 kernels/compiler_mandelbrot_alternate.cl           |   6 +-
 kernels/compiler_menger_sponge_no_shadow.cl        |  14 +-
 kernels/compiler_ribbon.cl                         | 177 +++++----
 kernels/compiler_vector_load_store.cl              |  52 ++-
 kernels/compiler_write_only_bytes.cl               |  13 +-
 kernels/compiler_write_only_shorts.cl              |  13 +-
 kernels/null_kernel_arg.cl                         |   9 +
 kernels/test_cl_finish.cl                          |  12 +
 kernels/test_copy_buffer.cl                        |  13 +-
 kernels/test_copy_buffer_row.cl                    |  17 +-
 kernels/test_get_image_info.cl                     |  13 +
 kernels/test_get_image_size.cl                     |   9 -
 kernels/test_write_only.cl                         |  13 +-
 src/cl_api.c                                       | 244 +++++++++---
 src/cl_command_queue.c                             |  22 +-
 src/cl_command_queue.h                             |   2 +-
 src/cl_command_queue_gen7.c                        |  15 +-
 src/cl_context.c                                   |  27 +-
 src/cl_device_id.c                                 |  34 +-
 src/cl_driver.h                                    |   4 +
 src/cl_driver_defs.c                               |   1 +
 src/cl_kernel.c                                    |  63 ++-
 src/cl_khr_icd.c                                   |   2 +-
 src/cl_mem.c                                       | 129 ++----
 src/cl_platform_id.c                               |  19 +-
 src/cl_program.c                                   |   7 -
 src/intel/intel_batchbuffer.c                      |   3 +
 src/intel/intel_batchbuffer.h                      |   2 +
 src/intel/intel_gpgpu.c                            |  31 +-
 utests/CMakeLists.txt                              |  11 +-
 utests/buildin_work_dim.cpp                        |  37 ++
 utests/compiler_box_blur_image.cpp                 |  52 +++
 utests/compiler_cl_finish.cpp                      |  50 +++
 utests/compiler_displacement_map_element.cpp       |  64 +++
 ..._image_size.cpp => compiler_get_image_info.cpp} |  25 +-
 utests/compiler_global_memory_barrier.cpp          |  28 ++
 utests/compiler_group_size.cpp                     |  86 ++++
 utests/compiler_local_memory.cpp                   |  47 ---
 utests/compiler_vector_load_store.cpp              |  57 ++-
 utests/runtime_null_kernel_arg.cpp                 |  27 ++
 utests/utest_helper.hpp                            |  11 +
 82 files changed, 1772 insertions(+), 881 deletions(-)

diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index c06b8a4..e76ab42 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -7,9 +7,9 @@
 # LLVM_MODULE_LIBS - list of llvm libs for working with modules.
 # LLVM_FOUND       - True if llvm found.
 if (LLVM_INSTALL_DIR)
-  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config DOC "llvm-config executable" PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
-else (LLVM_INSTALL_DIR)                                                                
-  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config DOC "llvm-config executable")
+  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config DOC "llvm-config executable" PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+else (LLVM_INSTALL_DIR)
+  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config DOC "llvm-config executable")
 endif (LLVM_INSTALL_DIR)
 
 if (LLVM_CONFIG_EXECUTABLE)
@@ -66,3 +66,28 @@ execute_process(
   OUTPUT_VARIABLE LLVM_MODULE_LIBS
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
+
+macro(add_one_lib name)
+  FIND_LIBRARY(CLANG_LIB
+    NAMES ${name}
+    PATHS ${LLVM_LIBRARY_DIR} )
+  set(CLANG_LIBRARIES ${CLANG_LIBRARIES} ${CLANG_LIB})
+	unset(CLANG_LIB CACHE)
+endmacro()
+
+#Assume clang lib path same as llvm lib path
+add_one_lib("clangFrontend")
+add_one_lib("clangSerialization")
+add_one_lib("clangDriver")
+add_one_lib("clangCodeGen")
+add_one_lib("clangSema")
+add_one_lib("clangStaticAnalyzerFrontend")
+add_one_lib("clangStaticAnalyzerCheckers")
+add_one_lib("clangStaticAnalyzerCore")
+add_one_lib("clangAnalysis")
+add_one_lib("clangEdit")
+add_one_lib("clangAST")
+add_one_lib("clangParse")
+add_one_lib("clangSema")
+add_one_lib("clangLex")
+add_one_lib("clangBasic")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d0d291..c6a5d49 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ SET(CMAKE_C_FLAGS "-Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3
 
 # Front end stuff we need
 #INCLUDE(CMake/FindLLVM.cmake)
-Find_Package(LLVM 3.2)
+Find_Package(LLVM 3.1)
 
 # XLib
 Find_Package(X11)
diff --git a/README.html b/README.html
index c5fcba8..4136d82 100644
--- a/README.html
+++ b/README.html
@@ -10,12 +10,7 @@ about the compiler, please refer to <code>backend/README.md</code></p>
 
 <h2>How to build</h2>
 
-<p>There are two ways to build Beignet.</p>
-
-<p>The first one uses a simple Makefile. Just type <code>make</code> and the project will
-build if everything is properly installed.</p>
-
-<p>The project also uses CMake with three profiles:</p>
+<p>The project uses CMake with three profiles:</p>
 
 <ol>
 <li>Debug (-g)</li>
@@ -91,12 +86,8 @@ are with the run-time in <code>./kernels</code>.</p>
 
 <h2>Supported Hardware</h2>
 
-<p>As an important remark, the code was only tested on IVB GT2 with a rather
-minimal Linux distribution (ArchLinux) and a very small desktop (dwm). If you
-use something more sophisticated using compiz or similar stuffs, you may expect
-serious problems and GPU hangs.</p>
-
-<p>Only IVB is supported right now. Actually, the code was only run on IVB GT2. You
+<p>The code was tested on IVB GT2 with ubuntu and fedora core distribution.
+Currently Only IVB is supported right now. Actually, the code was only run on IVB GT2. You
 may expect some issues with IVB GT1.</p>
 
 <h2>TODO</h2>
@@ -106,8 +97,6 @@ together to test and develop the OpenCL compiler. A partial list of things to
 do:</p>
 
 <ul>
-<li><p>Support dynamic sampler assignment in kernel. We now only support user to use
-clCreateSampler at host side and then pass the sampler to kernel.</p></li>
 <li><p>Complete cl_khr_gl_sharing support. We lack of some APIs implementation such
 as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
 the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D.</p></li>
@@ -121,14 +110,13 @@ implement those Enqueue*Buffer functions. </p></li>
 expensive pipe controls are issued for each batch buffer</p></li>
 <li><p>Valgrind reports some leaks in libdrm. It sounds like a false positive but it
 has to be checked. Idem for LLVM. There is one leak here to check.</p></li>
-<li><p>Support image/samplers in C++ simulator.</p></li>
 </ul>
 
 <p>More generally, everything in the run-time that triggers the "FATAL" macro means
 that something that must be supported is not implemented properly (either it
 does not comply with the standard or it is just missing)</p>
 
-<p>Project repository</p>
+<h2>Project repository</h2>
 
 <p>Right now, we host our project on fdo at: git://anongit.freedesktop.org/beignet.</p>
 
@@ -136,6 +124,11 @@ does not comply with the standard or it is just missing)</p>
 
 <p>This project was created by Ben Segovia when he was working for Intel. Now we
 have a team in China OTC graphics department continue to work on this project.
-We haven't set up a public mail list for this project, but we will do so in
-the near furture. Before that, the contact is as below:
-Zou Nanhai (<a href="&#x6D;&#x61;&#x69;l&#x74;o:&#x6E;a&#x6E;h&#x61;i&#x2E;z&#x6F;&#x75;@&#x69;&#x6E;&#x74;e&#x6C;&#x2E;&#x63;om">&#x6E;a&#x6E;h&#x61;i&#x2E;z&#x6F;&#x75;@&#x69;&#x6E;&#x74;e&#x6C;&#x2E;&#x63;om</a>).</p>
+The official contact for this project is: Zou Nanhai (<a href="mail&#x74;o:n&#x61;n&#x68;a&#x69;.&#x7A;&#x6F;&#x75;@i&#x6E;&#x74;e&#x6C;&#x2E;c&#x6F;m">n&#x61;n&#x68;a&#x69;.&#x7A;&#x6F;&#x75;@i&#x6E;&#x74;e&#x6C;&#x2E;c&#x6F;m</a>).</p>
+
+<h2>How to contribute</h2>
+
+<p>You are always welcome to contribute to this project, just need to subscribe
+to the beignet mail list and send patches to it for review.
+The official mail list is as below:
+http://lists.freedesktop.org/mailman/listinfo/beignet</p>
diff --git a/README.md b/README.md
index e76aba1..6e74112 100644
--- a/README.md
+++ b/README.md
@@ -12,12 +12,7 @@ about the compiler, please refer to `backend/README.md`
 How to build
 ------------
 
-There are two ways to build Beignet.
-
-The first one uses a simple Makefile. Just type `make` and the project will
-build if everything is properly installed.
-
-The project also uses CMake with three profiles:
+The project uses CMake with three profiles:
 
 1. Debug (-g)
 2. RelWithDebInfo (-g with optimizations)
@@ -91,12 +86,8 @@ will only run `some_unit_test0` and `some_unit_test1` tests
 Supported Hardware
 ------------------
 
-As an important remark, the code was only tested on IVB GT2 with a rather
-minimal Linux distribution (ArchLinux) and a very small desktop (dwm). If you
-use something more sophisticated using compiz or similar stuffs, you may expect
-serious problems and GPU hangs.
-
-Only IVB is supported right now. Actually, the code was only run on IVB GT2. You
+The code was tested on IVB GT2 with ubuntu and fedora core distribution.
+Currently Only IVB is supported right now. Actually, the code was only run on IVB GT2. You
 may expect some issues with IVB GT1.
 
 TODO
@@ -106,9 +97,6 @@ The run-time is far from being complete. Most of the pieces have been put
 together to test and develop the OpenCL compiler. A partial list of things to
 do:
 
-- Support dynamic sampler assignment in kernel. We now only support user to use
-  clCreateSampler at host side and then pass the sampler to kernel.
-
 - Complete cl\_khr\_gl\_sharing support. We lack of some APIs implementation such
   as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
   the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D.
@@ -128,20 +116,23 @@ do:
 - Valgrind reports some leaks in libdrm. It sounds like a false positive but it
   has to be checked. Idem for LLVM. There is one leak here to check.
 
-- Support image/samplers in C++ simulator.
-
 More generally, everything in the run-time that triggers the "FATAL" macro means
 that something that must be supported is not implemented properly (either it
 does not comply with the standard or it is just missing)
 
 Project repository
-
+------------------
 Right now, we host our project on fdo at: git://anongit.freedesktop.org/beignet.
 
 The team
 --------
 This project was created by Ben Segovia when he was working for Intel. Now we
 have a team in China OTC graphics department continue to work on this project.
-We haven't set up a public mail list for this project, but we will do so in
-the near furture. Before that, the contact is as below:
-Zou Nanhai (<nanhai.zou at intel.com>).
+The official contact for this project is: Zou Nanhai (<nanhai.zou at intel.com>).
+
+How to contribute
+-----------------
+You are always welcome to contribute to this project, just need to subscribe
+to the beignet mail list and send patches to it for review.
+The official mail list is as below:
+http://lists.freedesktop.org/mailman/listinfo/beignet
diff --git a/backend/doc/TODO.html b/backend/doc/TODO.html
index 27d1d81..36c2951 100644
--- a/backend/doc/TODO.html
+++ b/backend/doc/TODO.html
@@ -26,10 +26,10 @@ many things must be implemented:</p>
 <code>mad</code>, atomic operations, barriers...).</p></li>
 <li><p>Lowering down of int16 / int8 / float16 / char16 / char8 / char4 loads and
 stores into the supported loads and stores</p></li>
-<li><p>Support for constant buffers declared in the OpenCL source file</p></li>
 <li><p>Support for local declaration of local array (the OpenCL profile will properly
 declare them as global arrays)</p></li>
 <li><p>Support for doubles</p></li>
+<li><p>Support atomic extensions.</p></li>
 <li><p>Better resolving of the PHI functions. Today, we always generate MOV
 instructions at the end of each basic block . They can be easily optimized.</p></li>
 </ul>
@@ -40,9 +40,9 @@ instructions at the end of each basic block . They can be easily optimized.</p><
 
 <ul>
 <li><p>Bringing support for doubles</p></li>
+<li><p>Adding support for atomic extensions.</p></li>
 <li><p>Finishing the handling of function arguments (see the <a href="gen_ir.html">IR
 description</a> for more details)</p></li>
-<li><p>Adding support for constant data per unit</p></li>
 <li><p>Adding support for linking IR units together. OpenCL indeed allows to create
 programs from several sources</p></li>
 <li><p>Uniform analysys. This is a major performance improvement. A "uniform" value
@@ -63,11 +63,12 @@ allocation.</p></li>
 
 <ul>
 <li><p>Implementing support for doubles</p></li>
+<li><p>Implementing atomic extensions.</p></li>
 <li><p>Implementing register spilling (see the <a href="./compiler_backend.html">compiler backend
 description</a> for more details)</p></li>
 <li><p>Implementing proper instruction selection. A "simple" tree matching algorithm
 should provide good results for Gen</p></li>
-<li><p>Implementing the instruction scheduling pass</p></li>
+<li><p>Improving the instruction scheduling pass</p></li>
 </ul>
 
 <h2>General plumbing</h2>
diff --git a/backend/doc/TODO.md b/backend/doc/TODO.md
index 60d6c09..584dec5 100644
--- a/backend/doc/TODO.md
+++ b/backend/doc/TODO.md
@@ -31,13 +31,13 @@ many things must be implemented:
 - Lowering down of int16 / int8 / float16 / char16 / char8 / char4 loads and
   stores into the supported loads and stores
 
-- Support for constant buffers declared in the OpenCL source file
-
 - Support for local declaration of local array (the OpenCL profile will properly
   declare them as global arrays)
 
 - Support for doubles
 
+- Support atomic extensions.
+
 - Better resolving of the PHI functions. Today, we always generate MOV
   instructions at the end of each basic block . They can be easily optimized.
 
@@ -48,11 +48,11 @@ The code is defined in `src/ir`. Main things to do are:
 
 - Bringing support for doubles
 
+- Adding support for atomic extensions.
+
 - Finishing the handling of function arguments (see the [IR
   description](gen_ir.html) for more details)
 
-- Adding support for constant data per unit
-
 - Adding support for linking IR units together. OpenCL indeed allows to create
   programs from several sources
 
@@ -75,13 +75,15 @@ The code is defined in `src/backend`. Main things to do are:
 
 - Implementing support for doubles
 
+- Implementing atomic extensions.
+
 - Implementing register spilling (see the [compiler backend
   description](./compiler_backend.html) for more details)
 
 - Implementing proper instruction selection. A "simple" tree matching algorithm
   should provide good results for Gen
 
-- Implementing the instruction scheduling pass
+- Improving the instruction scheduling pass
 
 General plumbing
 ----------------
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 183517a..a0fe198 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -116,6 +116,7 @@ target_link_libraries(
                       ${DRM_INTEL_LIBRARY}
                       ${DRM_LIBRARY}
                       ${OPENGL_LIBRARIES}
+                      ${CLANG_LIBRARIES}
                       ${LLVM_MODULE_LIBS}
                       ${CMAKE_THREAD_LIBS_INIT}
                       ${CMAKE_DL_LIBS})
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 474c36a..48160de 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -271,7 +271,7 @@ namespace gbe
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
-  IVAR(OCL_SIMD_WIDTH, 8, 16, 32);
+  IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
 
   Context::Context(const ir::Unit &unit, const std::string &name) :
     unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL)
@@ -280,12 +280,12 @@ namespace gbe
     this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
     this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
     this->partitioner = GBE_NEW_NO_ARG(RegisterFilePartitioner);
-    if (fn.getSimdWidth() == 0)
+    if (fn.getSimdWidth() == 0 || OCL_SIMD_WIDTH != 15)
       this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
     else
       this->simdWidth = fn.getSimdWidth();
-
   }
+
   Context::~Context(void) {
     GBE_SAFE_DELETE(this->partitioner);
     GBE_SAFE_DELETE(this->dag);
@@ -430,6 +430,7 @@ namespace gbe
         INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1)
         INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1)
         INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1)
+        INSERT_REG(workdim, WORK_DIM, 1)
         INSERT_REG(numgroup0, GROUP_NUM_X, 1)
         INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
         INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
@@ -621,6 +622,7 @@ namespace gbe
         reg == ir::ocl::goffset0  ||
         reg == ir::ocl::goffset1  ||
         reg == ir::ocl::goffset2  ||
+        reg == ir::ocl::workdim   ||
         reg == ir::ocl::constoffst)
       return true;
     return false;
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 420cd62..17fc845 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -253,6 +253,7 @@ static const char *reg_encoding[8] = {
   [3] = "W",
   [4] = "UB",
   [5] = "B",
+  [6] = "DF",
   [7] = "F"
 };
 
@@ -263,6 +264,7 @@ int reg_type_size[8] = {
   [3] = 2,
   [4] = 1,
   [5] = 1,
+  [6] = 8,
   [7] = 4
 };
 
@@ -356,6 +358,33 @@ static const char *math_precision[2] = {
   [1] = "partial_precision"
 };
 
+static const char *data_port_data_cache_simd_mode[] = {
+  "SIMD4x2",
+  "SIMD16",
+  "SIMD8",
+};
+
+static const char *data_port_data_cache_category[] = {
+  "legacy",
+  "scratch",
+};
+
+static const char *data_port_data_cache_msg_type[] = {
+  [0] = "OWord Block Read",
+  [1] = "Unaligned OWord Block Read",
+  [2] = "OWord Dual Block Read",
+  [3] = "DWord Scattered Read",
+  [4] = "Byte Scattered Read",
+  [5] = "Untyped Surface Read",
+  [6] = "Untyped Atomic Operation",
+  [7] = "Memory Fence",
+  [8] = "OWord Block Write",
+  [10] = "OWord Dual Block Write",
+  [11] = "DWord Scattered Write",
+  [12] = "Byte Scattered Write",
+  [13] = "Untyped Surface Write",
+};
+
 static int column;
 
 static int string (FILE *file, const char *string)
@@ -816,25 +845,25 @@ static int src2_3src (FILE *file, const struct GenInstruction *inst)
 static int imm (FILE *file, uint32_t type, const struct GenInstruction *inst) {
   switch (type) {
     case GEN_TYPE_UD:
-      format (file, "0x%08xUD", inst->bits3.ud);
+      format (file, "0x%xUD", inst->bits3.ud);
       break;
     case GEN_TYPE_D:
       format (file, "%dD", inst->bits3.d);
       break;
     case GEN_TYPE_UW:
-      format (file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+      format (file, "0x%xUW", (uint16_t) inst->bits3.ud);
       break;
     case GEN_TYPE_W:
       format (file, "%dW", (int16_t) inst->bits3.d);
       break;
     case GEN_TYPE_UB:
-      format (file, "0x%02xUB", (int8_t) inst->bits3.ud);
+      format (file, "0x%xUB", (int8_t) inst->bits3.ud);
       break;
     case GEN_TYPE_VF:
       format (file, "Vector Float");
       break;
     case GEN_TYPE_V:
-      format (file, "0x%08xV", inst->bits3.ud);
+      format (file, "0x%xV", inst->bits3.ud);
       break;
     case GEN_TYPE_F:
       format (file, "%-gF", inst->bits3.f);
@@ -1122,11 +1151,12 @@ int gen_disasm (FILE *file, const void *opaque_insn)
                 inst->bits3.sampler_gen7.simd_mode);
         break;
       case GEN_SFID_DATAPORT_DATA_CACHE:
-        format (file, " (%d, %d, %d, %d)",
+        format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
                 inst->bits3.gen7_untyped_rw.bti,
                 inst->bits3.gen7_untyped_rw.rgba,
-                inst->bits3.gen7_untyped_rw.simd_mode,
-                inst->bits3.gen7_untyped_rw.msg_type);
+                data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+                data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+                data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
         break;
       case GEN_SFID_MESSAGE_GATEWAY:
         format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 18f6c11..055c8fc 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -72,6 +72,10 @@ namespace gbe
       }
       p->pop();
     }
+    /* per spec, pad the instruction stream with 8 nop to avoid
+	instruction prefetcher prefetch into an invalide page */
+    for(int i = 0; i < 8; i++)
+	p->NOP();
   }
 
   void GenContext::patchBranches(void) {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index d6c34fb..b65cc94 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -21,7 +21,7 @@
  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
  develop this 3D driver.
- 
+
  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  "Software"), to deal in the Software without restriction, including
@@ -29,11 +29,11 @@
  distribute, sublicense, and/or sell copies of the Software, and to
  permit persons to whom the Software is furnished to do so, subject to
  the following conditions:
- 
+
  The above copyright notice and this permission notice (including the
  next paragraph) shall be included in all copies or substantial
  portions of the Software.
- 
+
  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -41,7 +41,7 @@
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
+
  **********************************************************************/
  /*
   * Authors:
@@ -334,7 +334,7 @@ namespace gbe
          insn->bits3.da16.src1_reg_nr = reg.nr;
        }
 
-       if (reg.width == GEN_WIDTH_1 && 
+       if (reg.width == GEN_WIDTH_1 &&
            insn->header.execution_size == GEN_WIDTH_1) {
          insn->bits3.da1.src1_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
          insn->bits3.da1.src1_width = GEN_WIDTH_1;
@@ -802,20 +802,21 @@ namespace gbe
 
      if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
          function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER) {
-        assert(insn->header.execution_size == GEN_WIDTH_16);
-        insn->header.execution_size = GEN_WIDTH_8;
+        if(insn->header.execution_size == GEN_WIDTH_16) {
+          GenInstruction *insn2 = this->next(GEN_OPCODE_MATH);
+          GenRegister new_dest, new_src0, new_src1;
+          new_dest = GenRegister::QnPhysical(dst, 1);
+          new_src0 = GenRegister::QnPhysical(src0, 1);
+          new_src1 = GenRegister::QnPhysical(src1, 1);
+          insn2->header.destreg_or_condmod = function;
+          this->setHeader(insn2);
+          insn2->header.execution_size = GEN_WIDTH_8;
+          this->setDst(insn2, new_dest);
+          this->setSrc0(insn2, new_src0);
+          this->setSrc1(insn2, new_src1);
+        }
 
-        GenInstruction *insn2 = this->next(GEN_OPCODE_MATH);
-        GenRegister new_dest, new_src0, new_src1;
-        new_dest = GenRegister::QnPhysical(dst, 1);
-        new_src0 = GenRegister::QnPhysical(src0, 1);
-        new_src1 = GenRegister::QnPhysical(src1, 1);
-        insn2->header.destreg_or_condmod = function;
-        this->setHeader(insn2);
-        insn2->header.execution_size = GEN_WIDTH_8;
-        this->setDst(insn2, new_dest);
-        this->setSrc0(insn2, new_src0);
-        this->setSrc1(insn2, new_src1);
+        insn->header.execution_size = GEN_WIDTH_8;
      }
   }
 
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 08bc6af..88f9e94 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1429,7 +1429,10 @@ namespace gbe
 
         // Right part of the 16-wide register now
         if (simdWidth == 16) {
+          int predicate = sel.curr.predicate;
+          int noMask = sel.curr.noMask;
           sel.curr.noMask = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
           const GenRegister nextSrc0 = sel.selRegQn(insn.getSrc(0), 1, TYPE_S32);
           const GenRegister nextSrc1 = sel.selRegQn(insn.getSrc(1), 1, TYPE_S32);
           sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), nextSrc0, nextSrc1);
@@ -1437,11 +1440,15 @@ namespace gbe
           sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), nextSrc0, nextSrc1);
           sel.curr.accWrEnable = 0;
           sel.curr.quarterControl = GEN_COMPRESSION_Q2;
-          const ir::Register reg = sel.reg(FAMILY_DWORD);
-          sel.MOV(GenRegister::f8grf(reg), GenRegister::acc());
-          sel.curr.noMask = 0;
-          sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F),
-                  GenRegister::f8grf(reg));
+          if (predicate != GEN_PREDICATE_NONE || noMask != 1) {
+            const ir::Register reg = sel.reg(FAMILY_DWORD);
+            sel.MOV(GenRegister::f8grf(reg), GenRegister::acc());
+            sel.curr.noMask = noMask;;
+            sel.curr.predicate = predicate;
+            sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F),
+                    GenRegister::f8grf(reg));
+          } else
+            sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F), GenRegister::acc());
         }
 
         sel.pop();
@@ -1665,14 +1672,13 @@ namespace gbe
 
     void emitByteGather(Selection::Opaque &sel,
                         const ir::LoadInstruction &insn,
+                        const uint32_t elemSize,
                         GenRegister address,
                         GenRegister value,
                         uint32_t bti) const
     {
       using namespace ir;
       GBE_ASSERT(insn.getValueNum() == 1);
-      const Type type = insn.getValueType();
-      const uint32_t elemSize = getByteScatterGatherSize(type);
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
 
       // We need a temporary register if we read bytes or words
@@ -1711,13 +1717,15 @@ namespace gbe
                  insn.getAddressSpace() == MEM_PRIVATE ||
                  insn.getAddressSpace() == MEM_LOCAL);
       GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
+      const Type type = insn.getValueType();
+      const uint32_t elemSize = getByteScatterGatherSize(type);
       if (insn.getAddressSpace() == MEM_CONSTANT)
         this->emitIndirectMove(sel, insn, address);
-      else if (insn.isAligned() == true)
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
         const GenRegister value = sel.selReg(insn.getValue(0));
-        this->emitByteGather(sel, insn, address, value, space == MEM_LOCAL ? 0xfe : 0x01);
+        this->emitByteGather(sel, insn, elemSize, address, value, space == MEM_LOCAL ? 0xfe : 0x01);
       }
       return true;
     }
@@ -1745,13 +1753,12 @@ namespace gbe
 
     void emitByteScatter(Selection::Opaque &sel,
                          const ir::StoreInstruction &insn,
+                         const uint32_t elemSize,
                          GenRegister addr,
                          GenRegister value,
                          uint32_t bti) const
     {
       using namespace ir;
-      const Type type = insn.getValueType();
-      const uint32_t elemSize = getByteScatterGatherSize(type);
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const GenRegister dst = value;
 
@@ -1771,12 +1778,14 @@ namespace gbe
       using namespace ir;
       const AddressSpace space = insn.getAddressSpace();
       const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
-      if (insn.isAligned() == true)
+      const Type type = insn.getValueType();
+      const uint32_t elemSize = getByteScatterGatherSize(type);
+      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedWrite(sel, insn, bti);
       else {
         const GenRegister address = sel.selReg(insn.getAddress());
         const GenRegister value = sel.selReg(insn.getValue(0));
-        this->emitByteScatter(sel, insn, address, value, bti);
+        this->emitByteScatter(sel, insn, elemSize, address, value, bti);
       }
       return true;
     }
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index f6735c2..778eb1f 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -84,8 +84,8 @@ namespace gbe
     const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
     /*! Damn C++ */
     const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
-    /*! No more than 6 sources (used by typed writes) */
-    enum { MAX_SRC_NUM = 16 };
+    /*! No more than 17 sources (used by typed writes on simd8 mode.) */
+    enum { MAX_SRC_NUM = 17 };
     /*! No more than 4 destinations (used by samples and untyped reads) */
     enum { MAX_DST_NUM = 4 };
     /*! State of the instruction (extra fields neeed for the encoding) */
@@ -111,9 +111,9 @@ namespace gbe
     /*! Gen opcode */
     uint8_t opcode;
     /*! Number of destinations */
-    uint8_t dstNum:4;
+    uint8_t dstNum:3;
     /*! Number of sources */
-    uint8_t srcNum:4;
+    uint8_t srcNum:5;
     /*! To store various indices */
     uint16_t index;
     /*! Variable sized. Destinations and sources go here */
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 8c9f358..9765b02 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -270,7 +270,11 @@ namespace gbe
         return false;
       auto it = RA.find(reg);
       GBE_ASSERT(it != RA.end());
-
+      // offset less than 32 means it is not managed by our reg allocator.
+      if (it->second < 32) {
+        this->expiringID++;
+        continue;
+      }
       // Case 1 - it does not belong to a vector. Just remove it
       if (vectorMap.contains(reg) == false) {
         ctx.deallocate(it->second);
@@ -518,6 +522,7 @@ namespace gbe
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
+    allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index c46c681..e41e5b6 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -46,6 +46,27 @@
 #define LLVM_VERSION_MINOR 0
 #endif /* !defined(LLVM_VERSION_MINOR) */
 
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/CompilerInvocation.h>
+#if LLVM_VERSION_MINOR <= 1
+#include <clang/Frontend/DiagnosticOptions.h>
+#else
+#include <clang/Basic/DiagnosticOptions.h>
+#endif  /* LLVM_VERSION_MINOR <= 1 */
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Basic/TargetInfo.h>
+#include <clang/Basic/TargetOptions.h>
+#include <llvm/ADT/IntrusiveRefCntPtr.h>
+#include <llvm/ADT/OwningPtr.h>
+#if LLVM_VERSION_MINOR <= 2
+#include <llvm/Module.h>
+#else
+#include <llvm/IR/Module.h>
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Support/raw_ostream.h>
+
 namespace gbe {
 
   Kernel::Kernel(const std::string &name) :
@@ -104,6 +125,101 @@ namespace gbe {
     GBE_SAFE_DELETE(program);
   }
 
+  static void buildModuleFromSource(const char* input, const char* output, std::string options) {
+    // Arguments to pass to the clang frontend
+    vector<const char *> args;
+    bool bOpt = true;
+
+    vector<std::string> useless; //hold substrings to avoid c_str free
+    size_t start = 0, end = 0;
+    /* clang unsupport options:
+       -cl-denorms-are-zero, -cl-strict-aliasing
+       -cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt
+       all support options, refer to clang/include/clang/Driver/Options.inc
+       Maybe can filter these options to avoid warning
+    */
+    while (end != std::string::npos) {
+      end = options.find(' ', start);
+      std::string str = options.substr(start, end - start);
+      if(str.size() == 0)
+        continue;
+      if(str == "-cl-opt-disable") bOpt = false;
+      useless.push_back(str);
+      args.push_back(str.c_str());
+      start = end + 1;
+    }
+
+    args.push_back("-emit-llvm");
+    if(bOpt)  args.push_back("-O3");
+#if LLVM_VERSION_MINOR <= 2
+    args.push_back("-triple");
+    args.push_back("nvptx");
+#else
+    args.push_back("-x");
+    args.push_back("cl");
+    args.push_back("-triple");
+    args.push_back("spir");
+#endif /* LLVM_VERSION_MINOR <= 2 */
+    args.push_back(input);
+
+    // The compiler invocation needs a DiagnosticsEngine so it can report problems
+#if LLVM_VERSION_MINOR <= 1
+    args.push_back("-triple");
+    args.push_back("ptx32");
+
+    clang::TextDiagnosticPrinter *DiagClient =
+                             new clang::TextDiagnosticPrinter(llvm::errs(), clang::DiagnosticOptions());
+    llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine Diags(DiagID, DiagClient);
+#else
+    args.push_back("-ffp-contract=off");
+
+    llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
+    clang::TextDiagnosticPrinter *DiagClient =
+                             new clang::TextDiagnosticPrinter(llvm::errs(), &*DiagOpts);
+    llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+    // Create the compiler invocation
+    llvm::OwningPtr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
+    clang::CompilerInvocation::CreateFromArgs(*CI,
+                                              &args[0],
+                                              &args[0] + args.size(),
+                                              Diags);
+
+    // Create the compiler instance
+    clang::CompilerInstance Clang;
+    Clang.setInvocation(CI.take());
+    // Get ready to report problems
+#if LLVM_VERSION_MINOR <= 2
+    Clang.createDiagnostics(args.size(), &args[0]);
+#else
+    Clang.createDiagnostics();
+#endif /* LLVM_VERSION_MINOR <= 2 */
+    if (!Clang.hasDiagnostics())
+      return;
+
+    // Set Language
+    clang::LangOptions & lang_opts = Clang.getLangOpts();
+    lang_opts.OpenCL = 1;
+
+    // Create an action and make the compiler instance carry it out
+    llvm::OwningPtr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction());
+    if (!Clang.ExecuteAction(*Act))
+      return;
+
+    llvm::Module *module = Act->takeModule();
+
+    std::string ErrorInfo;
+    llvm::raw_fd_ostream OS(output, ErrorInfo,llvm::raw_fd_ostream::F_Binary);
+    //still write to temp file for code simply, otherwise need add another function.
+    //because gbe_program_new_from_llvm also be used by cl_program_create_from_llvm, can't be removed
+    //TODO: Pass module to llvmToGen, if use module, should return Act and use OwningPtr out of this funciton
+    llvm::WriteBitcodeToFile(module, OS);
+    OS.close();
+  }
+
   extern std::string ocl_stdlib_str;
   extern std::string ocl_common_defines_str;
   static gbe_program programNewFromSource(const char *source,
@@ -124,26 +240,7 @@ namespace gbe {
     fwrite(source, strlen(source), 1, clFile);
     fclose(clFile);
 
-    // Now compile the code to llvm using clang
-#if LLVM_VERSION_MINOR <= 1
-    std::string compileCmd = "clang -x cl -fno-color-diagnostics -emit-llvm -O3 -ccc-host-triple ptx32 -c ";
-#else
-    std::string compileCmd = "clang -ffp-contract=off -emit-llvm -O3 -target nvptx -x cl -c ";
-#endif /* LLVM_VERSION_MINOR <= 1 */
-    compileCmd += clName;
-    compileCmd += " ";
-    if(options)
-      compileCmd += options;
-    compileCmd += " -o ";
-    compileCmd += llName;
-
-    // Open a pipe and compile from here. Using Clang API instead is better
-    FILE *pipe = popen(compileCmd.c_str(), "r");
-    FATAL_IF (pipe == NULL, "Unable to run extern compilation command");
-    char msg[256];
-    while (fgets(msg, sizeof(msg), pipe))
-      std::cout << msg;
-    pclose(pipe);
+    buildModuleFromSource(clName.c_str(), llName.c_str(), options ? options : "");
     remove(clName.c_str());
 
     // Now build the program from llvm
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index f178f8b..f36bfbf 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -69,6 +69,7 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_X,
   GBE_CURBE_GROUP_NUM_Y,
   GBE_CURBE_GROUP_NUM_Z,
+  GBE_CURBE_WORK_DIM,
   GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
   GBE_CURBE_GLOBAL_CONSTANT_DATA,
   GBE_CURBE_IMAGE_INFO,
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index 9398e22..486fde1 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -31,8 +31,11 @@ namespace ir {
   static uint32_t getInfoOffset4Type(struct ImageInfo *imageInfo, int type)
   {
     switch (type) {
-      case GetImageInfoInstruction::WIDTH: return imageInfo->wSlot;
-      case GetImageInfoInstruction::HEIGHT: return imageInfo->hSlot;
+      case GetImageInfoInstruction::WIDTH:              return imageInfo->wSlot;
+      case GetImageInfoInstruction::HEIGHT:             return imageInfo->hSlot;
+      case GetImageInfoInstruction::DEPTH:              return imageInfo->depthSlot;
+      case GetImageInfoInstruction::CHANNEL_DATA_TYPE:  return imageInfo->dataTypeSlot;
+      case GetImageInfoInstruction::CHANNEL_ORDER:      return imageInfo->channelOrderSlot;
       default:
         NOT_IMPLEMENTED;
     }
@@ -42,8 +45,11 @@ namespace ir {
   static uint32_t setInfoOffset4Type(struct ImageInfo *imageInfo, int type, uint32_t offset)
   {
     switch (type) {
-      case GetImageInfoInstruction::WIDTH: imageInfo->wSlot = offset; break;
-      case GetImageInfoInstruction::HEIGHT: imageInfo->hSlot = offset; break;
+      case GetImageInfoInstruction::WIDTH:              imageInfo->wSlot = offset; break;
+      case GetImageInfoInstruction::HEIGHT:             imageInfo->hSlot = offset; break;
+      case GetImageInfoInstruction::DEPTH:              imageInfo->depthSlot = offset; break;
+      case GetImageInfoInstruction::CHANNEL_DATA_TYPE:  imageInfo->dataTypeSlot = offset; break;
+      case GetImageInfoInstruction::CHANNEL_ORDER:      imageInfo->channelOrderSlot = offset; break;
       default:
         NOT_IMPLEMENTED;
     }
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index c948d2c..7662b6a 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -335,12 +335,18 @@ namespace ir {
     enum {
      WIDTH = 0,
      HEIGHT = 1,
+     DEPTH = 2,
+     CHANNEL_DATA_TYPE = 3,
+     CHANNEL_ORDER = 4,
     };
 
     static INLINE uint32_t getDstNum4Type(int infoType) {
       switch (infoType) {
-        case GetImageInfoInstruction::WIDTH:
-        case GetImageInfoInstruction::HEIGHT:
+        case WIDTH:
+        case HEIGHT:
+        case DEPTH:
+        case CHANNEL_DATA_TYPE:
+        case CHANNEL_ORDER:
           return 1;
         break;
         default:
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index c1dc650..99cd06c 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -76,6 +76,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, barrierid);
       DECL_NEW_REG(FAMILY_DWORD, threadn);
       DECL_NEW_REG(FAMILY_DWORD, constoffst);
+      DECL_NEW_REG(FAMILY_DWORD, workdim);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 32dd149..4b0ef5e 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -64,7 +64,8 @@ namespace ir {
     static const Register barrierid = Register(20);// barrierid
     static const Register threadn = Register(21);  // number of threads
     static const Register constoffst = Register(22); // offset of global constant array's curbe
-    static const uint32_t regNum = 23;             // number of special registers
+    static const Register workdim = Register(23);  // work dimention.
+    static const uint32_t regNum = 24;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index 1e98afa..01e1eb1 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -21,6 +21,12 @@
  * \file unit.cpp
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "ir/unit.hpp"
 #include "ir/function.hpp"
 
@@ -53,10 +59,17 @@ namespace ir {
     constantSet.append(data, name, size, alignment);
   }
 
+  void Unit::removeDeadValues()
+  {
+    for(auto &it : valueMap) {
+      llvm::Instruction* I = llvm::dyn_cast<llvm::Instruction>(it.first.first);  //fake value
+      if((I == NULL) || (I->getParent() == NULL))
+        valueMap.erase(it.first);
+    }
+  }
   std::ostream &operator<< (std::ostream &out, const Unit &unit) {
     unit.apply([&out] (const Function &fn) { out << fn << std::endl; });
     return out;
   }
 } /* namespace ir */
 } /* namespace gbe */
-
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 3b293f5..1017f5f 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -24,7 +24,12 @@
 #ifndef __GBE_IR_UNIT_HPP__
 #define __GBE_IR_UNIT_HPP__
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Value.h"
+#else
+#include "llvm/IR/Value.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
@@ -88,6 +93,8 @@ namespace ir {
       GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
       valueMap[key] = value;
     }
+    /* remove fake values that removed by other pass */
+    void removeDeadValues(void);
     /*! Return the value map */
     const map<ValueIndex, ValueIndex>& getValueMap(void) const { return valueMap; }
   private:
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index deda687..3a59da3 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -71,16 +71,31 @@
  *   is intercepted, we just abort
  */
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -101,9 +116,10 @@
 #include "llvm/MC/MCSymbol.h"
 #if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
 #include "llvm/Target/TargetData.h"
-#endif
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 2)
+#elif LLVM_VERSION_MINOR == 2
 #include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
 #endif
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
@@ -138,9 +154,9 @@
 #define LLVM_VERSION_MINOR 0
 #endif /* !defined(LLVM_VERSION_MINOR) */
 
-#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 2)
-#error "Only LLVM 3.0 / 3.1 is supported"
-#endif /* (LLVM_VERSION_MAJOR != 3) && (LLVM_VERSION_MINOR >= 2) */
+#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 3)
+#error "Only LLVM 3.0 - 3.3 is supported"
+#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 3) */
 
 using namespace llvm;
 
@@ -368,6 +384,13 @@ namespace gbe
       const auto key = std::make_pair(value, index);
       return scalarMap.find(key) != scalarMap.end();
     }
+    /*! if it's a undef const value, return true. Otherwise, return false. */
+    bool isUndefConst(Value *value, uint32_t index) {
+      getRealValue(value, index);
+
+      Constant *CPV = dyn_cast<Constant>(value);
+      return (CPV && (isa<UndefValue>(CPV)));
+    }
   private:
     /*! This creates a scalar register for a Value (index is the vector index when
      *  the value is a vector of scalars)
@@ -670,7 +693,10 @@ namespace gbe
           return doIt(uint64_t(0));
         }
       }
-
+      // NULL pointers
+      if(isa<ConstantPointerNull>(CPV)) {
+        return doIt(uint32_t(0));
+      }
       // Floats and doubles
       const Type::TypeID typeID = CPV->getType()->getTypeID();
       switch (typeID) {
@@ -853,11 +879,8 @@ namespace gbe
       // Insert a new register for each function argument
 #if LLVM_VERSION_MINOR <= 1
       const AttrListPtr &PAL = F.getAttributes();
-      uint32_t argID = 1; // Start at one actually
-      for (; I != E; ++I, ++argID) {
-#else
-      for (; I != E; ++I, ++argID) {
 #endif /* LLVM_VERSION_MINOR <= 1 */
+      for (; I != E; ++I, ++argID) {
         const std::string &argName = I->getName().str();
         Type *type = I->getType();
 
@@ -892,7 +915,7 @@ namespace gbe
           PointerType *pointerType = dyn_cast<PointerType>(type);
           // By value structure
 #if LLVM_VERSION_MINOR <= 1
-          if (PAL.paramHasAttr(argID, Attribute::ByVal)) {
+          if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
 #else
           if (I->hasByValAttr()) {
 #endif /* LLVM_VERSION_MINOR <= 1 */
@@ -1135,14 +1158,19 @@ namespace gbe
   void GenWriter::emitFunction(Function &F)
   {
     switch (F.getCallingConv()) {
+#if LLVM_VERSION_MINOR <= 2
       case CallingConv::PTX_Device: // we do not emit device function
         return;
       case CallingConv::PTX_Kernel:
+#else
+      case CallingConv::C:
+#endif
         break;
       default: GBE_ASSERTM(false, "Unsupported calling convention");
     }
 
     ctx.startFunction(F.getName());
+    unit.removeDeadValues();
     this->regTranslator.clear();
     this->regTranslator.initValueMap(unit.getValueMap());
     this->labelMap.clear();
@@ -1592,14 +1620,14 @@ namespace gbe
           break;
           case Intrinsic::stackrestore:
           break;
-#if LLVM_VERSION_MINOR == 2
+#if LLVM_VERSION_MINOR >= 2
           case Intrinsic::lifetime_start:
           case Intrinsic::lifetime_end:
           break;
           case Intrinsic::fmuladd:
             this->newRegister(&I);
           break;
-#endif /* LLVM_VERSION_MINOR == 2 */
+#endif /* LLVM_VERSION_MINOR >= 2 */
           default:
           GBE_ASSERTM(false, "Unsupported intrinsics");
         }
@@ -1648,6 +1676,8 @@ namespace gbe
         regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
       case GEN_OCL_GET_GLOBAL_OFFSET2:
         regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
+      case GEN_OCL_GET_WORK_DIM:
+        regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
       case GEN_OCL_COS:
       case GEN_OCL_SIN:
       case GEN_OCL_SQR:
@@ -1662,6 +1692,9 @@ namespace gbe
       case GEN_OCL_RNDD:
       case GEN_OCL_GET_IMAGE_WIDTH:
       case GEN_OCL_GET_IMAGE_HEIGHT:
+      case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
+      case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
+      case GEN_OCL_GET_IMAGE_DEPTH:
         // No structure can be returned
         this->newRegister(&I);
         break;
@@ -1767,7 +1800,7 @@ namespace gbe
             ctx.MOV(ir::getType(family), dst, src);
           }
           break;
-#if LLVM_VERSION_MINOR == 2
+#if LLVM_VERSION_MINOR >= 2
           case Intrinsic::fmuladd:
           {
             const ir::Register tmp  = ctx.reg(ir::FAMILY_DWORD);
@@ -1783,7 +1816,7 @@ namespace gbe
           case Intrinsic::lifetime_start:
           case Intrinsic::lifetime_end:
           break;
-#endif /* LLVM_VERSION_MINOR == 2 */
+#endif /* LLVM_VERSION_MINOR >= 2 */
           default: NOT_IMPLEMENTED;
         }
       } else {
@@ -1827,6 +1860,9 @@ namespace gbe
           case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
           case GEN_OCL_GET_IMAGE_WIDTH:
           case GEN_OCL_GET_IMAGE_HEIGHT:
+          case GEN_OCL_GET_IMAGE_DEPTH:
+          case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
+          case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
           {
             GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
             uint32_t elemNum;
@@ -1904,14 +1940,15 @@ namespace gbe
             const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
             const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 5);
 
-            ir::Type srcType = ir::TYPE_U32, dstType = ir::TYPE_U32;
+            ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32;
 
             switch(it->second) {
               case GEN_OCL_READ_IMAGE0:
               case GEN_OCL_READ_IMAGE2:
               case GEN_OCL_READ_IMAGE10:
               case GEN_OCL_READ_IMAGE12:
-                srcType = dstType = ir::TYPE_U32;
+                dstType = ir::TYPE_U32;
+                srcType = ir::TYPE_S32;
                 break;
               case GEN_OCL_READ_IMAGE1:
               case GEN_OCL_READ_IMAGE3:
@@ -1923,7 +1960,7 @@ namespace gbe
               case GEN_OCL_READ_IMAGE4:
               case GEN_OCL_READ_IMAGE14:
                 dstType = ir::TYPE_FLOAT;
-                srcType = ir::TYPE_U32;
+                srcType = ir::TYPE_S32;
                 break;
               case GEN_OCL_READ_IMAGE5:
               case GEN_OCL_READ_IMAGE15:
@@ -2153,10 +2190,16 @@ namespace gbe
       Type *elemType = vectorType->getElementType();
 
       // We follow OCL spec and support 2,3,4,8,16 elements only
-      const uint32_t elemNum = vectorType->getNumElements();
+      uint32_t elemNum = vectorType->getNumElements();
       GBE_ASSERTM(elemNum == 2 || elemNum == 3 || elemNum == 4 || elemNum == 8 || elemNum == 16,
                   "Only vectors of 2,3,4,8 or 16 elements are supported");
-
+      // Per OPenCL 1.2 spec 6.1.5:
+      //   For 3-component vector data types, the size of the data type is 4 * sizeof(component).
+      // And the llvm does cast a type3 data to type4 for load/store instruction,
+      // so a 4 elements vector may only have 3 valid elements. We need to fix it to correct element
+      // count here.
+      if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
+          elemNum = 3;
       // The code is going to be fairly different from types to types (based on
       // size of each vector element)
       const ir::Type type = getType(ctx, elemType);
@@ -2221,8 +2264,27 @@ namespace gbe
               ctx.STORE(type, tuple, addr, addrSpace, 4, true);
           }
         }
-      } else
-        GBE_ASSERTM(false, "loads / stores of vectors of short / chars is not supported yet");
+      } else {
+        for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+          const ir::Register reg = this->getRegister(llvmValues, elemID);
+          ir::Register addr;
+          if (elemID == 0)
+            addr = ptr;
+          else {
+              const ir::Register offset = ctx.reg(pointerFamily);
+              ir::ImmediateIndex immIndex;
+              int elemSize = getTypeByteSize(unit, elemType);
+              immIndex = ctx.newImmediate(int32_t(elemID * elemSize));
+              addr = ctx.reg(pointerFamily);
+              ctx.LOADI(ir::TYPE_S32, offset, immIndex);
+              ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
+          }
+          if (isLoad)
+           ctx.LOAD(type, addr, addrSpace, dwAligned, reg);
+          else
+           ctx.STORE(type, addr, addrSpace, dwAligned, reg);
+        }
+      }
     }
   }
 
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 2fb33c0..6cd7298 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -16,6 +16,7 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
+DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 
 // Math function
 DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
@@ -73,6 +74,9 @@ DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE15, _Z22__gen_ocl_write_imagefjfffDv4_f)
 // To get image info function
 DECL_LLVM_GEN_FUNCTION(GET_IMAGE_WIDTH, __gen_ocl_get_image_width)
 DECL_LLVM_GEN_FUNCTION(GET_IMAGE_HEIGHT, __gen_ocl_get_image_height)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_DEPTH,  __gen_ocl_get_image_depth)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_DATA_TYPE,  __gen_ocl_get_image_channel_data_type)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_ORDER,  __gen_ocl_get_image_channel_order)
 
 // saturation related functions.
 DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 40c0e62..4bafc0d 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -30,16 +30,31 @@
  * Segovia) the right to use another license for it (MIT here)
  */
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -60,9 +75,10 @@
 #include "llvm/MC/MCSymbol.h"
 #if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
 #include "llvm/Target/TargetData.h"
-#endif
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR == 2)
+#elif LLVM_VERSION_MINOR == 2
 #include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
 #endif
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index f71401f..3c0d6a4 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -63,15 +63,30 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Config/config.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Function.h"
 #include "llvm/InstrTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
 #include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/raw_ostream.h"
@@ -178,7 +193,7 @@ namespace gbe {
     bool IsPerComponentOp(const Value* value);
 
     //these function used to add extract and insert instructions when load/store etc.
-    void extractFromeVector(Value* insn);
+    void extractFromVector(Value* insn);
     Value* InsertToVector(Value* insn, Value* vecValue);
 
     Type* GetBasicType(Value* value) {
@@ -577,7 +592,7 @@ namespace gbe {
     return true;
   }
 
-  void Scalarize::extractFromeVector(Value* insn) {
+  void Scalarize::extractFromVector(Value* insn) {
     VectorValues& vVals = vectorVals[insn];
 
     for (int i = 0; i < GetComponentCount(insn); ++i) {
@@ -641,7 +656,7 @@ namespace gbe {
           case GEN_OCL_GET_IMAGE_WIDTH:
           case GEN_OCL_GET_IMAGE_HEIGHT:
           {
-            extractFromeVector(call);
+            extractFromVector(call);
             break;
           }
           case GEN_OCL_WRITE_IMAGE10:
@@ -669,7 +684,7 @@ namespace gbe {
 
   bool Scalarize::scalarizeLoad(LoadInst* ld)
   {
-    extractFromeVector(ld);
+    extractFromVector(ld);
     return false;
   }
 
@@ -730,17 +745,11 @@ namespace gbe {
 
     Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
 
-#if LLVM_VERSION_MINOR <= 1
-    const AttrListPtr &PAL = F.getAttributes();
-    uint32_t argID = 1; // Start at one actually
-    for (; I != E; ++I, ++argID) {
-#else
     for (; I != E; ++I) {
-#endif /* LLVM_VERSION_MINOR <= 1 */
       Type *type = I->getType();
 
       if(type->isVectorTy())
-        extractFromeVector(I);
+        extractFromVector(I);
     }
     return;
   }
@@ -748,9 +757,13 @@ namespace gbe {
   bool Scalarize::runOnFunction(Function& F)
   {
     switch (F.getCallingConv()) {
+#if LLVM_VERSION_MINOR <= 2
     case CallingConv::PTX_Device:
       return false;
     case CallingConv::PTX_Kernel:
+#else
+    case CallingConv::C:
+#endif
       break;
     default: GBE_ASSERTM(false, "Unsupported calling convention");
     }
@@ -760,9 +773,9 @@ namespace gbe {
     intTy = IntegerType::get(module->getContext(), 32);
     floatTy = Type::getFloatTy(module->getContext());
     builder = new IRBuilder<>(module->getContext());
+    unit.removeDeadValues();
 
     scalarizeArgs(F);
-
     typedef ReversePostOrderTraversal<Function*> RPOTType;
     RPOTType rpot(&F);
     for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
@@ -798,6 +811,9 @@ namespace gbe {
 
     dce();
 
+    incompletePhis.clear();
+    vectorVals.clear();
+
     delete builder;
     builder = 0;
 
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 559cde0..788a3dd 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -22,11 +22,22 @@
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#else
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/PassManager.h"
 #include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 2
 #include "llvm/Support/IRReader.h"
+#else
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Assembly/PrintModulePass.h"
@@ -58,7 +69,7 @@ namespace gbe
       o = std::unique_ptr<llvm::raw_fd_ostream>(new llvm::raw_fd_ostream(fileno(stdout), false));
 
     // Get the module from its file
-    SMDiagnostic Err;
+    llvm::SMDiagnostic Err;
     std::auto_ptr<Module> M;
     M.reset(ParseIRFile(fileName, Err, c));
     if (M.get() == 0) return false;
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 92f9ba9..46b81e1 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -20,11 +20,11 @@
 #ifndef __GEN_OCL_STDLIB_H__
 #define __GEN_OCL_STDLIB_H__
 
-#define INLINE __attribute__((always_inline)) inline
+#define INLINE inline __attribute__((always_inline))
 #define OVERLOADABLE __attribute__((overloadable))
 #define PURE __attribute__((pure))
 #define CONST __attribute__((const))
-#define INLINE_OVERLOADABLE __attribute__((overloadable,always_inline))
+#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
 
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL built-in scalar data types
@@ -41,15 +41,17 @@ typedef unsigned int uintptr_t;
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL address space
 /////////////////////////////////////////////////////////////////////////////
+// These are built-ins in LLVM 3.3.
+#if 100*__clang_major__ + __clang_minor__ <= 302
 #define __private __attribute__((address_space(0)))
 #define __global __attribute__((address_space(1)))
 #define __constant __attribute__((address_space(2)))
 #define __local __attribute__((address_space(3)))
-#define __texture __attribute__((address_space(4)))
 #define global __global
 //#define local __local
 #define constant __constant
 #define private __private
+#endif
 
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL built-in vector data types
@@ -72,12 +74,20 @@ DEF(float);
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL other built-in data types
 /////////////////////////////////////////////////////////////////////////////
+// FIXME:
+// This is a transitional hack to bypass the LLVM 3.3 built-in types.
+// See the Khronos SPIR specification for handling of these types.
+#define __texture __attribute__((address_space(4)))
 struct _image2d_t;
-typedef __texture struct _image2d_t* image2d_t;
+typedef __texture struct _image2d_t* __image2d_t;
 struct _image3d_t;
-typedef __texture struct _image3d_t* image3d_t;
-typedef uint sampler_t;
-typedef size_t event_t;
+typedef __texture struct _image3d_t* __image3d_t;
+typedef uint __sampler_t;
+typedef size_t __event_t;
+#define image2d_t __image2d_t
+#define image3d_t __image3d_t
+#define sampler_t __sampler_t
+#define event_t __event_t
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL conversions & type casting
 /////////////////////////////////////////////////////////////////////////////
@@ -202,8 +212,8 @@ DEF;
 #undef DEF
 
 #define SDEF(TYPE)                                                              \
-INLINE_OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
 INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
 INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
 SDEF(char);
@@ -212,8 +222,8 @@ SDEF(int);
 SDEF(long);
 #undef SDEF
 #define UDEF(TYPE)                                                              \
-INLINE_OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
 INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
 INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
 UDEF(uchar);
@@ -346,8 +356,11 @@ DEC(16);
 /////////////////////////////////////////////////////////////////////////////
 // Work Items functions (see 6.11.1 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-// TODO get_global_offset
-// TODO get_work_dim
+
+PURE CONST uint __gen_ocl_get_work_dim(void);
+INLINE uint get_work_dim(void) {
+  return __gen_ocl_get_work_dim();
+}
 
 #define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
 PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
@@ -357,11 +370,12 @@ DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
 DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
 DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
 #undef DECL_INTERNAL_WORK_ITEM_FN
 
 #define DECL_PUBLIC_WORK_ITEM_FN(NAME) \
-inline unsigned NAME(unsigned int dim) { \
+INLINE unsigned NAME(unsigned int dim) { \
   if (dim == 0) return __gen_ocl_##NAME##0(); \
   else if (dim == 1) return __gen_ocl_##NAME##1(); \
   else if (dim == 2) return __gen_ocl_##NAME##2(); \
@@ -371,6 +385,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_size)
 DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_offset)
 DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
@@ -393,84 +408,84 @@ PURE CONST float __gen_ocl_rndz(float x);
 PURE CONST float __gen_ocl_rnde(float x);
 PURE CONST float __gen_ocl_rndu(float x);
 PURE CONST float __gen_ocl_rndd(float x);
-INLINE OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
-INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+INLINE_OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
+INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
   return __gen_ocl_cos(x * M_PI_F);
 }
-INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
   return __gen_ocl_sin(x * M_PI_F);
 }
-INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
-INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
-INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
-INLINE OVERLOADABLE float native_log(float x) {
+INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+INLINE_OVERLOADABLE float native_log(float x) {
   return native_log2(x) * 0.6931472002f;
 }
-INLINE OVERLOADABLE float native_log10(float x) {
+INLINE_OVERLOADABLE float native_log10(float x) {
   return native_log2(x) * 0.3010299956f;
 }
-INLINE OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
-INLINE OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
-INLINE OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
-INLINE OVERLOADABLE int2 ilogb(float2 x) {
+INLINE_OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
+INLINE_OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE_OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE_OVERLOADABLE int2 ilogb(float2 x) {
   return (int2)(ilogb(x.s0), ilogb(x.s1));
 }
-INLINE OVERLOADABLE int4 ilogb(float4 x) {
+INLINE_OVERLOADABLE int4 ilogb(float4 x) {
   return (int4)(ilogb(x.s01), ilogb(x.s23));
 }
-INLINE OVERLOADABLE int8 ilogb(float8 x) {
+INLINE_OVERLOADABLE int8 ilogb(float8 x) {
   return (int8)(ilogb(x.s0123), ilogb(x.s4567));
 }
-INLINE OVERLOADABLE int16 ilogb(float16 x) {
+INLINE_OVERLOADABLE int16 ilogb(float16 x) {
   return (int16)(ilogb(x.s01234567), ilogb(x.s89abcdef));
 }
-INLINE OVERLOADABLE float nan(uint code) {
+INLINE_OVERLOADABLE float nan(uint code) {
   return NAN;
 }
-INLINE OVERLOADABLE float2 nan(uint2 code) {
+INLINE_OVERLOADABLE float2 nan(uint2 code) {
   return (float2)(nan(code.s0), nan(code.s1));
 }
-INLINE OVERLOADABLE float4 nan(uint4 code) {
+INLINE_OVERLOADABLE float4 nan(uint4 code) {
   return (float4)(nan(code.s01), nan(code.s23));
 }
-INLINE OVERLOADABLE float8 nan(uint8 code) {
+INLINE_OVERLOADABLE float8 nan(uint8 code) {
   return (float8)(nan(code.s0123), nan(code.s4567));
 }
-INLINE OVERLOADABLE float16 nan(uint16 code) {
+INLINE_OVERLOADABLE float16 nan(uint16 code) {
   return (float16)(nan(code.s01234567), nan(code.s89abcdef));
 }
-INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
-INLINE OVERLOADABLE float native_tan(float x) {
+INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+INLINE_OVERLOADABLE float native_tan(float x) {
   return native_sin(x) / native_cos(x);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
   return native_tan(x * M_PI_F);
 }
-INLINE OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
-INLINE OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
-INLINE OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
-INLINE OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
+INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
+INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
   return __gen_ocl_pow(x, 0.3333333333f);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
   *cosval = native_cos(x);
   return native_sin(x);
 }
-INLINE OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
+INLINE_OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
   return (float2)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval));
 }
-INLINE OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
+INLINE_OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
   return (float4)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval));
 }
-INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
+INLINE_OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
   return (float8)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
@@ -480,7 +495,7 @@ INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
                   __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
                   __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval));
 }
-INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
+INLINE_OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
   return (float16)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
                    __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
                    __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
@@ -498,29 +513,29 @@ INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval
                    __gen_ocl_internal_sincos(x.se, 14 + (float *)cosval),
                    __gen_ocl_internal_sincos(x.sf, 15 + (float *)cosval));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
   return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
   return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
   float y = native_exp(-2 * x);
   return (1 - y) / (1 + y);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
   return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
   return __gen_ocl_internal_asin(x) / M_PI_F;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
   return M_PI_2_F - __gen_ocl_internal_asin(x);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
   return __gen_ocl_internal_acos(x) / M_PI_F;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
   float a = 0, c = 1;
   if (x <= -1) {
     a = - M_PI_2_F;
@@ -534,44 +549,44 @@ INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
   }
   return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
   return __gen_ocl_internal_atan(x) / M_PI_F;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
   return native_log(x + native_sqrt(x * x + 1));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
   return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
   return 0.5f * native_sqrt((1 + x) / (1 - x));
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
   return x * y < 0 ? -x : x;
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
   return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
   return 1 - __gen_ocl_internal_erf(x);
 }
 
 // XXX work-around PTX profile
 #define sqrt native_sqrt
-INLINE OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_log(float x)   { return native_log(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_log2(float x)  { return native_log2(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
-INLINE OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
-INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
-INLINE OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
-INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x)   { return native_log(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x)  { return native_log2(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
+INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
+INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return 2 * __gen_ocl_internal_round(x / 2);
 }
 // TODO use llvm intrinsics definitions
@@ -601,32 +616,32 @@ INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
 #define erf __gen_ocl_internal_erf
 #define erfc __gen_ocl_internal_erfc
 
-INLINE OVERLOADABLE float mad(float a, float b, float c) {
+INLINE_OVERLOADABLE float mad(float a, float b, float c) {
   return a*b+c;
 }
 
-INLINE OVERLOADABLE uint select(uint src0, uint src1, int cond) {
+INLINE_OVERLOADABLE uint select(uint src0, uint src1, int cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
+INLINE_OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE int select(int src0, int src1, int cond) {
+INLINE_OVERLOADABLE int select(int src0, int src1, int cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE int select(int src0, int src1, uint cond) {
+INLINE_OVERLOADABLE int select(int src0, int src1, uint cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE float select(float src0, float src1, int cond) {
+INLINE_OVERLOADABLE float select(float src0, float src1, int cond) {
   return cond ? src1 : src0;
 }
-INLINE OVERLOADABLE float select(float src0, float src1, uint cond) {
+INLINE_OVERLOADABLE float select(float src0, float src1, uint cond) {
   return cond ? src1 : src0;
 }
 
 // This will be optimized out by LLVM and will output LLVM select instructions
 #define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-INLINE OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+INLINE_OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
   TYPE4 dst; \
   const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
   const TYPE x1 = src1.x; \
@@ -652,13 +667,13 @@ DECL_SELECT4(float4, float, uint4, 0x80000000)
 // Common Functions (see 6.11.4 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
 #define DECL_MIN_MAX_CLAMP(TYPE) \
-INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
+INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
   return a > b ? a : b; \
 } \
-INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
+INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
   return a < b ? a : b; \
 } \
-INLINE OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
   return max(min(v, u), l); \
 }
 DECL_MIN_MAX_CLAMP(float)
@@ -670,35 +685,35 @@ DECL_MIN_MAX_CLAMP(unsigned short)
 DECL_MIN_MAX_CLAMP(unsigned char)
 #undef DECL_MIN_MAX_CLAMP
 
-INLINE OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
-INLINE OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
-INLINE OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
   float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
   return a > b ? x : b > a ? y : max(x, y);
 }
-INLINE OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
   float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
   return a < b ? x : b < a ? y : min(x, y);
 }
-INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
-INLINE OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
   return __gen_ocl_internal_fmax(x, y) - y;
 }
-INLINE OVERLOADABLE float fract(float x, float *p) {
+INLINE_OVERLOADABLE float fract(float x, float *p) {
   *p = __gen_ocl_internal_floor(x);
   return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
 }
-INLINE OVERLOADABLE float2 fract(float2 x, float2 *p) {
+INLINE_OVERLOADABLE float2 fract(float2 x, float2 *p) {
   return (float2)(fract(x.s0, (float *)p),
                   fract(x.s1, 1 + (float *)p));
 }
-INLINE OVERLOADABLE float4 fract(float4 x, float4 *p) {
+INLINE_OVERLOADABLE float4 fract(float4 x, float4 *p) {
   return (float4)(fract(x.s0, (float *)p),
                   fract(x.s1, 1 + (float *)p),
                   fract(x.s2, 2 + (float *)p),
                   fract(x.s3, 3 + (float *)p));
 }
-INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
+INLINE_OVERLOADABLE float8 fract(float8 x, float8 *p) {
   return (float8)(fract(x.s0, (float *)p),
                   fract(x.s1, 1 + (float *)p),
                   fract(x.s2, 2 + (float *)p),
@@ -708,7 +723,7 @@ INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
                   fract(x.s6, 6 + (float *)p),
                   fract(x.s7, 7 + (float *)p));
 }
-INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
+INLINE_OVERLOADABLE float16 fract(float16 x, float16 *p) {
   return (float16)(fract(x.s0, (float *)p),
                    fract(x.s1, 1 + (float *)p),
                    fract(x.s2, 2 + (float *)p),
@@ -726,85 +741,85 @@ INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
                    fract(x.se, 14 + (float *)p),
                    fract(x.sf, 15 + (float *)p));
 }
-INLINE OVERLOADABLE float native_divide(float x, float y) { return x/y; }
-INLINE OVERLOADABLE float ldexp(float x, int n) {
+INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+INLINE_OVERLOADABLE float ldexp(float x, int n) {
   return __gen_ocl_pow(2, n) * x;
 }
-INLINE OVERLOADABLE float pown(float x, int n) {
+INLINE_OVERLOADABLE float pown(float x, int n) {
   if (x == 0 && n == 0)
     return 1;
   return powr(x, n);
 }
-INLINE OVERLOADABLE float rootn(float x, int n) {
+INLINE_OVERLOADABLE float rootn(float x, int n) {
   return powr(x, 1.f / n);
 }
 
 /////////////////////////////////////////////////////////////////////////////
 // Geometric functions (see 6.11.5 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-INLINE OVERLOADABLE float dot(float2 p0, float2 p1) {
+INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
   return mad(p0.x,p1.x,p0.y*p1.y);
 }
-INLINE OVERLOADABLE float dot(float3 p0, float3 p1) {
+INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
   return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
 }
-INLINE OVERLOADABLE float dot(float4 p0, float4 p1) {
+INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
   return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
 }
 
-INLINE OVERLOADABLE float dot(float8 p0, float8 p1) {
+INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {
   return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
          mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
 }
-INLINE OVERLOADABLE float dot(float16 p0, float16 p1) {
+INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {
   return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
          mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
          mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
          mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
 }
 
-INLINE OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
-INLINE OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float distance(float x, float y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
-INLINE OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
-INLINE OVERLOADABLE float normalize(float x) { return 1.f; }
-INLINE OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
-
-INLINE OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
-INLINE OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
-INLINE OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
-INLINE OVERLOADABLE float fast_normalize(float x) { return 1.f; }
-INLINE OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
-
-INLINE OVERLOADABLE float3 cross(float3 v0, float3 v1) {
+INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
+INLINE_OVERLOADABLE float normalize(float x) { return 1.f; }
+INLINE_OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_normalize(float x) { return 1.f; }
+INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
    return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
 }
-INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {
+INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
    return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
 }
 
@@ -816,10 +831,10 @@ INLINE OVERLOADABLE float4 cross(float4 v0, float4 v1) {
 // cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
 // Well we do not care, we do not activate TBAA in the compiler
 #define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
-INLINE OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
   return *(SPACE TYPE##DIM *) (p + DIM * offset); \
 } \
-INLINE OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
   *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
 }
 
@@ -854,22 +869,22 @@ DECL_UNTYPED_RW_ALL(float)
 // Declare functions for vector types which are derived from scalar ones
 /////////////////////////////////////////////////////////////////////////////
 #define DECL_VECTOR_1OP(NAME, TYPE) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
     return (TYPE##2)(NAME(v.x), NAME(v.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
     return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
     return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v.s0123);\
     dst.s4567 = NAME(v.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v.s01234567);\
     dst.s89abcdef = NAME(v.s89abcdef);\
@@ -920,22 +935,22 @@ DECL_VECTOR_1OP(__gen_ocl_internal_erfc, float);
 /////////////////////////////////////////////////////////////////////////////
 
 #define DECL_VECTOR_2OP(NAME, TYPE) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
-    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
+    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v0.y, v1.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
     return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
     return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v0.s0123, v1.s0123);\
     dst.s4567 = NAME(v0.s4567, v1.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
     dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
@@ -954,25 +969,39 @@ DECL_VECTOR_2OP(native_divide, float);
 DECL_VECTOR_2OP(copysign, float);
 DECL_VECTOR_2OP(__gen_ocl_internal_maxmag, float);
 DECL_VECTOR_2OP(__gen_ocl_internal_minmag, float);
+
+#define DECL_VECTOR_NOP_ALL_INT_TYPES(NOP, NAME) \
+NOP(NAME, char)   \
+NOP(NAME, uchar)  \
+NOP(NAME, short)  \
+NOP(NAME, ushort) \
+NOP(NAME, int)    \
+NOP(NAME, uint)   \
+NOP(NAME, long)   \
+NOP(NAME, ulong)
+
+DECL_VECTOR_NOP_ALL_INT_TYPES(DECL_VECTOR_2OP, add_sat)
+DECL_VECTOR_NOP_ALL_INT_TYPES(DECL_VECTOR_2OP, sub_sat)
+#undef DECL_VECTOR_NOP_ALL_INT_TYPES
 #undef DECL_VECTOR_2OP
 
 #define DECL_VECTOR_2OP(NAME, TYPE, TYPE2) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
-    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
+    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v0.y, v1.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
     return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
     return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v0.s0123, v1.s0123);\
     dst.s4567 = NAME(v0.s4567, v1.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
     dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
@@ -984,22 +1013,22 @@ DECL_VECTOR_2OP(rootn, float, int);
 #undef DECL_VECTOR_2OP
 
 #define DECL_VECTOR_3OP(NAME, TYPE) \
-  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
-    return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v1.y, v1.y, v2.y)); \
+  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
+    return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y)); \
   }\
-  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
+  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
     return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \
   }\
-  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
+  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
     return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \
   }\
-  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
+  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
     TYPE##8 dst;\
     dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\
     dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\
     return dst;\
   }\
-  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
+  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
     TYPE##16 dst;\
     dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\
     dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\
@@ -1010,11 +1039,11 @@ DECL_VECTOR_3OP(mix, float);
 #undef DECL_VECTOR_3OP
 
 // mix requires more variants
-INLINE OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
-INLINE OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
-INLINE OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
-INLINE OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
-INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
+INLINE_OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
+INLINE_OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
+INLINE_OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
+INLINE_OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
+INLINE_OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
 
 // XXX workaround ptx profile
 #define fabs __gen_ocl_internal_fabs
@@ -1056,6 +1085,13 @@ INLINE void barrier(cl_mem_fence_flags flags) {
     __gen_ocl_barrier_global();
 }
 
+INLINE void mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void read_mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void write_mem_fence(cl_mem_fence_flags flags) {
+}
+
 /////////////////////////////////////////////////////////////////////////////
 // Force the compilation to SIMD8 or SIMD16
 /////////////////////////////////////////////////////////////////////////////
@@ -1098,7 +1134,9 @@ OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, f
 OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
 int __gen_ocl_get_image_width(uint surface_id);
 int __gen_ocl_get_image_height(uint surface_id);
-//OVERLOADABLE int __gen_ocl_get_image_depth(image3d_t image);
+int __gen_ocl_get_image_channel_data_type(uint surface_id);
+int __gen_ocl_get_image_channel_order(uint surface_id);
+int __gen_ocl_get_image_depth(uint surface_id);
 
 #define GET_IMAGE(cl_image, surface_id) \
     uint surface_id = (uint)cl_image
@@ -1150,17 +1188,32 @@ DECL_IMAGE(float4, f)
   { \
     GET_IMAGE(image, surface_id);\
     return __gen_ocl_get_image_height(surface_id); \
-  }
-#if 0
+  } \
   INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
-  { NOT_IMPLEMENTED; }\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_data_type(surface_id); \
+  }\
   INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
-  { NOT_IMPLEMENTED; }
-#endif
-
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_order(surface_id); \
+  }
 
 DECL_IMAGE_INFO(image2d_t)
 DECL_IMAGE_INFO(image3d_t)
+
+INLINE_OVERLOADABLE  int get_image_depth(image3d_t image)
+  {
+   GET_IMAGE(image, surface_id);
+   return __gen_ocl_get_image_depth(surface_id);
+  }
+
+INLINE_OVERLOADABLE  int2 get_image_dim(image2d_t image)
+  { return (int2){get_image_width(image), get_image_height(image)}; }
+
+INLINE_OVERLOADABLE  int4 get_image_dim(image3d_t image)
+  { return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0}; }
 #if 0
 /* The following functions are not implemented yet. */
 DECL_IMAGE_INFO(image1d_t)
@@ -1168,12 +1221,6 @@ DECL_IMAGE_INFO(image1d_buffer_t)
 DECL_IMAGE_INFO(image1d_array_t)
 DECL_IMAGE_INFO(image2d_array_t)
 
-INLINE_OVERLOADABLE  int get_image_depth(image3d_t image)
-  { return __gen_ocl_get_image_depth(image); }
-
-INLINE_OVERLOADABLE  int2 get_image_dim(image2d_t image)
-  { return __gen_ocl_get_image_dim(image); }
-
 INLINE_OVERLOADABLE  int2 get_image_dim(image2d_array_t image)
   { return __gen_ocl_get_image_dim(image); }
 
diff --git a/kernels/buildin_work_dim.cl b/kernels/buildin_work_dim.cl
new file mode 100644
index 0000000..27c0e18
--- /dev/null
+++ b/kernels/buildin_work_dim.cl
@@ -0,0 +1,3 @@
+kernel void buildin_work_dim( __global int *ret ) {
+  *ret = get_work_dim();
+}
diff --git a/kernels/compiler_box_blur.cl b/kernels/compiler_box_blur.cl
index 0c6b657..26936e0 100644
--- a/kernels/compiler_box_blur.cl
+++ b/kernels/compiler_box_blur.cl
@@ -27,7 +27,7 @@ inline uint pack_fp3(float3 u3) {
     C2 = (from1+from2+from3);\
     C3 = (from2+from3+r);\
   } while(0)
-#if 1
+
 __kernel void compiler_box_blur(__global const uint *src,
                                 __global uint *dst,
                                 int w,
@@ -39,27 +39,27 @@ __kernel void compiler_box_blur(__global const uint *src,
   const int yend = min(y + chunk, h); /* we process a tile in the image */
 
   /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
-  const int left = max(4*x-1 + y*w, y*w);
-  const int right = min(4*x+4 + y*w, y*w+w-1);
+  const int left = max(4*x-1, 0) + y*w;
+  const int right = min(4*x+4, w-1) + y*w;
   int curr = x + y*(w>>2);
   HFILTER3(curr0, curr1, curr2, curr3, curr, left, right);
 
   /* Top line (left (1 pixel), center (4 pixels), right (1 pixel)) */
   const int ytop = max(y-1,0);
-  const int topLeft = max(4*x-1 + ytop*w, ytop*w);
-  const int topRight = min(4*x+4 + ytop*w, ytop*w+w-1);
+  const int topLeft = max(4*x-1, 0) + ytop*w;
+  const int topRight = min(4*x+4, w-1) + ytop*w;
   const int top = x + ytop*(w>>2);
   HFILTER3(top0, top1, top2, top3, top, topLeft, topRight);
 
   /* To guard bottom line */
   const int maxBottom = x + (h-1)*(w>>2);
-  const int maxBottomLeft = max(x-1,0) + (h-1)*w;
-  const int maxBottomRight = min(x+1,w-1) + (h-1)*w;
+  const int maxBottomLeft = max(4*x-1,0) + (h-1)*w;
+  const int maxBottomRight = min(4*x+4,w-1) + (h-1)*w;
 
   /* We use a short 3 pixel sliding window */
   const int ybottom = min(y+1,h-1);
-  int bottomLeft = max(4*x-1 + ybottom*w, ybottom*w);
-  int bottomRight = min(4*x+4 + ybottom*w, ybottom*w+w-1);
+  int bottomLeft = max(4*x-1, 0) + ybottom*w;
+  int bottomRight = min(4*x+4, w-1) + ybottom*w;
   int bottom = x + ybottom*(w>>2);
 
   /* Top down sliding window */
@@ -78,36 +78,3 @@ __kernel void compiler_box_blur(__global const uint *src,
     curr0 = bottom0; curr1 = bottom1; curr2 = bottom2; curr3 = bottom3;
   }
 }
-#else
-
-__kernel void compiler_box_blur(__global const uint *src,
-                                __global uint *dst,
-                                int w,
-                                int h,
-                                int chunk)
-{
-  const int x = get_global_id(0);
-  int y = 0;
-  const int yend = min(y + 64, h); /* we process a tile in the image */
-
-  /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
-  int curr = x + y*32;
-
-  /* Top down sliding window */
-  for (; y < yend; ++y, curr += (w>>2)) {
-    float3 d = (float3)(255.f,255.f,255.f);
-    const uint4 to = (uint4)(pack_fp3(d),pack_fp3(d),pack_fp3(d),pack_fp3(d));
-#if 0
-    dst[4*curr+0] = (int)dst;
-    dst[4*curr+1] = (int)dst;
-    dst[4*curr+2] = (int)dst;
-    dst[4*curr+3] = (int)dst;
-#endif
-    dst[4*curr+0] = to.x;
-    dst[4*curr+1] = to.y;
-    dst[4*curr+2] = to.z;
-    dst[4*curr+3] = to.w;
-  }
-}
-#endif
-
diff --git a/kernels/compiler_box_blur_image.cl b/kernels/compiler_box_blur_image.cl
new file mode 100644
index 0000000..7bcbdeb
--- /dev/null
+++ b/kernels/compiler_box_blur_image.cl
@@ -0,0 +1,18 @@
+__kernel void compiler_box_blur_image(__read_only image2d_t src,
+                                      __write_only image2d_t dst)
+{
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+                            CLK_ADDRESS_CLAMP_TO_EDGE |
+                            CLK_FILTER_NEAREST;
+  const int2 coord = (int2)(get_global_id(0), get_global_id(1));
+  int2 offset;
+  float4 sum = 0;
+
+  for (offset.y = -1; offset.y <= 1; offset.y++) {
+    for (offset.x = -1; offset.x <= 1; offset.x++) {
+      sum += read_imagef(src, sampler, coord + offset);
+    }
+  }
+
+  write_imagef(dst, coord, (1.0f/9.0f)*sum);
+}
diff --git a/kernels/compiler_box_blur_ref.bmp b/kernels/compiler_box_blur_ref.bmp
index fd91008..149cbba 100644
Binary files a/kernels/compiler_box_blur_ref.bmp and b/kernels/compiler_box_blur_ref.bmp differ
diff --git a/kernels/compiler_clod.cl b/kernels/compiler_clod.cl
index e21d9f5..dba7d6f 100644
--- a/kernels/compiler_clod.cl
+++ b/kernels/compiler_clod.cl
@@ -28,7 +28,7 @@ inline uint pack_fp4(float4 u4) {
 
 #define time 1.f
 
-float f(vec3 o)
+inline float f(vec3 o)
 {
     float a=(sin(o.x)+o.y*.25f)*.35f;
     o=(vec3)(cos(a)*o.x-sin(a)*o.y,sin(a)*o.x+cos(a)*o.y,o.z);
@@ -36,7 +36,7 @@ float f(vec3 o)
 }
 
 // XXX front end does not inline this function
-__attribute((always_inline)) vec3 s(vec3 o,vec3 d)
+inline __attribute((always_inline)) vec3 s(vec3 o,vec3 d)
 {
     float t=0.0f;
     float dt = 0.2f;
diff --git a/kernels/compiler_displacement_map_element.cl b/kernels/compiler_displacement_map_element.cl
new file mode 100644
index 0000000..ee40ad5
--- /dev/null
+++ b/kernels/compiler_displacement_map_element.cl
@@ -0,0 +1,11 @@
+kernel void compiler_displacement_map_element(const global uint *in, const global uint *offset, int w, int h, global uint *out) {
+    const int cx = get_global_id(0);
+    const int cy = get_global_id(1);
+    uint c = offset[cy * w + cx];
+    int x_pos = cx + c;
+    int y_pos = cy + c;
+    if(0 <= x_pos && x_pos < w && 0 <= y_pos && y_pos < h)
+        out[cy * w + cx] = in[y_pos * w + x_pos];
+    else
+        out[cy * w + cx] = 0;
+}
diff --git a/kernels/compiler_global_memory_barrier.cl b/kernels/compiler_global_memory_barrier.cl
new file mode 100644
index 0000000..99bb940
--- /dev/null
+++ b/kernels/compiler_global_memory_barrier.cl
@@ -0,0 +1,7 @@
+__kernel void compiler_global_memory_barrier(__global int *dst, __global int *src) {
+  src[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = get_local_id(0);
+  src[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = get_local_id(0);
+  barrier(CLK_GLOBAL_MEM_FENCE);
+  dst[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = src[get_local_size(0) * 2 * get_group_id(0) + get_local_size(0) - (get_local_id(0) + 1)];
+  dst[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = src[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_size(0) - (get_local_id(0) + 1)];
+}
diff --git a/kernels/compiler_group_size.cl b/kernels/compiler_group_size.cl
new file mode 100644
index 0000000..9dba236
--- /dev/null
+++ b/kernels/compiler_group_size.cl
@@ -0,0 +1,12 @@
+__kernel void
+compiler_group_size(__global unsigned int *dst)
+{
+  uint idx = (uint)get_global_id(0);
+  uint idy = (uint)get_global_id(1);
+  uint idz = (uint)get_global_id(2);
+  uint size_x = (uint)get_global_size(0);
+  uint size_y = (uint)get_global_size(1);
+
+  dst[idz*size_x*size_y + idy*size_x + idx] = idz*size_x*size_y + idy*size_x +idx;
+}
+
diff --git a/kernels/compiler_julia.cl b/kernels/compiler_julia.cl
index 98c5799..21672f6 100644
--- a/kernels/compiler_julia.cl
+++ b/kernels/compiler_julia.cl
@@ -27,7 +27,7 @@ inline uint pack_fp4(float4 u4) {
   dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
 } while (0)
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 {
     float mz2,md2,dist,t;
@@ -74,7 +74,7 @@ float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 }
 
 #if 1
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec3 calcNormal(vec3 p, vec4 c)
 {
     vec4 nz,ndz,dz[4];
diff --git a/kernels/compiler_julia_no_break.cl b/kernels/compiler_julia_no_break.cl
index 1a9be64..5c357b1 100644
--- a/kernels/compiler_julia_no_break.cl
+++ b/kernels/compiler_julia_no_break.cl
@@ -27,7 +27,7 @@ inline uint pack_fp4(float4 u4) {
   dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
 } while (0)
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 {
     float mz2,md2,dist,t;
@@ -75,7 +75,7 @@ float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
 }
 
 #if 1
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec3 calcNormal(vec3 p, vec4 c)
 {
     vec4 nz,ndz,dz[4];
diff --git a/kernels/compiler_local_memory.cl b/kernels/compiler_local_memory.cl
deleted file mode 100644
index daadd66..0000000
--- a/kernels/compiler_local_memory.cl
+++ /dev/null
@@ -1,5 +0,0 @@
-__kernel void compiler_local_memory(__global int *dst, __local int *src) {
-  src[get_local_id(0)] = get_local_id(0);
-  dst[get_global_id(0)] = src[15 - get_local_id(0)];
-}
-
diff --git a/kernels/compiler_local_memory_two_ptr.cl b/kernels/compiler_local_memory_two_ptr.cl
index f410406..46589ba 100644
--- a/kernels/compiler_local_memory_two_ptr.cl
+++ b/kernels/compiler_local_memory_two_ptr.cl
@@ -4,6 +4,7 @@ __kernel void compiler_local_memory_two_ptr(__global int *dst,
 {
   src0[get_local_id(0)] = get_local_id(0);
   src1[get_local_id(0)] = get_global_id(0);
+  barrier(CLK_LOCAL_MEM_FENCE);
   dst[get_global_id(0)] = src0[15 - get_local_id(0)] + src1[15 - get_local_id(0)];
 }
 
diff --git a/kernels/compiler_mandelbrot.cl b/kernels/compiler_mandelbrot.cl
index 42295ab..d15ccd0 100644
--- a/kernels/compiler_mandelbrot.cl
+++ b/kernels/compiler_mandelbrot.cl
@@ -1,8 +1,8 @@
 // Used to ID into the 1D array, so that we can use
 // it effectively as a 2D array
-int ID(int x, int y, int width) { return 4*width*y + x*4; }
-float mapX(float x) { return x*3.25f - 2.f; }
-float mapY(float y) { return y*2.5f - 1.25f; }
+inline int ID(int x, int y, int width) { return 4*width*y + x*4; }
+inline float mapX(float x) { return x*3.25f - 2.f; }
+inline float mapY(float y) { return y*2.5f - 1.25f; }
 
 __kernel void compiler_mandelbrot(__global char *out) {
   int x_dim = get_global_id(0);
diff --git a/kernels/compiler_mandelbrot_alternate.cl b/kernels/compiler_mandelbrot_alternate.cl
index fc99326..ab6fb07 100644
--- a/kernels/compiler_mandelbrot_alternate.cl
+++ b/kernels/compiler_mandelbrot_alternate.cl
@@ -1,6 +1,6 @@
-int offset(int x, int y, int width) { return width*y + x; }
-float mapX(float x) {return x*3.25f - 2.f;}
-float mapY(float y) {return y*2.5f - 1.25f;}
+inline int offset(int x, int y, int width) { return width*y + x; }
+inline float mapX(float x) {return x*3.25f - 2.f;}
+inline float mapY(float y) {return y*2.5f - 1.25f;}
 
 __kernel void compiler_mandelbrot_alternate(__global uint *out,
                                             float rcpWidth,
diff --git a/kernels/compiler_menger_sponge_no_shadow.cl b/kernels/compiler_menger_sponge_no_shadow.cl
index 95469c5..4de6c10 100644
--- a/kernels/compiler_menger_sponge_no_shadow.cl
+++ b/kernels/compiler_menger_sponge_no_shadow.cl
@@ -14,11 +14,11 @@ typedef float4 vec4;
 #define time 1.f
 
 // fmod is not like glsl mod!
-__attribute__((always_inline, overloadable))
+inline __attribute__((always_inline, overloadable))
 float glsl_mod(float x,float y) { return x-y*floor(x/y); }
-__attribute__((always_inline, overloadable))
+inline __attribute__((always_inline, overloadable))
 float2 glsl_mod(float2 a,float2 b) { return (float2)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y)); }
-__attribute__((always_inline, overloadable))
+inline __attribute__((always_inline, overloadable))
 float3 glsl_mod(float3 a,float3 b) { return (float3)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y), glsl_mod(a.z,b.z)); }
 
 inline vec3 reflect(vec3 I, vec3 N) {
@@ -38,10 +38,10 @@ inline uint pack_fp4(float4 u4) {
   dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
 } while (0)
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float maxcomp(vec3 p) { return max(p.x,max(p.y,p.z));}
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 float sdBox(vec3 p, vec3 b)
 {
   vec3  di = fabs(p) - b;
@@ -49,7 +49,7 @@ float sdBox(vec3 p, vec3 b)
   return min(mc,length(max(di,0.0f)));
 }
 
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec4 map(vec3 p)
 {
    float d = sdBox(p,(vec3)(1.0f));
@@ -78,7 +78,7 @@ vec4 map(vec3 p)
 }
 
 // GLSL ES doesn't seem to like loops with conditional break/return...
-__attribute__((always_inline))
+inline __attribute__((always_inline))
 vec4 intersect( vec3 ro, vec3 rd )
 {
     float t = 0.0f;
diff --git a/kernels/compiler_ribbon.cl b/kernels/compiler_ribbon.cl
index 92375e7..157cc66 100644
--- a/kernels/compiler_ribbon.cl
+++ b/kernels/compiler_ribbon.cl
@@ -1,89 +1,88 @@
-typedef float2 vec2;
-typedef float3 vec3;
-typedef float4 vec4;
-#define sin native_sin
-#define cos native_cos
-#define tan native_tan
-#define normalize fast_normalize
-#define length fast_length
-
-inline vec3 reflect(vec3 I, vec3 N) {
-  return I - 2.0f * dot(N, I) * N;
-}
-
-#define time 1.f
-
-// Object A (tunnel)
-inline float oa(vec3 q) {
- return cos(q.x)+cos(q.y*1.5f)+cos(q.z)+cos(q.y*20.f)*.05f;
-}
-
-// Object B (ribbon)
-inline float ob(vec3 q) {
-  return length(max(fabs(q-(vec3)(cos(q.z*1.5f)*.3f,-.5f+cos(q.z)*.2f,.0f))-(vec3)(.125f,.02f,time+3.f),(vec3)(.0f)));
-}
-
-// Scene
-inline float o(vec3 q) { return min(oa(q),ob(q)); }
-
-// Get Normal XXX Not inline by LLVM
-__attribute__((always_inline)) vec3 gn(vec3 q) {
- const vec3 fxyy = (vec3)(.01f, 0.f, 0.f);
- const vec3 fyxy = (vec3)(0.f, .01f, 0.f);
- const vec3 fyyx = (vec3)(0.f, 0.f, .01f);
- return normalize((vec3)(o(q+fxyy),
-                         o(q+fyxy),
-                         o(q+fyyx)));
-}
-
-inline uint pack_fp4(float4 u4) {
-  uint u;
-  u = (((uint) u4.x)) |
-      (((uint) u4.y) << 8) |
-      (((uint) u4.z) << 16);
-  return u;
-}
-
-// XXX vector not supported in function argument yet
-__kernel void compiler_ribbon(__global uint *dst, float resx, float resy, int w)
-{
-  vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
-  vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / (vec2)(resx, resy);
-  p.x *= resx/resy;
-
-  vec4 c = (vec4)(1.0f);
-  const vec3 org = (vec3)(sin(time)*.5f,
-                          cos(time*.5f)*.25f+.25f,
-                          time);
-  vec3 dir=normalize((vec3)(p.x*1.6f,p.y,1.0f));
-  vec3 q = org, pp;
-  float d=.0f;
-
-  // First raymarching
-  for(int i=0;i<64;i++) {
-    d=o(q);
-    q+=d*dir;
-  }
-  pp=q;
-  const float f = length(q-org)*0.02f;
-
-  // Second raymarching (reflection)
-  dir=reflect(dir,gn(q));
-  q+=dir;
-  for(int i=0;i<64;i++) {
-    d=o(q);
-    q+=d*dir;
-  }
-  c = max(dot(gn(q), (vec3)(0.1f,0.1f,0.0f)), 0.0f)
-    + (vec4)(0.3f, cos(time*.5f)*.5f+.5f, sin(time*.5f)*.5f+.5f, 1.f) * min(length(q-org)*.04f,1.f);
-
-  // Ribbon Color
-  if(oa(pp)>ob(pp))
-    c = mix(c, (vec4)(cos(time*.3f)*0.5f + 0.5f,cos(time*.2f)*.5f+.5f,sin(time*.3f)*.5f+.5f,1.f),.3f);
-
-  // Final Color
-  const vec4 color = ((c+(vec4)(f))+(1.f-min(pp.y+1.9f,1.f))*(vec4)(1.f,.8f,.7f,1.f))*min(time*.5f,1.f);
-  const vec4 final = 255.f * max(min(color, (vec4)(1.f)), (vec4)(0.f));
-  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final);
-}
-
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define normalize fast_normalize
+#define length fast_length
+
+inline vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+#define time 1.f
+
+// Object A (tunnel)
+inline float oa(vec3 q) {
+ return cos(q.x)+cos(q.y*1.5f)+cos(q.z)+cos(q.y*20.f)*.05f;
+}
+
+// Object B (ribbon)
+inline float ob(vec3 q) {
+  return length(max(fabs(q-(vec3)(cos(q.z*1.5f)*.3f,-.5f+cos(q.z)*.2f,.0f))-(vec3)(.125f,.02f,time+3.f),(vec3)(.0f)));
+}
+
+// Scene
+inline float o(vec3 q) { return min(oa(q),ob(q)); }
+
+// Get Normal XXX Not inline by LLVM
+inline __attribute__((always_inline)) vec3 gn(vec3 q) {
+ const vec3 fxyy = (vec3)(.01f, 0.f, 0.f);
+ const vec3 fyxy = (vec3)(0.f, .01f, 0.f);
+ const vec3 fyyx = (vec3)(0.f, 0.f, .01f);
+ return normalize((vec3)(o(q+fxyy),
+                         o(q+fyxy),
+                         o(q+fyyx)));
+}
+
+inline uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+// XXX vector not supported in function argument yet
+__kernel void compiler_ribbon(__global uint *dst, float resx, float resy, int w)
+{
+  vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+  vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / (vec2)(resx, resy);
+  p.x *= resx/resy;
+
+  vec4 c = (vec4)(1.0f);
+  const vec3 org = (vec3)(sin(time)*.5f,
+                          cos(time*.5f)*.25f+.25f,
+                          time);
+  vec3 dir=normalize((vec3)(p.x*1.6f,p.y,1.0f));
+  vec3 q = org, pp;
+  float d=.0f;
+
+  // First raymarching
+  for(int i=0;i<64;i++) {
+    d=o(q);
+    q+=d*dir;
+  }
+  pp=q;
+  const float f = length(q-org)*0.02f;
+
+  // Second raymarching (reflection)
+  dir=reflect(dir,gn(q));
+  q+=dir;
+  for(int i=0;i<64;i++) {
+    d=o(q);
+    q+=d*dir;
+  }
+  c = max(dot(gn(q), (vec3)(0.1f,0.1f,0.0f)), 0.0f)
+    + (vec4)(0.3f, cos(time*.5f)*.5f+.5f, sin(time*.5f)*.5f+.5f, 1.f) * min(length(q-org)*.04f,1.f);
+
+  // Ribbon Color
+  if(oa(pp)>ob(pp))
+    c = mix(c, (vec4)(cos(time*.3f)*0.5f + 0.5f,cos(time*.2f)*.5f+.5f,sin(time*.3f)*.5f+.5f,1.f),.3f);
+
+  // Final Color
+  const vec4 color = ((c+(vec4)(f))+(1.f-min(pp.y+1.9f,1.f))*(vec4)(1.f,.8f,.7f,1.f))*min(time*.5f,1.f);
+  const vec4 final = 255.f * max(min(color, (vec4)(1.f)), (vec4)(0.f));
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final);
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
index 28fd93a..30f0e1e 100644
--- a/kernels/compiler_vector_load_store.cl
+++ b/kernels/compiler_vector_load_store.cl
@@ -1,18 +1,40 @@
 /* test OpenCL 1.1 Vector Data Load/Store Functions (section 6.11.7) */
-kernel void compiler_vector_load_store() {
-  float p[16], f;
-  float4 f4;
-  f4 = vload4(0, p);
-  vstore4(f4, 0, p);
-  
-  long x[16], l;
-  long16 l16;
-  l = vload16(0, x);
-  vstore16(l16, 0, x);
 
-  half h[16];
-  half4 h4;
-  f = vload_half(0, h);
-  f4 = vload_half4(0, h);
-  vstore_half(f, 0, h);
+#define OFFSET2(type)  (type ##2) {(type)1, (type)2}
+#define OFFSET3(type)  (type ##3) {(type)1, (type)2, (type)3}
+#define OFFSET4(type)  (type ##4) {(type)1, (type)2, (type)3, (type)4}
+#define OFFSET8(type)  (type ##8) {(type)1, (type)2, (type)3, (type)4, (type)5, (type)6, (type)7, (type)8}
+#define OFFSET16(type)  (type ##16)  {(type)1, (type)2, (type)3, (type)4, (type)5, (type)6, (type)7, (type)8, (type)9, (type)10, (type)11, (type)12, (type)13, (type)14, (type)15, (type)16}
+
+#define  TEST_TYPE(type, n) \
+__kernel void test_##type ##n(__global type *pin, \
+                            __global type *pout)  \
+{\
+  int x = get_global_id(0); \
+  type ##n value; \
+  value = vload ##n(x, pin); \
+  value += OFFSET ##n(type); \
+  vstore ##n(value, x, pout); \
 }
+
+#define TEST_ALL_TYPE(n) \
+  TEST_TYPE(char,n)  \
+  TEST_TYPE(uchar,n) \
+  TEST_TYPE(short,n) \
+  TEST_TYPE(ushort,n)\
+  TEST_TYPE(int,n)   \
+  TEST_TYPE(uint,n)  \
+  TEST_TYPE(float,n)
+
+#if 0
+  TEST_TYPE(double,n)
+  TEST_TYPE(long,n)
+  TEST_TYPE(ulong,n)
+  TEST_TYPE(half,n)
+#endif
+
+TEST_ALL_TYPE(2)
+TEST_ALL_TYPE(3)
+TEST_ALL_TYPE(4)
+TEST_ALL_TYPE(8)
+TEST_ALL_TYPE(16)
diff --git a/kernels/compiler_write_only_bytes.cl b/kernels/compiler_write_only_bytes.cl
index 0bc0cd8..555a9dc 100644
--- a/kernels/compiler_write_only_bytes.cl
+++ b/kernels/compiler_write_only_bytes.cl
@@ -1,7 +1,6 @@
-__kernel void
-compiler_write_only_bytes(__global char *dst)
-{
-    int id = (int)get_global_id(0);
-    dst[id] = 2;
-}
-
+__kernel void
+compiler_write_only_bytes(__global char *dst)
+{
+    int id = (int)get_global_id(0);
+    dst[id] = 2;
+}
diff --git a/kernels/compiler_write_only_shorts.cl b/kernels/compiler_write_only_shorts.cl
index bfd23cc..205634d 100644
--- a/kernels/compiler_write_only_shorts.cl
+++ b/kernels/compiler_write_only_shorts.cl
@@ -1,7 +1,6 @@
-__kernel void
-compiler_write_only_shorts(__global short *dst)
-{
-    int id = (int)get_global_id(0);
-    dst[id] = 2;
-}
-
+__kernel void
+compiler_write_only_shorts(__global short *dst)
+{
+    int id = (int)get_global_id(0);
+    dst[id] = 2;
+}
diff --git a/kernels/null_kernel_arg.cl b/kernels/null_kernel_arg.cl
new file mode 100644
index 0000000..68a4280
--- /dev/null
+++ b/kernels/null_kernel_arg.cl
@@ -0,0 +1,9 @@
+__kernel void
+null_kernel_arg(__global unsigned int *dst, __global unsigned int * mask_global, __constant unsigned int* mask_const)
+{
+  if(dst && mask_global==0 && mask_const == NULL)
+  {
+    uint idx = (uint)get_global_id(0);
+    dst[idx] = idx;
+  }
+}
diff --git a/kernels/test_cl_finish.cl b/kernels/test_cl_finish.cl
new file mode 100644
index 0000000..723949c
--- /dev/null
+++ b/kernels/test_cl_finish.cl
@@ -0,0 +1,12 @@
+
+
+__kernel void
+test_cl_finish(__global int *src, __global int *dst, int n, int num_threads)
+{
+	int tid, pos;
+
+	tid = get_global_id(0);
+	for (pos=tid; pos < n; pos+=num_threads) {
+		dst[pos] = src[pos];
+	}
+}
diff --git a/kernels/test_copy_buffer.cl b/kernels/test_copy_buffer.cl
index 2aec892..6f2fd22 100644
--- a/kernels/test_copy_buffer.cl
+++ b/kernels/test_copy_buffer.cl
@@ -1,7 +1,6 @@
-__kernel void
-test_copy_buffer(__global float* src, __global float* dst)
-{
-  int id = (int)get_global_id(0);
-  dst[id] = src[id];
-}
-
+__kernel void
+test_copy_buffer(__global float* src, __global float* dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+}
diff --git a/kernels/test_copy_buffer_row.cl b/kernels/test_copy_buffer_row.cl
index a55d99e..e33380f 100644
--- a/kernels/test_copy_buffer_row.cl
+++ b/kernels/test_copy_buffer_row.cl
@@ -1,9 +1,8 @@
-__kernel void
-test_copy_buffer_row(__global int *src, __global int *dst, __global int *data)
-{
-  int row = data[0];
-  int size = data[1];
-  int id = (int) get_global_id(0);
-  for (; id < size; id += row) dst[id] = src[id];
-}
-
+__kernel void
+test_copy_buffer_row(__global int *src, __global int *dst, __global int *data)
+{
+  int row = data[0];
+  int size = data[1];
+  int id = (int) get_global_id(0);
+  for (; id < size; id += row) dst[id] = src[id];
+}
diff --git a/kernels/test_get_image_info.cl b/kernels/test_get_image_info.cl
new file mode 100644
index 0000000..8f69b75
--- /dev/null
+++ b/kernels/test_get_image_info.cl
@@ -0,0 +1,13 @@
+__kernel void
+test_get_image_info(__write_only image3d_t src, __global int *size, __global int *fmt)
+{
+  int id = (int)get_global_id(0);
+  int w, h, depth;
+  w = get_image_width(src);
+  h = get_image_height(src);
+  depth = get_image_depth(src);
+  int channel_data_type = get_image_channel_data_type(src);
+  int channel_order = get_image_channel_order(src);
+  size[id] = (w << 20 | h << 8  | depth);
+  fmt[id] = (channel_data_type << 16 | channel_order);
+}
diff --git a/kernels/test_get_image_size.cl b/kernels/test_get_image_size.cl
deleted file mode 100644
index aeb7d66..0000000
--- a/kernels/test_get_image_size.cl
+++ /dev/null
@@ -1,9 +0,0 @@
-__kernel void
-test_get_image_size(__write_only image2d_t src, __global int *info)
-{
-  int id = (int)get_global_id(0);
-  int w, h;
-  w = get_image_width(src);
-  h = get_image_height(src);
-  info[id] = (w << 16 | h);
-}
diff --git a/kernels/test_write_only.cl b/kernels/test_write_only.cl
index bb7e972..27c7acb 100644
--- a/kernels/test_write_only.cl
+++ b/kernels/test_write_only.cl
@@ -1,7 +1,6 @@
-__kernel void
-test_write_only(__global int *dst)
-{
-  int id = (int)get_global_id(0);
-  dst[id] = id;
-}
-
+__kernel void
+test_write_only(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = id;
+}
diff --git a/src/cl_api.c b/src/cl_api.c
index a4e534a..f14bee4 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -37,11 +37,40 @@
 #include <string.h>
 #include <assert.h>
 
+static cl_int
+cl_check_device_type(cl_device_type device_type)
+{
+  const cl_device_type valid =  CL_DEVICE_TYPE_GPU
+                              | CL_DEVICE_TYPE_CPU
+                              | CL_DEVICE_TYPE_ACCELERATOR
+                              | CL_DEVICE_TYPE_DEFAULT
+                              | CL_DEVICE_TYPE_CUSTOM;
+
+  if( (device_type & valid) == 0) {
+    return CL_INVALID_DEVICE_TYPE;
+  }
+  if(UNLIKELY(!(device_type & CL_DEVICE_TYPE_DEFAULT) && !(device_type & CL_DEVICE_TYPE_GPU)))
+    return CL_DEVICE_NOT_FOUND;
+
+  return CL_SUCCESS;
+}
+
+static cl_int
+cl_device_id_is_ok(const cl_device_id device)
+{
+  return device != cl_get_gt_device() ? CL_FALSE : CL_TRUE;
+}
+
 cl_int
 clGetPlatformIDs(cl_uint          num_entries,
                  cl_platform_id * platforms,
                  cl_uint *        num_platforms)
 {
+  if(UNLIKELY(platforms == NULL && num_platforms == NULL))
+    return CL_INVALID_VALUE;
+  if(UNLIKELY(num_entries == 0 && platforms != NULL))
+    return CL_INVALID_VALUE;
+
   return cl_get_platform_ids(num_entries, platforms, num_platforms);
 }
 
@@ -52,6 +81,10 @@ clGetPlatformInfo(cl_platform_id    platform,
                   void *            param_value,
                   size_t *          param_value_size_ret)
 {
+  /* Only one platform. This is easy */
+  if (UNLIKELY(platform != NULL && platform != intel_platform))
+    return CL_INVALID_PLATFORM;
+
   return cl_get_platform_info(platform,
                               param_name,
                               param_value_size,
@@ -66,6 +99,20 @@ clGetDeviceIDs(cl_platform_id platform,
                cl_device_id * devices,
                cl_uint *      num_devices)
 {
+  cl_int err = CL_SUCCESS;
+
+  /* Check parameter consistency */
+  if (UNLIKELY(devices == NULL && num_devices == NULL))
+    return CL_INVALID_VALUE;
+  if (UNLIKELY(platform && platform != intel_platform))
+    return CL_INVALID_PLATFORM;
+  if (UNLIKELY(devices && num_entries == 0))
+    return CL_INVALID_VALUE;
+
+  err = cl_check_device_type(device_type);
+  if(err != CL_SUCCESS)
+    return err;
+
   return cl_get_device_ids(platform,
                            device_type,
                            num_entries,
@@ -120,12 +167,27 @@ clCreateContext(const cl_context_properties *  properties,
                 void *                         user_data,
                 cl_int *                       errcode_ret)
 {
-  return cl_create_context(properties,
+  cl_int err = CL_SUCCESS;
+  cl_context context = NULL;
+
+  /* Assert parameters correctness */
+  INVALID_VALUE_IF (devices == NULL);
+  INVALID_VALUE_IF (num_devices == 0);
+  INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
+
+  /* Now check if the user is asking for the right device */
+  INVALID_DEVICE_IF (cl_device_id_is_ok(*devices) == CL_FALSE);
+
+  context = cl_create_context(properties,
                            num_devices,
                            devices,
                            pfn_notify,
                            user_data,
-                           errcode_ret);
+                           &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return context;
 }
 
 cl_context
@@ -135,9 +197,17 @@ clCreateContextFromType(const cl_context_properties *  properties,
                         void *                         user_data,
                         cl_int *                       errcode_ret)
 {
+  cl_context context = NULL;
+  cl_int err = CL_SUCCESS;
   cl_device_id devices[1];
   cl_uint num_devices = 1;
-  cl_int err;
+
+  INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
+
+  err = cl_check_device_type(device_type);
+  if(err != CL_SUCCESS) {
+    goto error;
+  }
 
   err = cl_get_device_ids(NULL,
                           device_type,
@@ -145,16 +215,19 @@ clCreateContextFromType(const cl_context_properties *  properties,
                           &devices[0],
                           &num_devices);
   if (err != CL_SUCCESS) {
-    *errcode_ret = err;
-    return NULL;
+    goto error;
   }
 
-  return cl_create_context(properties,
-                           num_devices,
-                           devices,
-                           pfn_notify,
-                           user_data,
-                           errcode_ret);
+  context = cl_create_context(properties,
+                              num_devices,
+                              devices,
+                              pfn_notify,
+                              user_data,
+                              &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return context;
 }
 
 cl_int
@@ -214,9 +287,20 @@ clCreateCommandQueue(cl_context                   context,
   cl_command_queue queue = NULL;
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
-  queue = cl_context_create_queue(context, device, properties, errcode_ret);
+
+  INVALID_DEVICE_IF (device != context->device);
+  INVALID_VALUE_IF (properties & ~(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE));
+
+  if(properties) {
+    err = CL_INVALID_QUEUE_PROPERTIES;
+    goto error;
+  }
+
+  queue = cl_context_create_queue(context, device, properties, &err);
 error:
-  return err == CL_SUCCESS ? queue : NULL;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return queue;
 }
 
 cl_int
@@ -349,7 +433,7 @@ clCreateImage2D(cl_context              context,
                          image_format,
                          &image_desc,
                          host_ptr,
-                         errcode_ret);
+                         &err);
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -385,7 +469,7 @@ clCreateImage3D(cl_context              context,
                          image_format,
                          &image_desc,
                          host_ptr,
-                         errcode_ret);
+                         &err);
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -532,8 +616,17 @@ clCreateProgramWithSource(cl_context     context,
 {
   cl_program program = NULL;
   cl_int err = CL_SUCCESS;
+  cl_uint i;
 
   CHECK_CONTEXT (context);
+  INVALID_VALUE_IF (count == 0);
+  INVALID_VALUE_IF (strings == NULL);
+  for(i = 0; i < count; i++) {
+    if(UNLIKELY(strings[i] == NULL)) {
+      err = CL_INVALID_VALUE;
+      goto error;
+    }
+  }
   program = cl_program_create_from_source(context,
                                           count,
                                           strings,
@@ -607,10 +700,7 @@ clBuildProgram(cl_program            program,
   /* Everything is easy. We only support one device anyway */
   if (num_devices != 0) {
     assert(program->ctx);
-    if (UNLIKELY(device_list[0] != program->ctx->device)) {
-      err = CL_INVALID_DEVICE;
-      goto error;
-    }
+    INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
   }
 
   /* TODO support create program from binary */
@@ -668,14 +758,13 @@ clCreateKernel(cl_program   program,
     err = CL_INVALID_PROGRAM_EXECUTABLE;
     goto error;
   }
-  kernel = cl_program_create_kernel(program, kernel_name, errcode_ret);
+  INVALID_VALUE_IF (kernel_name == NULL);
+  kernel = cl_program_create_kernel(program, kernel_name, &err);
 
-exit:
-  return kernel;
 error:
   if (errcode_ret)
     *errcode_ret = err;
-  goto exit;
+  return kernel;
 }
 
 cl_int
@@ -835,10 +924,8 @@ clFinish(cl_command_queue command_queue)
   CHECK_QUEUE (command_queue);
   err = cl_command_queue_finish(command_queue);
 
-exit:
-  return err;
 error:
-  goto exit;
+  return err;
 }
 
 cl_int
@@ -846,19 +933,46 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
                     cl_mem           buffer,
                     cl_bool          blocking_read,
                     size_t           offset,
-                    size_t           cb,
+                    size_t           size,
                     void *           ptr,
                     cl_uint          num_events_in_wait_list,
                     const cl_event * event_wait_list,
                     cl_event *       event)
 {
-	cl_int err = CL_SUCCESS;
-	assert(ptr != NULL);
-	void* temp_ptr = NULL;
-	temp_ptr = clMapBufferIntel(buffer, &err);
-	assert(err == CL_SUCCESS);
-	memcpy(ptr, temp_ptr, cb);
-	return err;
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+  if (command_queue->ctx != buffer->ctx) {
+     err = CL_INVALID_CONTEXT;
+     goto error;
+  }
+
+  if (blocking_read != CL_TRUE)
+     NOT_IMPLEMENTED;
+
+  if (!ptr || !size || offset + size > buffer->size) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+     err = CL_INVALID_OPERATION;
+     goto error;
+  }
+
+  if (!(src_ptr = cl_mem_map_auto(buffer))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  memcpy(ptr, (char*)src_ptr + offset, size);
+
+  err = cl_mem_unmap_auto(buffer);
+
+error:
+  return err;
 }
 
 cl_int
@@ -886,20 +1000,45 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
                      cl_mem              buffer,
                      cl_bool             blocking_write,
                      size_t              offset,
-                     size_t              cb,
+                     size_t              size,
                      const void *        ptr,
                      cl_uint             num_events_in_wait_list,
                      const cl_event *    event_wait_list,
                      cl_event *          event)
 {
+  cl_int err = CL_SUCCESS;
+  void* dst_ptr;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
   if (blocking_write != CL_TRUE)
     NOT_IMPLEMENTED;
-  cl_int err;
-  void *p = clMapBufferIntel(buffer, &err);
-  if (err != CL_SUCCESS)
-    return err;
-  memcpy(p + offset, ptr, cb);
-  err = clUnmapBufferIntel(buffer);
+
+  if (!ptr || !size || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  if (!(dst_ptr = cl_mem_map_auto(buffer))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  memcpy((char*)dst_ptr + offset, ptr, size);
+
+  err = cl_mem_unmap_auto(buffer);
+
+error:
   return err;
 }
 
@@ -1198,7 +1337,7 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
                    cl_bool           blocking_map,
                    cl_map_flags      map_flags,
                    size_t            offset,
-                   size_t            cb,
+                   size_t            size,
                    cl_uint           num_events_in_wait_list,
                    const cl_event *  event_wait_list,
                    cl_event *        event,
@@ -1215,15 +1354,29 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
   }
 
   if (blocking_map != CL_TRUE)
-     NOT_IMPLEMENTED;
-  if (offset != 0)
-     NOT_IMPLEMENTED;
+    NOT_IMPLEMENTED;
+
+  if (!size || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((map_flags & CL_MAP_READ &&
+       buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+      (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+       buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+  {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
 
   if (!(ptr = cl_mem_map_auto(buffer))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
+  ptr = (char*)ptr + offset;
+
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -1385,6 +1538,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   /* Do device specific checks are enqueue the kernel */
   err = cl_command_queue_ND_range(command_queue,
                                   kernel,
+                                  work_dim,
                                   fixed_global_off,
                                   fixed_global_sz,
                                   fixed_local_sz);
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 6ce9016..b296dd7 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -84,7 +84,6 @@ cl_command_queue_delete(cl_command_queue queue)
     cl_mem_delete(queue->fulsim_out);
     queue->fulsim_out = NULL;
   }
-  cl_buffer_unreference(queue->last_batch);
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
   cl_gpgpu_delete(queue->gpgpu);
@@ -105,6 +104,12 @@ set_image_info(char *curbe, struct ImageInfo * image_info, cl_mem image)
     *(uint32_t*)(curbe + image_info->wSlot) = image->w;
   if (image_info->hSlot >= 0)
     *(uint32_t*)(curbe + image_info->hSlot) = image->h;
+  if (image_info->depthSlot >= 0)
+    *(uint32_t*)(curbe + image_info->depthSlot) = image->depth;
+  if (image_info->channelOrderSlot >= 0)
+    *(uint32_t*)(curbe + image_info->channelOrderSlot) = image->fmt.image_channel_order;
+  if (image_info->dataTypeSlot >= 0)
+    *(uint32_t*)(curbe + image_info->dataTypeSlot) = image->fmt.image_channel_data_type;
 }
 
 LOCAL cl_int
@@ -132,7 +137,7 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   for (i = 0; i < k->arg_n; ++i) {
     uint32_t offset; // location of the address in the curbe
     arg_type = gbe_kernel_get_arg_type(k->opaque, i);
-    if (arg_type != GBE_ARG_GLOBAL_PTR)
+    if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
       continue;
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
@@ -148,7 +153,7 @@ LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
   for(i = 0; i < k->arg_n; i++) {
     enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
 
-    if(arg_type == GBE_ARG_CONSTANT_PTR) {
+    if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) {
       uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
       cl_mem mem = k->args[i].mem;
       cl_buffer_map(mem->bo, 1);
@@ -350,7 +355,7 @@ error:
 }
 #endif
 
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
 
 static cl_int
 cl_kernel_check_args(cl_kernel k)
@@ -365,6 +370,7 @@ cl_kernel_check_args(cl_kernel k)
 LOCAL cl_int
 cl_command_queue_ND_range(cl_command_queue queue,
                           cl_kernel k,
+                          const uint32_t work_dim,
                           const size_t *global_wk_off,
                           const size_t *global_wk_sz,
                           const size_t *local_wk_sz)
@@ -388,7 +394,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
 #endif /* USE_FULSIM */
 
   if (ver == 7 || ver == 75)
-    TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
+    TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
   else
     FATAL ("Unknown Gen Device");
 
@@ -409,11 +415,7 @@ error:
 LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
-  if (queue->last_batch == NULL)
-    return CL_SUCCESS;
-  cl_buffer_wait_rendering(queue->last_batch);
-  cl_buffer_unreference(queue->last_batch);
-  queue->last_batch = NULL;
+  cl_gpgpu_sync(queue->gpgpu);
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index f0c00f4..0e04ff3 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -37,7 +37,6 @@ struct _cl_command_queue {
   cl_gpgpu gpgpu;              /* Setup all GEN commands */
   cl_mem perf;                 /* Where to put the perf counters */
   cl_mem fulsim_out;           /* Fulsim will output this buffer */
-  cl_buffer last_batch;        /* To synchronize using clFinish */
 };
 
 /* Allocate and initialize a new command queue. Also insert it in the list of
@@ -54,6 +53,7 @@ extern void cl_command_queue_add_ref(cl_command_queue);
 /* Map ND range kernel from OCL API */
 extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
                                         cl_kernel ker,
+                                        const uint32_t work_dim,
                                         const size_t *global_work_offset,
                                         const size_t *global_work_size,
                                         const size_t *local_work_size);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 770af4a..048595c 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -98,6 +98,7 @@ error:
 /* Will return the total amount of slm used */
 static int32_t
 cl_curbe_fill(cl_kernel ker,
+              const uint32_t work_dim,
               const size_t *global_wk_off,
               const size_t *global_wk_sz,
               const size_t *local_wk_sz,
@@ -120,6 +121,7 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
   UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
   UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
+  UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
   UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
 #undef UPLOAD
 
@@ -185,6 +187,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
 LOCAL cl_int
 cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                cl_kernel ker,
+                               const uint32_t work_dim,
                                const size_t *global_wk_off,
                                const size_t *global_wk_sz,
                                const size_t *local_wk_sz)
@@ -209,18 +212,18 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Compute the number of HW threads we need */
   TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
-  kernel.thread_n = thread_n = local_sz / simd_sz;
+  kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
   kernel.cst_sz = cst_sz;
 
   /* Curbe step 1: fill the constant buffer data shared by all threads */
-  if (ker->curbe)
-    kernel.slm_sz = cl_curbe_fill(ker, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+  if (ker->curbe) {
+    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+    if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
+      return CL_OUT_OF_RESOURCES;
+  }
 
   /* Setup the kernel */
   cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
-  if (queue->last_batch != NULL)
-    cl_buffer_unreference(queue->last_batch);
-  queue->last_batch = NULL;
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
diff --git a/src/cl_context.c b/src/cl_context.c
index 4a1925c..fa4c7e0 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -87,11 +87,7 @@ error:
   return err;
 }
 
-static cl_int
-cl_device_id_is_ok(const cl_device_id device)
-{
-  return device != cl_get_gt_device() ? CL_FALSE : CL_TRUE;
-}
+
 
 LOCAL cl_context
 cl_create_context(const cl_context_properties *  properties,
@@ -106,28 +102,13 @@ cl_create_context(const cl_context_properties *  properties,
   cl_context ctx = NULL;
   cl_int err = CL_SUCCESS;
 
-  /* Assert parameters correctness */
-  INVALID_VALUE_IF (devices == NULL);
-  INVALID_VALUE_IF (num_devices == 0);
-  INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
-
   /* XXX */
   FATAL_IF (pfn_notify != NULL || user_data != NULL, "Unsupported call back");
   FATAL_IF (num_devices != 1, "Only one device is supported");
 
   /* Check that we are getting the right platform */
-//  if (UNLIKELY((err = cl_context_properties_is_ok(properties)) != CL_SUCCESS))
-//    goto error;
-
   if (UNLIKELY(((err = cl_context_properties_process(properties, &props)) != CL_SUCCESS)))
     goto error;
-  /* platform = intel_platform; */
-
-  /* Now check if the user is asking for the right device */
-  if (UNLIKELY(cl_device_id_is_ok(*devices) == CL_FALSE)) {
-    err = CL_INVALID_DEVICE;
-    goto error;
-  }
 
   /* We are good */
   if (UNLIKELY((ctx = cl_context_new(&props)) == NULL)) {
@@ -211,10 +192,7 @@ cl_context_create_queue(cl_context ctx,
   cl_command_queue queue = NULL;
   cl_int err = CL_SUCCESS;
 
-  if (UNLIKELY(device != ctx->device)) {
-    err = CL_INVALID_DEVICE;
-    goto error;
-  }
+
 
   /* We create the command queue and store it in the context list of queues */
   TRY_ALLOC (queue, cl_command_queue_new(ctx));
@@ -225,6 +203,7 @@ exit:
   return queue;
 error:
   cl_command_queue_delete(queue);
+  queue = NULL;
   goto exit;
 }
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 136f3b1..7669602 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -105,30 +105,24 @@ cl_get_device_ids(cl_platform_id    platform,
                   cl_device_id *    devices,
                   cl_uint *         num_devices)
 {
-  /* Check parameter consistency */
-  if (UNLIKELY(num_entries == 0 && devices == NULL && num_devices == NULL))
-    return CL_SUCCESS;
-  if (UNLIKELY(devices == NULL && num_devices == NULL))
-    return CL_INVALID_VALUE;
-  if (UNLIKELY(platform != NULL && platform != intel_platform))
-    return CL_INVALID_PLATFORM;
-  if (num_devices && (device_type == CL_DEVICE_TYPE_CPU)) {
-    *num_devices = 0;
-    return CL_SUCCESS;	
-  }
+  cl_device_id device;
 
-  /* Detect our device (reject a non intel one or gen<6) */
-  if (devices && UNLIKELY((*devices = cl_get_gt_device()) != NULL)) {
+  /* Do we have a usable device? */
+  device = cl_get_gt_device();
+  if (!device) {
     if (num_devices)
-      *num_devices = 1;
-
-    (*devices)->extensions = intel_platform->extensions;
-    (*devices)->extensions_sz = intel_platform->extensions_sz;
-    return CL_SUCCESS;
-  }
-  else {
+      *num_devices = 0;
+    if (devices)
+      *devices = 0;
+    return CL_DEVICE_NOT_FOUND;
+  } else {
     if (num_devices)
       *num_devices = 1;
+    if (devices) {
+      *devices = device;
+      (*devices)->extensions = intel_platform->extensions;
+      (*devices)->extensions_sz = intel_platform->extensions_sz;
+    }
     return CL_SUCCESS;
   }
 }
diff --git a/src/cl_driver.h b/src/cl_driver.h
index e8ebad1..212beb3 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -106,6 +106,10 @@ extern cl_gpgpu_new_cb *cl_gpgpu_new;
 typedef void (cl_gpgpu_delete_cb)(cl_gpgpu);
 extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
 
+/* Synchonize GPU with CPU */
+typedef cl_gpgpu (cl_gpgpu_sync_cb)(cl_gpgpu);
+extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
+
 /* Bind a regular unformatted buffer */
 typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t cchint);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 2c77a22..4952288 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -47,6 +47,7 @@ LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
 /* GPGPU */
 LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
 LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
+LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL;
 LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index d8671c6..41e6a8a 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -105,14 +105,42 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     return CL_INVALID_ARG_INDEX;
   arg_type = gbe_kernel_get_arg_type(k->opaque, index);
   arg_sz = gbe_kernel_get_arg_size(k->opaque, index);
+
   if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz))
     return CL_INVALID_ARG_SIZE;
 
-  /* Copy the structure or the value directly into the curbe */
-  if (arg_type == GBE_ARG_VALUE) {
+  if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
+    return CL_INVALID_ARG_SIZE;
+  if(arg_type == GBE_ARG_VALUE) {
+    if(UNLIKELY(value == NULL))
+      return CL_INVALID_ARG_VALUE;
+  } else if(arg_type == GBE_ARG_LOCAL_PTR) {
+    if(UNLIKELY(value != NULL))
+      return CL_INVALID_ARG_VALUE;
+  } else if(arg_type == GBE_ARG_SAMPLER) {
     if (UNLIKELY(value == NULL))
-      return CL_INVALID_KERNEL_ARGS;
+      return CL_INVALID_ARG_VALUE;
+
+    cl_sampler s = *(cl_sampler*)value;
+    if(s->magic != CL_MAGIC_SAMPLER_HEADER)
+      return CL_INVALID_SAMPLER;
+  } else {
+    // should be image, GLOBAL_PTR, CONSTANT_PTR
+    if (UNLIKELY(value == NULL && arg_type == GBE_ARG_IMAGE))
+      return CL_INVALID_ARG_VALUE;
+    if(value != NULL) {
+      mem = *(cl_mem*)value;
+      if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+        return CL_INVALID_MEM_OBJECT;
+
+      if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !mem->is_image)
+         || (arg_type != GBE_ARG_IMAGE && mem->is_image)))
+          return CL_INVALID_ARG_VALUE;
+    }
+  }
 
+  /* Copy the structure or the value directly into the curbe */
+  if (arg_type == GBE_ARG_VALUE) {
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
     assert(offset + sz <= k->curbe_sz);
     memcpy(k->curbe + offset, value, sz);
@@ -124,8 +152,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 
   /* For a local pointer just save the size */
   if (arg_type == GBE_ARG_LOCAL_PTR) {
-    if (UNLIKELY(value != NULL))
-      return CL_INVALID_KERNEL_ARGS;
     k->args[index].local_sz = sz;
     k->args[index].is_set = 1;
     k->args[index].mem = NULL;
@@ -136,8 +162,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   if (arg_type == GBE_ARG_SAMPLER) {
     cl_sampler sampler;
     memcpy(&sampler, value, sz);
-    if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
-      return CL_INVALID_KERNEL_ARGS;
     k->args[index].local_sz = 0;
     k->args[index].is_set = 1;
     k->args[index].mem = NULL;
@@ -146,15 +170,21 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     return CL_SUCCESS;
   }
 
-  /* Otherwise, we just need to check that this is a buffer */
-  if (UNLIKELY(value == NULL))
-    return CL_INVALID_KERNEL_ARGS;
+  if(value == NULL) {
+    /* for buffer object GLOBAL_PTR CONSTANT_PTR, it maybe NULL */
+    int32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    *((uint32_t *)(k->curbe + offset)) = 0;
+    assert(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR);
+
+    if (k->args[index].mem)
+      cl_mem_delete(k->args[index].mem);
+    k->args[index].mem = NULL;
+    k->args[index].is_set = 1;
+    k->args[index].local_sz = 0;
+    return CL_SUCCESS;
+  }
+
   mem = *(cl_mem*) value;
-  if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
-    return CL_INVALID_ARG_VALUE;
-  if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !mem->is_image)
-     || (arg_type != GBE_ARG_IMAGE && mem->is_image)))
-      return CL_INVALID_ARG_VALUE;
 
   if(arg_type == GBE_ARG_CONSTANT_PTR) {
     int32_t cbOffset;
@@ -232,6 +262,7 @@ cl_kernel_dup(cl_kernel from)
   if (UNLIKELY(from == NULL))
     return NULL;
   TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
+  SET_ICD(to->dispatch)
   to->bo = from->bo;
   to->const_bo = from->const_bo;
   to->opaque = from->opaque;
@@ -291,7 +322,7 @@ cl_kernel_work_group_sz(cl_kernel ker,
   sz = local_wk_sz[0];
   for (i = 1; i < wk_dim; ++i)
     sz *= local_wk_sz[i];
-  FATAL_IF (sz % 16, "Work group size must be a multiple of 16");
+
   if (sz > ker->program->ctx->device->max_work_group_size) {
     err = CL_INVALID_WORK_ITEM_SIZE;
     goto error;
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index 5f0180a..d601134 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -43,7 +43,7 @@ clIcdGetPlatformIDsKHR(cl_uint          num_entries,
                  cl_platform_id * platforms,
                  cl_uint *        num_platforms)
 {
-  return cl_get_platform_ids(num_entries, platforms, num_platforms);
+  return clGetPlatformIDs(num_entries, platforms, num_platforms);
 }
 
 struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 354fe34..4afc207 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -30,6 +30,7 @@
 #include "CL/cl_intel.h"
 #include <assert.h>
 #include <stdio.h>
+#include <string.h>
 
 #define FIELD_SIZE(CASE,TYPE)               \
   case JOIN(CL_,CASE):                      \
@@ -200,27 +201,34 @@ error:
 }
 
 static void
-cl_mem_copy_data_linear(cl_mem mem,
-                        size_t w,
-                        size_t h,
-                        size_t pitch,
-                        uint32_t bpp,
-                        void *data)
+cl_mem_copy_image(cl_mem image,
+		  size_t row_pitch,
+		  size_t slice_pitch,
+		  void* host_ptr)
 {
-  size_t x, y, p;
-  char *dst;
-  cl_buffer_map(mem->bo, 1);
-  dst = cl_buffer_get_virtual(mem->bo);
-  for (y = 0; y < h; ++y) {
-    char *src = (char*) data + pitch * y;
-    for (x = 0; x < w; ++x) {
-      for (p = 0; p < bpp; ++p)
-        dst[p] = src[p];
-      dst += bpp;
-      src += bpp;
+  char* dst_ptr = cl_mem_map_auto(image);
+
+  if (row_pitch == image->row_pitch &&
+      (image->depth == 1 || slice_pitch == image->slice_pitch))
+  {
+    memcpy(dst_ptr, host_ptr, image->depth == 1 ? row_pitch*image->h : slice_pitch*image->depth);
+  }
+  else {
+    size_t y, z;
+    for (z = 0; z < image->depth; z++) {
+      const char* src = host_ptr;
+      char* dst = dst_ptr;
+      for (y = 0; y < image->h; y++) {
+	memcpy(dst, src, image->bpp*image->w);
+	src += row_pitch;
+	dst += image->row_pitch;
+      }
+      host_ptr = (char*)host_ptr + slice_pitch;
+      dst_ptr = (char*)dst_ptr + image->slice_pitch;
     }
   }
-  cl_buffer_unmap(mem->bo);
+
+  cl_mem_unmap_auto(image);
 }
 
 static const uint32_t tile_sz = 4096; /* 4KB per tile */
@@ -229,77 +237,6 @@ static const uint32_t tilex_h = 8;    /* tileX height in number of rows */
 static const uint32_t tiley_w = 128;  /* tileY width in bytes */
 static const uint32_t tiley_h = 32;   /* tileY height in number of rows */
 
-static void
-cl_mem_copy_data_tilex(cl_mem mem,
-                       size_t w,
-                       size_t h,
-                       size_t pitch,
-                       uint32_t bpp,
-                       void *data)
-{
-  const size_t tile_w = tilex_w;
-  const size_t tile_h = tilex_h;
-  const size_t aligned_pitch  = ALIGN(w * bpp, tile_w);
-  const size_t aligned_height = ALIGN(h, tile_h);
-  const size_t tilex_n = aligned_pitch  / tile_w;
-  const size_t tiley_n = aligned_height / tile_h;
-  size_t x, y, tilex, tiley;
-  char *img = NULL;
-  char *end = (char*) data + pitch * h;
-
-  cl_buffer_map(mem->bo, 1);
-  img = cl_buffer_get_virtual(mem->bo);
-  for (tiley = 0; tiley < tiley_n; ++tiley)
-  for (tilex = 0; tilex < tilex_n; ++tilex) {
-    char *tile = img + (tilex + tiley * tilex_n) * tile_sz;
-    for (y = 0; y < tile_h; ++y) {
-      char *src = (char*) data + (tiley*tile_h+y)*pitch + tilex*tile_w;
-      char *dst = tile + y*tile_w;
-      for (x = 0; x < tile_w; ++x, ++dst, ++src) {
-        if ((uintptr_t) src < (uintptr_t) end)
-          *dst = *src;
-      }
-    }
-  }
-  cl_buffer_unmap(mem->bo);
-}
-
-static void
-cl_mem_copy_data_tiley(cl_mem mem,
-                       size_t w,
-                       size_t h,
-                       size_t pitch,
-                       uint32_t bpp,
-                       void *data)
-{
-  const size_t tile_w = tiley_w;
-  const size_t tile_h = tiley_h;
-  const size_t aligned_pitch  = ALIGN(w * bpp, tile_w);
-  const size_t aligned_height = ALIGN(h, tile_h);
-  const size_t tilex_n = aligned_pitch  / tile_w;
-  const size_t tiley_n = aligned_height / tile_h;
-  size_t x, y, tilex, tiley, byte;
-  char *img = NULL;
-  char *end = (char*) data + pitch * h;
-
-  cl_buffer_map(mem->bo, 1);
-  img = cl_buffer_get_virtual(mem->bo);
-  for (tiley = 0; tiley < tiley_n; ++tiley)
-  for (tilex = 0; tilex < tilex_n; ++tilex) {
-    char *tile = img + (tiley * tilex_n + tilex) * tile_sz;
-    for (x = 0; x < tile_w; x += 16) {
-      char *src = (char*) data + tiley*tile_h*pitch + tilex*tile_w+x;
-      char *dst = tile + x*tile_h;
-      for (y = 0; y < tile_h; ++y, dst += 16, src += pitch) {
-        for (byte = 0; byte < 16; ++byte)
-          if ((uintptr_t) src  + byte < (uintptr_t) end)
-            dst[byte] = src[byte];
-      }
-    }
-  }
-  cl_buffer_unmap(mem->bo);
-}
-
 static cl_mem
 _cl_mem_new_image(cl_context ctx,
                   cl_mem_flags flags,
@@ -398,16 +335,6 @@ _cl_mem_new_image(cl_context ctx,
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
-  /* Copy the data if required */
-  if (flags & CL_MEM_COPY_HOST_PTR) {
-    if (tiling == CL_NO_TILE)
-      cl_mem_copy_data_linear(mem, w, h, pitch, bpp, data);
-    else if (tiling == CL_TILE_X)
-      cl_mem_copy_data_tilex(mem, w, h, pitch, bpp, data);
-    else if (tiling == CL_TILE_Y)
-      cl_mem_copy_data_tiley(mem, w, h, pitch, bpp, data);
-  }
-
   mem->w = w;
   mem->h = h;
   mem->depth = depth;
@@ -422,6 +349,10 @@ _cl_mem_new_image(cl_context ctx,
 
   cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
 
+  /* Copy the data if required */
+  if (flags & CL_MEM_COPY_HOST_PTR)
+    cl_mem_copy_image(mem, pitch, slice_pitch, data);
+
 exit:
   if (errcode_ret)
     *errcode_ret = err;
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index 2f66064..2e0a86a 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -51,22 +51,11 @@ cl_get_platform_ids(cl_uint          num_entries,
 {
   if (num_platforms != NULL)
     *num_platforms = 1;
-  if (UNLIKELY(platforms == NULL))
-    return CL_SUCCESS;
-  if (UNLIKELY(num_entries == 0))
-    return CL_INVALID_VALUE;
-  if (UNLIKELY(num_platforms == NULL && platforms == NULL))
-    return CL_SUCCESS;
-#if 0
-  if (UNLIKELY(num_platforms == NULL && platforms != NULL))
-    return CL_INVALID_VALUE;
-#endif
-  if (UNLIKELY(num_platforms != NULL && platforms == NULL))
-    return CL_INVALID_VALUE;
 
   cl_intel_platform_extension_init(intel_platform);
   /* Easy right now, only one platform is supported */
-  *platforms = intel_platform;
+  if(platforms)
+    *platforms = intel_platform;
   intel_platform->extensions_sz = strlen(intel_platform->extensions) + 1;
   return CL_SUCCESS;
 }
@@ -95,10 +84,6 @@ cl_get_platform_info(cl_platform_id    platform,
                      void *            param_value,
                      size_t *          param_value_size_ret)
 {
-  /* Only one platform. This is easy */
-  if (UNLIKELY(platform != NULL && platform != intel_platform))
-    return CL_INVALID_PLATFORM;
-
   if (param_value == NULL) {
     switch (param_name) {
       GET_FIELD_SZ (PLATFORM_PROFILE,    profile);
diff --git a/src/cl_program.c b/src/cl_program.c
index 0c48ef3..6acf31f 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -236,8 +236,6 @@ cl_program_create_from_source(cl_context ctx,
   cl_uint i;
 
   assert(ctx);
-  INVALID_VALUE_IF (count == 0);
-  INVALID_VALUE_IF (strings == NULL);
 
   // the real compilation step will be done at build time since we do not have
   // yet the compilation options
@@ -297,11 +295,6 @@ cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
   cl_int err = CL_SUCCESS;
   uint32_t i = 0;
 
-  if (UNLIKELY(name == NULL)) {
-    err = CL_INVALID_KERNEL_NAME;
-    goto error;
-  }
-
   /* Find the program first */
   for (i = 0; i < p->ker_n; ++i) {
     assert(p->ker[i]);
diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
index 89f8676..62eedd0 100644
--- a/src/intel/intel_batchbuffer.c
+++ b/src/intel/intel_batchbuffer.c
@@ -59,6 +59,7 @@ intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t sz)
   if (batch->buffer != NULL) {
     dri_bo_unreference(batch->buffer);
     batch->buffer = NULL;
+    batch->last_bo = NULL;
   }
 
   batch->buffer = dri_bo_alloc(batch->intel->bufmgr,
@@ -72,6 +73,7 @@ intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t sz)
   batch->size = sz;
   batch->ptr = batch->map;
   batch->atomic = 0;
+  batch->last_bo = batch->buffer;
 }
 
 LOCAL void
@@ -172,6 +174,7 @@ intel_batchbuffer_delete(intel_batchbuffer_t *batch)
     return;
   if(batch->buffer)
     intel_batchbuffer_terminate(batch);
+
   cl_free(batch);
 }
 
diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
index ece8307..74f1790 100644
--- a/src/intel/intel_batchbuffer.h
+++ b/src/intel/intel_batchbuffer.h
@@ -78,6 +78,8 @@ typedef struct intel_batchbuffer
 {
   struct intel_driver *intel;
   drm_intel_bo *buffer;
+  /** Last bo submitted to the hardware.  used for clFinish. */
+  drm_intel_bo *last_bo;
   uint32_t size;
   uint8_t *map;
   uint8_t *ptr;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index b0f556d..19567dc 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -93,6 +93,14 @@ struct intel_gpgpu
 
 typedef struct intel_gpgpu intel_gpgpu_t;
 
+
+static void
+intel_gpgpu_sync(intel_gpgpu_t *gpgpu)
+{
+    if (gpgpu->batch->last_bo)
+	drm_intel_bo_wait_rendering(gpgpu->batch->last_bo);
+}
+
 static void
 intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
 {
@@ -333,11 +341,19 @@ intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
 {
   intel_batchbuffer_reset(gpgpu->batch, sz);
 }
-
+/* check we do not get a 0 starting address for binded buf */
+static void
+intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
+{
+  uint32_t i;
+  for (i = 0; i < gpgpu->binded_n; ++i)
+    assert(gpgpu->binded_buf[i]->offset != 0);
+}
 static void
 intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
 {
   intel_batchbuffer_flush(gpgpu->batch);
+  intel_gpgpu_check_binded_buf_address(gpgpu);
 }
 
 static void
@@ -748,7 +764,15 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
     global_wk_sz[1] / local_wk_sz[1],
     global_wk_sz[2] / local_wk_sz[2]
   };
+  uint32_t right_mask = ~0x0;
+  size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
+
   assert(simd_sz == 8 || simd_sz == 16);
+
+  uint32_t shift = (group_sz & (simd_sz - 1));
+  shift = (shift == 0) ? simd_sz : shift;
+  right_mask = (1 << shift) - 1;
+
   BEGIN_BATCH(gpgpu->batch, 11);
   OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
   OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
@@ -762,8 +786,8 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
   OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
   OUT_BATCH(gpgpu->batch, global_wk_off[2]);
   OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
-  OUT_BATCH(gpgpu->batch, ~0x0);
-  OUT_BATCH(gpgpu->batch, ~0x0);
+  OUT_BATCH(gpgpu->batch, right_mask);
+  OUT_BATCH(gpgpu->batch, ~0x0);                     /* we always set height as 1, so set bottom mask as all 1*/
   ADVANCE_BATCH(gpgpu->batch);
 
   BEGIN_BATCH(gpgpu->batch, 2);
@@ -777,6 +801,7 @@ intel_set_gpgpu_callbacks(void)
 {
   cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new;
   cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
+  cl_gpgpu_sync = (cl_gpgpu_sync_cb *) intel_gpgpu_sync;
   cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image;
   cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
   cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 2ba01c4..e5c03ee 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -5,10 +5,12 @@ link_directories (${LLVM_LIBRARY_DIR})
 set (utests_sources
   cl_create_kernel.cpp
   utest_error.c
+  compiler_displacement_map_element.cpp
   compiler_shader_toy.cpp
   compiler_mandelbrot.cpp
   compiler_mandelbrot_alternate.cpp
   compiler_box_blur_float.cpp
+  compiler_box_blur_image.cpp
   compiler_box_blur.cpp
   compiler_insert_to_constant.cpp
   compiler_argument_structure.cpp
@@ -38,6 +40,7 @@ set (utests_sources
   compiler_function_constant.cpp
   compiler_global_constant.cpp
   compiler_global_constant_2.cpp
+  compiler_group_size.cpp
   compiler_if_else.cpp
   compiler_integer_division.cpp
   compiler_integer_remainder.cpp
@@ -68,15 +71,19 @@ set (utests_sources
   compiler_insn_selection_min.cpp
   compiler_insn_selection_max.cpp
   compiler_insn_selection_masked_min_max.cpp
-  compiler_local_memory.cpp
+#  compiler_global_memory_barrier.cpp
   compiler_local_memory_two_ptr.cpp
   compiler_local_memory_barrier.cpp
   compiler_local_memory_barrier_wg64.cpp
   compiler_movforphi_undef.cpp
   compiler_volatile.cpp
   compiler_copy_image1.cpp
-  compiler_get_image_size.cpp
+  compiler_get_image_info.cpp
+  compiler_vector_load_store.cpp
+  compiler_cl_finish.cpp
+  buildin_work_dim.cpp
   runtime_createcontext.cpp
+  runtime_null_kernel_arg.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/buildin_work_dim.cpp b/utests/buildin_work_dim.cpp
new file mode 100644
index 0000000..d678c0f
--- /dev/null
+++ b/utests/buildin_work_dim.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+
+static void buildin_work_dim(void)
+{
+  // Setup kernel and buffers
+
+  int result, err;
+  OCL_CREATE_KERNEL("buildin_work_dim");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  globals[0] = 1;
+  globals[1] = 1;
+  globals[2] = 1;
+  locals[0] = 1;
+  locals[1] = 1;
+  locals[2] = 1;
+
+  for( int i=1; i <= 3; i++ )
+  {
+
+    // Run the kernel
+    OCL_NDRANGE(i);
+
+    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &result, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+       printf("Error: Failed to read output array! %d\n", err);
+       exit(1);
+    }
+
+    OCL_ASSERT( result == i);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(buildin_work_dim);
diff --git a/utests/compiler_box_blur_image.cpp b/utests/compiler_box_blur_image.cpp
new file mode 100644
index 0000000..d94a97c
--- /dev/null
+++ b/utests/compiler_box_blur_image.cpp
@@ -0,0 +1,52 @@
+#include "utest_helper.hpp"
+
+static void compiler_box_blur_image()
+{
+  int w, h;
+  cl_image_format format = { };
+  cl_image_desc desc = { };
+  size_t origin[3] = { };
+  size_t region[3];
+  int *src, *dst;
+
+  OCL_CREATE_KERNEL("compiler_box_blur_image");
+
+  /* Load the picture */
+  src = cl_read_bmp("lenna128x128.bmp", &w, &h);
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = 1;
+  desc.image_row_pitch = w*sizeof(uint32_t);
+
+  /* Run the kernel */
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, src);
+  free(src);
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+  dst = (int*)malloc(w*h*sizeof(uint32_t));
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  OCL_READ_IMAGE(buf[1], origin, region, dst);
+
+  /* Save the image (for debug purpose) */
+  cl_write_bmp(dst, w, h, "compiler_box_blur_image.bmp");
+
+  /* Compare with the golden image */
+  OCL_CHECK_IMAGE(dst, w, h, "compiler_box_blur_ref.bmp");
+
+  free(dst);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_box_blur_image);
diff --git a/utests/compiler_cl_finish.cpp b/utests/compiler_cl_finish.cpp
new file mode 100644
index 0000000..7c7dee3
--- /dev/null
+++ b/utests/compiler_cl_finish.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+#define T_GET(t)        gettimeofday(&t, NULL);
+#define T_LAPSE(t1, t2) \
+  ((t2.tv_sec+t2.tv_usec*0.000001) - (t1.tv_sec+t1.tv_usec*0.000001))
+
+static void compiler_cl_finish(void)
+{
+  const size_t n = 16*1024*1024;
+  struct timeval t1, t2;
+  float t_fin, t_map_w_fin,t_map_wo_fin;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_cl_finish");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+
+  // Run the kernel
+  locals[0]  = 64;
+  globals[0] = 32 * locals[0];
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(int), &n);
+  OCL_SET_ARG(3, sizeof(int), &globals[0]);
+
+  // 1st time map after clFinish
+  OCL_NDRANGE(1);
+  T_GET(t1);
+  OCL_FINISH();
+  T_GET(t2);
+  t_fin = T_LAPSE(t1, t2);
+
+  T_GET(t1);
+  OCL_MAP_BUFFER(0);
+  T_GET(t2);
+  t_map_w_fin = T_LAPSE(t1, t2);
+
+  // 2nd time map without clFinish
+  OCL_NDRANGE(1);
+  T_GET(t1);
+  OCL_MAP_BUFFER(0);
+  T_GET(t2);
+  t_map_wo_fin = T_LAPSE(t1, t2);
+
+  OCL_ASSERT(t_fin > t_map_w_fin && t_map_wo_fin > t_map_w_fin);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_cl_finish);
diff --git a/utests/compiler_displacement_map_element.cpp b/utests/compiler_displacement_map_element.cpp
new file mode 100644
index 0000000..98041ec
--- /dev/null
+++ b/utests/compiler_displacement_map_element.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+
+typedef unsigned int uint;
+constexpr int W = 16, H = 16;
+constexpr int SIZE = W * H;
+uint in_1[SIZE];
+uint disp_map[SIZE];
+uint out_1[SIZE];
+
+uint cpu(const int cx, const int cy, const uint *in, const uint *disp_map, int w, int h) {
+  uint c = disp_map[cy * w + cx];
+  int x_pos = cx + c;
+  int y_pos = cy + c;
+  if(0 <= x_pos && x_pos < w && 0 <= y_pos && y_pos < h)
+    return in[y_pos * w + x_pos];
+  else
+    return 0;
+}
+
+void test() {
+  OCL_MAP_BUFFER(2);
+  for(int y=0; y<H; y++)
+    for(int x=0; x<W; x++) {
+      uint out = ((uint*)buf_data[2]) [y * W + x];
+      uint wish = cpu(x, y, in_1, disp_map, W, H);
+      if(out != wish)
+        printf("XXX %d %d %x %x\n", x, y, out, wish);
+      OCL_ASSERT(out == wish);
+    }
+  OCL_UNMAP_BUFFER(2);
+}
+
+void displacement_map_element(void) {
+  int i, pass;
+
+  OCL_CREATE_KERNEL("compiler_displacement_map_element");
+  OCL_CREATE_BUFFER(buf[0], 0, SIZE * sizeof(uint), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, SIZE * sizeof(uint), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, SIZE * sizeof(uint), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(W), &W);
+  OCL_SET_ARG(3, sizeof(H), &H);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[2]);
+  globals[0] = W;
+  globals[1] = H;
+  locals[0] = 16;
+  locals[1] = 16;
+
+  for (pass = 0; pass < 8; pass ++) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (i = 0; i < SIZE; i ++) {
+      in_1[i] = ((uint*)buf_data[0])[i] = ((rand() & 0xFFFF) << 16) | (rand() & 0xFFFF);
+      disp_map[i] = ((uint*)buf_data[1])[i] = rand() & 3;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+    OCL_NDRANGE(2);
+    test();
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(displacement_map_element);
diff --git a/utests/compiler_get_image_size.cpp b/utests/compiler_get_image_info.cpp
similarity index 50%
rename from utests/compiler_get_image_size.cpp
rename to utests/compiler_get_image_info.cpp
index 49c08ad..3b9d132 100644
--- a/utests/compiler_get_image_size.cpp
+++ b/utests/compiler_get_image_info.cpp
@@ -1,37 +1,50 @@
 #include "utest_helper.hpp"
 
-static void compiler_get_image_size(void)
+static void compiler_get_image_info(void)
 {
   const size_t w = 256;
   const size_t h = 512;
+  const size_t depth = 3;
   cl_image_format format;
   cl_image_desc desc;
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
-  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
   desc.image_width = w;
   desc.image_height = h;
+  desc.image_depth = depth;
   desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = NULL;
 
   // Setup kernel and images
-  OCL_CREATE_KERNEL("test_get_image_size");
+  OCL_CREATE_KERNEL("test_get_image_info");
 
   OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
   OCL_CREATE_BUFFER(buf[1], 0, 32 * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, 32 * sizeof(int), NULL);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
   globals[0] = 32;
   locals[0] = 16;
   OCL_NDRANGE(1);
 
   // Check result
   OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
   for (uint32_t i = 0; i < 32; i++)
-    OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((w << 16) | (h)));
-  OCL_UNMAP_BUFFER(0);
+  {
+    OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((w << 20) | (h << 8) | depth));
+    OCL_ASSERT(((uint32_t*)buf_data[2])[i] == ((CL_UNSIGNED_INT8 << 16) | CL_RGBA));
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_get_image_size);
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_info);
diff --git a/utests/compiler_global_memory_barrier.cpp b/utests/compiler_global_memory_barrier.cpp
new file mode 100644
index 0000000..a6496a7
--- /dev/null
+++ b/utests/compiler_global_memory_barrier.cpp
@@ -0,0 +1,28 @@
+#include "utest_helper.hpp"
+
+static void compiler_global_memory_barrier(void)
+{
+  const size_t n = 16*1024;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_global_memory_barrier");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  // Run the kernel
+  globals[0] = n/2;
+  locals[0] = 32;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  uint32_t *dst = (uint32_t*)buf_data[0];
+  for (uint32_t i = 0; i < n; i+=locals[0])
+    for (uint32_t j = 0; j < locals[0]; ++j)
+        OCL_ASSERT(dst[i+j] == locals[0] - 1 -j);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_global_memory_barrier);
diff --git a/utests/compiler_group_size.cpp b/utests/compiler_group_size.cpp
new file mode 100644
index 0000000..6d59aed
--- /dev/null
+++ b/utests/compiler_group_size.cpp
@@ -0,0 +1,86 @@
+#include "utest_helper.hpp"
+
+void compiler_group_size1(void)
+{
+  const size_t n = 7*32*17;
+
+  int group_size[] = {7, 17, 32};
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_group_size");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for(int i = 0; i < 3; i++) {
+    // Run the kernel
+    globals[0] = n;
+    locals[0] = group_size[i];
+    OCL_NDRANGE(1);
+    OCL_MAP_BUFFER(0);
+
+    // Check results
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+void compiler_group_size2(void)
+{
+  const uint32_t n = 4*17*8;
+  int size_x[] = {2, 4, 17};
+  int size_y[] = {2, 4, 4};
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_group_size");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for(int i = 0; i < 3; i++) {
+    // Run the kernel
+    globals[0] = 4*17;
+    globals[1] = 8;
+    locals[0] = size_x[i];
+    locals[1] = size_y[i];
+    OCL_NDRANGE(2);
+    OCL_MAP_BUFFER(0);
+
+    // Check results
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+void compiler_group_size3(void)
+{
+  const uint32_t n = 4*17*8*4;
+  int size_x[] = {2, 4, 17};
+  int size_y[] = {2, 4, 4};
+  int size_z[] = {2, 1, 2};
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_group_size");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for(int i = 0; i < 3; i++) {
+    // Run the kernel
+    globals[0] = 4*17;
+    globals[1] = 8;
+    globals[2] = 4;
+    locals[0] = size_x[i];
+    locals[1] = size_y[i];
+    locals[2] = size_z[i];
+    OCL_NDRANGE(3);
+    OCL_MAP_BUFFER(0);
+
+    // Check results
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_group_size1);
+MAKE_UTEST_FROM_FUNCTION(compiler_group_size2);
+MAKE_UTEST_FROM_FUNCTION(compiler_group_size3);
+
diff --git a/utests/compiler_local_memory.cpp b/utests/compiler_local_memory.cpp
deleted file mode 100644
index 49fa28c..0000000
--- a/utests/compiler_local_memory.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* 
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
- */
-
-#include "utest_helper.hpp"
-
-static void compiler_local_memory(void)
-{
-  const size_t n = 1024;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_local_memory");
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, 64, NULL); // 16 x int
-
-  // Run the kernel
-  globals[0] = n;
-  locals[0] = 16;
-  OCL_NDRANGE(1);
-  OCL_MAP_BUFFER(0);
-
-  // Check results
-  uint32_t *dst = (uint32_t*)buf_data[0];
-  for (uint32_t i = 0; i < n; i+=16)
-  for (uint32_t j = 0; j < 16; ++j)
-    OCL_ASSERT(dst[i+j] == 15-j);
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_local_memory);
-
-
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index 96fcfa9..79f284f 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -1,10 +1,59 @@
 #include "utest_helper.hpp"
-
-void compiler_vector_load_store(void)
+template<typename T>
+static void compiler_vector_load_store(int elemNum, const char *kernelName)
 {
-  OCL_CREATE_KERNEL("compiler_vector_load_store");
+  const size_t n = elemNum * 256;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_vector_load_store", kernelName);
+  buf_data[0] = (T*) malloc(sizeof(T) * n);
+  for (uint32_t i = 0; i < n; ++i)
+    ((T*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(float), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n / elemNum;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+  {
+    int shift = ((i % elemNum) + 1);
+    OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_vector_load_store);
+#define compiler_vector_load_store(type, n, kernel_type) \
+static void compiler_vector_ ##kernel_type ##n ##_load_store(void)\
+{\
+  compiler_vector_load_store<type>(n, "test_" #kernel_type #n);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_vector_ ## kernel_type ##n ##_load_store);
 
+#define test_all_vector(type, kernel_type) \
+  compiler_vector_load_store(type, 2, kernel_type) \
+  compiler_vector_load_store(type, 3, kernel_type) \
+  compiler_vector_load_store(type, 4, kernel_type) \
+  compiler_vector_load_store(type, 8, kernel_type) \
+  compiler_vector_load_store(type, 16, kernel_type)
 
+test_all_vector(int8_t, char)
+test_all_vector(uint8_t, uchar)
+test_all_vector(int16_t, short)
+test_all_vector(uint16_t, ushort)
+test_all_vector(int32_t, int)
+test_all_vector(uint32_t, uint)
+test_all_vector(float, float)
+//test_all_vector(double, double)
+//test_all_vector(int64_t, long)
+//test_all_vector(uint64_t, ulong)
diff --git a/utests/runtime_null_kernel_arg.cpp b/utests/runtime_null_kernel_arg.cpp
new file mode 100644
index 0000000..447e345
--- /dev/null
+++ b/utests/runtime_null_kernel_arg.cpp
@@ -0,0 +1,27 @@
+#include "utest_helper.hpp"
+
+void runtime_null_kernel_arg(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("null_kernel_arg");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), NULL);
+
+    // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+  OCL_UNMAP_BUFFER(0);
+}
+
+
+MAKE_UTEST_FROM_FUNCTION(runtime_null_kernel_arg);
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index d882fc7..dd98a1f 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -75,6 +75,11 @@ extern EGLSurface  eglSurface;
     OCL_CALL(clFlush, queue); \
   } while(0)
 
+#define OCL_FINISH() \
+  do { \
+    OCL_CALL(clFinish, queue); \
+  } while(0)
+
 #define OCL_CALL2(FN, RET, ...) \
   do { \
     cl_int status; \
@@ -88,6 +93,12 @@ extern EGLSurface  eglSurface;
 #define OCL_CREATE_IMAGE(IMAGE, FLAGS, FORMAT, DESC, DATA) \
     OCL_CALL2(clCreateImage, IMAGE, ctx, FLAGS, FORMAT, DESC, DATA)
 
+#define OCL_READ_IMAGE(IMAGE, ORIGIN, REGION, DATA) \
+    OCL_CALL(clEnqueueReadImage, queue, IMAGE, CL_TRUE, ORIGIN, REGION, 0, 0, DATA, 0, NULL, NULL)
+
+#define OCL_WRITE_IMAGE(IMAGE, ORIGIN, REGION, DATA) \
+    OCL_CALL(clEnqueueWriteImage, queue, IMAGE, CL_TRUE, ORIGIN, REGION, 0, 0, DATA, 0, NULL, NULL)
+
 #define OCL_CREATE_GL_IMAGE(IMAGE, FLAGS, TARGET, LEVEL, TEXTURE) \
     OCL_CALL2(clCreateFromGLTexture, IMAGE, ctx, FLAGS, TARGET, LEVEL, TEXTURE)
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git