[Pkg-opencl-devel] [beignet] 59/66: Imported Upstream version 0.9.3

Fri Oct 31 07:27:10 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 3ec9c00d750f76c9f99a5a48a900cf237789b489
Author: Andreas Beckmann <anbe at debian.org>
Date:   Wed Oct 29 18:06:04 2014 +0100

    Imported Upstream version 0.9.3
---
 CMake/FindDRM.cmake                                |    42 -
 CMake/FindDRMIntel.cmake                           |    43 -
 CMake/FindLLVM.cmake                               |     2 +-
 CMake/{FindEGL.cmake => FindMesaSrc.cmake}         |    37 +-
 CMake/FindXext.cmake                               |    35 -
 CMake/FindXfixes.cmake                             |    35 -
 CMakeLists.txt                                     |    65 +-
 NEWS.mdwn                                          |     1 +
 backend/CMakeLists.txt                             |     7 +-
 backend/src/CMakeLists.txt                         |    82 +-
 backend/src/GBEConfig.h.in                         |     2 +
 backend/src/backend/context.cpp                    |   296 +-
 backend/src/backend/context.hpp                    |    26 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |   142 +-
 backend/src/backend/gen/gen_mesa_disasm.h          |     2 +-
 backend/src/backend/gen75_context.cpp              |   112 +
 backend/src/backend/gen75_context.hpp              |    62 +
 backend/src/backend/gen75_encoder.cpp              |   269 +
 backend/src/backend/gen75_encoder.hpp              |    60 +
 backend/src/backend/gen_context.cpp                |   823 +-
 backend/src/backend/gen_context.hpp                |    73 +-
 backend/src/backend/gen_defs.hpp                   |  1013 +-
 backend/src/backend/gen_encoder.cpp                |   506 +-
 backend/src/backend/gen_encoder.hpp                |    93 +-
 backend/src/backend/gen_insn_compact.cpp           |   523 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |    46 +-
 backend/src/backend/gen_insn_scheduling.cpp        |   271 +-
 backend/src/backend/gen_insn_selection.cpp         |  2288 ++--
 backend/src/backend/gen_insn_selection.hpp         |    80 +-
 backend/src/backend/gen_insn_selection.hxx         |     6 +
 backend/src/backend/gen_program.cpp                |   326 +-
 backend/src/backend/gen_program.hpp                |    21 +-
 backend/src/backend/gen_reg_allocation.cpp         |   662 +-
 backend/src/backend/gen_reg_allocation.hpp         |     2 +
 backend/src/backend/gen_register.hpp               |   101 +-
 backend/src/backend/program.cpp                    |   450 +-
 backend/src/backend/program.h                      |   138 +-
 backend/src/backend/program.hpp                    |    72 +-
 backend/src/builtin_vector_proto.def               |    39 +
 backend/src/gbe_bin_generater.cpp                  |   138 +-
 backend/src/gbe_bin_interpreter.cpp                |    80 +
 backend/src/gen_builtin_vector.py                  |     4 +-
 backend/src/gen_convert.sh                         |     8 +-
 backend/src/ir/context.cpp                         |    20 +-
 backend/src/ir/context.hpp                         |    46 +-
 backend/src/ir/function.cpp                        |    66 +-
 backend/src/ir/function.hpp                        |    58 +-
 backend/src/ir/image.cpp                           |    66 +-
 backend/src/ir/image.hpp                           |     7 +-
 backend/src/ir/immediate.cpp                       |   263 +
 backend/src/ir/immediate.hpp                       |   220 +-
 backend/src/ir/instruction.cpp                     |   109 +-
 backend/src/ir/instruction.hpp                     |    48 +-
 backend/src/ir/instruction.hxx                     |     3 +-
 backend/src/ir/liveness.cpp                        |   163 +-
 backend/src/ir/liveness.hpp                        |    18 +-
 backend/src/ir/lowering.cpp                        |    17 +-
 backend/src/ir/printf.cpp                          |   222 +
 backend/src/ir/printf.hpp                          |   244 +
 backend/src/ir/profile.cpp                         |    77 +-
 backend/src/ir/profile.hpp                         |    23 +-
 backend/src/ir/register.hpp                        |    21 +-
 backend/src/ir/sampler.cpp                         |     3 +
 backend/src/ir/sampler.hpp                         |     2 +
 backend/src/ir/type.cpp                            |     2 +
 backend/src/ir/type.hpp                            |     6 +-
 backend/src/ir/unit.cpp                            |     2 +-
 backend/src/ir/unit.hpp                            |     4 +-
 backend/src/ir/value.cpp                           |    13 +
 backend/src/llvm/llvm_barrier_nodup.cpp            |   115 +
 backend/src/llvm/llvm_gen_backend.cpp              |  1460 ++-
 backend/src/llvm/llvm_gen_backend.hpp              |     9 +
 backend/src/llvm/llvm_gen_ocl_function.hxx         |    72 +-
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |     4 +-
 backend/src/llvm/llvm_loadstore_optimization.cpp   |   272 +
 backend/src/llvm/llvm_passes.cpp                   |    21 +-
 backend/src/llvm/llvm_printf_parser.cpp            |   851 ++
 backend/src/llvm/llvm_scalarize.cpp                |   128 +-
 backend/src/llvm/llvm_to_gen.cpp                   |    79 +-
 backend/src/llvm/llvm_to_gen.hpp                   |     2 +-
 backend/src/ocl_barrier.ll                         |     6 +-
 backend/src/ocl_common_defines.h                   |     5 +-
 backend/src/ocl_convert.h                          |     8 +-
 backend/src/ocl_stdlib.tmpl.h                      |  1149 +-
 backend/src/update_blob_ocl_header.py              |     2 +-
 benchmark/CMakeLists.txt                           |    21 +
 benchmark/benchmark_run.cpp                        |   117 +
 benchmark/enqueue_copy_buf.cpp                     |    69 +
 docs/Beignet.mdwn                                  |   136 +-
 docs/Beignet/Backend.mdwn                          |    44 +-
 docs/Beignet/Backend/TODO.mdwn                     |    34 +-
 docs/Beignet/Backend/flat_address_space.mdwn       |    98 -
 docs/Beignet/Backend/gen_ir.mdwn                   |    10 +-
 docs/Beignet/Backend/mixed_buffer_pointer.mdwn     |    46 +
 docs/NEWS.mdwn                                     |    16 +
 docs/howto/cross-compiler-howto.mdwn               |    60 +
 docs/optimization-guide.mdwn                       |    28 +
 include/CL/cl.h                                    |   364 +-
 include/CL/cl.hpp                                  | 12217 ++++++++++++++++---
 include/CL/cl_d3d10.h                              |    20 +-
 include/CL/{cl_d3d10.h => cl_d3d11.h}              |    96 +-
 include/CL/cl_dx9_media_sharing.h                  |   127 +
 include/CL/cl_egl.h                                |   133 +
 include/CL/cl_ext.h                                |   119 +-
 include/CL/cl_gl.h                                 |    91 +-
 include/CL/cl_gl_ext.h                             |     4 +-
 include/CL/cl_intel.h                              |    11 +
 include/CL/cl_platform.h                           |   426 +-
 include/CL/opencl.h                                |     2 +-
 kernels/compare_image_2d_and_1d_array.cl           |    13 +
 kernels/compiler_async_copy.cl                     |     2 +-
 kernels/compiler_constant_expr.cl                  |    23 +
 kernels/compiler_getelementptr_bitcast.cl          |    18 +
 kernels/compiler_mixed_pointer.cl                  |    23 +
 kernels/compiler_simd_all.cl                       |    12 +
 kernels/compiler_simd_any.cl                       |    15 +
 kernels/compiler_vector_load_store.cl              |     2 +-
 kernels/double_precision_check.cl                  |    11 +
 kernels/image_1D_buffer.cl                         |    13 +
 kernels/include/runtime_compile_link_inc.h         |     4 +
 kernels/runtime_compile_link.h                     |     1 +
 kernels/runtime_compile_link_a.cl                  |    13 +
 kernels/runtime_compile_link_b.cl                  |     9 +
 kernels/test_copy_image_1d.cl                      |     9 +
 kernels/test_fill_image_1d.cl                      |     8 +
 kernels/test_get_arg_info.cl                       |     8 +
 kernels/test_get_image_info_array.cl               |    25 +
 kernels/test_printf.cl                             |    38 +
 src/CMakeLists.txt                                 |    81 +-
 src/cl_alloc.c                                     |     1 +
 src/cl_api.c                                       |   691 +-
 src/cl_command_queue.c                             |   156 +-
 src/cl_command_queue.h                             |    13 +-
 src/cl_command_queue_gen7.c                        |   150 +-
 src/cl_context.c                                   |    90 +-
 src/cl_context.h                                   |    41 +-
 src/cl_device_data.h                               |    57 +-
 src/cl_device_id.c                                 |   343 +-
 src/cl_device_id.h                                 |    29 +-
 src/cl_driver.h                                    |    82 +-
 src/cl_driver_defs.c                               |    12 +-
 src/cl_enqueue.c                                   |   100 +-
 src/cl_enqueue.h                                   |     5 +
 src/cl_event.c                                     |   278 +-
 src/cl_event.h                                     |    13 +-
 src/cl_gbe_loader.cpp                              |   328 +
 src/cl_gbe_loader.h                                |    80 +
 src/cl_gen75_device.h                              |     1 +
 src/cl_gen7_device.h                               |     1 +
 src/cl_gt_device.h                                 |    74 +-
 src/cl_image.c                                     |     3 +
 src/cl_kernel.c                                    |   142 +-
 src/cl_kernel.h                                    |    14 +-
 src/cl_khr_icd.c                                   |    38 +-
 src/cl_mem.c                                       |  1002 +-
 src/cl_mem.h                                       |    21 +-
 src/cl_platform_id.c                               |     2 +-
 src/cl_program.c                                   |   457 +-
 src/cl_program.h                                   |    32 +-
 src/cl_thread.c                                    |   266 +-
 src/cl_thread.h                                    |     9 +-
 src/cl_utils.h                                     |    29 +
 src/intel/intel_batchbuffer.c                      |    23 +-
 src/intel/intel_batchbuffer.h                      |     5 +-
 src/intel/intel_defines.h                          |     8 +
 src/intel/intel_dri_resource_sharing.c             |     6 +-
 src/intel/intel_driver.c                           |   142 +-
 src/intel/intel_driver.h                           |     6 +-
 src/intel/intel_gpgpu.c                            |   904 +-
 src/intel/intel_gpgpu.h                            |     2 +-
 src/intel/intel_structs.h                          |    19 +-
 src/kernels/cl_internal_copy_buf_align1.cl         |     8 -
 src/kernels/cl_internal_copy_buf_align16.cl        |     2 +-
 src/kernels/cl_internal_copy_buf_align4.cl         |     2 +-
 src/kernels/cl_internal_copy_buf_rect.cl           |    15 +
 .../cl_internal_copy_buf_unalign_dst_offset.cl     |    28 +
 .../cl_internal_copy_buf_unalign_same_offset.cl    |    19 +
 .../cl_internal_copy_buf_unalign_src_offset.cl     |    29 +
 src/kernels/cl_internal_copy_buffer_to_image_2d.cl |    18 +
 src/kernels/cl_internal_copy_buffer_to_image_3d.cl |    19 +
 src/kernels/cl_internal_copy_image_1d_to_1d.cl     |    19 +
 src/kernels/cl_internal_copy_image_2d_to_2d.cl     |    21 +
 src/kernels/cl_internal_copy_image_2d_to_3d.cl     |    22 +
 src/kernels/cl_internal_copy_image_2d_to_buffer.cl |    19 +
 src/kernels/cl_internal_copy_image_3d_to_2d.cl     |    22 +
 src/kernels/cl_internal_copy_image_3d_to_3d.cl     |    23 +
 src/kernels/cl_internal_copy_image_3d_to_buffer.cl |    22 +
 src/kernels/cl_internal_fill_buf_align128.cl       |     9 +
 src/kernels/cl_internal_fill_buf_align2.cl         |     8 +
 src/kernels/cl_internal_fill_buf_align4.cl         |     8 +
 src/kernels/cl_internal_fill_buf_align8.cl         |    14 +
 src/kernels/cl_internal_fill_buf_unalign.cl        |     8 +
 src/kernels/cl_internal_fill_image_1d.cl           |    14 +
 src/kernels/cl_internal_fill_image_1d_array.cl     |    15 +
 src/kernels/cl_internal_fill_image_2d.cl           |    15 +
 src/kernels/cl_internal_fill_image_2d_array.cl     |    16 +
 src/kernels/cl_internal_fill_image_3d.cl           |    16 +
 src/performance.c                                  |   324 +
 src/performance.h                                  |    12 +
 utests/CMakeLists.txt                              |    63 +-
 utests/builtin_kernel_max_global_size.cpp          |    30 +
 utests/compare_image_2d_and_1d_array.cpp           |    79 +
 utests/compiler_async_copy.cpp                     |     2 +-
 utests/compiler_async_stride_copy.cpp              |    10 +-
 utests/compiler_basic_arithmetic.cpp               |     1 -
 utests/compiler_box_blur_image.cpp                 |    11 +-
 utests/compiler_constant_expr.cpp                  |    35 +
 utests/compiler_copy_image.cpp                     |    16 +-
 utests/compiler_copy_image1.cpp                    |    24 +-
 utests/compiler_copy_image_1d.cpp                  |    52 +
 utests/compiler_copy_image_3d.cpp                  |    25 +-
 utests/compiler_double_precision.cpp               |    43 +
 utests/compiler_fill_gl_image.cpp                  |     2 +-
 utests/compiler_fill_image.cpp                     |    11 +-
 utests/compiler_fill_image0.cpp                    |    11 +-
 utests/compiler_fill_image_1d.cpp                  |    50 +
 utests/compiler_fill_image_3d.cpp                  |    13 +-
 utests/compiler_fill_image_3d_2.cpp                |    13 +-
 utests/compiler_function_qualifiers.cpp            |    10 +
 utests/compiler_get_image_info.cpp                 |    12 +-
 utests/compiler_get_image_info_array.cpp           |    64 +
 utests/compiler_getelementptr_bitcast.cpp          |    45 +
 utests/compiler_local_slm.cpp                      |     3 +-
 utests/compiler_mixed_pointer.cpp                  |   119 +
 utests/compiler_movforphi_undef.cpp                |    14 +-
 utests/compiler_saturate_sub.cpp                   |     2 +-
 utests/compiler_simd_all.cpp                       |    43 +
 utests/compiler_simd_any.cpp                       |    43 +
 utests/compiler_vector_load_store.cpp              |     2 +-
 utests/enqueue_built_in_kernels.cpp                |    19 +
 utests/enqueue_copy_buf.cpp                        |     4 +-
 utests/enqueue_copy_buf_unaligned.cpp              |   118 +
 utests/enqueue_fill_buf.cpp                        |    90 +
 utests/get_arg_info.cpp                            |    85 +
 utests/get_cl_info.cpp                             |    18 +-
 utests/image_1D_buffer.cpp                         |    80 +
 ...from_bin.cpp => load_program_from_bin_file.cpp} |     4 +-
 ..._from_bin.cpp => load_program_from_gen_bin.cpp} |    32 +-
 utests/profiling_exec.cpp                          |   102 +
 ...{runtime_event.cpp => runtime_barrier_list.cpp} |    23 +-
 utests/runtime_compile_link.cpp                    |   162 +
 utests/runtime_event.cpp                           |     2 +-
 .../{runtime_event.cpp => runtime_marker_list.cpp} |    23 +-
 utests/setenv.sh.in                                |     2 +
 utests/sub_buffer.cpp                              |    10 +-
 utests/test_printf.cpp                             |    18 +
 utests/utest.cpp                                   |   116 +-
 utests/utest.hpp                                   |    40 +-
 utests/utest_generator.py                          |    19 +-
 utests/utest_helper.cpp                            |    45 +-
 utests/utest_helper.hpp                            |    27 +-
 utests/utest_math_gen.py                           |   142 +-
 252 files changed, 30245 insertions(+), 7812 deletions(-)

diff --git a/CMake/FindDRM.cmake b/CMake/FindDRM.cmake
deleted file mode 100644
index a5a4ebc..0000000
--- a/CMake/FindDRM.cmake
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# Try to find X library and include path.
-# Once done this will define
-#
-# DRM_FOUND
-# DRM_INCLUDE_PATH
-# DRM_LIBRARY
-# 
-
-FIND_PATH(DRM_INCLUDE_PATH
-  NAMES
-  drm.h
-  PATHS
-  ${CMAKE_INCLUDE_PATH}/include/libdrm/
-  ~/include/libdrm/
-  /usr/include/libdrm/
-  /usr/local/include/libdrm/
-  /sw/include/libdrm/
-  /opt/local/include/libdrm/
-  DOC "The directory where drm.h resides")
-FIND_LIBRARY(DRM_LIBRARY
-  NAMES DRM drm
-  PATHS
-  ${CMAKE_LIBRARY_PATH}/lib/
-  ~/lib/
-  /usr/lib64
-  /usr/lib
-  /usr/local/lib64
-  /usr/local/lib
-  /sw/lib
-  /opt/local/lib
-  DOC "The DRM library")
-
-IF(DRM_INCLUDE_PATH)
-  INCLUDE_DIRECTORIES(${DRM_INCLUDE_PATH})
-  SET(DRM_FOUND 1 CACHE STRING "Set to 1 if DRM is found, 0 otherwise")
-ELSE(DRM_INCLUDE_PATH)
-  SET(DRM_FOUND 0 CACHE STRING "Set to 1 if DRM is found, 0 otherwise")
-ENDIF(DRM_INCLUDE_PATH)
-
-MARK_AS_ADVANCED(DRM_FOUND)
-
diff --git a/CMake/FindDRMIntel.cmake b/CMake/FindDRMIntel.cmake
deleted file mode 100644
index 0aab1c7..0000000
--- a/CMake/FindDRMIntel.cmake
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# Try to find X library and include path.
-# Once done this will define
-#
-# DRM_INTEL_FOUND
-# DRM_INTEL_INCLUDE_PATH
-# 
-
-FIND_PATH(DRM_INTEL_INCLUDE_PATH
-  NAMES
-  intel_bufmgr.h
-  PATHS
-  ${CMAKE_INCLUDE_PATH}/include/libdrm/
-  ~/include/libdrm/
-  /usr/include/libdrm/
-  /usr/local/include/libdrm/
-  /sw/include/libdrm/
-  /opt/local/include/libdrm/
-  DOC "The directory where intel_bufmgr.h resides")
-
-FIND_LIBRARY(DRM_INTEL_LIBRARY
-  NAMES DRM_INTEL drm_intel
-  PATHS
-  ${CMAKE_LIBRARY_PATH}/lib/
-  ~/lib/
-  /usr/lib64
-  /usr/lib
-  /usr/local/lib64
-  /usr/local/lib
-  /sw/lib
-  /opt/local/lib
-  /usr/lib/i386-linux-gnu/
-  DOC "The DRM_INTEL library")
-
-IF(DRM_INTEL_INCLUDE_PATH)
-  INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_PATH})
-  SET(DRM_INTEL_FOUND 1 CACHE STRING "Set to 1 if DRM_INTEL is found, 0 otherwise")
-ELSE(DRM_INTEL_INCLUDE_PATH)
-  SET(DRM_INTEL_FOUND 0 CACHE STRING "Set to 1 if DRM_INTEL is found, 0 otherwise")
-ENDIF(DRM_INTEL_INCLUDE_PATH)
-
-MARK_AS_ADVANCED(DRM_INTEL_FOUND)
-
diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index 97ee7db..556b3a9 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -84,7 +84,7 @@ endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
 macro(add_one_lib name)
   FIND_LIBRARY(CLANG_LIB
     NAMES ${name}
-    PATHS ${LLVM_LIBRARY_DIR} )
+    PATHS ${LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
   set(CLANG_LIBRARIES ${CLANG_LIBRARIES} ${CLANG_LIB})
 	unset(CLANG_LIB CACHE)
 endmacro()
diff --git a/CMake/FindEGL.cmake b/CMake/FindMesaSrc.cmake
similarity index 55%
rename from CMake/FindEGL.cmake
rename to CMake/FindMesaSrc.cmake
index 597b4a5..978cb4e 100644
--- a/CMake/FindEGL.cmake
+++ b/CMake/FindMesaSrc.cmake
@@ -1,38 +1,11 @@
 #
-# Try to find EGL library and include path.
+# Try to find mesa source code
 # Once done this will define
 #
-# EGL_FOUND
-# EGL_INCLUDE_PATH
-# EGL_LIBRARY
+# MESA_SOURCE_FOUND
+# MESA_SOURCE_INCLUDES
 #
 
-FIND_PATH(EGL_INCLUDE_PATH EGL/egl.h
-  ~/include/
-  /usr/include/
-  /usr/local/include/
-  /sw/include/
-  /opt/local/include/
-  DOC "The directory where gen/program.h resides")
-FIND_LIBRARY(EGL_LIBRARY
-  NAMES EGL egl
-  PATHS
-  ~/lib/
-  /usr/lib64
-  /usr/lib
-  /usr/local/lib64
-  /usr/local/lib
-  /sw/lib
-  /opt/local/lib
-  DOC "The EGL library")
-
-IF(EGL_INCLUDE_PATH)
-  INCLUDE_DIRECTORIES(${EGL_INCLUDE_PATH})
-  SET(EGL_FOUND 1 CACHE STRING "Set to 1 if EGL is found, 0 otherwise")
-ELSE(EGL_INCLUDE_PATH)
-  SET(EGL_FOUND 0 CACHE STRING "Set to 1 if EGL is found, 0 otherwise")
-ENDIF(EGL_INCLUDE_PATH)
-
 # Find mesa source code.
 FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
   $ENV{MESA_SOURCE_DIR}
@@ -45,11 +18,9 @@ SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
                          ${MESA_SOURCE_PREFIX}/include
                          ${MESA_SOURCE_PREFIX}/src/mapi
                          ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
-                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/intel/
+                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i915/
                          ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
 SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
 ELSE(MESA_SOURCE_PREFIX)
 SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
 ENDIF(MESA_SOURCE_PREFIX)
-
-MARK_AS_ADVANCED(EGL_FOUND)
diff --git a/CMake/FindXext.cmake b/CMake/FindXext.cmake
deleted file mode 100644
index 5bbd719..0000000
--- a/CMake/FindXext.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Try to find Xext library path.
-# Once done this will define
-#
-# XEXT_FOUND
-# XEXT_LIBRARY
-# 
-
-FIND_PATH(XEXT_INCLUDE_PATH X11/extensions/Xext.h
-  /usr/include
-  /usr/local/include
-  /sw/include
-  /opt/local/include
-  DOC "The directory where Xext.h resides")
-
-FIND_LIBRARY(XEXT_LIBRARY
-  NAMES XEXT Xext
-  PATHS
-  /usr/lib64
-  /usr/lib
-  /usr/local/lib64
-  /usr/local/lib
-  /sw/lib
-  /opt/local/lib
-  DOC "The XEXT library")
-
-IF(XEXT_INCLUDE_PATH)
-  INCLUDE_DIRECTORIES(${XEXT_INCLUDE_PATH})
-  SET(XEXT_FOUND 1 CACHE STRING "Set to 1 if XEXT is found, 0 otherwise")
-ELSE(XEXT_INCLUDE_PATH)
-  SET(XEXT_FOUND 0 CACHE STRING "Set to 1 if XEXT is found, 0 otherwise")
-ENDIF(XEXT_INCLUDE_PATH)
-
-MARK_AS_ADVANCED(XEXT_FOUND)
-
diff --git a/CMake/FindXfixes.cmake b/CMake/FindXfixes.cmake
deleted file mode 100644
index 47259e1..0000000
--- a/CMake/FindXfixes.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Try to find Xfixes library path.
-# Once done this will define
-#
-# XFIXES_FOUND
-# XFIXES_LIBRARY
-# 
-
-FIND_PATH(XFIXES_INCLUDE_PATH X11/extensions/Xfixes.h
-  /usr/include
-  /usr/local/include
-  /sw/include
-  /opt/local/include
-  DOC "The directory where Xfixes.h resides")
-
-FIND_LIBRARY(XFIXES_LIBRARY
-  NAMES XFIXES Xfixes
-  PATHS
-  /usr/lib64
-  /usr/lib
-  /usr/local/lib64
-  /usr/local/lib
-  /sw/lib
-  /opt/local/lib
-  DOC "The XFIXES library")
-
-IF(XFIXES_INCLUDE_PATH)
-  INCLUDE_DIRECTORIES(${XFIXES_INCLUDE_PATH})
-  SET(XFIXES_FOUND 1 CACHE STRING "Set to 1 if XFIXES is found, 0 otherwise")
-ELSE(XFIXES_INCLUDE_PATH)
-  SET(XFIXES_FOUND 0 CACHE STRING "Set to 1 if XFIXES is found, 0 otherwise")
-ENDIF(XFIXES_INCLUDE_PATH)
-
-MARK_AS_ADVANCED(XFIXES_FOUND)
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ed27b5..ac59859 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,18 +1,11 @@
-#############################################################################
-#                  INTEL CORPORATION PROPRIETARY INFORMATION                #
-#     This software is supplied under the terms of a license agreement or   #
-#     nondisclosure agreement with Intel Corporation and may not be copied  #
-#     or disclosed except in accordance with the terms of that agreement.   #
-#          Copyright (C) 2009 Intel Corporation. All Rights Reserved.       #
-#############################################################################
 
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 0)
-set (LIBCL_DRIVER_VERSION_MINOR 8)
-set (LIBCL_DRIVER_VERSION_PATCH 0)
+set (LIBCL_DRIVER_VERSION_MINOR 9)
+set (LIBCL_DRIVER_VERSION_PATCH 3)
 set (LIBCL_C_VERSION_MAJOR 1)
-set (LIBCL_C_VERSION_MINOR 1)
+set (LIBCL_C_VERSION_MINOR 2)
 
 configure_file (
   "src/OCLConfig.h.in"
@@ -21,11 +14,16 @@ configure_file (
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 
+INCLUDE (FindPkgConfig)
+
 SET(CMAKE_VERBOSE_MAKEFILE "false")
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMake/")
 if (NOT LIB_INSTALL_DIR)
   set (LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 endif (NOT LIB_INSTALL_DIR)
+if (NOT BEIGNET_INSTALL_DIR)
+  set (BEIGNET_INSTALL_DIR "${LIB_INSTALL_DIR}/beignet/")
+endif (NOT BEIGNET_INSTALL_DIR)
 SET(EMULATE_IVB false CACHE BOOL "To emulate IVB")
 SET(EMULATE_SNB false CACHE BOOL "To emulate SNB")
 SET(EMULATE_HSW false CACHE BOOL "To emulate HSW")
@@ -80,50 +78,66 @@ ELSE(X11_FOUND)
 ENDIF(X11_FOUND)
 
 # DRM
-Find_Package(DRM)
+pkg_check_modules(DRM REQUIRED libdrm)
 IF(DRM_FOUND)
-  MESSAGE(STATUS "Looking for DRM - found")
+  MESSAGE(STATUS "Looking for DRM - found at ${DRM_PREFIX}")
+  INCLUDE_DIRECTORIES(${DRM_INCLUDE_DIRS})
 ELSE(DRM_FOUND)
   MESSAGE(STATUS "Looking for DRM - not found")
 ENDIF(DRM_FOUND)
 
-# OpenGL
-Find_Package(OpenGL)
-# Threads
-Find_Package(Threads)
 # DRM Intel
-Find_Package(DRMIntel)
+pkg_check_modules(DRM_INTEL REQUIRED libdrm_intel)
 IF(DRM_INTEL_FOUND)
-  MESSAGE(STATUS "Looking for DRM Intel - found")
+  INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX}")
 ELSE(DRM_INTEL_FOUND)
   MESSAGE(STATUS "Looking for DRM Intel - not found")
 ENDIF(DRM_INTEL_FOUND)
 
+# Threads
+Find_Package(Threads)
+
+IF(X11_FOUND)
+# OpenGL (not use cmake helper)
+pkg_check_modules(OPENGL gl)
+IF(OPENGL_FOUND)
+  INCLUDE_DIRECTORIES(${OPENGL_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for OpenGL - found at ${OPENGL_PREFIX}")
+ELSE(OPENGL_FOUND)
+  MESSAGE(STATUS "Looking for OpenGL - not found")
+ENDIF(OPENGL_FOUND)
+
 # Xext
-Find_Package(Xext)
+pkg_check_modules(XEXT REQUIRED xext)
 IF(XEXT_FOUND)
-  MESSAGE(STATUS "Looking for Xext - found")
+  INCLUDE_DIRECTORIES(${XEXT_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for Xext - found at ${XEXT_PREFIX}")
 ELSE(XEXT_FOUND)
   MESSAGE(STATUS "Looking for Xext - not found")
 ENDIF(XEXT_FOUND)
 
 # Xfixes
-Find_Package(Xfixes)
+pkg_check_modules(XFIXES REQUIRED xfixes)
 IF(XFIXES_FOUND)
-  MESSAGE(STATUS "Looking for Xfixes - found")
+  INCLUDE_DIRECTORIES(${XFIXES_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for Xfixes - found at ${XFIXES_PREFIX}")
 ELSE(XFIXES_FOUND)
   MESSAGE(STATUS "Looking for Xfixes - not found")
 ENDIF(XFIXES_FOUND)
+ENDIF(X11_FOUND)
 
-Find_Package(EGL)
+pkg_check_modules(EGL egl)
 IF(EGL_FOUND)
-  MESSAGE(STATUS "Looking for EGL - found")
+  MESSAGE(STATUS "Looking for EGL - found at ${EGL_PREFIX}")
 ELSE(EGL_FOUND)
   MESSAGE(STATUS "Looking for EGL - not found")
 ENDIF(EGL_FOUND)
 
+# cl_khr_gl_sharing requires to build with mesa source
+Find_Package(MesaSrc)
 IF(MESA_SOURCE_FOUND)
-  MESSAGE(STATUS "Looking for mesa source code - found")
+  MESSAGE(STATUS "Looking for mesa source code - found at ${MESA_SOURCE_PREFIX}")
 ELSE(MESA_SOURCE_FOUND)
   MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
 ENDIF(MESA_SOURCE_FOUND)
@@ -146,6 +160,7 @@ ADD_SUBDIRECTORY(include)
 ADD_SUBDIRECTORY(backend)
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(utests)
+ADD_SUBDIRECTORY(benchmark)
 
 SET(CPACK_PACKAGE_VERSION_MAJOR "${LIBCL_DRIVER_VERSION_MAJOR}")
 SET(CPACK_PACKAGE_VERSION_MINOR "${LIBCL_DRIVER_VERSION_MINOR}")
diff --git a/NEWS.mdwn b/NEWS.mdwn
new file mode 120000
index 0000000..dc4cb4b
--- /dev/null
+++ b/NEWS.mdwn
@@ -0,0 +1 @@
+docs/NEWS.mdwn
\ No newline at end of file
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index dd55a4a..6a31c68 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -34,7 +34,7 @@ else (GBE_DEBUG_MEMORY)
 endif (GBE_DEBUG_MEMORY)
 
 # Hide all symbols and allows the symbols declared as visible to be exported
-set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden ${CMAKE_C_CXX_FLAGS}")
+set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden -DGBE_COMPILER_AVAILABLE=1 ${CMAKE_C_CXX_FLAGS}")
 
 if (COMPILER STREQUAL "GCC")
   set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
@@ -99,7 +99,10 @@ include_directories (${CMAKE_CURRENT_BINARY_DIR})
 add_subdirectory (src)
 set(LOCAL_PCH_OBJECT_DIR ${LOCAL_PCH_OBJECT_DIR} PARENT_SCOPE)
 set(LOCAL_PCM_OBJECT_DIR ${LOCAL_PCM_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_GBE_OBJECT_DIR ${LOCAL_GBE_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_INTERP_OBJECT_DIR ${LOCAL_INTERP_OBJECT_DIR} PARENT_SCOPE)
+
 set (GBE_BIN_GENERATER
-     OCL_PCM_PATH=${LOCAL_PCM_OBJECT_DIR} OCL_PCH_PATH=${LOCAL_PCH_OBJECT_DIR} ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
+     OCL_PCM_PATH=${LOCAL_PCM_OBJECT_DIR} OCL_PCH_PATH=${LOCAL_PCH_OBJECT_DIR} LD_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
      PARENT_SCOPE)
 
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 33494a0..a3818ab 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -1,13 +1,10 @@
-set (beignet_install_path ${LIB_INSTALL_DIR}/beignet/)
-
-
 set (ocl_vector_spec_file ${GBE_SOURCE_DIR}/src/builtin_vector_proto.def)
 set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
 set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
 set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
 set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
 set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
-set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}${beignet_install_path}ocl_stdlib.h)
+set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}${BEIGNET_INSTALL_DIR}ocl_stdlib.h)
 set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
 set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
 set (ocl_gen_vector_cmd ${GBE_SOURCE_DIR}/src/gen_builtin_vector.py)
@@ -36,7 +33,7 @@ add_custom_command(
 
 add_custom_command(
   OUTPUT ${ocl_blob_file}
-  COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/${beignet_install_path}
+  COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/${BEIGNET_INSTALL_DIR}
   COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
   DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file}
   )
@@ -45,7 +42,7 @@ set (pch_object ${ocl_blob_file}.pch)
 set (local_pch_object ${ocl_blob_file}.local.pch)
 # generate pch object
 if (LLVM_VERSION_NODOT VERSION_GREATER 32)
-    set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off)
+    set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off -cl-kernel-arg-info)
 else (LLVM_VERSION_NODOT VERSION_GREATER 32)
     if (LLVM_VERSION_NODOT VERSION_GREATER 31)
         set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off)
@@ -53,13 +50,13 @@ else (LLVM_VERSION_NODOT VERSION_GREATER 32)
         set (clang_cmd -cc1 -x cl -triple ptx32)
     endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
-set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+set (clang_cmd ${clang_cmd} -cl-std=CL1.2 -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
 
 add_custom_command(
      OUTPUT ${pch_object}
      COMMAND rm -f ${pch_object}
-     COMMAND clang ${clang_cmd} --relocatable-pch -emit-pch -isysroot ${CMAKE_CURRENT_BINARY_DIR} ${ocl_blob_file} -o ${pch_object}
-     COMMAND clang ${clang_cmd} -emit-pch ${ocl_blob_file} -o ${local_pch_object}
+     COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} --relocatable-pch -emit-pch -isysroot ${CMAKE_CURRENT_BINARY_DIR} ${ocl_blob_file} -o ${pch_object}
+     COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} -emit-pch ${ocl_blob_file} -o ${local_pch_object}
      DEPENDS ${ocl_blob_file}
      )
 
@@ -71,14 +68,14 @@ macro(ll_add_library ll_lib ll_sources)
   add_custom_command(
        OUTPUT  ${ll}.bc
        COMMAND rm -f ${ll}.bc
-       COMMAND llvm-as -o ${ll}.bc ${GBE_SOURCE_DIR}/src/${ll}
+       COMMAND ${LLVM_INSTALL_DIR}llvm-as -o ${ll}.bc ${GBE_SOURCE_DIR}/src/${ll}
        DEPENDS ${ll}
        )
   set (ll_objects ${ll_objects} ${ll}.bc)
   endforeach (ll ${ll_sources})
   add_custom_command(
        OUTPUT ${ll_lib}
-       COMMAND llvm-link -o ${ll_lib} ${ll_objects}
+       COMMAND ${LLVM_INSTALL_DIR}llvm-link -o ${ll_lib} ${ll_objects}
        DEPENDS ${ll_objects}
        )
   add_custom_target(${ll_lib}
@@ -135,6 +132,10 @@ else (GBE_USE_BLOB)
     ir/value.hpp
     ir/lowering.cpp
     ir/lowering.hpp
+    ir/printf.cpp
+    ir/printf.hpp
+    ir/immediate.hpp
+    ir/immediate.cpp
     backend/context.cpp
     backend/context.hpp
     backend/program.cpp
@@ -144,7 +145,10 @@ else (GBE_USE_BLOB)
     llvm/llvm_passes.cpp
     llvm/llvm_scalarize.cpp
     llvm/llvm_intrinsic_lowering.cpp
+    llvm/llvm_barrier_nodup.cpp
+    llvm/llvm_printf_parser.cpp
     llvm/llvm_to_gen.cpp
+    llvm/llvm_loadstore_optimization.cpp
     llvm/llvm_gen_backend.hpp
     llvm/llvm_gen_ocl_function.hxx
     llvm/llvm_to_gen.hpp
@@ -156,20 +160,26 @@ else (GBE_USE_BLOB)
     backend/gen_reg_allocation.cpp
     backend/gen_reg_allocation.hpp
     backend/gen_context.cpp
-    backend/gen_context.hpp
+    backend/gen_context.cpp
+    backend/gen75_context.hpp
+    backend/gen75_context.cpp
     backend/gen_program.cpp
     backend/gen_program.hpp
     backend/gen_program.h
     backend/gen_defs.hpp
+    backend/gen_insn_compact.cpp
     backend/gen_encoder.hpp
-    backend/gen_encoder.cpp)
+    backend/gen_encoder.cpp
+    backend/gen75_encoder.hpp
+    backend/gen75_encoder.cpp
+    )
 
 endif (GBE_USE_BLOB)
 
 include_directories (.)
-link_directories (${LLVM_LIBRARY_DIRS})
+link_directories (${LLVM_LIBRARY_DIRS} ${DRM_LIBDIR})
 include_directories(${LLVM_INCLUDE_DIRS})
-add_library (gbe STATIC ${GBE_SRC})
+add_library (gbe SHARED ${GBE_SRC})
 
 # for pre compiled module library.
 set (pcm_lib "beignet.bc")
@@ -179,31 +189,47 @@ ll_add_library (${pcm_lib} pcm_sources)
 ADD_DEPENDENCIES (gbe pch_object ${pcm_lib})
 target_link_libraries(
                       gbe
-                      ${DRM_INTEL_LIBRARY}
-                      ${DRM_LIBRARY}
-                      ${OPENGL_LIBRARIES}
+                      ${DRM_INTEL_LIBRARIES}
+                      ${DRM_LIBRARIES}
                       ${CLANG_LIBRARIES}
                       ${LLVM_MODULE_LIBS}
                       ${LLVM_SYSTEM_LIBS}
                       ${CMAKE_THREAD_LIBS_INIT}
                       ${CMAKE_DL_LIBS})
 
-link_directories (${LLVM_LIBRARY_DIR})
+add_library(gbeinterp SHARED gbe_bin_interpreter.cpp)
+
+if (LLVM_VERSION_NODOT VERSION_EQUAL 34)
+  find_library(TERMINFO NAMES tinfo ncurses)
+  if (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+    message(FATAL_ERROR "no libtinfo or libncurses is found in system")
+  else (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+    target_link_libraries(gbe ${TERMINFO})
+    message(STATUS "use ${TERMINFO} as terminal control library")
+  endif (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+endif(LLVM_VERSION_NODOT VERSION_EQUAL 34)
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
 ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
 TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
 
-#install (TARGETS gbe LIBRARY DESTINATION lib)
+install (TARGETS gbe LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
+install (TARGETS gbeinterp LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
 #install (FILES backend/program.h DESTINATION include/gen)
-install (FILES ${ocl_blob_file} DESTINATION ${LIB_INSTALL_DIR}/beignet)
-install (FILES ${pch_object} DESTINATION ${LIB_INSTALL_DIR}/beignet)
-install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib} DESTINATION ${LIB_INSTALL_DIR}/beignet)
+install (FILES ${ocl_blob_file} DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${pch_object} DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib} DESTINATION ${BEIGNET_INSTALL_DIR})
 # When build beignet itself, we need to export the local precompiled header file and precompiled module
 # file to libcl and utests.
-set (LOCAL_PCH_OBJECT_DIR "${local_pch_object}:${beignet_install_path}/ocl_stdlib.h.pch" PARENT_SCOPE)
-set (LOCAL_PCM_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib}:${beignet_install_path}/${pcm_lib}" PARENT_SCOPE)
-
-set (PCH_OBJECT_DIR "${beignet_install_path}/ocl_stdlib.h.pch")
-set (PCM_OBJECT_DIR "${beignet_install_path}/${pcm_lib}")
+set (LOCAL_PCH_OBJECT_DIR "${local_pch_object}:${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch" PARENT_SCOPE)
+set (LOCAL_PCM_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib}:${BEIGNET_INSTALL_DIR}/${pcm_lib}" PARENT_SCOPE)
+set (LOCAL_GBE_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbe.so" PARENT_SCOPE)
+set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PARENT_SCOPE)
+
+set (PCH_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch")
+set (PCM_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/${pcm_lib}")
+set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
+set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
 configure_file (
   "GBEConfig.h.in"
   "GBEConfig.h"
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
index 5bc09b8..f5c69c6 100644
--- a/backend/src/GBEConfig.h.in
+++ b/backend/src/GBEConfig.h.in
@@ -3,3 +3,5 @@
 #define LIBGBE_VERSION_MINOR @LIBGBE_VERSION_MINOR@
 #define PCH_OBJECT_DIR "@PCH_OBJECT_DIR@"
 #define PCM_OBJECT_DIR "@PCM_OBJECT_DIR@"
+#define GBE_OBJECT_DIR "@GBE_OBJECT_DIR@"
+#define INTERP_OBJECT_DIR "@INTERP_OBJECT_DIR@"
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 2125bd1..e09a309 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -30,30 +30,18 @@
 #include "ir/liveness.hpp"
 #include "ir/value.hpp"
 #include "ir/image.hpp"
-#include "ir/sampler.hpp"
 #include "sys/cvar.hpp"
 #include <algorithm>
 
 namespace gbe
 {
-  /*! Structure that keeps track of allocation in the register file. This is
-   *  actually needed by Context (and not only by GenContext) because both
-   *  simulator and hardware have to deal with constant pushing which uses the
-   *  register file
-   *
-   *  Since Gen is pretty flexible, we just maintain a free list for the
-   *  register file (as a classical allocator) and coalesce blocks when required
-   */
-  class RegisterFilePartitioner
+  class SimpleAllocator
   {
   public:
-    RegisterFilePartitioner(void);
-    ~RegisterFilePartitioner(void);
+    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
+    ~SimpleAllocator(void);
 
-    /*! Allocate some memory in the register file. Return 0 if out-of-memory. By
-     *  the way, zero is not a valid offset since r0 is always preallocated by
-     *  the hardware. Note that we always use the left most block when
-     *  allocating, so it makes sense for constant pushing
+    /*! Allocate some memory from the pool.
      */
     int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
 
@@ -63,10 +51,7 @@ namespace gbe
     /*! Spilt a block into 2 blocks */
     void splitBlock(int16_t offset, int16_t subOffset);
 
-  private:
-    /*! May need to make that run-time in the future */
-    static const int16_t RegisterFileSize = 4*KB;
-
+  protected:
     /*! Double chained list of free spaces */
     struct Block {
       Block(int16_t offset, int16_t size) :
@@ -80,6 +65,10 @@ namespace gbe
      *  If the colascing was done, the left block is deleted
      */
     void coalesce(Block *left, Block *right);
+    /*! the maximum offset */
+    int16_t maxOffset;
+    /*! whether trigger an assertion on allocation failure */
+    bool assertFail;
     /*! Head and tail of the free list */
     Block *head;
     Block *tail;
@@ -88,17 +77,46 @@ namespace gbe
     /*! Track allocated memory blocks <offset, size> */
     map<int16_t, int16_t> allocatedBlocks;
     /*! Use custom allocators */
-    GBE_CLASS(RegisterFilePartitioner);
+    GBE_CLASS(SimpleAllocator);
+  };
+
+  /*! Structure that keeps track of allocation in the register file. This is
+   *  actually needed by Context (and not only by GenContext) because both
+   *  simulator and hardware have to deal with constant pushing which uses the
+   *  register file
+   *
+   *  Since Gen is pretty flexible, we just reuse the Simpleallocator
+   */
+
+  class RegisterAllocator: public SimpleAllocator {
+  public:
+    RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
+
+    GBE_CLASS(RegisterAllocator);
+  };
+
+  /*!
+   * an allocator for scratch memory allocation. Scratch memory are used for register spilling.
+   * You can query how much scratch memory needed through getMaxScatchMemUsed().
+   */
+
+  class ScratchAllocator: public SimpleAllocator {
+  public:
+    ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {}
+    int16_t getMaxScatchMemUsed() { return maxOffset; }
+
+    GBE_CLASS(ScratchAllocator);
   };
 
-  RegisterFilePartitioner::RegisterFilePartitioner(void) {
-    // r0 is always set by the HW and used at the end by EOT
-    const int16_t offset = GEN_REG_SIZE;
-    const int16_t size = RegisterFileSize  - offset;
-    tail = head = this->newBlock(offset, size);
+  SimpleAllocator::SimpleAllocator(int16_t startOffset,
+                                   int16_t size,
+                                   bool _assertFail)
+                                  : maxOffset(0),
+                                  assertFail(_assertFail){
+    tail = head = this->newBlock(startOffset, size);
   }
 
-  RegisterFilePartitioner::~RegisterFilePartitioner(void) {
+  SimpleAllocator::~SimpleAllocator(void) {
     while (this->head) {
       Block *next = this->head->next;
       this->deleteBlock(this->head);
@@ -106,7 +124,7 @@ namespace gbe
     }
   }
 
-  int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment, bool bFwd)
+  int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd)
   {
     // Make it simple and just use the first block we find
     Block *list = bFwd ? head : tail;
@@ -206,13 +224,16 @@ namespace gbe
 
       // Track the allocation to retrieve the size later
       allocatedBlocks.insert(std::make_pair(aligned, size));
+      // update max offset
+      if(aligned + size > maxOffset) maxOffset = aligned + size;
       // We have a valid offset now
       return aligned;
     }
+    GBE_ASSERT( !assertFail );
     return 0;
   }
 
-  void RegisterFilePartitioner::deallocate(int16_t offset)
+  void SimpleAllocator::deallocate(int16_t offset)
   {
     // Retrieve the size in the allocation map
     auto it = allocatedBlocks.find(offset);
@@ -255,7 +276,7 @@ namespace gbe
     allocatedBlocks.erase(it);
   }
 
-  void RegisterFilePartitioner::coalesce(Block *left, Block *right) {
+  void SimpleAllocator::coalesce(Block *left, Block *right) {
     if (left == NULL || right == NULL) return;
     GBE_ASSERT(left->offset < right->offset);
     GBE_ASSERT(left->next == right);
@@ -271,7 +292,7 @@ namespace gbe
     }
   }
 
-  void RegisterFilePartitioner::splitBlock(int16_t offset, int16_t subOffset) {
+  void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) {
     // Retrieve the size in the allocation map
     auto it = allocatedBlocks.find(offset);
     GBE_ASSERT(it != allocatedBlocks.end());
@@ -292,15 +313,6 @@ namespace gbe
     allocatedBlocks.insert(std::make_pair(offset + subOffset, size - subOffset));
   }
 
-  static int
-  alignScratchSize(int size){
-    int i = 0;
-
-    for(; i < size; i+=1024)
-      ;
-
-    return i;
-  }
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
@@ -312,27 +324,40 @@ namespace gbe
     GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
     this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
     this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
-    this->partitioner = GBE_NEW_NO_ARG(RegisterFilePartitioner);
-    if (fn.getSimdWidth() == 0 || OCL_SIMD_WIDTH != 15)
-      this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
-    else
-      this->simdWidth = fn.getSimdWidth();
-    this->scratchOffset = 0;
+    // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
+    this->registerAllocator = NULL; //GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+    this->scratchAllocator = NULL; //GBE_NEW(ScratchAllocator, 12*KB);
   }
 
   Context::~Context(void) {
-    GBE_SAFE_DELETE(this->partitioner);
+    GBE_SAFE_DELETE(this->registerAllocator);
+    GBE_SAFE_DELETE(this->scratchAllocator);
     GBE_SAFE_DELETE(this->dag);
     GBE_SAFE_DELETE(this->liveness);
   }
 
+  void Context::startNewCG(uint32_t simdWidth) {
+    if (simdWidth == 0 || OCL_SIMD_WIDTH != 15)
+      this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
+    else
+      this->simdWidth = simdWidth;
+    GBE_SAFE_DELETE(this->registerAllocator);
+    GBE_SAFE_DELETE(this->scratchAllocator);
+    GBE_ASSERT(dag != NULL && liveness != NULL);
+    this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+    this->scratchAllocator = GBE_NEW(ScratchAllocator, this->getScratchSize());
+    this->curbeRegs.clear();
+    this->JIPs.clear();
+  }
+
   Kernel *Context::compileKernel(void) {
     this->kernel = this->allocateKernel();
     this->kernel->simdWidth = this->simdWidth;
-    this->buildPatchList();
     this->buildArgList();
-    this->buildUsedLabels();
-    this->buildJIPs();
+    if (usedLabels.size() == 0)
+      this->buildUsedLabels();
+    if (JIPs.size() == 0)
+      this->buildJIPs();
     this->buildStack();
     this->handleSLM();
     if (this->emitCode() == false) {
@@ -340,47 +365,31 @@ namespace gbe
       this->kernel = NULL;
     }
     if(this->kernel != NULL) {
-      this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+      this->kernel->scratchSize = this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
       this->kernel->ctx = this;
     }
     return this->kernel;
   }
 
   int16_t Context::allocate(int16_t size, int16_t alignment) {
-    return partitioner->allocate(size, alignment);
+    return registerAllocator->allocate(size, alignment);
   }
 
-  void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); }
+  void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
 
   void Context::splitBlock(int16_t offset, int16_t subOffset) {
-    partitioner->splitBlock(offset, subOffset);
+    registerAllocator->splitBlock(offset, subOffset);
   }
 
-  int32_t Context::allocConstBuf(uint32_t argID) {
-     GBE_ASSERT(kernel->args[argID].type == GBE_ARG_CONSTANT_PTR);
-
-    //free previous
-    int32_t offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
-    if(offset >= 0)
-        deallocate(offset+GEN_REG_SIZE);
-
-    if(kernel->args[argID].bufSize > 0) {
-      //use 32 alignment here as GEN_REG_SIZE, need dynamic by type?
-      newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_CONSTANT_BUFFER+argID, kernel->args[argID].bufSize, 32);
-    }
-
-    std::sort(kernel->patches.begin(), kernel->patches.end());
-    offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
-    GBE_ASSERT(offset>=0);
+  // FIXME TODO as we optimize scratch memory usage using the register interval.
+  // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
+  // memory that are reused still keep the order
 
-    kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
-    return offset + GEN_REG_SIZE;
+  int32_t Context::allocateScratchMem(uint32_t size) {
+    return scratchAllocator->allocate(size, 32, true);
   }
-
-  uint32_t Context::allocateScratchMem(uint32_t size) {
-    uint32_t offset = scratchOffset;
-    scratchOffset += size;
-    return offset;
+  void Context::deallocateScratchMem(int32_t offset) {
+    scratchAllocator->deallocate(offset);
   }
 
   void Context::buildStack(void) {
@@ -388,7 +397,7 @@ namespace gbe
     if (stackUse.size() == 0)  // no stack is used if stackptr is unused
       return;
     // Be sure that the stack pointer is set
-    GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
+    // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
     uint32_t stackSize = 1*KB;
     while (stackSize < fn.getStackSize()) {
       stackSize <<= 1;
@@ -403,7 +412,7 @@ namespace gbe
                               uint32_t alignment)
   {
     alignment = alignment == 0 ? size : alignment;
-    const uint32_t offset = partitioner->allocate(size, alignment, 1);
+    const uint32_t offset = registerAllocator->allocate(size, alignment, 1);
     GBE_ASSERT(offset >= GEN_REG_SIZE);
     kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
     kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
@@ -424,104 +433,11 @@ namespace gbe
     return offset + GEN_REG_SIZE;
   }
 
-
   void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
     curbeRegs.insert(std::make_pair(reg, offset));
   }
-
-  void Context::buildPatchList(void) {
-    const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
-    kernel->curbeSize = 0u;
-
-    // We insert the block IP mask first
-    this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
-    this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0,  sizeof(uint32_t)));
-    this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
-    this->insertCurbeReg(ir::ocl::barriermask, this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t)));
-
-    // Go over the arguments and find the related patch locations
-    const uint32_t argNum = fn.argNum();
-    for (uint32_t argID = 0u; argID < argNum; ++argID) {
-      const ir::FunctionArgument &arg = fn.getArg(argID);
-      // For pointers and values, we have nothing to do. We just push the values
-      if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
-          arg.type == ir::FunctionArgument::LOCAL_POINTER ||
-          arg.type == ir::FunctionArgument::CONSTANT_POINTER ||
-          arg.type == ir::FunctionArgument::VALUE ||
-          arg.type == ir::FunctionArgument::STRUCTURE ||
-          arg.type == ir::FunctionArgument::IMAGE ||
-          arg.type == ir::FunctionArgument::SAMPLER)
-        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
-    }
-
-    // Already inserted registers go here
-    const size_t localIDSize = sizeof(uint32_t) * this->simdWidth;
-    insertCurbeReg(ir::ocl::lid0, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize));
-    insertCurbeReg(ir::ocl::lid1, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize));
-    insertCurbeReg(ir::ocl::lid2, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize));
-
-    // Go over all the instructions and find the special register we need
-    // to push
-#define INSERT_REG(SPECIAL_REG, PATCH, WIDTH) \
-  if (reg == ir::ocl::SPECIAL_REG) { \
-    if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
-    insertCurbeReg(reg, this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH)); \
-  } else
-
-    bool useStackPtr = false;
-    fn.foreachInstruction([&](ir::Instruction &insn) {
-      const uint32_t srcNum = insn.getSrcNum();
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const ir::Register reg = insn.getSrc(srcID);
-        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
-          if (srcID != 0) continue;
-          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
-          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key(bti, type);
-          const ir::Register imageInfo = insn.getSrc(0);
-          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
-            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            insertCurbeReg(imageInfo, offset);
-          }
-          continue;
-        } else if (insn.getOpcode() == ir::OP_GET_SAMPLER_INFO) {
-          /* change the src to sampler information register. */
-          GBE_ASSERT(insn.getSrc(0) == ir::ocl::samplerinfo);
-          if (curbeRegs.find(insn.getSrc(0)) == curbeRegs.end())
-            insertCurbeReg(insn.getSrc(0), this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
-          continue;
-        }
-        if (fn.isSpecialReg(reg) == false) continue;
-        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
-        if (reg == ir::ocl::stackptr) useStackPtr = true;
-        INSERT_REG(lsize0, LOCAL_SIZE_X, 1)
-        INSERT_REG(lsize1, LOCAL_SIZE_Y, 1)
-        INSERT_REG(lsize2, LOCAL_SIZE_Z, 1)
-        INSERT_REG(gsize0, GLOBAL_SIZE_X, 1)
-        INSERT_REG(gsize1, GLOBAL_SIZE_Y, 1)
-        INSERT_REG(gsize2, GLOBAL_SIZE_Z, 1)
-        INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1)
-        INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1)
-        INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1)
-        INSERT_REG(workdim, WORK_DIM, 1)
-        INSERT_REG(numgroup0, GROUP_NUM_X, 1)
-        INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
-        INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
-        INSERT_REG(stackptr, STACK_POINTER, this->simdWidth)
-        do {} while(0);
-      }
-    });
-#undef INSERT_REG
-
-    // Insert the stack buffer if used
-    if (useStackPtr)
-      insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
-
-    // After this point the vector is immutable. Sorting it will make
-    // research faster
-    std::sort(kernel->patches.begin(), kernel->patches.end());
-
-    kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
+  ir::Register Context::getSurfaceBaseReg(unsigned char bti) {
+    return fn.getSurfaceBaseReg(bti);
   }
 
   void Context::buildArgList(void) {
@@ -534,6 +450,7 @@ namespace gbe
       const auto &arg = fn.getArg(argID);
 
       kernel->args[argID].align = arg.align;
+      kernel->args[argID].info = arg.info;
       switch (arg.type) {
         case ir::FunctionArgument::VALUE:
         case ir::FunctionArgument::STRUCTURE:
@@ -543,6 +460,7 @@ namespace gbe
         case ir::FunctionArgument::GLOBAL_POINTER:
           kernel->args[argID].type = GBE_ARG_GLOBAL_PTR;
           kernel->args[argID].size = sizeof(void*);
+          kernel->args[argID].bti = arg.bti;
           break;
         case ir::FunctionArgument::CONSTANT_POINTER:
           kernel->args[argID].type = GBE_ARG_CONSTANT_PTR;
@@ -663,35 +581,5 @@ namespace gbe
     kernel->slmSize = fn.getSLMSize();
   }
 
-  bool Context::isScalarReg(const ir::Register &reg) const {
-    GBE_ASSERT(fn.getProfile() == ir::Profile::PROFILE_OCL);
-    if (fn.getArg(reg) != NULL) return true;
-    if (fn.getPushLocation(reg) != NULL) return true;
-    if (reg == ir::ocl::groupid0  ||
-        reg == ir::ocl::groupid1  ||
-        reg == ir::ocl::groupid2  ||
-        reg == ir::ocl::barrierid ||
-        reg == ir::ocl::threadn   ||
-        reg == ir::ocl::numgroup0 ||
-        reg == ir::ocl::numgroup1 ||
-        reg == ir::ocl::numgroup2 ||
-        reg == ir::ocl::lsize0    ||
-        reg == ir::ocl::lsize1    ||
-        reg == ir::ocl::lsize2    ||
-        reg == ir::ocl::gsize0    ||
-        reg == ir::ocl::gsize1    ||
-        reg == ir::ocl::gsize2    ||
-        reg == ir::ocl::goffset0  ||
-        reg == ir::ocl::goffset1  ||
-        reg == ir::ocl::goffset2  ||
-        reg == ir::ocl::workdim   ||
-        reg == ir::ocl::emask     ||
-        reg == ir::ocl::notemask  ||
-        reg == ir::ocl::barriermask
-      )
-      return true;
-    return false;
-  }
-
 } /* namespace gbe */
 
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 000612e..3faead2 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -41,7 +41,8 @@ namespace ir {
 namespace gbe
 {
   class Kernel;                 // context creates Kernel
-  class RegisterFilePartitioner; // Partition register file for reg allocation
+  class RegisterAllocator;      // allocator for physical register allocation
+  class ScratchAllocator;       // allocator for scratch memory allocation
 
   /*! Context is the helper structure to build the Gen ISA or simulation code
    *  from GenIR
@@ -55,6 +56,8 @@ namespace gbe
     Context(const ir::Unit &unit, const std::string &name);
     /*! Release everything needed */
     virtual ~Context(void);
+    /*! start new code generation with specific simd width. */
+    void startNewCG(uint32_t simdWidth);
     /*! Compile the code */
     Kernel *compileKernel(void);
     /*! Tells if the labels is used */
@@ -67,8 +70,6 @@ namespace gbe
     INLINE const ir::Liveness &getLiveness(void) const { return *liveness; }
     /*! Tells if the register is used */
     bool isRegUsed(const ir::Register &reg) const;
-    /*! Indicate if a register is scalar or not */
-    bool isScalarReg(const ir::Register &reg) const;
     /*! Get the kernel we are currently compiling */
     INLINE Kernel *getKernel(void) const { return this->kernel; }
     /*! Get the function we are currently compiling */
@@ -88,24 +89,27 @@ namespace gbe
     void deallocate(int16_t offset);
     /*! Spilt a block into 2 blocks, for some registers allocate together but  deallocate seperate */
     void splitBlock(int16_t offset, int16_t subOffset);
-    /* allocate curbe for constant ptr argument */
-    int32_t allocConstBuf(uint32_t argID);
     /* allocate a new entry for a specific image's information */
     /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
     uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
     /*! allocate size scratch memory and return start address */
-    uint32_t allocateScratchMem(uint32_t size);
+    int32_t allocateScratchMem(uint32_t size);
+    /*! deallocate scratch memory at offset */
+    void deallocateScratchMem(int32_t offset);
     /*! Preallocated curbe register set including special registers. */
     map<ir::Register, uint32_t> curbeRegs;
+    ir::Register getSurfaceBaseReg(unsigned char bti);
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t) = 0;
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) = 0;
     /*! Allocate a new empty kernel (to be implemented) */
     virtual Kernel *allocateKernel(void) = 0;
     /*! Look if a stack is needed and allocate it */
     void buildStack(void);
-    /*! Build the curbe patch list for the given kernel */
-    void buildPatchList(void);
     /*! Build the list of arguments to set to launch the kernel */
     void buildArgList(void);
     /*! Build the sets of used labels */
@@ -120,6 +124,7 @@ namespace gbe
      *  of the entry
      */
     void insertCurbeReg(ir::Register, uint32_t grfOffset);
+    /*! allocate a curbe entry. */
     uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
     /*! Provide for each branch and label the label index target */
     typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
@@ -129,11 +134,12 @@ namespace gbe
     Kernel *kernel;                       //!< Kernel we are building
     ir::Liveness *liveness;               //!< Liveness info for the variables
     ir::FunctionDAG *dag;                 //!< Graph of values on the function
-    RegisterFilePartitioner *partitioner; //!< Handle register file partionning
+    RegisterAllocator *registerAllocator; //!< physical register allocation
+    ScratchAllocator *scratchAllocator;   //!< scratch memory allocator
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
-    uint32_t scratchOffset;               //!< scratch slot for next scratch memory request
+    map<unsigned char, ir::Register> btiRegMap;
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 1f5adc9..c120b60 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -49,6 +49,7 @@
 #include <assert.h>
 
 #include "backend/gen_defs.hpp"
+#include "src/cl_device_data.h"
 
 static const struct {
   const char    *name;
@@ -100,12 +101,13 @@ static const struct {
   [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
-  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
-  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
-  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
   [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
   [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
@@ -113,7 +115,7 @@ static const struct {
   [GEN_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
   [GEN_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
   [GEN_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
-  [GEN_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+  [GEN_OPCODE_ENDIF] = { .name = "endif", .nsrc = 1, .ndst = 0 },
 };
 
 static const char *conditional_modifier[16] = {
@@ -253,14 +255,14 @@ static const char *access_mode[2] = {
 };
 
 static const char *reg_encoding[8] = {
-  [0] = "UD",
-  [1] = "D",
-  [2] = "UW",
-  [3] = "W",
-  [4] = "UB",
-  [5] = "B",
-  [6] = "DF",
-  [7] = "F"
+  [0] = ":UD",
+  [1] = ":D",
+  [2] = ":UW",
+  [3] = ":W",
+  [4] = ":UB",
+  [5] = ":B",
+  [6] = ":DF",
+  [7] = ":F"
 };
 
 int reg_type_size[8] = {
@@ -318,6 +320,20 @@ static const char *target_function_gen6[16] = {
   [GEN_SFID_DATAPORT_DATA_CACHE] = "data"
 };
 
+static const char *target_function_gen75[16] = {
+  [GEN_SFID_NULL] = "null",
+  [GEN_SFID_MATH] = "math",
+  [GEN_SFID_SAMPLER] = "sampler",
+  [GEN_SFID_MESSAGE_GATEWAY] = "gateway",
+  [GEN_SFID_URB] = "urb",
+  [GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
+  [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
+  [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
+  [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+  [GEN_SFID_DATAPORT_DATA_CACHE] = "data (0)",
+  [GEN_SFID_DATAPORT1_DATA_CACHE] = "data (1)"
+};
+
 static const char *gateway_sub_function[8] = {
   [0] = "open gateway",
   [1] = "close gateway",
@@ -413,6 +429,21 @@ static const char *data_port_data_cache_msg_type[] = {
   [13] = "Untyped Surface Write",
 };
 
+static const char *data_port1_data_cache_msg_type[] = {
+  [1] = "Untyped Surface Read",
+  [2] = "Untyped Atomic Operation",
+  [3] = "Untyped Atomic Operation SIMD4x2",
+  [4] = "Media Block Read",
+  [5] = "Typed Surface Read",
+  [6] = "Typed Atomic Operation",
+  [7] = "Typed Atomic Operation SIMD4x2",
+  [9] = "Untyped Surface Write",
+  [10] = "Media Block Write",
+  [11] = "Atomic Counter Operation",
+  [12] = "Atomic Counter Operation 4X2",
+  [13] = "Typed Surface Write",
+};
+
 static int column;
 
 static int string (FILE *file, const char *string)
@@ -532,7 +563,7 @@ static int reg (FILE *file, uint32_t _reg_file, uint32_t _reg_nr)
   return err;
 }
 
-static int dest (FILE *file, const struct GenInstruction *inst)
+static int dest (FILE *file, const union GenNativeInstruction *inst)
 {
   int	err = 0;
 
@@ -541,8 +572,10 @@ static int dest (FILE *file, const struct GenInstruction *inst)
     if (inst->bits1.da1.dest_address_mode == GEN_ADDRESS_DIRECT)
     {
       err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
-      if (err == -1)
+      if (err == -1) {
+        control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
         return 0;
+      }
       if (inst->bits1.da1.dest_subreg_nr)
         format (file, ".%d", inst->bits1.da1.dest_subreg_nr /
             reg_type_size[inst->bits1.da1.dest_reg_type]);
@@ -586,7 +619,7 @@ static int dest (FILE *file, const struct GenInstruction *inst)
   return 0;
 }
 
-static int dest_3src (FILE *file, const struct GenInstruction *inst)
+static int dest_3src (FILE *file, const union GenNativeInstruction *inst)
 {
   int	err = 0;
   const uint32_t reg_file = GEN_GENERAL_REGISTER_FILE;
@@ -719,7 +752,7 @@ static int src_da16 (FILE *file,
   return err;
 }
 
-static int src0_3src (FILE *file, const struct GenInstruction *inst)
+static int src0_3src (FILE *file, const union GenNativeInstruction *inst)
 {
   int err = 0;
   uint32_t swz_x = (inst->bits2.da3src.src0_swizzle >> 0) & 0x3;
@@ -767,7 +800,7 @@ static int src0_3src (FILE *file, const struct GenInstruction *inst)
   return err;
 }
 
-static int src1_3src (FILE *file, const struct GenInstruction *inst)
+static int src1_3src (FILE *file, const union GenNativeInstruction *inst)
 {
   int err = 0;
   uint32_t swz_x = (inst->bits2.da3src.src1_swizzle >> 0) & 0x3;
@@ -820,7 +853,7 @@ static int src1_3src (FILE *file, const struct GenInstruction *inst)
 }
 
 
-static int src2_3src (FILE *file, const struct GenInstruction *inst)
+static int src2_3src (FILE *file, const union GenNativeInstruction *inst)
 {
   int err = 0;
   uint32_t swz_x = (inst->bits3.da3src.src2_swizzle >> 0) & 0x3;
@@ -870,7 +903,7 @@ static int src2_3src (FILE *file, const struct GenInstruction *inst)
   return err;
 }
 
-static int imm (FILE *file, uint32_t type, const struct GenInstruction *inst) {
+static int imm (FILE *file, uint32_t type, const union GenNativeInstruction *inst) {
   switch (type) {
     case GEN_TYPE_UD:
       format (file, "0x%xUD", inst->bits3.ud);
@@ -899,7 +932,7 @@ static int imm (FILE *file, uint32_t type, const struct GenInstruction *inst) {
   return 0;
 }
 
-static int src0 (FILE *file, const struct GenInstruction *inst)
+static int src0 (FILE *file, const union GenNativeInstruction *inst)
 {
   if (inst->bits1.da1.src0_reg_file == GEN_IMMEDIATE_VALUE)
     return imm (file, inst->bits1.da1.src0_reg_type,
@@ -959,7 +992,7 @@ static int src0 (FILE *file, const struct GenInstruction *inst)
   }
 }
 
-static int src1 (FILE *file, const struct GenInstruction *inst)
+static int src1 (FILE *file, const union GenNativeInstruction *inst)
 {
   if (inst->bits1.da1.src1_reg_file == GEN_IMMEDIATE_VALUE)
     return imm (file, inst->bits1.da1.src1_reg_type,
@@ -1028,7 +1061,7 @@ static const int esize[6] = {
   [5] = 32,
 };
 
-static int qtr_ctrl(FILE *file, const struct GenInstruction *inst)
+static int qtr_ctrl(FILE *file, const union GenNativeInstruction *inst)
 {
   int qtr_ctl = inst->header.quarter_control;
   int exec_size = esize[inst->header.execution_size];
@@ -1057,12 +1090,17 @@ static int qtr_ctrl(FILE *file, const struct GenInstruction *inst)
   return 0;
 }
 
-int gen_disasm (FILE *file, const void *opaque_insn)
+int gen_disasm (FILE *file, const void *opaque_insn, uint32_t deviceID, uint32_t compacted)
 {
-  const struct GenInstruction *inst = (const struct GenInstruction *) opaque_insn;
+  const union GenNativeInstruction *inst = (const union GenNativeInstruction *) opaque_insn;
   int	err = 0;
   int space = 0;
-  int gen = 7;
+  int gen = 70;
+  if (IS_IVYBRIDGE(deviceID)) {
+    gen = 70;
+  } else if (IS_HASWELL(deviceID)) {
+    gen = 75;
+  }
 
   if (inst->header.predicate_control) {
     string (file, "(");
@@ -1103,7 +1141,7 @@ int gen_disasm (FILE *file, const void *opaque_insn)
     string (file, ")");
   }
 
-  if (inst->header.opcode == GEN_OPCODE_SEND && gen < 6)
+  if (inst->header.opcode == GEN_OPCODE_SEND && gen < 60)
     format (file, " %d", inst->header.destreg_or_condmod);
 
   if (opcode[inst->header.opcode].nsrc == 3) {
@@ -1122,20 +1160,21 @@ int gen_disasm (FILE *file, const void *opaque_insn)
     if (opcode[inst->header.opcode].ndst > 0) {
       pad (file, 16);
       err |= dest (file, inst);
-    } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_IF ||
+    } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_IF ||
           inst->header.opcode == GEN_OPCODE_ELSE ||
           inst->header.opcode == GEN_OPCODE_ENDIF ||
-          inst->header.opcode == GEN_OPCODE_WHILE)) {
-      // XXX format (file, " %d", inst->bits1.branch_gen6.jump_count);
-      assert(0);
-    } else if (gen >= 6 && (inst->header.opcode == GEN_OPCODE_BREAK ||
+          inst->header.opcode == GEN_OPCODE_WHILE ||
+          inst->header.opcode == GEN_OPCODE_BRD ||
+          inst->header.opcode == GEN_OPCODE_JMPI)) {
+      format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
+    } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_BREAK ||
           inst->header.opcode == GEN_OPCODE_CONTINUE ||
-          inst->header.opcode == GEN_OPCODE_HALT)) {
-      // XXX format (file, " %d %d", inst->bits3.break_cont.uip, inst->bits3.break_cont.jip);
-      assert(0);
-    } else if (inst->header.opcode == GEN_OPCODE_JMPI) {
+          inst->header.opcode == GEN_OPCODE_HALT ||
+          inst->header.opcode == GEN_OPCODE_BRC)) {
+      format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+    }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
       format (file, " %d", inst->bits3.d);
-    }
+    }*/
 
     if (opcode[inst->header.opcode].nsrc > 0) {
       pad (file, 32);
@@ -1155,8 +1194,13 @@ int gen_disasm (FILE *file, const void *opaque_insn)
     pad (file, 16);
     space = 0;
 
-    err |= control (file, "target function", target_function_gen6,
-           target, &space);
+    if(gen == 75) {
+      err |= control (file, "target function", target_function_gen75,
+             target, &space);
+    } else {
+      err |= control (file, "target function", target_function_gen6,
+             target, &space);
+    }
 
     switch (target) {
       case GEN_SFID_MATH:
@@ -1195,6 +1239,14 @@ int gen_disasm (FILE *file, const void *opaque_insn)
                   data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
         }
         break;
+      case GEN_SFID_DATAPORT1_DATA_CACHE:
+        format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                inst->bits3.gen7_untyped_rw.bti,
+                inst->bits3.gen7_untyped_rw.rgba,
+                data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+                data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+                data_port1_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+        break;
       case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
         format (file, " (bti: %d, %s)",
                 inst->bits3.gen7_dword_rw.bti,
@@ -1221,7 +1273,7 @@ int gen_disasm (FILE *file, const void *opaque_insn)
     string (file, "{");
     space = 1;
     err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
-    if (gen >= 6)
+    if (gen >= 60)
       err |= control (file, "write enable control", wectrl, inst->header.mask_control, &space);
     else
       err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
@@ -1229,12 +1281,16 @@ int gen_disasm (FILE *file, const void *opaque_insn)
 
     err |= qtr_ctrl (file, inst);
     err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
-    if (gen >= 6)
+    if (gen >= 60)
       err |= control (file, "acc write control", accwr, inst->header.acc_wr_control, &space);
     if (inst->header.opcode == GEN_OPCODE_SEND ||
         inst->header.opcode == GEN_OPCODE_SENDC)
       err |= control (file, "end of thread", end_of_thread,
           inst->bits3.generic_gen5.end_of_thread, &space);
+
+    if(compacted) {
+      string(file, " Compacted");
+    }
     if (space)
       string (file, " ");
     string (file, "}");
diff --git a/backend/src/backend/gen/gen_mesa_disasm.h b/backend/src/backend/gen/gen_mesa_disasm.h
index 6185061..ae007a4 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.h
+++ b/backend/src/backend/gen/gen_mesa_disasm.h
@@ -34,7 +34,7 @@
 extern "C" {
 #endif /* __cplusplus */
 
-extern int gen_disasm(FILE *file, const void *opaque_insn);
+extern int gen_disasm(FILE *file, const void *opaque_insn, uint32_t deviceID, uint32_t compacted);
 
 #ifdef __cplusplus
 }
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
new file mode 100644
index 0000000..da0db85
--- /dev/null
+++ b/backend/src/backend/gen75_context.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.cpp
+ */
+
+#include "backend/gen75_context.hpp"
+#include "backend/gen75_encoder.hpp"
+#include "backend/gen_program.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/cvar.hpp"
+#include "ir/function.hpp"
+#include "ir/value.hpp"
+#include <cstring>
+
+namespace gbe
+{
+  void Gen75Context::emitSLMOffset(void) {
+    if(kernel->getUseSLM() == false)
+      return;
+
+    const GenRegister slm_offset = ra->genReg(GenRegister::ud1grf(ir::ocl::slmoffset));
+    const GenRegister slm_index = GenRegister::ud1grf(0, 0);
+    //the slm index is hold in r0.0 24-27 bit, in 4K unit, shift left 12 to get byte unit
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->SHR(slm_offset, slm_index, GenRegister::immud(12));
+    p->pop();
+  }
+
+  void Gen75Context::allocSLMOffsetCurbe(void) {
+    if(fn.getUseSLM())
+      allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
+  }
+
+  uint32_t Gen75Context::alignScratchSize(uint32_t size){
+    if(size == 0)
+      return 0;
+    uint32_t i = 2048;
+    while(i < size) i *= 2;
+    return i;
+  }
+
+  void Gen75Context::emitStackPointer(void) {
+    using namespace ir;
+
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+      return;
+
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
+    GBE_ASSERT(perLaneSize > 0);
+    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
+    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
+
+    // Use shifts rather than muls which are limited to 32x16 bit sources
+    const uint32_t perLaneShift = logi2(perLaneSize);
+    const uint32_t perThreadShift = logi2(perThreadSize);
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+    const GenRegister bufferptr = ra->genReg(selStackBuffer);
+
+    // We compute the per-lane stack pointer here
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+      p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+      p->curr.execWidth = this->simdWidth;
+      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->curr.execWidth = 1;
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
+      p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, stackptr, bufferptr);
+      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+    p->pop();
+  }
+
+  void Gen75Context::newSelection(void) {
+    this->sel = GBE_NEW(Selection75, *this);
+  }
+}
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp
new file mode 100644
index 0000000..6f62b02
--- /dev/null
+++ b/backend/src/backend/gen75_context.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.hpp
+ */
+#ifndef __GBE_GEN75_CONTEXT_HPP__
+#define __GBE_GEN75_CONTEXT_HPP__
+
+#include "backend/gen_context.hpp"
+#include "backend/gen75_encoder.hpp"
+
+namespace gbe
+{
+  /* This class is used to implement the HSW
+     specific logic for context. */
+  class Gen75Context : public GenContext
+  {
+  public:
+    virtual ~Gen75Context(void) { }
+    Gen75Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+            : GenContext(unit, name, deviceID, relaxMath) {
+    };
+    /*! device's max srcatch buffer size */
+    #define GEN75_SCRATCH_SIZE  (2 * KB * KB)
+    /*! Emit the per-lane stack pointer computation */
+    virtual void emitStackPointer(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) {
+      //Because the allocate is use uint16_t, so clamp it, need refine
+      return std::min(GEN75_SCRATCH_SIZE, 0x7fff);
+    }
+
+  protected:
+    virtual GenEncoder* generateEncoder(void) {
+      return GBE_NEW(Gen75Encoder, this->simdWidth, 75, deviceID);
+    }
+
+  private:
+    virtual void emitSLMOffset(void);
+    virtual void allocSLMOffsetCurbe(void);
+    virtual void newSelection(void);
+  };
+}
+#endif /* __GBE_GEN75_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
new file mode 100644
index 0000000..69d2de0
--- /dev/null
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -0,0 +1,269 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+#include "backend/gen75_encoder.hpp"
+
+static const uint32_t untypedRWMask[] = {
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
+  GEN_UNTYPED_ALPHA,
+  0
+};
+
+namespace gbe
+{
+  void Gen75Encoder::setHeader(GenNativeInstruction *insn) {
+    if (this->curr.execWidth == 8)
+      insn->header.execution_size = GEN_WIDTH_8;
+    else if (this->curr.execWidth == 16)
+      insn->header.execution_size = GEN_WIDTH_16;
+    else if (this->curr.execWidth == 1)
+      insn->header.execution_size = GEN_WIDTH_1;
+    else if (this->curr.execWidth == 4)
+      insn->header.execution_size = GEN_WIDTH_4;
+    else
+      NOT_IMPLEMENTED;
+    insn->header.acc_wr_control = this->curr.accWrEnable;
+    insn->header.quarter_control = this->curr.quarterControl;
+    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
+    insn->header.mask_control = this->curr.noMask;
+    insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+    insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    if (this->curr.predicate != GEN_PREDICATE_NONE) {
+      insn->header.predicate_control = this->curr.predicate;
+      insn->header.predicate_inverse = this->curr.inversePredicate;
+    }
+    insn->header.saturate = this->curr.saturate;
+  }
+
+  void Gen75Encoder::setDPUntypedRW(GenNativeInstruction *insn,
+                                    uint32_t bti,
+                                    uint32_t rgba,
+                                    uint32_t msg_type,
+                                    uint32_t msg_length,
+                                    uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_untyped_rw.msg_type = msg_type;
+    insn->bits3.gen7_untyped_rw.bti = bti;
+    insn->bits3.gen7_untyped_rw.rgba = rgba;
+    if (curr.execWidth == 8)
+      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+    else if (curr.execWidth == 16)
+      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+
+  void Gen75Encoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                          unsigned char msg_type, uint32_t msg_length, bool header_present)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
+    insn->bits3.gen7_typed_rw.bti = bti;
+    insn->bits3.gen7_typed_rw.msg_type = msg_type;
+
+    /* Always using the low 8 slots here. */
+    insn->bits3.gen7_typed_rw.slot = 1;
+  }
+
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+
+    if (this->curr.execWidth == 8) {
+      msg_length = srcNum;
+      response_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2 * srcNum;
+      response_length = 2;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
+    insn->bits3.gen7_atomic_op.bti = bti;
+    insn->bits3.gen7_atomic_op.return_data = 1;
+    insn->bits3.gen7_atomic_op.aop_type = function;
+
+    if (this->curr.execWidth == 8)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+    else if (this->curr.execWidth == 16)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+
+  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+      response_length = elemNum;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+      response_length = 2 * elemNum;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN75_P1_UNTYPED_READ,
+                   msg_length,
+                   response_length);
+  }
+
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+      msg_length = 1 + elemNum;
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+      msg_length = 2 * (1 + elemNum);
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN75_P1_UNTYPED_SURFACE_WRITE,
+                   msg_length,
+                   response_length);
+  }
+
+  void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+    union { double d; unsigned u[2]; } u;
+    u.d = value;
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+    push();
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.execWidth = 1;
+    MOV(r, GenRegister::immud(u.u[0]));
+    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[1]));
+    pop();
+    r.type = GEN_TYPE_DF;
+    r.vstride = GEN_VERTICAL_STRIDE_0;
+    r.width = GEN_WIDTH_1;
+    r.hstride = GEN_HORIZONTAL_STRIDE_0;
+    push();
+    uint32_t width = curr.execWidth;
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.quarterControl = GEN_COMPRESSION_Q1;
+    MOV(dest, r);
+    if (width == 16) {
+      curr.quarterControl = GEN_COMPRESSION_Q2;
+      MOV(GenRegister::offset(dest, 2), r);
+    }
+    pop();
+  }
+
+  void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+    int w = curr.execWidth;
+    GenRegister r0;
+    r0 = GenRegister::h2(r);
+    push();
+    curr.execWidth = 4;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    MOV(r0, src0);
+    MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
+    curr.noMask = 0;
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(dest, r0);
+    curr.nibControl = 1;
+    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4));
+    pop();
+    if (w == 16) {
+      push();
+      curr.execWidth = 4;
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = 1;
+      MOV(r0, GenRegister::suboffset(src0, 8));
+      MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
+      curr.noMask = 0;
+      curr.quarterControl = 1;
+      curr.nibControl = 0;
+      MOV(GenRegister::suboffset(dest, 8), r0);
+      curr.nibControl = 1;
+      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4));
+      pop();
+    }
+  }
+
+  void Gen75Encoder::JMPI(GenRegister src, bool longjmp) {
+    alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
+  }
+
+  void Gen75Encoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
+    GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
+    GBE_ASSERT(insnID < this->store.size());
+    GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
+               insn.header.opcode == GEN_OPCODE_BRD  ||
+               insn.header.opcode == GEN_OPCODE_ENDIF ||
+               insn.header.opcode == GEN_OPCODE_IF ||
+               insn.header.opcode == GEN_OPCODE_BRC);
+
+    if (insn.header.opcode == GEN_OPCODE_IF) {
+      this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+      return;
+    }
+    else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+      //jumpDistance'unit is Qword, and the HSW's offset of jmpi is in byte, so multi 8
+      jumpDistance = (jumpDistance - 2) * 8;
+    }
+
+    this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+  }
+} /* End of the name space. */
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
new file mode 100644
index 0000000..c10dac9
--- /dev/null
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.hpp
+ */
+#ifndef __GBE_GEN75_ENCODER_HPP__
+#define __GBE_GEN75_ENCODER_HPP__
+
+#include "backend/gen_encoder.hpp"
+
+namespace gbe
+{
+  /* This class is used to implement the HSW
+     specific logic for encoder. */
+  class Gen75Encoder : public GenEncoder
+  {
+  public:
+    /*! exec width of the double data type */    
+    #define GEN75_DOUBLE_EXEC_WIDTH  4
+    virtual ~Gen75Encoder(void) { }
+
+    Gen75Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
+         : GenEncoder(simdWidth, gen, deviceID) { }
+
+    /*! Jump indexed instruction */
+    virtual void JMPI(GenRegister src, bool longjmp = false);
+    /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
+    virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+    /*! Get double/long exec width */
+    virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
+    virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
+    virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void setHeader(GenNativeInstruction *insn);
+    virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
+                   uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
+    virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                      unsigned char msg_type, uint32_t msg_length,
+                                      bool header_present);
+  };
+}
+#endif /* __GBE_GEN75_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index d72b19b..4f697ef 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -31,6 +31,7 @@
 #include "backend/gen_reg_allocation.hpp"
 #include "backend/gen/gen_mesa_disasm.h"
 #include "ir/function.hpp"
+#include "ir/value.hpp"
 #include "sys/cvar.hpp"
 #include <cstring>
 #include <iostream>
@@ -41,14 +42,14 @@ namespace gbe
   ///////////////////////////////////////////////////////////////////////////
   // GenContext implementation
   ///////////////////////////////////////////////////////////////////////////
-  GenContext::GenContext(const ir::Unit &unit,
-                         const std::string &name,
-                         bool limitRegisterPressure) :
-    Context(unit, name), limitRegisterPressure(limitRegisterPressure)
+  GenContext::GenContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID,
+	     bool relaxMath) :
+    Context(unit, name), deviceID(deviceID), relaxMath(relaxMath)
   {
-    this->p = GBE_NEW(GenEncoder, simdWidth, 7); // XXX handle more than Gen7
-    this->sel = GBE_NEW(Selection, *this);
-    this->ra = GBE_NEW(GenRegAllocator, *this);
+    this->p = NULL;
+    this->sel = NULL;
+    this->ra = NULL;
+    this->ifEndifFix = false;
   }
 
   GenContext::~GenContext(void) {
@@ -57,6 +58,32 @@ namespace gbe
     GBE_DELETE(this->p);
   }
 
+  void GenContext::startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure) {
+    this->limitRegisterPressure = limitRegisterPressure;
+    this->reservedSpillRegs = reservedSpillRegs;
+    Context::startNewCG(simdWidth);
+    GBE_SAFE_DELETE(ra);
+    GBE_SAFE_DELETE(sel);
+    GBE_SAFE_DELETE(p);
+    this->p = generateEncoder();
+    this->newSelection();
+    this->ra = GBE_NEW(GenRegAllocator, *this);
+    this->branchPos2.clear();
+    this->branchPos3.clear();
+    this->labelPos.clear();
+    this->errCode = NO_ERROR;
+  }
+
+  void GenContext::newSelection(void) {
+    this->sel = GBE_NEW(Selection, *this);
+  }
+
+  uint32_t GenContext::alignScratchSize(uint32_t size){
+    uint32_t i = 0;
+    while(i < size) i+=1024;
+    return i;
+  }
+
   void GenContext::emitInstructionStream(void) {
     // Emit Gen ISA
     for (auto &block : *sel->blockList)
@@ -80,63 +107,50 @@ namespace gbe
 	p->NOP();
   }
 
-  void GenContext::patchBranches(void) {
+  bool GenContext::patchBranches(void) {
     using namespace ir;
     for (auto pair : branchPos2) {
       const LabelIndex label = pair.first;
       const int32_t insnID = pair.second;
       const int32_t targetID = labelPos.find(label)->second;
-      p->patchJMPI(insnID, (targetID-insnID-1) * 2);
+      p->patchJMPI(insnID, (targetID - insnID));
     }
+    for (auto pair : branchPos3) {
+      const LabelPair labelPair = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t jip = labelPos.find(labelPair.l0)->second;
+      const int32_t uip = labelPos.find(labelPair.l1)->second;
+      if (((jip - insnID) > 32767 || (jip - insnID) < -32768) ||
+          ((uip - insnID) > 32768 || (uip - insnID) < -32768)) {
+        // The only possible error instruction is if/endif here.
+        errCode = OUT_OF_RANGE_IF_ENDIF; 
+        return false;
+      }
+      p->patchJMPI(insnID, (((uip - insnID)) << 16) | ((jip - insnID)));
+    }
+    return true;
   }
 
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
-    const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
-    const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
-    uint32_t execWidth = p->curr.execWidth;
+    const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+    const GenRegister zero = ra->genReg(GenRegister::uw1grf(ir::ocl::zero));
+    const GenRegister one = ra->genReg(GenRegister::uw1grf(ir::ocl::one));
     p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    /* clear all the bit in f0.0. */
-    p->curr.execWidth = 1;
-    p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immuw(0x0000));
-    /* clear the barrier mask bits to all zero0*/
-    p->curr.noMask = 0;
-    p->curr.useFlag(0, 0);
-    p->curr.execWidth = execWidth;
-    /* set all the active lane to 1. Inactive lane remains 0. */
-    p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
-    p->curr.noMask = 1;
-    p->curr.execWidth = 1;
-    p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
-    p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
-    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+      p->curr.noMask = 0;
+      p->MOV(blockip, GenRegister::immuw(0));
+      p->curr.execWidth = 1;
+      // FIXME, need to get the final use set of zero/one, if there is no user,
+      // no need to generate the following two instructions.
+      p->MOV(zero, GenRegister::immuw(0));
+      p->MOV(one, GenRegister::immw(-1));
     p->pop();
   }
 
-  //Each emit function should only using one flag reg, otherwise, should handle the case both use f0.1
-  GenRegister GenContext::checkFlagRegister(GenRegister flagReg) {
-    uint32_t nr=0, subnr=0;
-    if(flagReg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
-      assert(flagReg.nr >= GEN_ARF_FLAG && flagReg.nr < GEN_ARF_MASK);
-      return flagReg;
-    }
-
-    //flagReg is grf register, use f0.1, so f0.1 shouldn't be in use.
-    //Only check curr in the GenInstructionState stack, it seems enough now.
-    //Should check other GenInstructionState in the stack if needed in future.
-    if(p->curr.predicate == GEN_PREDICATE_NORMAL) {
-      nr = p->curr.flag;
-      subnr = p->curr.subFlag;
-      //TODO: Add mov to save/restore if f0.1 is in use
-      assert(!(nr == 0 && subnr == 2));
-    }
-
-    return GenRegister::flag(0, 1);
-  }
-
   void GenContext::emitStackPointer(void) {
     using namespace ir;
 
@@ -147,7 +161,6 @@ namespace gbe
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
     const uint32_t perThreadSize = perLaneSize * this->simdWidth;
-    const int32_t offset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
     GBE_ASSERT(perLaneSize > 0);
     GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
     GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
@@ -159,9 +172,8 @@ namespace gbe
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
-    const uint32_t nr = offset / GEN_REG_SIZE;
-    const uint32_t subnr = (offset % GEN_REG_SIZE) / sizeof(uint32_t);
-    const GenRegister bufferptr = GenRegister::ud1grf(nr, subnr);
+    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+    const GenRegister bufferptr = ra->genReg(selStackBuffer);
 
     // We compute the per-lane stack pointer here
     p->push();
@@ -187,7 +199,7 @@ namespace gbe
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
     switch (insn.opcode) {
-      case SEL_OP_MOV: p->MOV(dst, src); break;
+      case SEL_OP_MOV: p->MOV(dst, src, insn.extra.function); break;
       case SEL_OP_FBH: p->FBH(dst, src); break;
       case SEL_OP_FBL: p->FBL(dst, src); break;
       case SEL_OP_NOT: p->NOT(dst, src); break;
@@ -200,19 +212,35 @@ namespace gbe
       case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
       case SEL_OP_CONVI64_TO_I:
        {
-        int execWidth = p->curr.execWidth;
-        GenRegister xsrc = src.bottom_half(), xdst = dst;
-        p->push();
-        p->curr.execWidth = 8;
-        for(int i = 0; i < execWidth/4; i ++) {
-          p->curr.chooseNib(i);
-          p->MOV(xdst, xsrc);
-          xdst = GenRegister::suboffset(xdst, 4);
-          xsrc = GenRegister::suboffset(xsrc, 4);
-        }
-        p->pop();
+        p->MOV(dst, src.bottom_half());
         break;
        }
+      case SEL_OP_BRC:
+        {
+          const ir::LabelIndex label0(insn.index), label1(insn.index1);
+          const LabelPair labelPair(label0, label1);
+          const GenRegister src = ra->genReg(insn.src(0));
+          this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));
+          p->BRC(src);
+        }
+        break;
+      case SEL_OP_BRD:
+        insertJumpPos(insn);
+        p->BRD(src);
+        break;
+      case SEL_OP_ENDIF:
+        insertJumpPos(insn);
+        p->ENDIF(src);
+        break;
+      case SEL_OP_IF:
+        {
+          const ir::LabelIndex label0(insn.index), label1(insn.index1);
+          const LabelPair labelPair(label0, label1);
+          const GenRegister src = ra->genReg(insn.src(0));
+          this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));
+          p->IF(src);
+        }
+        break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -229,28 +257,18 @@ namespace gbe
         p->MOV_DF(dst, src, tmp);
         break;
       case SEL_OP_CONVI_TO_I64: {
-        GenRegister middle;
-        if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
+        GenRegister middle = src;
+        if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
           middle = tmp;
-          middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
+          middle.type = GEN_TYPE_D;
           p->MOV(middle, src);
-        } else {
-          middle = src;
-        }
-        int execWidth = p->curr.execWidth;
-        p->push();
-        p->curr.execWidth = 8;
-        for (int nib = 0; nib < execWidth / 4; nib ++) {
-          p->curr.chooseNib(nib);
-          p->MOV(dst.bottom_half(), middle);
-          if(middle.is_signed_int())
-            p->ASR(dst.top_half(), middle, GenRegister::immud(31));
-          else
-            p->MOV(dst.top_half(), GenRegister::immd(0));
-          dst = GenRegister::suboffset(dst, 4);
-          middle = GenRegister::suboffset(middle, 4);
         }
-        p->pop();
+
+        p->MOV(dst.bottom_half(), middle);
+        if(src.is_signed_int())
+          p->ASR(dst.top_half(this->simdWidth), middle, GenRegister::immud(31));
+        else
+          p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
         break;
       }
       default:
@@ -265,8 +283,10 @@ namespace gbe
     GenRegister tmp = ra->genReg(insn.dst(1));
     switch (insn.opcode) {
       case SEL_OP_I64ADD: {
-        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
-                    y = GenRegister::suboffset(x, p->curr.execWidth);
+        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+        GenRegister x = tmp.bottom_half();
+        GenRegister y = tmp.top_half(this->simdWidth);
+
         loadBottomHalf(x, src0);
         loadBottomHalf(y, src1);
         addWithCarry(x, x, y);
@@ -279,8 +299,10 @@ namespace gbe
         break;
       }
       case SEL_OP_I64SUB: {
-        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
-                    y = GenRegister::suboffset(x, p->curr.execWidth);
+        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+        GenRegister x = tmp.bottom_half();
+        GenRegister y = tmp.top_half(this->simdWidth);
+
         loadBottomHalf(x, src0);
         loadBottomHalf(y, src1);
         subWithBorrow(x, x, y);
@@ -299,6 +321,7 @@ namespace gbe
         for (int i = 0; i < w / 8; i ++) {
           p->push();
           p->curr.predicate = GEN_PREDICATE_NONE;
+          p->curr.noMask = 1;
           p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
           p->curr.accWrEnable = 1;
           p->MACH(tmp, src0, src1);
@@ -360,81 +383,29 @@ namespace gbe
       case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
       case SEL_OP_SEL_INT64:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4);
-            xsrc0 = GenRegister::suboffset(xsrc0, 4);
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->SEL(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
-      case SEL_OP_AND:  p->AND(dst, src0, src1); break;
-      case SEL_OP_OR:   p->OR (dst, src0, src1);  break;
-      case SEL_OP_XOR:  p->XOR(dst, src0, src1); break;
+      case SEL_OP_AND:  p->AND(dst, src0, src1, insn.extra.function); break;
+      case SEL_OP_OR:   p->OR (dst, src0, src1, insn.extra.function);  break;
+      case SEL_OP_XOR:  p->XOR(dst, src0, src1, insn.extra.function); break;
       case SEL_OP_I64AND:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4),
-            xsrc0 = GenRegister::suboffset(xsrc0, 4),
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->AND(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
       case SEL_OP_I64OR:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4),
-            xsrc0 = GenRegister::suboffset(xsrc0, 4),
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->OR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
       case SEL_OP_I64XOR:
         {
-          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
-                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
-                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
-            p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
-            xdst = GenRegister::suboffset(xdst, 4),
-            xsrc0 = GenRegister::suboffset(xsrc0, 4),
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->XOR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
         }
         break;
       case SEL_OP_SHR:  p->SHR(dst, src0, src1); break;
@@ -452,18 +423,8 @@ namespace gbe
           GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
                       xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
                       xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
-          int execWidth = p->curr.execWidth;
-          p->push();
-          p->curr.execWidth = 8;
-          for (int nib = 0; nib < execWidth / 4; nib ++) {
-            p->curr.chooseNib(nib);
-            p->MOV(xdst.top_half(), xsrc0.bottom_half());
-            p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
-            xdst = GenRegister::suboffset(xdst, 4);
-            xsrc0 = GenRegister::suboffset(xsrc0, 4);
-            xsrc1 = GenRegister::suboffset(xsrc1, 4);
-          }
-          p->pop();
+          p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half());
+          p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
         }
         break;
       default: NOT_IMPLEMENTED;
@@ -471,15 +432,10 @@ namespace gbe
   }
 
   void GenContext::collectShifter(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
     p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.execWidth = 8;
-    for (int nib = 0; nib < execWidth / 4; nib ++) {
-      p->AND(dest, src.bottom_half(), GenRegister::immud(63));
-      dest = GenRegister::suboffset(dest, 4);
-      src = GenRegister::suboffset(src, 4);
-    }
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+    p->AND(dest, src.bottom_half(), GenRegister::immud(63));
     p->pop();
   }
 
@@ -513,6 +469,7 @@ namespace gbe
   void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
     p->SHR(sign, high, GenRegister::immud(31));
     p->push();
+    p->curr.noMask = 1;
     p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
     p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
@@ -534,7 +491,8 @@ namespace gbe
     GenRegister g = ra->genReg(insn.dst(7));
     GenRegister h = ra->genReg(insn.dst(8));
     GenRegister i = ra->genReg(insn.dst(9));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -548,6 +506,7 @@ namespace gbe
       I64FullMult(e, f, g, h, a, b, c, d);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -580,7 +539,8 @@ namespace gbe
     GenRegister g = ra->genReg(insn.dst(7));
     GenRegister h = ra->genReg(insn.dst(8));
     GenRegister i = ra->genReg(insn.dst(9));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
@@ -600,6 +560,7 @@ namespace gbe
       p->OR(a, e, f);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NZ, a, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -613,6 +574,7 @@ namespace gbe
       I64FullMult(e, f, g, h, a, b, c, d);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NZ, i, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -644,6 +606,7 @@ namespace gbe
       p->push();
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_NZ, e, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->MOV(b, one);
@@ -761,12 +724,14 @@ namespace gbe
     GenRegister e = ra->genReg(insn.dst(5));
     GenRegister f = ra->genReg(insn.dst(6));
     a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(7)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     GenRegister zero = GenRegister::immud(0);
     switch(insn.opcode) {
       case SEL_OP_I64SHL:
         p->push();
         p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
         collectShifter(a, y);
         loadBottomHalf(e, x);
         loadTopHalf(f, x);
@@ -794,6 +759,7 @@ namespace gbe
       case SEL_OP_I64SHR:
         p->push();
         p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
         collectShifter(a, y);
         loadBottomHalf(e, x);
         loadTopHalf(f, x);
@@ -822,6 +788,7 @@ namespace gbe
         f.type = GEN_TYPE_D;
         p->push();
         p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
         collectShifter(a, y);
         loadBottomHalf(e, x);
         loadTopHalf(f, x);
@@ -863,13 +830,19 @@ namespace gbe
                                             GenRegister mantissa, GenRegister tmp, GenRegister flag) {
     uint32_t jip0, jip1;
     GenRegister dst_ud = GenRegister::retype(dst, GEN_TYPE_UD);
+    p->push();
+      p->curr.noMask = 1;
+      p->MOV(exp, GenRegister::immud(32)); // make sure the inactive lane is 1 when check ALL8H/ALL16H condition latter.
+    p->pop();
     p->FBH(exp, high);
     p->ADD(exp, GenRegister::negate(exp), GenRegister::immud(31));  //exp = 32 when high == 0
     p->push();
       p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32));   //high == 0
       p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.noMask = 0;
       p->MOV(dst, low);
       p->push();
         if (simdWidth == 8)
@@ -883,8 +856,8 @@ namespace gbe
         jip0 = p->n_instruction();
         p->JMPI(GenRegister::immud(0));
       p->pop();
-
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32));  //exp>23 && high!=0
@@ -910,6 +883,7 @@ namespace gbe
       p->pop();
 
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->MOV(dst_ud, GenRegister::immud(0));   //exp==9, SHR == 0
@@ -930,7 +904,7 @@ namespace gbe
       p->SHL(high, low, tmp);
       p->MOV(low, GenRegister::immud(0));
 
-      p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
+      p->patchJMPI(jip1, (p->n_instruction() - jip1) );
       p->curr.predicate = GEN_PREDICATE_NONE;
       p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));  //update dst where high != 0
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -944,7 +918,7 @@ namespace gbe
       p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
       p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
       p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
-      p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
+      p->patchJMPI(jip0, (p->n_instruction() - jip0));
 
     p->pop();
 
@@ -959,16 +933,18 @@ namespace gbe
     GenRegister mantissa = ra->genReg(insn.dst(4));
     GenRegister tmp = ra->genReg(insn.dst(5));
     GenRegister tmp_high = ra->genReg(insn.dst(6));
-    GenRegister f0 = checkFlagRegister(ra->genReg(insn.dst(7)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     loadTopHalf(high, src);
     loadBottomHalf(low, src);
     if(!src.is_signed_int()) {
-      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
+      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, flagReg);
     } else {
       p->MOV(tmp_high, high);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->NOT(high, high);
@@ -977,10 +953,11 @@ namespace gbe
       addWithCarry(low, low, tmp);
       p->ADD(high, high, tmp);
       p->pop();
-      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
+      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, flagReg);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       dest.type = GEN_TYPE_UD;
@@ -995,7 +972,8 @@ namespace gbe
     GenRegister dst = ra->genReg(insn.dst(0));
     GenRegister high = ra->genReg(insn.dst(1));
     GenRegister tmp = ra->genReg(insn.dst(2));
-    GenRegister flag0 = checkFlagRegister(ra->genReg(insn.dst(3)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
 
     if(dst.is_signed_int())
       high = GenRegister::retype(high, GEN_TYPE_D);
@@ -1013,7 +991,8 @@ namespace gbe
     if(dst.is_signed_int()) {
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->CMP(GEN_CONDITIONAL_NEQ, low, GenRegister::immud(0x0));
@@ -1039,11 +1018,10 @@ namespace gbe
                 f1.width = GEN_WIDTH_1;
     GenRegister f2 = GenRegister::suboffset(f1, 1);
     GenRegister f3 = GenRegister::suboffset(f1, 2);
-    GenRegister f4 = GenRegister::suboffset(f1, 3);
 
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
-    saveFlag(f4, flag, subFlag);
+    p->curr.noMask = 1;
     loadTopHalf(tmp0, src0);
     loadTopHalf(tmp1, src1);
     switch(insn.extra.function) {
@@ -1103,7 +1081,6 @@ namespace gbe
         NOT_IMPLEMENTED;
     }
     p->curr.execWidth = 1;
-    p->AND(f1, f1, f4);
     p->MOV(GenRegister::flag(flag, subFlag), f1);
     p->pop();
   }
@@ -1117,7 +1094,8 @@ namespace gbe
     GenRegister c = ra->genReg(insn.dst(3));
     GenRegister d = ra->genReg(insn.dst(4));
     GenRegister e = ra->genReg(insn.dst(5));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -1130,6 +1108,7 @@ namespace gbe
     p->ADD(c, c, d);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
     if(! dst.is_signed_int()) {
       p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1143,6 +1122,7 @@ namespace gbe
       p->MOV(a, GenRegister::immud(0x80000000u));
       p->MOV(b, GenRegister::immud(0));
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
@@ -1163,7 +1143,8 @@ namespace gbe
     GenRegister c = ra->genReg(insn.dst(3));
     GenRegister d = ra->genReg(insn.dst(4));
     GenRegister e = ra->genReg(insn.dst(5));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -1176,6 +1157,7 @@ namespace gbe
     p->ADD(c, c, d);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
     if(! dst.is_signed_int()) {
       p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
@@ -1201,69 +1183,19 @@ namespace gbe
   }
 
   void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    src = src.top_half();
-    p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.execWidth = 8;
-    p->MOV(dest, src);
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest, src.top_half(this->simdWidth));
   }
 
   void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    dest = dest.top_half();
-    p->push();
-    p->curr.execWidth = 8;
-    p->MOV(dest, src);
-    p->curr.nibControl = 1;
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->curr.nibControl = 0;
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->curr.nibControl = 1;
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest.top_half(this->simdWidth), src);
   }
 
   void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    src = src.bottom_half();
-    p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.execWidth = 8;
-    p->MOV(dest, src);
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest, src.bottom_half());
   }
 
   void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
-    int execWidth = p->curr.execWidth;
-    dest = dest.bottom_half();
-    p->push();
-    p->curr.execWidth = 8;
-    p->MOV(dest, src);
-    p->curr.nibControl = 1;
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->curr.nibControl = 0;
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
-      p->curr.nibControl = 1;
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
-    }
-    p->pop();
+    p->MOV(dest.bottom_half(), src);
   }
 
   void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
@@ -1336,6 +1268,7 @@ namespace gbe
     loadBottomHalf(d, y);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
     I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
     I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
     p->ADD(e, e, f);
@@ -1363,10 +1296,12 @@ namespace gbe
     GenRegister k = ra->genReg(insn.dst(11));
     GenRegister l = ra->genReg(insn.dst(12));
     GenRegister m = ra->genReg(insn.dst(13));
-    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(14)));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
     GenRegister zero = GenRegister::immud(0),
                 one = GenRegister::immud(1),
                 imm31 = GenRegister::immud(31);
+    uint32_t jip0;
     // (a,b) <- x
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
@@ -1409,6 +1344,7 @@ namespace gbe
       // condition <- (c,d)==0 && (a,b)>=(e,f)
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->MOV(l, zero);
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_EQ, a, e);
@@ -1441,11 +1377,21 @@ namespace gbe
       p->SHR(g, g, one);
       // condition: m < 64
       p->ADD(m, m, one);
+
+      p->push();
+      p->curr.noMask = 1;
+      p->curr.execWidth = 1;
+      p->MOV(flagReg, zero);
+      p->pop();
+
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 0;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
-      p->curr.predicate = GEN_PREDICATE_NORMAL;
+
+      p->curr.execWidth = 1;
+      p->curr.noMask = 1;
       // under condition, jump back to start point
       if (simdWidth == 8)
         p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
@@ -1453,11 +1399,11 @@ namespace gbe
         p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
       else
         NOT_IMPLEMENTED;
-      p->curr.execWidth = 1;
+      int distance = -(int)(p->n_instruction() - loop_start );
       p->curr.noMask = 1;
-      int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
+      jip0 = p->n_instruction();
       p->JMPI(zero);
-      p->patchJMPI(p->n_instruction()-2, jip);
+      p->patchJMPI(jip0, distance);
       p->pop();
       // end of loop
     }
@@ -1465,6 +1411,7 @@ namespace gbe
     if(x.is_signed_int()) {
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
       p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
       p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -1497,7 +1444,7 @@ namespace gbe
   }
 
   void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
-    NOT_IMPLEMENTED;
+   p->NOP();
   }
 
   void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
@@ -1509,59 +1456,24 @@ namespace gbe
     const GenRegister fenceDst = ra->genReg(insn.dst(0));
     uint32_t barrierType = insn.extra.barrierType;
     const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
-    GenRegister blockIP;
-    uint32_t exeWidth = p->curr.execWidth;
-    ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
-
-    if (exeWidth == 16)
-      blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
-    else if (exeWidth == 8)
-      blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
-
-    p->push();
-    /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
-       after the barrier, If there is any lane still remains zero. */
-    p->MOV(blockIP, GenRegister::immuw(0xFFFF));
-    p->curr.noMask = 1;
-    p->curr.execWidth = 1;
-    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
-    if (exeWidth == 16)
-      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
-    else if (exeWidth == 8)
-      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
-    else
-      NOT_IMPLEMENTED;
-    p->curr.inversePredicate = 1;
-    // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
-    // jump to next basic block.
-    p->JMPI(GenRegister::immud(0));
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
-    p->pop();
 
-    p->push();
-    p->curr.useFlag(0, 0);
-    /* Restore the blockIP to current label. */
-    p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
     if (barrierType == ir::syncGlobalBarrier) {
       p->FENCE(fenceDst);
       p->MOV(fenceDst, fenceDst);
     }
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    // As only the payload.2 is used and all the other regions are ignored
-    // SIMD8 mode here is safe.
-    p->curr.execWidth = 8;
-    p->curr.physicalFlag = 0;
-    p->curr.noMask = 1;
-    // Copy barrier id from r0.
-    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
-    // A barrier is OK to start the thread synchronization *and* SLM fence
-    p->BARRIER(src);
-    // Now we wait for the other threads
-    p->curr.execWidth = 1;
-    p->WAIT();
-    // we executed the barrier then restore the barrier soft mask to initial value.
-    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+    p->push();
+      // As only the payload.2 is used and all the other regions are ignored
+      // SIMD8 mode here is safe.
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      // Copy barrier id from r0.
+      p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      p->BARRIER(src);
+      p->curr.execWidth = 1;
+      // Now we wait for the other threads
+      p->WAIT();
     p->pop();
   }
 
@@ -1585,8 +1497,9 @@ namespace gbe
   void GenContext::emitCompareInstruction(const SelectionInstruction &insn) {
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
+    const GenRegister dst = ra->genReg(insn.dst(0));
     if (insn.opcode == SEL_OP_CMP)
-      p->CMP(insn.extra.function, src0, src1);
+      p->CMP(insn.extra.function, src0, src1, dst);
     else {
       GBE_ASSERT(insn.opcode == SEL_OP_SEL_CMP);
       const GenRegister dst = ra->genReg(insn.dst(0));
@@ -1598,14 +1511,14 @@ namespace gbe
     const GenRegister src = ra->genReg(insn.src(0));
     const GenRegister dst = ra->genReg(insn.dst(0));
     const uint32_t function = insn.extra.function;
-    const uint32_t bti = insn.extra.elem;
+    const uint32_t bti = insn.getbti();
 
     p->ATOMIC(dst, function, src, bti, insn.srcNum);
   }
 
   void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
     GenRegister src = ra->genReg(insn.src(0));
-    if(isScalarReg(src.reg()))
+    if(sel->isScalarReg(src.reg()))
       src = GenRegister::retype(src, GEN_TYPE_UW);
     else
       src = GenRegister::unpacked_uw(src.nr, src.subnr / typeSize(GEN_TYPE_UW));
@@ -1634,11 +1547,15 @@ namespace gbe
     }
   }
 
-  void GenContext::emitJumpInstruction(const SelectionInstruction &insn) {
+ void GenContext::insertJumpPos(const SelectionInstruction &insn) {
     const ir::LabelIndex label(insn.index);
-    const GenRegister src = ra->genReg(insn.src(0));
     this->branchPos2.push_back(std::make_pair(label, p->store.size()));
-    p->JMPI(src);
+ }
+
+  void GenContext::emitJumpInstruction(const SelectionInstruction &insn) {
+    insertJumpPos(insn);
+    const GenRegister src = ra->genReg(insn.src(0));
+    p->JMPI(src, insn.extra.longjmp);
   }
 
   void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
@@ -1662,14 +1579,28 @@ namespace gbe
     GenRegister payload = src;
     payload.nr = header + 1;
     payload.subnr = 0;
+
     GBE_ASSERT(src.subnr == 0);
-    if (payload.nr != src.nr)
-      p->MOV(payload, src);
     uint32_t regType = insn.src(0).type;
     uint32_t size = typeSize(regType);
-    assert(size <= 4);
-    uint32_t regNum = (stride(src.hstride)*size*simdWidth) > 32 ? 2 : 1;
-    this->scratchWrite(msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    uint32_t regSize = stride(src.hstride)*size;
+
+    GBE_ASSERT(regSize == 4 || regSize == 8);
+    if(regSize == 4) {
+      if (payload.nr != src.nr)
+        p->MOV(payload, src);
+      uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+      this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    }
+    else { //size == 8
+      payload.type = GEN_TYPE_UD;
+      GBE_ASSERT(payload.hstride == GEN_HORIZONTAL_STRIDE_1);
+      loadBottomHalf(payload, src);
+      uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+      this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      loadTopHalf(payload, src);
+      this->scratchWrite(msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    }
     p->pop();
   }
 
@@ -1680,50 +1611,53 @@ namespace gbe
     uint32_t simdWidth = p->curr.execWidth;
     const uint32_t header = insn.extra.scratchMsgHeader;
     uint32_t size = typeSize(regType);
-    assert(size <= 4);
-    uint32_t regNum = (stride(dst.hstride)*size*simdWidth) > 32 ? 2 : 1;
+    uint32_t regSize = stride(dst.hstride)*size;
+
     const GenRegister msg = GenRegister::ud8grf(header, 0);
-    this->scratchRead(GenRegister::retype(dst, GEN_TYPE_UD), msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    GenRegister payload = msg;
+    payload.nr = header + 1;
+
+    p->push();
+    assert(regSize == 4 || regSize == 8);
+    if(regSize == 4) {
+      uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+      this->scratchRead(GenRegister::ud8grf(dst.nr, dst.subnr), msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    } else {
+      uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+      this->scratchRead(payload, msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      storeBottomHalf(dst, payload);
+      this->scratchRead(payload, msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      storeTopHalf(dst, payload);
+    }
+    p->pop();
   }
 
-  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
-  //  then follow the real destination registers.
-  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
   void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
     const uint32_t elemNum = insn.extra.elem;
-    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
-    const GenRegister tempAddr = ra->genReg(insn.dst(0));
-    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize + 1));
-    const GenRegister tmp = ra->genReg(insn.dst(1));
+    const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
-    p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
+    const uint32_t bti = insn.getbti();
+    p->UNTYPED_READ(dst, src, bti, elemNum*2);
   }
 
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
+    const uint32_t bti = insn.getbti();
     const uint32_t elemNum = insn.extra.elem;
     p->UNTYPED_READ(dst, src, bti, elemNum);
   }
 
-  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
-  //  then follow the real destination registers.
-  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
   void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.dst(0));
     const uint32_t elemNum = insn.extra.elem;
-    const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
-    const GenRegister data = ra->genReg(insn.src(1));
-    const uint32_t bti = insn.extra.function;
-    p->MOV(src, addr);
-    p->WRITE64(src, data, bti, elemNum, isScalarReg(data.reg()));
+    const uint32_t bti = insn.getbti();
+    p->UNTYPED_WRITE(src, bti, elemNum*2);
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
+    const uint32_t bti = insn.getbti();
     const uint32_t elemNum = insn.extra.elem;
     p->UNTYPED_WRITE(src, bti, elemNum);
   }
@@ -1731,47 +1665,62 @@ namespace gbe
   void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
+    const uint32_t bti = insn.getbti();
     const uint32_t elemSize = insn.extra.elem;
     p->BYTE_GATHER(dst, src, bti, elemSize);
   }
 
   void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
+    const uint32_t bti = insn.getbti();
     const uint32_t elemSize = insn.extra.elem;
     p->BYTE_SCATTER(src, bti, elemSize);
   }
 
+  void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    for(uint32_t i = 0; i < insn.dstNum; i++) {
+      p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i));
+    }
+  }
+
+  void GenContext::emitPackByteInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    p->push();
+    if(simdWidth == 8) {
+      for(uint32_t i = 0; i < insn.srcNum; i++)
+        p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i)));
+    } else {
+      // when destination expands two registers, the source must span two registers.
+      p->curr.execWidth = 8;
+      for(uint32_t i = 0; i < insn.srcNum; i++) {
+        GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i);
+        GenRegister src = ra->genReg(insn.src(i));
+
+        p->curr.quarterControl = 0;
+        p->MOV(dsti, src);
+        p->curr.quarterControl = 1;
+        p->MOV(GenRegister::Qn(dsti,1), GenRegister::Qn(src, 1));
+      }
+    }
+    p->pop();
+  }
+
   void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
+    const uint32_t bti = insn.getbti();
     p->DWORD_GATHER(dst, src, bti);
   }
 
   void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
-    const unsigned char bti = insn.extra.rdbti;
+    const unsigned char bti = insn.getbti();
     const unsigned char sampler = insn.extra.sampler;
-    const GenRegister ucoord = ra->genReg(insn.src(4));
-    const GenRegister vcoord = ra->genReg(insn.src(5));
+    const unsigned int msgLen = insn.extra.rdmsglen;
     uint32_t simdWidth = p->curr.execWidth;
-    uint32_t coord_cnt = 2;
-    p->push();
-    const uint32_t nr = msgPayload.nr;
-    // prepare mesg desc and move to a0.0.
-    // desc = bti | (sampler << 8) | (0 << 12) | (2 << 16) | (0 << 18) | (0 << 19) | (4 << 20) | (1 << 25) | (0 < 29) | (0 << 31)
-    /* Prepare message payload. */
-    p->MOV(GenRegister::f8grf(nr , 0), ucoord);
-    p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
-    if (insn.extra.is3DRead) {
-      p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), ra->genReg(insn.src(6)));
-      coord_cnt++;
-    }
-    p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0);
-    p->pop();
+    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0, insn.extra.isLD, insn.extra.isUniform);
   }
 
   void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
@@ -1807,66 +1756,109 @@ namespace gbe
 
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
-    const GenRegister ucoord = ra->genReg(insn.src(insn.extra.msglen));
-    const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.msglen));
-    const GenRegister R = ra->genReg(insn.src(3 + insn.extra.msglen));
-    const GenRegister G = ra->genReg(insn.src(4 + insn.extra.msglen));
-    const GenRegister B = ra->genReg(insn.src(5 + insn.extra.msglen));
-    const GenRegister A = ra->genReg(insn.src(6 + insn.extra.msglen));
-    const unsigned char bti = insn.extra.bti;
-
-    p->push();
-    uint32_t simdWidth = p->curr.execWidth;
-    const uint32_t nr = header.nr;
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    p->MOV(header, GenRegister::immud(0x0));
-    p->curr.execWidth = 1;
+    const uint32_t bti = insn.getbti();
+    p->TYPED_WRITE(header, true, bti);
+  }
 
-    // prepare mesg desc and move to a0.0.
-    // desc = bti | (msg_type << 14) | (header_present << 19))
-    // prepare header, we need to enable all the 8 planes.
-    p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xffff));
-    p->curr.execWidth = 8;
-    // Typed write only support SIMD8.
-    // Prepare message payload U + V + R(ignored) + LOD(0) + RGBA.
-    // Currently, we don't support non-zero lod, so we clear all lod to
-    // zero for both quarters thus save one instruction here.
-    // Thus we must put this instruction in noMask and no predication state.
-    p->MOV(GenRegister::ud8grf(nr + 4, 0), GenRegister::immud(0)); //LOD
-    p->pop();
-    p->push();
-    p->curr.execWidth = 8;
-    // TYPED WRITE send instruction only support SIMD8, if we are SIMD16, we
-    // need to call it twice.
-    uint32_t quarterNum = (simdWidth == 8) ? 1 : 2;
+  BVAR(OCL_OUTPUT_REG_ALLOC, false);
+  BVAR(OCL_OUTPUT_ASM, false);
 
-    for( uint32_t quarter = 0; quarter < quarterNum; quarter++)
-    {
-#define QUARTER_MOV0(dst_nr, src) p->MOV(GenRegister::ud8grf(dst_nr, 0), \
-                                        GenRegister::retype(GenRegister::QnPhysical(src, quarter), src.type))
-#define QUARTER_MOV1(dst_nr, src) p->MOV(GenRegister::retype(GenRegister::ud8grf(dst_nr, 0), src.type), \
-                                        GenRegister::retype(GenRegister::QnPhysical(src,quarter), src.type))
-      if (quarter == 1)
-        p->curr.quarterControl = GEN_COMPRESSION_Q2;
-      QUARTER_MOV0(nr + 1, ucoord);
-      QUARTER_MOV0(nr + 2, vcoord);
-      if (insn.extra.is3DWrite)
-        QUARTER_MOV0(nr + 3, ra->genReg(insn.src(2 + insn.extra.msglen)));
-      QUARTER_MOV1(nr + 5, R);
-      QUARTER_MOV1(nr + 6, G);
-      QUARTER_MOV1(nr + 7, B);
-      QUARTER_MOV1(nr + 8, A);
-#undef QUARTER_MOV
-      p->TYPED_WRITE(header, true, bti);
+  void GenContext::allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue) {
+    uint32_t regSize;
+    regSize = this->ra->getRegSize(reg);
+    insertCurbeReg(reg, newCurbeEntry(value, subValue, regSize));
+  }
+
+  void GenContext::buildPatchList(void) {
+    const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
+    kernel->curbeSize = 0u;
+    auto &stackUse = dag->getUse(ir::ocl::stackptr);
+
+    // We insert the block IP mask first
+    using namespace ir::ocl;
+    allocCurbeReg(blockip, GBE_CURBE_BLOCK_IP);
+    allocCurbeReg(lid0, GBE_CURBE_LOCAL_ID_X);
+    allocCurbeReg(lid1, GBE_CURBE_LOCAL_ID_Y);
+    allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
+    allocCurbeReg(zero, GBE_CURBE_ZERO);
+    allocCurbeReg(one, GBE_CURBE_ONE);
+    if (stackUse.size() != 0)
+      allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
+    allocSLMOffsetCurbe();
+    // Go over the arguments and find the related patch locations
+    const uint32_t argNum = fn.argNum();
+    for (uint32_t argID = 0u; argID < argNum; ++argID) {
+      const ir::FunctionArgument &arg = fn.getArg(argID);
+      // For pointers and values, we have nothing to do. We just push the values
+      if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
+          arg.type == ir::FunctionArgument::LOCAL_POINTER ||
+          arg.type == ir::FunctionArgument::CONSTANT_POINTER ||
+          arg.type == ir::FunctionArgument::VALUE ||
+          arg.type == ir::FunctionArgument::STRUCTURE ||
+          arg.type == ir::FunctionArgument::IMAGE ||
+          arg.type == ir::FunctionArgument::SAMPLER)
+        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
     }
-    p->pop();
+
+    // Go over all the instructions and find the special register we need
+    // to push
+    #define INSERT_REG(SPECIAL_REG, PATCH) \
+    if (reg == ir::ocl::SPECIAL_REG) { \
+      if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
+      allocCurbeReg(reg, GBE_CURBE_##PATCH); \
+    } else
+  
+    fn.foreachInstruction([&](ir::Instruction &insn) {
+      const uint32_t srcNum = insn.getSrcNum();
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        const ir::Register reg = insn.getSrc(srcID);
+        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+          if (srcID != 0) continue;
+          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
+          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+          ir::ImageInfoKey key(bti, type);
+          const ir::Register imageInfo = insn.getSrc(0);
+          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+            insertCurbeReg(imageInfo, offset);
+          }
+          continue;
+        }
+        if (fn.isSpecialReg(reg) == false) continue;
+        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
+        if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
+        INSERT_REG(lsize0, LOCAL_SIZE_X)
+        INSERT_REG(lsize1, LOCAL_SIZE_Y)
+        INSERT_REG(lsize2, LOCAL_SIZE_Z)
+        INSERT_REG(gsize0, GLOBAL_SIZE_X)
+        INSERT_REG(gsize1, GLOBAL_SIZE_Y)
+        INSERT_REG(gsize2, GLOBAL_SIZE_Z)
+        INSERT_REG(goffset0, GLOBAL_OFFSET_X)
+        INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
+        INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
+        INSERT_REG(workdim, WORK_DIM)
+        INSERT_REG(numgroup0, GROUP_NUM_X)
+        INSERT_REG(numgroup1, GROUP_NUM_Y)
+        INSERT_REG(numgroup2, GROUP_NUM_Z)
+        INSERT_REG(stackptr, STACK_POINTER)
+        INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
+        INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
+        do {} while(0);
+      }
+    });
+#undef INSERT_REG
+
+
+    // After this point the vector is immutable. Sorting it will make
+    // research faster
+    std::sort(kernel->patches.begin(), kernel->patches.end());
+
+    kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
   }
 
-  BVAR(OCL_OUTPUT_REG_ALLOC, false);
-  BVAR(OCL_OUTPUT_ASM, false);
   bool GenContext::emitCode(void) {
     GenKernel *genKernel = static_cast<GenKernel*>(this->kernel);
+    buildPatchList();
     sel->select();
     schedulePreRegAllocation(*this, *this->sel);
     if (UNLIKELY(ra->allocate(*this->sel) == false))
@@ -1876,22 +1868,35 @@ namespace gbe
       ra->outputAllocation();
     this->clearFlagRegister();
     this->emitStackPointer();
+    this->emitSLMOffset();
     this->emitInstructionStream();
-    this->patchBranches();
+    if (this->patchBranches() == false)
+      return false;
     genKernel->insnNum = p->store.size();
     genKernel->insns = GBE_NEW_ARRAY_NO_ARG(GenInstruction, genKernel->insnNum);
     std::memcpy(genKernel->insns, &p->store[0], genKernel->insnNum * sizeof(GenInstruction));
     if (OCL_OUTPUT_ASM) {
       std::cout << genKernel->getName() << "'s disassemble begin:" << std::endl;
       ir::LabelIndex curLabel = (ir::LabelIndex)0;
+      GenCompactInstruction * pCom = NULL;
+      GenNativeInstruction insn;
       std::cout << "  L0:" << std::endl;
-      for (uint32_t insnID = 0; insnID < genKernel->insnNum; ++insnID) {
-        if (labelPos.find((ir::LabelIndex)(curLabel + 1))->second == insnID) {
+      for (uint32_t insnID = 0; insnID < genKernel->insnNum; ) {
+        if (labelPos.find((ir::LabelIndex)(curLabel + 1))->second == insnID &&
+            curLabel < this->getFunction().labelNum()) {
           std::cout << "  L" << curLabel + 1 << ":" << std::endl;
           curLabel = (ir::LabelIndex)(curLabel + 1);
         }
-        std::cout << "    (" << std::setw(8) << insnID * 2 << ")  ";
-        gen_disasm(stdout, &p->store[insnID]);
+        std::cout << "    (" << std::setw(8) << insnID << ")  ";
+        pCom = (GenCompactInstruction*)&p->store[insnID];
+        if(pCom->bits1.cmpt_control == 1) {
+          decompactInstruction(pCom, &insn);
+          gen_disasm(stdout, &insn, deviceID, 1);
+          insnID++;
+        } else {
+          gen_disasm(stdout, &p->store[insnID], deviceID, 0);
+          insnID = insnID + 2;
+        }
       }
       std::cout << genKernel->getName() << "'s disassemble end." << std::endl;
     }
@@ -1899,7 +1904,7 @@ namespace gbe
   }
 
   Kernel *GenContext::allocateKernel(void) {
-    return GBE_NEW(GenKernel, name);
+    return GBE_NEW(GenKernel, name, deviceID);
   }
 
 } /* namespace gbe */
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6cfc295..02c83d0 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -26,6 +26,7 @@
 #define __GBE_GEN_CONTEXT_HPP__
 
 #include "backend/context.hpp"
+#include "backend/gen_encoder.hpp"
 #include "backend/program.h"
 #include "backend/gen_register.hpp"
 #include "ir/function.hpp"
@@ -42,6 +43,13 @@ namespace gbe
   class SelectionInstruction; // Pre-RA Gen instruction
   class SelectionReg;         // Pre-RA Gen register
   class GenRegister;
+  typedef enum {
+    NO_ERROR,
+    REGISTER_ALLOCATION_FAIL,
+    REGISTER_SPILL_EXCEED_THRESHOLD,
+    REGISTER_SPILL_FAIL,
+    OUT_OF_RANGE_IF_ENDIF,
+  } CompileErrorCode;
 
   /*! Context is the helper structure to build the Gen ISA or simulation code
    *  from GenIR
@@ -52,11 +60,22 @@ namespace gbe
     /*! Create a new context. name is the name of the function we want to
      *  compile
      */
-    GenContext(const ir::Unit &unit, const std::string &name, bool limitRegisterPressure = false);
+    GenContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID,
+               bool relaxMath = false);
     /*! Release everything needed */
-    ~GenContext(void);
+    virtual ~GenContext(void);
+    /*! device's max srcatch buffer size */
+    #define GEN7_SCRATCH_SIZE  (12 * KB)
+    /*! Start new code generation with specific parameters */
+    void startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure);
+    /*! Target device ID*/
+    uint32_t deviceID;
     /*! Implements base class */
     virtual bool emitCode(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) { return GEN7_SCRATCH_SIZE; }
     /*! Function we emit code for */
     INLINE const ir::Function &getFunction(void) const { return fn; }
     /*! Simd width chosen for the current function */
@@ -65,11 +84,11 @@ namespace gbe
     /*! check the flag reg, if is grf, use f0.1 instead */
     GenRegister checkFlagRegister(GenRegister flagReg);
     /*! Emit the per-lane stack pointer computation */
-    void emitStackPointer(void);
+    virtual void emitStackPointer(void);
     /*! Emit the instructions */
     void emitInstructionStream(void);
     /*! Set the correct target values for the branches */
-    void patchBranches(void);
+    bool patchBranches(void);
     /*! Forward ir::Function isSpecialReg method */
     INLINE bool isSpecialReg(ir::Register reg) const {
       return fn.isSpecialReg(reg);
@@ -83,15 +102,6 @@ namespace gbe
       return this->liveness->getLiveIn(bb);
     }
 
-    /*! Get the extra liveOut information for the given block */
-    INLINE const ir::Liveness::LiveOut &getExtraLiveOut(const ir::BasicBlock *bb) const {
-      return this->liveness->getExtraLiveOut(bb);
-    }
-    /*! Get the extra LiveIn information for the given block */
-    INLINE const ir::Liveness::UEVar &getExtraLiveIn(const ir::BasicBlock *bb) const {
-      return this->liveness->getExtraLiveIn(bb);
-    }
-
     void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
     void storeTopHalf(GenRegister dest, GenRegister src);
@@ -142,6 +152,8 @@ namespace gbe
     void emitAtomicInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn);
     void emitByteScatterInstruction(const SelectionInstruction &insn);
+    void emitPackByteInstruction(const SelectionInstruction &insn);
+    void emitUnpackByteInstruction(const SelectionInstruction &insn);
     void emitDWordGatherInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
@@ -157,8 +169,17 @@ namespace gbe
     virtual Kernel *allocateKernel(void);
     /*! Store the position of each label instruction in the Gen ISA stream */
     map<ir::LabelIndex, uint32_t> labelPos;
+    typedef struct LabelPair {
+      LabelPair(ir::LabelIndex l0, ir::LabelIndex l1) :
+                l0(l0), l1(l1){};
+      ir::LabelIndex l0;
+      ir::LabelIndex l1;
+    } LabelPair;
     /*! Store the Gen instructions to patch */
+    vector<std::pair<LabelPair, uint32_t>> branchPos3;
     vector<std::pair<ir::LabelIndex, uint32_t>> branchPos2;
+
+    void insertJumpPos(const SelectionInstruction &insn);
     /*! Encode Gen ISA */
     GenEncoder *p;
     /*! Instruction selection on Gen ISA (pre-register allocation) */
@@ -168,7 +189,33 @@ namespace gbe
     /*! Indicate if we need to tackle a register pressure issue when
      * regenerating the code
      */
+    uint32_t reservedSpillRegs;
     bool limitRegisterPressure;
+    bool relaxMath;
+    const bool getIFENDIFFix(void) const { return ifEndifFix; }
+    void setIFENDIFFix(bool fix) { ifEndifFix = fix; }
+    const CompileErrorCode getErrCode() { return errCode; }
+
+  protected:
+    virtual GenEncoder* generateEncoder(void) {
+      return GBE_NEW(GenEncoder, this->simdWidth, 7, deviceID);
+    }
+    /*! allocate a new curbe register and insert to curbe pool. */
+    void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
+
+  private:
+    CompileErrorCode errCode;
+    bool ifEndifFix;
+    /*! Build the curbe patch list for the given kernel */
+    void buildPatchList(void);
+    /*! Calc the group's slm offset from R0.0, to work around HSW SLM bug*/
+    virtual void emitSLMOffset(void) { };
+    /*! allocate group's slm offset in curbe, only for HSW */
+    virtual void allocSLMOffsetCurbe(void) { };
+    /*! new selection of device */
+    virtual void newSelection(void);
+    friend class GenRegAllocator;               //!< need to access errCode directly.
+
   };
 
 } /* namespace gbe */
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index ffa38c0..f0da50a 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -128,8 +128,9 @@ enum opcode {
   GEN_OPCODE_F32TO16 = 19,
   GEN_OPCODE_F16TO32 = 20,
   GEN_OPCODE_JMPI = 32,
+  GEN_OPCODE_BRD = 33,
   GEN_OPCODE_IF = 34,
-  GEN_OPCODE_IFF = 35,
+  GEN_OPCODE_BRC = 35,
   GEN_OPCODE_ELSE = 36,
   GEN_OPCODE_ENDIF = 37,
   GEN_OPCODE_DO = 38,
@@ -208,7 +209,8 @@ enum GenMessageTarget {
   GEN6_SFID_DATAPORT_SAMPLER_CACHE  = 4,
   GEN6_SFID_DATAPORT_RENDER_CACHE   = 5,
   GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
-  GEN_SFID_DATAPORT_DATA_CACHE     = 10,
+  GEN_SFID_DATAPORT_DATA_CACHE      = 10,
+  GEN_SFID_DATAPORT1_DATA_CACHE     = 12,
 };
 
 #define GEN_PREDICATE_NONE                    0
@@ -306,20 +308,46 @@ enum GenMessageTarget {
 #define GEN_BYTE_SCATTER_SIMD8    0
 #define GEN_BYTE_SCATTER_SIMD16   1
 
-/* Data port message type*/
-#define GEN_OBLOCK_READ           0 //0000: OWord Block Read
-#define GEN_UNALIGNED_OBLOCK_READ 1 //0001: Unaligned OWord Block Read
-#define GEN_ODBLOCK_READ          2 //0010: OWord Dual Block Read
-#define GEN_DWORD_GATHER          3 //0011: DWord Scattered Read
-#define GEN_BYTE_GATHER           4 //0100: Byte Scattered Read
-#define GEN_UNTYPED_READ          5 //0101: Untyped Surface Read
-#define GEN_UNTYPED_ATOMIC_READ   6 //0110: Untyped Atomic Operation
-#define GEN_MEMORY_FENCE          7 //0111: Memory Fence
-#define GEN_OBLOCK_WRITE          8 //1000: OWord Block Write
-#define GEN_ODBLOCK_WRITE         10//1010: OWord Dual Block Write
-#define GEN_DWORD_SCATTER         11//1011: DWord Scattered Write
-#define GEN_BYTE_SCATTER          12//1100: Byte Scattered Write
-#define GEN_UNTYPED_WRITE         13//1101: Untyped Surface Write
+/* Data port message type for gen7*/
+#define GEN7_OBLOCK_READ           0 //0000: OWord Block Read
+#define GEN7_UNALIGNED_OBLOCK_READ 1 //0001: Unaligned OWord Block Read
+#define GEN7_ODBLOCK_READ          2 //0010: OWord Dual Block Read
+#define GEN7_DWORD_GATHER          3 //0011: DWord Scattered Read
+#define GEN7_BYTE_GATHER           4 //0100: Byte Scattered Read
+#define GEN7_UNTYPED_READ          5 //0101: Untyped Surface Read
+#define GEN7_UNTYPED_ATOMIC_READ   6 //0110: Untyped Atomic Operation
+#define GEN7_MEMORY_FENCE          7 //0111: Memory Fence
+#define GEN7_OBLOCK_WRITE          8 //1000: OWord Block Write
+#define GEN7_ODBLOCK_WRITE         10//1010: OWord Dual Block Write
+#define GEN7_DWORD_SCATTER         11//1011: DWord Scattered Write
+#define GEN7_BYTE_SCATTER          12//1100: Byte Scattered Write
+#define GEN7_UNTYPED_WRITE         13//1101: Untyped Surface Write
+
+/* Data port0 message type for Gen75*/
+#define GEN75_P0_OBLOCK_READ            0 //0000: OWord Block Read
+#define GEN75_P0_UNALIGNED_OBLOCK_READ  1 //0001: Unaligned OWord Block Read
+#define GEN75_P0_ODBLOCK_READ           2 //0010: OWord Dual Block Read
+#define GEN75_P0_DWORD_GATHER           3 //0011: DWord Scattered Read
+#define GEN75_P0_BYTE_GATHER            4 //0100: Byte Scattered Read
+#define GEN75_P0_MEMORY_FENCE           7 //0111: Memory Fence
+#define GEN75_P0_OBLOCK_WRITE           8 //1000: OWord Block Write
+#define GEN75_P0_ODBLOCK_WRITE         10 //1010: OWord Dual Block Write
+#define GEN75_P0_DWORD_SCATTER         11 //1011: DWord Scattered Write
+#define GEN75_P0_BYTE_SCATTER          12 //1100: Byte Scattered Write
+
+/* Data port1 message type for Gen75*/
+#define GEN75_P1_UNTYPED_READ           1 //0001: Untyped Surface Read
+#define GEN75_P1_UNTYPED_ATOMIC_OP      2 //0010: Untyped Atomic Operation
+#define GEN75_P1_UNTYPED_ATOMIC_OP_4X2  3 //0011: Untyped Atomic Operation SIMD4x2
+#define GEN75_P1_MEDIA_BREAD            4 //0100: Media Block Read
+#define GEN75_P1_TYPED_SURFACE_READ     5 //0101: Typed Surface Read
+#define GEN75_P1_TYPED_ATOMIC_OP        6 //0110: Typed Atomic Operation
+#define GEN75_P1_TYPED_ATOMIC_OP_4X2    7 //0111: Typed Atomic Operation SIMD4x2
+#define GEN75_P1_UNTYPED_SURFACE_WRITE  9 //1001: Untyped Surface Write
+#define GEN75_P1_MEDIA_TYPED_BWRITE    10 //1010: Media Block Write
+#define GEN75_P1_ATOMIC_COUNTER        11 //1011: Atomic Counter Operation
+#define GEN75_P1_ATOMIC_COUNTER_4X2    12 //1100: Atomic Counter Operation 4X2
+#define GEN75_P1_TYPED_SURFACE_WRITE   13 //1101: Typed Surface Write
 
 /* Data port data cache scratch messages*/
 #define GEN_SCRATCH_READ                  0
@@ -368,9 +396,9 @@ enum GenMessageTarget {
 #define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE  1
 #define GEN_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
 #define GEN_SAMPLER_MESSAGE_SIMD16_RESINFO            2
-#define GEN_SAMPLER_MESSAGE_SIMD4X2_LD                3
-#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  3
-#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 3
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_LD                7
+#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  7
+#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 7
 
 #define GEN5_SAMPLER_MESSAGE_SAMPLE              0
 #define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
@@ -435,470 +463,511 @@ enum GenMessageTarget {
 #define GEN_MAX_GRF 128
 
 /* Instruction format for the execution units */
-struct GenInstruction
-{
-  struct {
-    uint32_t opcode:7;
-    uint32_t pad:1;
-    uint32_t access_mode:1;
-    uint32_t mask_control:1;
-    uint32_t dependency_control:2;
-    uint32_t quarter_control:2;
-    uint32_t thread_control:2;
-    uint32_t predicate_control:4;
-    uint32_t predicate_inverse:1;
-    uint32_t execution_size:3;
-    uint32_t destreg_or_condmod:4;
-    uint32_t acc_wr_control:1;
-    uint32_t cmpt_control:1;
-    uint32_t debug_control:1;
-    uint32_t saturate:1;
-  } header;
-
-  union {
-    struct {
-      uint32_t dest_reg_file:2;
-      uint32_t dest_reg_type:3;
-      uint32_t src0_reg_file:2;
-      uint32_t src0_reg_type:3;
-      uint32_t src1_reg_file:2;
-      uint32_t src1_reg_type:3;
-      uint32_t nib_ctrl:1;
-      uint32_t dest_subreg_nr:5;
-      uint32_t dest_reg_nr:8;
-      uint32_t dest_horiz_stride:2;
-      uint32_t dest_address_mode:1;
-    } da1;
 
-    struct {
-      uint32_t dest_reg_file:2;
-      uint32_t dest_reg_type:3;
-      uint32_t src0_reg_file:2;
-      uint32_t src0_reg_type:3;
-      uint32_t src1_reg_file:2;        /* 0x00000c00 */
-      uint32_t src1_reg_type:3;        /* 0x00007000 */
-      uint32_t nib_ctrl:1;
-      int dest_indirect_offset:10;        /* offset against the deref'd address reg */
-      uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
-      uint32_t dest_horiz_stride:2;
-      uint32_t dest_address_mode:1;
-    } ia1;
-
-    struct {
-      uint32_t dest_reg_file:2;
-      uint32_t dest_reg_type:3;
-      uint32_t src0_reg_file:2;
-      uint32_t src0_reg_type:3;
-      uint32_t src1_reg_file:2;
-      uint32_t src1_reg_type:3;
-      uint32_t nib_ctrl:1;
-      uint32_t dest_writemask:4;
-      uint32_t dest_subreg_nr:1;
-      uint32_t dest_reg_nr:8;
-      uint32_t dest_horiz_stride:2;
-      uint32_t dest_address_mode:1;
-    } da16;
-
-    struct {
-      uint32_t dest_reg_file:2;
-      uint32_t dest_reg_type:3;
-      uint32_t src0_reg_file:2;
-      uint32_t src0_reg_type:3;
-      uint32_t nib_ctrl:1;
-      uint32_t dest_writemask:4;
-      int dest_indirect_offset:6;
-      uint32_t dest_subreg_nr:3;
-      uint32_t dest_horiz_stride:2;
-      uint32_t dest_address_mode:1;
-    } ia16;
+struct GenInstruction {
+  uint32_t low;
+  uint32_t high;
+};
 
+union GenCompactInstruction {
+  struct GenInstruction low;
+  struct {
     struct {
-      uint32_t dest_reg_file:2;
-      uint32_t dest_reg_type:3;
-      uint32_t src0_reg_file:2;
-      uint32_t src0_reg_type:3;
-      uint32_t src1_reg_file:2;
-      uint32_t src1_reg_type:3;
+      uint32_t opcode:7;
+      uint32_t debug_control:1;
+      uint32_t control_index:5;
+      uint32_t data_type_index:5;
+      uint32_t sub_reg_index:5;
+      uint32_t acc_wr_control:1;
+      uint32_t destreg_or_condmod:4;
       uint32_t pad:1;
-      int jump_count:16;
-    } branch_gen6;
-
+      uint32_t cmpt_control:1;
+      uint32_t src0_index_lo:2;
+    } bits1;
     struct {
-      uint32_t dest_reg_file:1;
-      uint32_t flag_subreg_num:1;
-      uint32_t pad0:2;
-      uint32_t src0_abs:1;
-      uint32_t src0_negate:1;
-      uint32_t src1_abs:1;
-      uint32_t src1_negate:1;
-      uint32_t src2_abs:1;
-      uint32_t src2_negate:1;
-      uint32_t pad1:7;
-      uint32_t dest_writemask:4;
-      uint32_t dest_subreg_nr:3;
+      uint32_t src0_index_hi:3;
+      uint32_t src1_index:5;
       uint32_t dest_reg_nr:8;
-    } da3src;
-  } bits1;
-
-  union {
-    struct {
-      uint32_t src0_subreg_nr:5;
       uint32_t src0_reg_nr:8;
-      uint32_t src0_abs:1;
-      uint32_t src0_negate:1;
-      uint32_t src0_address_mode:1;
-      uint32_t src0_horiz_stride:2;
-      uint32_t src0_width:3;
-      uint32_t src0_vert_stride:4;
-      uint32_t flag_sub_reg_nr:1;
-      uint32_t flag_reg_nr:1;
-      uint32_t pad:5;
-    } da1;
-
-    struct {
-      int src0_indirect_offset:10;
-      uint32_t src0_subreg_nr:3;
-      uint32_t src0_abs:1;
-      uint32_t src0_negate:1;
-      uint32_t src0_address_mode:1;
-      uint32_t src0_horiz_stride:2;
-      uint32_t src0_width:3;
-      uint32_t src0_vert_stride:4;
-      uint32_t flag_sub_reg_nr:1;
-      uint32_t flag_reg_nr:1;
-      uint32_t pad:5;
-    } ia1;
-
-    struct {
-      uint32_t src0_swz_x:2;
-      uint32_t src0_swz_y:2;
-      uint32_t src0_subreg_nr:1;
-      uint32_t src0_reg_nr:8;
-      uint32_t src0_abs:1;
-      uint32_t src0_negate:1;
-      uint32_t src0_address_mode:1;
-      uint32_t src0_swz_z:2;
-      uint32_t src0_swz_w:2;
-      uint32_t pad0:1;
-      uint32_t src0_vert_stride:4;
-      uint32_t flag_sub_reg_nr:1;
-      uint32_t flag_reg_nr:1;
-      uint32_t pad:5;
-    } da16;
-
-    struct {
-      uint32_t src0_swz_x:2;
-      uint32_t src0_swz_y:2;
-      int src0_indirect_offset:6;
-      uint32_t src0_subreg_nr:3;
-      uint32_t src0_abs:1;
-      uint32_t src0_negate:1;
-      uint32_t src0_address_mode:1;
-      uint32_t src0_swz_z:2;
-      uint32_t src0_swz_w:2;
-      uint32_t pad0:1;
-      uint32_t src0_vert_stride:4;
-      uint32_t flag_sub_reg_nr:1;
-      uint32_t flag_reg_nr:1;
-      uint32_t pad:5;
-    } ia16;
-
-    struct {
-      uint32_t src0_rep_ctrl:1;
-      uint32_t src0_swizzle:8;
-      uint32_t src0_subreg_nr:3;
-      uint32_t src0_reg_nr:8;
-      uint32_t pad0:1;
-      uint32_t src1_rep_ctrl:1;
-      uint32_t src1_swizzle:8;
-      uint32_t src1_subreg_nr_low:2;
-    } da3src;
-  } bits2;
-
-  union {
-    struct {
-      uint32_t src1_subreg_nr:5;
       uint32_t src1_reg_nr:8;
-      uint32_t src1_abs:1;
-      uint32_t src1_negate:1;
-      uint32_t src1_address_mode:1;
-      uint32_t src1_horiz_stride:2;
-      uint32_t src1_width:3;
-      uint32_t src1_vert_stride:4;
-      uint32_t pad0:7;
-    } da1;
-
-    struct {
-      uint32_t src1_swz_x:2;
-      uint32_t src1_swz_y:2;
-      uint32_t src1_subreg_nr:1;
-      uint32_t src1_reg_nr:8;
-      uint32_t src1_abs:1;
-      uint32_t src1_negate:1;
-      uint32_t src1_address_mode:1;
-      uint32_t src1_swz_z:2;
-      uint32_t src1_swz_w:2;
-      uint32_t pad1:1;
-      uint32_t src1_vert_stride:4;
-      uint32_t pad2:7;
-    } da16;
-
-    struct {
-      int  src1_indirect_offset:10;
-      uint32_t src1_subreg_nr:3;
-      uint32_t src1_abs:1;
-      uint32_t src1_negate:1;
-      uint32_t src1_address_mode:1;
-      uint32_t src1_horiz_stride:2;
-      uint32_t src1_width:3;
-      uint32_t src1_vert_stride:4;
-      uint32_t pad1:7;
-    } ia1;
-
-    struct {
-      uint32_t src1_swz_x:2;
-      uint32_t src1_swz_y:2;
-      int  src1_indirect_offset:6;
-      uint32_t src1_subreg_nr:3;
-      uint32_t src1_abs:1;
-      uint32_t src1_negate:1;
-      uint32_t pad0:1;
-      uint32_t src1_swz_z:2;
-      uint32_t src1_swz_w:2;
-      uint32_t pad1:1;
-      uint32_t src1_vert_stride:4;
-      uint32_t pad2:7;
-    } ia16;
-
-    struct {
-      uint32_t function_control:19;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad1:2;
-      uint32_t end_of_thread:1;
-    } generic_gen5;
-
-    struct {
-      uint32_t sub_function_id:3;
-      uint32_t pad0:11;
-      uint32_t ack_req:1;
-      uint32_t notify:2;
-      uint32_t pad1:2;
-      uint32_t header:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad2:2;
-      uint32_t end_of_thread:1;
-    } msg_gateway;
-
-    struct {
-      uint32_t opcode:1;
-      uint32_t request:1;
-      uint32_t pad0:2;
-      uint32_t resource:1;
-      uint32_t pad1:14;
-      uint32_t header:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad2:2;
-      uint32_t end_of_thread:1;
-    } spawner_gen5;
-
-    /** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
-    struct {
-      uint32_t function:4;
-      uint32_t int_type:1;
-      uint32_t precision:1;
-      uint32_t saturate:1;
-      uint32_t data_type:1;
-      uint32_t snapshot:1;
-      uint32_t pad0:10;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad1:2;
-      uint32_t end_of_thread:1;
-    } math_gen5;
+    } bits2;
+  };
+};
 
+union GenNativeInstruction
+{
+  struct {
+    struct GenInstruction low;
+    struct GenInstruction high;
+  };
+  struct {
     struct {
-      uint32_t bti:8;
-      uint32_t sampler:4;
-      uint32_t msg_type:5;
-      uint32_t simd_mode:2;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad1:2;
-      uint32_t end_of_thread:1;
-    } sampler_gen7;
-
-    /**
-     * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
-     *
-     * See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
-     **/
-    struct {
-      uint32_t bti:8;
-      uint32_t msg_control:5;
-      uint32_t msg_type:3;
-      uint32_t pad0:3;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad1:2;
-      uint32_t end_of_thread:1;
-    } gen6_dp_sampler_const_cache;
-
-    /*! Data port untyped read / write messages */
-    struct {
-      uint32_t bti:8;
-      uint32_t rgba:4;
-      uint32_t simd_mode:2;
-      uint32_t msg_type:4;
-      uint32_t category:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad2:2;
-      uint32_t end_of_thread:1;
-    } gen7_untyped_rw;
-
-    /*! Data port byte scatter / gather */
-    struct {
-      uint32_t bti:8;
-      uint32_t simd_mode:1;
-      uint32_t ignored0:1;
-      uint32_t data_size:2;
-      uint32_t ignored1:2;
-      uint32_t msg_type:4;
-      uint32_t category:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad2:2;
-      uint32_t end_of_thread:1;
-    } gen7_byte_rw;
-
-    /*! Data port Scratch Read/ write */
-    struct {
-      uint32_t offset:12;
-      uint32_t block_size:2;
-      uint32_t ignored0:1;
-      uint32_t invalidate_after_read:1;
-      uint32_t channel_mode:1;
-      uint32_t msg_type:1;
-      uint32_t category:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad2:2;
-      uint32_t end_of_thread:1;
-    } gen7_scratch_rw;
-
-    /*! Data port OBlock read / write */
-    struct {
-      uint32_t bti:8;
-      uint32_t block_size:3;
-      uint32_t ignored:2;
-      uint32_t invalidate_after_read:1;
-      uint32_t msg_type:4;
-      uint32_t category:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad2:2;
-      uint32_t end_of_thread:1;
-    } gen7_oblock_rw;
-
-    /*! Data port dword scatter / gather */
-    struct {
-      uint32_t bti:8;
-      uint32_t block_size:2;
-      uint32_t ignored0:3;
-      uint32_t invalidate_after_read:1;
-      uint32_t msg_type:4;
-      uint32_t ignored1:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad2:2;
-      uint32_t end_of_thread:1;
-    } gen7_dword_rw;
-
-    /*! Data port typed read / write messages */
-    struct {
-      uint32_t bti:8;
-      uint32_t chan_mask:4;
+      uint32_t opcode:7;
       uint32_t pad:1;
-      uint32_t slot:1;
-      uint32_t msg_type:4;
-      uint32_t pad2:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad3:2;
-      uint32_t end_of_thread:1;
-    } gen7_typed_rw;
-
-    /*! Memory fence */
-    struct {
-      uint32_t bti:8;
-      uint32_t pad:5;
-      uint32_t commit_enable:1;
-      uint32_t msg_type:4;
-      uint32_t pad2:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad3:2;
-      uint32_t end_of_thread:1;
-    } gen7_memory_fence;
-
-    /*! atomic messages */
-    struct {
-      uint32_t bti:8;
-      uint32_t aop_type:4;
-      uint32_t simd_mode:1;
-      uint32_t return_data:1;
-      uint32_t msg_type:4;
-      uint32_t category:1;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad3:2;
-      uint32_t end_of_thread:1;
-    } gen7_atomic_op;
-
-    struct {
-      uint32_t src1_subreg_nr_high:1;
-      uint32_t src1_reg_nr:8;
-      uint32_t pad0:1;
-      uint32_t src2_rep_ctrl:1;
-      uint32_t src2_swizzle:8;
-      uint32_t src2_subreg_nr:3;
-      uint32_t src2_reg_nr:8;
-      uint32_t pad1:2;
-    } da3src;
-
-    /*! Message gateway */
-    struct {
-      uint32_t subfunc:3;
-      uint32_t pad:11;
-      uint32_t ackreq:1;
-      uint32_t notify:2;
-      uint32_t pad2:2;
-      uint32_t header_present:1;
-      uint32_t response_length:5;
-      uint32_t msg_length:4;
-      uint32_t pad3:2;
-      uint32_t end_of_thread:1;
-    } gen7_msg_gw;
-
-    int d;
-    uint32_t ud;
-    float f;
-  } bits3;
+      uint32_t access_mode:1;
+      uint32_t mask_control:1;
+      uint32_t dependency_control:2;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t destreg_or_condmod:4;
+      uint32_t acc_wr_control:1;
+      uint32_t cmpt_control:1;
+      uint32_t debug_control:1;
+      uint32_t saturate:1;
+    } header;
+
+    union {
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_subreg_nr:5;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da1;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;        /* 0x00000c00 */
+        uint32_t src1_reg_type:3;        /* 0x00007000 */
+        uint32_t nib_ctrl:1;
+        int dest_indirect_offset:10;        /* offset against the deref'd address reg */
+        uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia1;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:1;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da16;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_writemask:4;
+        int dest_indirect_offset:6;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia16;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t pad:1;
+        int jump_count:16;
+      } branch_gen6;
+
+      struct {
+        uint32_t dest_reg_file:1;
+        uint32_t flag_subreg_num:1;
+        uint32_t pad0:2;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src2_abs:1;
+        uint32_t src2_negate:1;
+        uint32_t pad1:7;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_reg_nr:8;
+      } da3src;
+    } bits1;
+
+    union {
+      struct {
+        uint32_t src0_subreg_nr:5;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } da1;
+
+      struct {
+        int src0_indirect_offset:10;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } ia1;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        uint32_t src0_subreg_nr:1;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } da16;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        int src0_indirect_offset:6;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } ia16;
+
+      struct {
+        uint32_t src0_rep_ctrl:1;
+        uint32_t src0_swizzle:8;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_reg_nr:8;
+        uint32_t pad0:1;
+        uint32_t src1_rep_ctrl:1;
+        uint32_t src1_swizzle:8;
+        uint32_t src1_subreg_nr_low:2;
+      } da3src;
+    } bits2;
+
+    union {
+      struct {
+        uint32_t src1_subreg_nr:5;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad0:7;
+      } da1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        uint32_t src1_subreg_nr:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } da16;
+
+      struct {
+        int  src1_indirect_offset:10;
+        uint32_t src1_subreg_nr:3;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad1:7;
+      } ia1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        int  src1_indirect_offset:6;
+        uint32_t src1_subreg_nr:3;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t pad0:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } ia16;
+
+      struct {
+        uint32_t function_control:19;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } generic_gen5;
+
+      struct {
+        uint32_t sub_function_id:3;
+        uint32_t pad0:11;
+        uint32_t ack_req:1;
+        uint32_t notify:2;
+        uint32_t pad1:2;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } msg_gateway;
+
+      struct {
+        uint32_t opcode:1;
+        uint32_t request:1;
+        uint32_t pad0:2;
+        uint32_t resource:1;
+        uint32_t pad1:14;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } spawner_gen5;
+
+      /** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
+      struct {
+        uint32_t function:4;
+        uint32_t int_type:1;
+        uint32_t precision:1;
+        uint32_t saturate:1;
+        uint32_t data_type:1;
+        uint32_t snapshot:1;
+        uint32_t pad0:10;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } math_gen5;
+
+      struct {
+        uint32_t bti:8;
+        uint32_t sampler:4;
+        uint32_t msg_type:5;
+        uint32_t simd_mode:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } sampler_gen7;
+
+      /**
+       * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
+       *
+       * See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
+       **/
+      struct {
+        uint32_t bti:8;
+        uint32_t msg_control:5;
+        uint32_t msg_type:3;
+        uint32_t pad0:3;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } gen6_dp_sampler_const_cache;
+
+      /*! Data port untyped read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t rgba:4;
+        uint32_t simd_mode:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_untyped_rw;
+
+      /*! Data port byte scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t simd_mode:1;
+        uint32_t ignored0:1;
+        uint32_t data_size:2;
+        uint32_t ignored1:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_byte_rw;
+
+      /*! Data port Scratch Read/ write */
+      struct {
+        uint32_t offset:12;
+        uint32_t block_size:2;
+        uint32_t ignored0:1;
+        uint32_t invalidate_after_read:1;
+        uint32_t channel_mode:1;
+        uint32_t msg_type:1;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_scratch_rw;
+
+      /*! Data port OBlock read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:3;
+        uint32_t ignored:2;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_oblock_rw;
+
+      /*! Data port dword scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:2;
+        uint32_t ignored0:3;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t ignored1:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_dword_rw;
+
+      /*! Data port typed read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t chan_mask:4;
+        uint32_t slot:2;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_typed_rw;
+
+      /*! Memory fence */
+      struct {
+        uint32_t bti:8;
+        uint32_t pad:5;
+        uint32_t commit_enable:1;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_memory_fence;
+
+      /*! atomic messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t aop_type:4;
+        uint32_t simd_mode:1;
+        uint32_t return_data:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_atomic_op;
+
+      struct {
+        uint32_t src1_subreg_nr_high:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t pad0:1;
+        uint32_t src2_rep_ctrl:1;
+        uint32_t src2_swizzle:8;
+        uint32_t src2_subreg_nr:3;
+        uint32_t src2_reg_nr:8;
+        uint32_t pad1:2;
+      } da3src;
+
+      /*! Message gateway */
+      struct {
+        uint32_t subfunc:3;
+        uint32_t pad:11;
+        uint32_t ackreq:1;
+        uint32_t notify:2;
+        uint32_t pad2:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_msg_gw;
+
+      struct {
+        uint32_t jip:16;
+        uint32_t uip:16;
+      } gen7_branch;
+
+      int d;
+      uint32_t ud;
+      float f;
+    } bits3;
+  };
 };
 
 #endif /* __GEN_DEFS_HPP__ */
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index aaf7dce..182752a 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -51,8 +51,11 @@
 #include "backend/gen_encoder.hpp"
 #include <cstring>
 
+
 namespace gbe
 {
+  extern bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split);
+  extern bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split);
   //////////////////////////////////////////////////////////////////////////
   // Some helper functions to encode
   //////////////////////////////////////////////////////////////////////////
@@ -65,14 +68,17 @@ namespace gbe
   }
 
   INLINE bool needToSplitAlu1(GenEncoder *p, GenRegister dst, GenRegister src) {
-    if (p->curr.execWidth != 16) return false;
+    if (p->curr.execWidth != 16 || src.hstride == GEN_HORIZONTAL_STRIDE_0) return false;
     if (isVectorOfBytes(dst) == true) return true;
     if (isVectorOfBytes(src) == true) return true;
     return false;
   }
 
   INLINE bool needToSplitAlu2(GenEncoder *p, GenRegister dst, GenRegister src0, GenRegister src1) {
-    if (p->curr.execWidth != 16) return false;
+    if (p->curr.execWidth != 16 ||
+         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
     if (isVectorOfBytes(dst) == true) return true;
     if (isVectorOfBytes(src0) == true) return true;
     if (isVectorOfBytes(src1) == true) return true;
@@ -80,7 +86,10 @@ namespace gbe
   }
 
   INLINE bool needToSplitCmp(GenEncoder *p, GenRegister src0, GenRegister src1) {
-    if (p->curr.execWidth != 16) return false;
+    if (p->curr.execWidth != 16 ||
+         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
     if (isVectorOfBytes(src0) == true) return true;
     if (isVectorOfBytes(src1) == true) return true;
     if (src0.type == GEN_TYPE_D || src0.type == GEN_TYPE_UD || src0.type == GEN_TYPE_F)
@@ -90,15 +99,11 @@ namespace gbe
     return false;
   }
 
-  static void setMessageDescriptor(GenEncoder *p,
-                                   GenInstruction *inst,
-                                   enum GenMessageTarget sfid,
-                                   unsigned msg_length,
-                                   unsigned response_length,
-                                   bool header_present = false,
-                                   bool end_of_thread = false)
+  void GenEncoder::setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
+                                        unsigned msg_length, unsigned response_length,
+                                        bool header_present, bool end_of_thread)
   {
-     p->setSrc1(inst, GenRegister::immd(0));
+     setSrc1(inst, GenRegister::immd(0));
      inst->bits3.generic_gen5.header_present = header_present;
      inst->bits3.generic_gen5.response_length = response_length;
      inst->bits3.generic_gen5.msg_length = msg_length;
@@ -106,29 +111,35 @@ namespace gbe
      inst->header.destreg_or_condmod = sfid;
   }
 
-  static void setDPUntypedRW(GenEncoder *p,
-                             GenInstruction *insn,
-                             uint32_t bti,
-                             uint32_t rgba,
-                             uint32_t msg_type,
-                             uint32_t msg_length,
-                             uint32_t response_length)
+  void GenEncoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                        unsigned char msg_type, uint32_t msg_length,
+                                        bool header_present)
+  {
+    const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
+    insn->bits3.gen7_typed_rw.bti = bti;
+    insn->bits3.gen7_typed_rw.msg_type = msg_type;
+  }
+
+  void GenEncoder::setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti,
+                                  uint32_t rgba, uint32_t msg_type,
+                                  uint32_t msg_length, uint32_t response_length)
   {
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
-    setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_untyped_rw.msg_type = msg_type;
     insn->bits3.gen7_untyped_rw.bti = bti;
     insn->bits3.gen7_untyped_rw.rgba = rgba;
-    if (p->curr.execWidth == 8)
+    if (curr.execWidth == 8)
       insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
-    else if (p->curr.execWidth == 16)
+    else if (curr.execWidth == 16)
       insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
     else
       NOT_SUPPORTED;
   }
 
   static void setDPByteScatterGather(GenEncoder *p,
-                                     GenInstruction *insn,
+                                     GenNativeInstruction *insn,
                                      uint32_t bti,
                                      uint32_t elem_size,
                                      uint32_t msg_type,
@@ -136,7 +147,7 @@ namespace gbe
                                      uint32_t response_length)
   {
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
-    setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_byte_rw.msg_type = msg_type;
     insn->bits3.gen7_byte_rw.bti = bti;
     insn->bits3.gen7_byte_rw.data_size = elem_size;
@@ -149,7 +160,7 @@ namespace gbe
   }
 #if 0
   static void setOBlockRW(GenEncoder *p,
-                          GenInstruction *insn,
+                          GenNativeInstruction *insn,
                           uint32_t bti,
                           uint32_t size,
                           uint32_t msg_type,
@@ -157,7 +168,7 @@ namespace gbe
                           uint32_t response_length)
   {
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
-    setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
     assert(size == 2 || size == 4);
     insn->bits3.gen7_oblock_rw.msg_type = msg_type;
     insn->bits3.gen7_oblock_rw.bti = bti;
@@ -167,7 +178,7 @@ namespace gbe
 #endif
 
   static void setSamplerMessage(GenEncoder *p,
-                                GenInstruction *insn,
+                                GenNativeInstruction *insn,
                                 unsigned char bti,
                                 unsigned char sampler,
                                 uint32_t msg_type,
@@ -178,36 +189,27 @@ namespace gbe
                                 uint32_t return_format)
   {
      const GenMessageTarget sfid = GEN_SFID_SAMPLER;
-     setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
      insn->bits3.sampler_gen7.bti = bti;
      insn->bits3.sampler_gen7.sampler = sampler;
      insn->bits3.sampler_gen7.msg_type = msg_type;
      insn->bits3.sampler_gen7.simd_mode = simd_mode;
   }
 
-
-  static void setTypedWriteMessage(GenEncoder *p,
-                                   GenInstruction *insn,
-                                   unsigned char bti,
-                                   unsigned char msg_type,
-                                   uint32_t msg_length,
-                                   bool header_present)
-  {
-     const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
-     setMessageDescriptor(p, insn, sfid, msg_length, 0, header_present);
-     insn->bits3.gen7_typed_rw.bti = bti;
-     insn->bits3.gen7_typed_rw.msg_type = msg_type;
-  }
   static void setDWordScatterMessgae(GenEncoder *p,
-                                     GenInstruction *insn,
+                                     GenNativeInstruction *insn,
                                      uint32_t bti,
                                      uint32_t block_size,
                                      uint32_t msg_type,
                                      uint32_t msg_length,
                                      uint32_t response_length)
   {
-    const GenMessageTarget sfid = GEN6_SFID_DATAPORT_CONSTANT_CACHE;
-    setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+    // FIXME there is a unknown issue with baytrail-t platform, the DWORD scatter
+    // message causes a hang at unit test case compiler_global_constant.
+    // We workaround it to use DATA CACHE instead.
+    const GenMessageTarget sfid = (p->deviceID == PCI_CHIP_BAYTRAIL_T) ?
+                                 GEN_SFID_DATAPORT_DATA_CACHE : GEN6_SFID_DATAPORT_CONSTANT_CACHE;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_dword_rw.msg_type = msg_type;
     insn->bits3.gen7_dword_rw.bti = bti;
     insn->bits3.gen7_dword_rw.block_size = block_size;
@@ -216,9 +218,10 @@ namespace gbe
   //////////////////////////////////////////////////////////////////////////
   // Gen Emitter encoding class
   //////////////////////////////////////////////////////////////////////////
-  GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen) :
-    stateNum(0), gen(gen)
+  GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID) :
+    stateNum(0), gen(gen), deviceID(deviceID)
   {
+    this->simdWidth = simdWidth;
     this->curr.execWidth = simdWidth;
     this->curr.quarterControl = GEN_COMPRESSION_Q1;
     this->curr.noMask = 0;
@@ -238,11 +241,13 @@ namespace gbe
     curr = stack[--stateNum];
   }
 
-  void GenEncoder::setHeader(GenInstruction *insn) {
+  void GenEncoder::setHeader(GenNativeInstruction *insn) {
     if (this->curr.execWidth == 8)
       insn->header.execution_size = GEN_WIDTH_8;
     else if (this->curr.execWidth == 16)
       insn->header.execution_size = GEN_WIDTH_16;
+    else if (this->curr.execWidth == 4)
+      insn->header.execution_size = GEN_WIDTH_4;
     else if (this->curr.execWidth == 1)
       insn->header.execution_size = GEN_WIDTH_1;
     else
@@ -260,7 +265,7 @@ namespace gbe
     insn->header.saturate = this->curr.saturate;
   }
 
-  void GenEncoder::setDst(GenInstruction *insn, GenRegister dest) {
+  void GenEncoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
      if (dest.file != GEN_ARCHITECTURE_REGISTER_FILE)
         assert(dest.nr < 128);
 
@@ -269,12 +274,18 @@ namespace gbe
      insn->bits1.da1.dest_address_mode = dest.address_mode;
      insn->bits1.da1.dest_reg_nr = dest.nr;
      insn->bits1.da1.dest_subreg_nr = dest.subnr;
-     if (dest.hstride == GEN_HORIZONTAL_STRIDE_0)
-       dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+     if (dest.hstride == GEN_HORIZONTAL_STRIDE_0) {
+       if (dest.type == GEN_TYPE_UB || dest.type == GEN_TYPE_B)
+         dest.hstride = GEN_HORIZONTAL_STRIDE_4;
+       else if (dest.type == GEN_TYPE_UW || dest.type == GEN_TYPE_W)
+         dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+       else
+         dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+     }
      insn->bits1.da1.dest_horiz_stride = dest.hstride;
   }
 
-  void GenEncoder::setSrc0(GenInstruction *insn, GenRegister reg) {
+  void GenEncoder::setSrc0(GenNativeInstruction *insn, GenRegister reg) {
      if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
         assert(reg.nr < 128);
 
@@ -327,7 +338,7 @@ namespace gbe
     }
   }
 
-  void GenEncoder::setSrc1(GenInstruction *insn, GenRegister reg) {
+  void GenEncoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
      assert(reg.nr < 128);
      assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
 
@@ -371,78 +382,8 @@ namespace gbe
     0
   };
 
-  void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
-    GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
-    src = GenRegister::retype(src, GEN_TYPE_UD);
-    addr = GenRegister::retype(addr, GEN_TYPE_UD);
-    tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
-    uint32_t originSimdWidth = curr.execWidth;
-    uint32_t originPredicate = curr.predicate;
-    uint32_t originMask = curr.noMask;
-    push();
-    for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
-          channels < originSimdWidth; channels += 8, currQuarter++) {
-      curr.predicate = GEN_PREDICATE_NONE;
-      curr.noMask = GEN_MASK_DISABLE;
-      curr.execWidth = 8;
-      /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
-         which is what we want here. */
-      MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
-      ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
-      MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
-      ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
-      // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
-      curr.execWidth = 16;
-      this->UNTYPED_READ(tmp, addr, bti, elemNum);
-      if (originSimdWidth == 16)
-        curr.quarterControl = currQuarter;
-      curr.predicate = originPredicate;
-      curr.noMask = originMask;
-      // Back to simd8 for correct predication flag.
-      curr.execWidth = 8;
-      MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
-    }
-    pop();
-  }
-
-  void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
-    GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
-    GenRegister unpacked;
-    msg = GenRegister::retype(msg, GEN_TYPE_UD);
-    int originSimdWidth = curr.execWidth;
-    int originPredicate = curr.predicate;
-    int originMask = curr.noMask;
-    push();
-    for (uint32_t half = 0; half < 2; half++) {
-      curr.predicate = GEN_PREDICATE_NONE;
-      curr.noMask = GEN_MASK_DISABLE;
-      curr.execWidth = 8;
-      if (is_scalar) {
-        unpacked = data32;
-        unpacked.subnr += half * 4;
-      } else
-        unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
-      MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
-      if (originSimdWidth == 16) {
-        if (is_scalar) {
-          unpacked = data32;
-          unpacked.subnr += half * 4;
-        } else
-          unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
-        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
-        curr.execWidth = 16;
-      }
-      if (half == 1)
-        ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
-      curr.predicate = originPredicate;
-      curr.noMask = originMask;
-      this->UNTYPED_WRITE(msg, bti, elemNum);
-    }
-    pop();
-  }
-
   void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -459,17 +400,16 @@ namespace gbe
     this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this,
-                   insn,
+    setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
-                   GEN_UNTYPED_READ,
+                   GEN7_UNTYPED_READ,
                    msg_length,
                    response_length);
   }
 
   void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
@@ -485,17 +425,16 @@ namespace gbe
       NOT_IMPLEMENTED;
     this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this,
-                   insn,
+    setDPUntypedRW(insn,
                    bti,
                    untypedRWMask[elemNum],
-                   GEN_UNTYPED_WRITE,
+                   GEN7_UNTYPED_WRITE,
                    msg_length,
                    response_length);
   }
 
   void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     if (this->curr.execWidth == 8) {
@@ -515,13 +454,13 @@ namespace gbe
                            insn,
                            bti,
                            elemSize,
-                           GEN_BYTE_GATHER,
+                           GEN7_BYTE_GATHER,
                            msg_length,
                            response_length);
   }
 
   void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     this->setHeader(insn);
@@ -539,13 +478,13 @@ namespace gbe
                            insn,
                            bti,
                            elemSize,
-                           GEN_BYTE_SCATTER,
+                           GEN7_BYTE_SCATTER,
                            msg_length,
                            response_length);
   }
 
   void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
     uint32_t block_size = 0;
@@ -568,14 +507,14 @@ namespace gbe
                            insn,
                            bti,
                            block_size,
-                           GEN_DWORD_GATHER,
+                           GEN7_DWORD_GATHER,
                            msg_length,
                            response_length);
 
   }
 
   void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
 
@@ -594,8 +533,8 @@ namespace gbe
     this->setSrc1(insn, GenRegister::immud(0));
 
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
-    setMessageDescriptor(this, insn, sfid, msg_length, response_length);
-    insn->bits3.gen7_atomic_op.msg_type = GEN_UNTYPED_ATOMIC_READ;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
     insn->bits3.gen7_atomic_op.bti = bti;
     insn->bits3.gen7_atomic_op.return_data = 1;
     insn->bits3.gen7_atomic_op.aop_type = function;
@@ -608,21 +547,30 @@ namespace gbe
       NOT_SUPPORTED;
 
   }
+  GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
+    GenCompactInstruction insn;
+    std::memset(&insn, 0, sizeof(GenCompactInstruction));
+    insn.bits1.opcode = opcode;
+    this->store.push_back(insn.low);
+    return (GenCompactInstruction *)&this->store.back();
+  }
 
-  GenInstruction *GenEncoder::next(uint32_t opcode) {
-     GenInstruction insn;
-     std::memset(&insn, 0, sizeof(GenInstruction));
+  GenNativeInstruction *GenEncoder::next(uint32_t opcode) {
+     GenNativeInstruction insn;
+     std::memset(&insn, 0, sizeof(GenNativeInstruction));
      insn.header.opcode = opcode;
-     this->store.push_back(insn);
-     return &this->store.back();
+     this->store.push_back(insn.low);
+     this->store.push_back(insn.high);
+     return (GenNativeInstruction *)(&this->store.back()-1);
   }
 
   INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
                             GenRegister src0, GenRegister src1 = GenRegister::null()) {
        int w = p->curr.execWidth;
        p->push();
+       p->curr.execWidth = p->getDoubleExecWidth();
        p->curr.nibControl = 0;
-       GenInstruction *insn = p->next(opcode);
+       GenNativeInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
        p->setSrc0(insn, src0);
@@ -661,28 +609,27 @@ namespace gbe
       }
   }
 
-  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
+  void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
+            GenRegister src, uint32_t condition) {
      if (dst.isdf() && src.isdf()) {
        handleDouble(p, opcode, dst, src);
      } else if (dst.isint64() && src.isint64()) { // handle int64
-       int execWidth = p->curr.execWidth;
-       p->push();
-       p->curr.execWidth = 8;
-       for (int nib = 0; nib < execWidth / 4; nib ++) {
-         p->curr.chooseNib(nib);
-         p->MOV(dst.bottom_half(), src.bottom_half());
-         p->MOV(dst.top_half(), src.top_half());
-         dst = GenRegister::suboffset(dst, 4);
-         src = GenRegister::suboffset(src, 4);
-       }
-       p->pop();
+       p->MOV(dst.bottom_half(), src.bottom_half());
+       p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
      } else if (needToSplitAlu1(p, dst, src) == false) {
-       GenInstruction *insn = p->next(opcode);
+      if(compactAlu1(p, opcode, dst, src, condition, false))
+        return;
+       GenNativeInstruction *insn = p->next(opcode);
+       if (condition != 0) {
+         GBE_ASSERT(opcode == GEN_OPCODE_MOV ||
+                    opcode == GEN_OPCODE_NOT);
+         insn->header.destreg_or_condmod = condition;
+       }
        p->setHeader(insn);
        p->setDst(insn, dst);
        p->setSrc0(insn, src);
      } else {
-       GenInstruction *insnQ1, *insnQ2;
+       GenNativeInstruction *insnQ1, *insnQ2;
 
        // Instruction for the first quarter
        insnQ1 = p->next(opcode);
@@ -702,22 +649,31 @@ namespace gbe
      }
   }
 
-  INLINE void alu2(GenEncoder *p,
-                   uint32_t opcode,
-                   GenRegister dst,
-                   GenRegister src0,
-                   GenRegister src1)
+  void alu2(GenEncoder *p,
+            uint32_t opcode,
+            GenRegister dst,
+            GenRegister src0,
+            GenRegister src1,
+            uint32_t condition)
   {
     if (dst.isdf() && src0.isdf() && src1.isdf()) {
        handleDouble(p, opcode, dst, src0, src1);
     } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
-       GenInstruction *insn = p->next(opcode);
+       if(compactAlu2(p, opcode, dst, src0, src1, condition, false))
+         return;
+       GenNativeInstruction *insn = p->next(opcode);
+       if (condition != 0) {
+         GBE_ASSERT(opcode == GEN_OPCODE_OR ||
+                    opcode == GEN_OPCODE_XOR ||
+                    opcode == GEN_OPCODE_AND);
+         insn->header.destreg_or_condmod = condition;
+       }
        p->setHeader(insn);
        p->setDst(insn, dst);
        p->setSrc0(insn, src0);
        p->setSrc1(insn, src1);
     } else {
-       GenInstruction *insnQ1, *insnQ2;
+       GenNativeInstruction *insnQ1, *insnQ2;
 
        // Instruction for the first quarter
        insnQ1 = p->next(opcode);
@@ -741,14 +697,14 @@ namespace gbe
 
 #define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
 
-  static GenInstruction *alu3(GenEncoder *p,
+  static GenNativeInstruction *alu3(GenEncoder *p,
                               uint32_t opcode,
                               GenRegister dest,
                               GenRegister src0,
                               GenRegister src1,
                               GenRegister src2)
   {
-     GenInstruction *insn = p->next(opcode);
+     GenNativeInstruction *insn = p->next(opcode);
 
      assert(dest.file == GEN_GENERAL_REGISTER_FILE);
      assert(dest.nr < 128);
@@ -798,7 +754,7 @@ namespace gbe
 
      // Emit second half of the instruction
      if (p->curr.execWidth == 16) {
-      GenInstruction q1Insn = *insn;
+      GenNativeInstruction q1Insn = *insn;
       insn = p->next(opcode);
       *insn = q1Insn;
       insn->header.quarter_control = GEN_COMPRESSION_Q2;
@@ -817,15 +773,21 @@ namespace gbe
 #undef NO_SWIZZLE
 
 #define ALU1(OP) \
-  void GenEncoder::OP(GenRegister dest, GenRegister src0) { \
-    alu1(this, GEN_OPCODE_##OP, dest, src0); \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, uint32_t condition) { \
+    alu1(this, GEN_OPCODE_##OP, dest, src0, condition); \
   }
 
 #define ALU2(OP) \
   void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1) { \
-    alu2(this, GEN_OPCODE_##OP, dest, src0, src1); \
+    alu2(this, GEN_OPCODE_##OP, dest, src0, src1, 0); \
+  }
+
+#define ALU2_MOD(OP) \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition) { \
+    alu2(this, GEN_OPCODE_##OP, dest, src0, src1, condition); \
   }
 
+
 #define ALU3(OP) \
   void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2) { \
     alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
@@ -837,6 +799,7 @@ namespace gbe
     GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
     push();
     curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
     curr.execWidth = 1;
     MOV(r, GenRegister::immud(u.u[1]));
     MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
@@ -885,51 +848,49 @@ namespace gbe
 
   void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
     GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
-    int execWidth = curr.execWidth;
-    push();
-    curr.execWidth = 8;
-    for(int nib = 0; nib < execWidth/4; nib ++) {
-      curr.chooseNib(nib);
-      MOV(dest.top_half(), u1);
-      MOV(dest.bottom_half(), u0);
-      dest = GenRegister::suboffset(dest, 4);
-    }
-    pop();
+    MOV(dest.bottom_half(), u0);
+    MOV(dest.top_half(this->simdWidth), u1);
   }
 
   void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
     int w = curr.execWidth;
-    if (src0.isdf()) {
-      GBE_ASSERT(0); // MOV DF is called from convert instruction,
-                     // We should never convert a df to a df.
+    GenRegister r0;
+    int factor = 1;
+    if (dest.type == GEN_TYPE_F) {
+      r0 = r;
+      r = GenRegister::h2(r);
+      factor = 2;
     } else {
-      GenRegister r0 = GenRegister::h2(r);
+      r0 = GenRegister::h2(r);
+    }
+    push();
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    MOV(r0, src0);
+    MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 4));
+    curr.noMask = 0;
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(dest, r);
+    curr.nibControl = 1;
+    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8 / factor));
+    pop();
+    if (w == 16) {
       push();
       curr.execWidth = 8;
       curr.predicate = GEN_PREDICATE_NONE;
-      MOV(r0, src0);
-      MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
-      curr.predicate = GEN_PREDICATE_NORMAL;
-      curr.quarterControl = 0;
+      curr.noMask = 1;
+      MOV(r0, GenRegister::suboffset(src0, 8));
+      MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 12));
+      curr.noMask = 0;
+      curr.quarterControl = 1;
       curr.nibControl = 0;
-      MOV(dest, r);
+      MOV(GenRegister::suboffset(dest, 8), r);
       curr.nibControl = 1;
-      MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8));
+      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8 / factor));
       pop();
-      if (w == 16) {
-        push();
-        curr.execWidth = 8;
-        curr.predicate = GEN_PREDICATE_NONE;
-        MOV(r0, GenRegister::suboffset(src0, 8));
-        MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
-        curr.predicate = GEN_PREDICATE_NORMAL;
-        curr.quarterControl = 1;
-        curr.nibControl = 0;
-        MOV(GenRegister::suboffset(dest, 8), r);
-        curr.nibControl = 1;
-        MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8));
-        pop();
-      }
     }
   }
 
@@ -944,9 +905,9 @@ namespace gbe
   ALU1(F32TO16)
   ALU2(SEL)
   ALU1(NOT)
-  ALU2(AND)
-  ALU2(OR)
-  ALU2(XOR)
+  ALU2_MOD(AND)
+  ALU2_MOD(OR)
+  ALU2_MOD(XOR)
   ALU2(SHR)
   ALU2(SHL)
   ALU2(RSR)
@@ -959,6 +920,9 @@ namespace gbe
   ALU2(PLN)
   ALU2(MACH)
   ALU3(MAD)
+ // ALU2(BRC)
+ // ALU1(ENDIF)
+ //  ALU1(IF)
 
   void GenEncoder::SUBB(GenRegister dest, GenRegister src0, GenRegister src1) {
     push();
@@ -1023,42 +987,66 @@ namespace gbe
 
 
   void GenEncoder::NOP(void) {
-    GenInstruction *insn = this->next(GEN_OPCODE_NOP);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_NOP);
     this->setDst(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
     this->setSrc0(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
     this->setSrc1(insn, GenRegister::immud(0x0));
   }
 
   void GenEncoder::BARRIER(GenRegister src) {
-     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
      this->setHeader(insn);
      this->setDst(insn, GenRegister::null());
      this->setSrc0(insn, src);
-     setMessageDescriptor(this, insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
+     setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
      insn->bits3.msg_gateway.sub_function_id = GEN_BARRIER_MSG;
      insn->bits3.msg_gateway.notify = 0x1;
   }
   void GenEncoder::FENCE(GenRegister dst) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setHeader(insn);
     this->setDst(insn, dst);
     this->setSrc0(insn, dst);
-    setMessageDescriptor(this, insn, GEN_SFID_DATAPORT_DATA_CACHE, 1, 1, 1);
+    setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA_CACHE, 1, 1, 1);
     insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
     insn->bits3.gen7_memory_fence.commit_enable = 0x1;
   }
 
-  void GenEncoder::JMPI(GenRegister src) {
+  void GenEncoder::JMPI(GenRegister src, bool longjmp) {
     alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
-    NOP();
+    if (longjmp)
+      NOP();
   }
 
+#define ALU2_BRA(OP) \
+  void GenEncoder::OP(GenRegister src) { \
+    alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \
+  }
+
+  ALU2_BRA(IF)
+  ALU2_BRA(ENDIF)
+  ALU2_BRA(BRD)
+  ALU2_BRA(BRC)
+
   void GenEncoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
-    GenInstruction &insn = this->store[insnID];
+    GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
     GBE_ASSERT(insnID < this->store.size());
-    GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI);
-    if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
-        this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+    GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
+               insn.header.opcode == GEN_OPCODE_BRD  ||
+               insn.header.opcode == GEN_OPCODE_ENDIF ||
+               insn.header.opcode == GEN_OPCODE_IF ||
+               insn.header.opcode == GEN_OPCODE_BRC);
+
+    if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768))  {
+           if (insn.header.opcode == GEN_OPCODE_IF) {
+             this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+             return;
+           }
+           else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+             jumpDistance = jumpDistance - 2;
+           }
+
+           this->setSrc1(&insn, GenRegister::immd(jumpDistance));
     } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
       // For the conditional jump distance out of S15 range, we need to use an
       // inverted jmp followed by a add ip, ip, distance to implement.
@@ -1070,52 +1058,64 @@ namespace gbe
       // for all the branching instruction. And need to adjust the distance
       // for those branch instruction's start point and end point contains
       // this instruction.
+      GenNativeInstruction *insn2 = (GenNativeInstruction *)&this->store[insnID+2];
+      GBE_ASSERT(insn2->header.opcode == GEN_OPCODE_NOP);
+      insn2 = insn2;
       insn.header.opcode = GEN_OPCODE_ADD;
       this->setDst(&insn, GenRegister::ip());
       this->setSrc0(&insn, GenRegister::ip());
-      this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
+      this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
     } else {
+      GenNativeInstruction &insn2 = *(GenNativeInstruction *)&this->store[insnID+2];
       insn.header.predicate_inverse ^= 1;
       this->setSrc1(&insn, GenRegister::immd(2));
-      GenInstruction &insn2 = this->store[insnID+1];
       GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
       GBE_ASSERT(insnID < this->store.size());
       insn2.header.predicate_control = GEN_PREDICATE_NONE;
       insn2.header.opcode = GEN_OPCODE_ADD;
       this->setDst(&insn2, GenRegister::ip());
       this->setSrc0(&insn2, GenRegister::ip());
-      this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
+      this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
     }
   }
 
-  void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1) {
+  void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1, GenRegister dst) {
     if (needToSplitCmp(this, src0, src1) == false) {
-      GenInstruction *insn = this->next(GEN_OPCODE_CMP);
+      if(!GenRegister::isNull(dst) && compactAlu2(this, GEN_OPCODE_CMP, dst, src0, src1, conditional, false)) {
+        return;
+      }
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_CMP);
       this->setHeader(insn);
       insn->header.destreg_or_condmod = conditional;
-      this->setDst(insn, GenRegister::null());
+      if (GenRegister::isNull(dst))
+        insn->header.thread_control = GEN_THREAD_SWITCH;
+      this->setDst(insn, dst);
       this->setSrc0(insn, src0);
       this->setSrc1(insn, src1);
     } else {
-      GenInstruction *insnQ1, *insnQ2;
+      GenNativeInstruction *insnQ1, *insnQ2;
 
       // Instruction for the first quarter
       insnQ1 = this->next(GEN_OPCODE_CMP);
       this->setHeader(insnQ1);
+      if (GenRegister::isNull(dst))
+        insnQ1->header.thread_control = GEN_THREAD_SWITCH;
       insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
       insnQ1->header.execution_size = GEN_WIDTH_8;
       insnQ1->header.destreg_or_condmod = conditional;
-      this->setDst(insnQ1, GenRegister::null());
+      this->setDst(insnQ1, dst);
       this->setSrc0(insnQ1, src0);
       this->setSrc1(insnQ1, src1);
 
       // Instruction for the second quarter
       insnQ2 = this->next(GEN_OPCODE_CMP);
       this->setHeader(insnQ2);
+      if (GenRegister::isNull(dst))
+        insnQ2->header.thread_control = GEN_THREAD_SWITCH;
       insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
       insnQ2->header.execution_size = GEN_WIDTH_8;
       insnQ2->header.destreg_or_condmod = conditional;
-      this->setDst(insnQ2, GenRegister::null());
+      this->setDst(insnQ2, GenRegister::Qn(dst, 1));
       this->setSrc0(insnQ2, GenRegister::Qn(src0, 1));
       this->setSrc1(insnQ2, GenRegister::Qn(src1, 1));
     }
@@ -1126,7 +1126,7 @@ namespace gbe
                            GenRegister src0,
                            GenRegister src1)
   {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEL);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEL);
     GBE_ASSERT(curr.predicate == GEN_PREDICATE_NONE);
     this->setHeader(insn);
     insn->header.destreg_or_condmod = conditional;
@@ -1136,7 +1136,7 @@ namespace gbe
   }
 
   void GenEncoder::WAIT(void) {
-     GenInstruction *insn = this->next(GEN_OPCODE_WAIT);
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_WAIT);
      GenRegister src = GenRegister::notification1();
      this->setDst(insn, GenRegister::null());
      this->setSrc0(insn, src);
@@ -1147,11 +1147,11 @@ namespace gbe
   }
 
   void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1) {
-     GenInstruction *insn = this->next(GEN_OPCODE_MATH);
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH);
      assert(dst.file == GEN_GENERAL_REGISTER_FILE);
      assert(src0.file == GEN_GENERAL_REGISTER_FILE);
      assert(src1.file == GEN_GENERAL_REGISTER_FILE);
-     assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1);
+     assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
 
      if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
          function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER ||
@@ -1171,11 +1171,11 @@ namespace gbe
 
      if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
          function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER) {
-        insn->header.execution_size = GEN_WIDTH_8;
+        insn->header.execution_size = this->curr.execWidth == 1 ? GEN_WIDTH_1 : GEN_WIDTH_8;
         insn->header.quarter_control = GEN_COMPRESSION_Q1;
 
         if(this->curr.execWidth == 16) {
-          GenInstruction *insn2 = this->next(GEN_OPCODE_MATH);
+          GenNativeInstruction *insn2 = this->next(GEN_OPCODE_MATH);
           GenRegister new_dest, new_src0, new_src1;
           new_dest = GenRegister::QnPhysical(dst, 1);
           new_src0 = GenRegister::QnPhysical(src0, 1);
@@ -1193,10 +1193,10 @@ namespace gbe
   }
 
   void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src) {
-     GenInstruction *insn = this->next(GEN_OPCODE_MATH);
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH);
      assert(dst.file == GEN_GENERAL_REGISTER_FILE);
      assert(src.file == GEN_GENERAL_REGISTER_FILE);
-     assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1);
+     assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
      assert(src.type == GEN_TYPE_F);
 
      insn->header.destreg_or_condmod = function;
@@ -1207,24 +1207,32 @@ namespace gbe
 
   void GenEncoder::SAMPLE(GenRegister dest,
                           GenRegister msg,
+                          unsigned int msg_len,
                           bool header_present,
                           unsigned char bti,
                           unsigned char sampler,
-                          unsigned int coord_cnt,
                           uint32_t simdWidth,
                           uint32_t writemask,
-                          uint32_t return_format)
+                          uint32_t return_format,
+                          bool isLD,
+                          bool isUniform)
   {
      if (writemask == 0) return;
-     uint32_t msg_type =  (simdWidth == 16) ?
-                            GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+     uint32_t msg_type = isLD ? GEN_SAMPLER_MESSAGE_SIMD8_LD :
+                                GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
      uint32_t response_length = (4 * (simdWidth / 8));
-     uint32_t msg_length = (coord_cnt * (simdWidth / 8));
+     uint32_t msg_length = (msg_len * (simdWidth / 8));
      if (header_present)
        msg_length++;
      uint32_t simd_mode = (simdWidth == 16) ?
                             GEN_SAMPLER_SIMD_MODE_SIMD16 : GEN_SAMPLER_SIMD_MODE_SIMD8;
-     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    if(isUniform) {
+      response_length = 1;
+      msg_type = GEN_SAMPLER_MESSAGE_SIMD4X2_LD;
+      msg_length = 1;
+      simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD4X2;
+    }
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
      this->setHeader(insn);
      this->setDst(insn, dest);
      this->setSrc0(insn, msg);
@@ -1236,16 +1244,16 @@ namespace gbe
 
   void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
   {
-     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
      uint32_t msg_type = GEN_TYPED_WRITE;
      uint32_t msg_length = header_present ? 9 : 8;
      this->setHeader(insn);
      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
      this->setSrc0(insn, msg);
-     setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present);
+     setTypedWriteMessage(insn, bti, msg_type, msg_length, header_present);
   }
   static void setScratchMessage(GenEncoder *p,
-                                   GenInstruction *insn,
+                                   GenNativeInstruction *insn,
                                    uint32_t offset,
                                    uint32_t block_size,
                                    uint32_t channel_mode,
@@ -1254,7 +1262,7 @@ namespace gbe
                                    uint32_t response_length)
   {
      const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
-     setMessageDescriptor(p, insn, sfid, msg_length, response_length, true);
+     p->setMessageDescriptor(insn, sfid, msg_length, response_length, true);
      insn->bits3.gen7_scratch_rw.block_size = block_size;
      insn->bits3.gen7_scratch_rw.msg_type = msg_type;
      insn->bits3.gen7_scratch_rw.channel_mode = channel_mode;
@@ -1266,7 +1274,7 @@ namespace gbe
   {
      assert(src_num == 1 || src_num ==2);
      uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
-     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
      this->setHeader(insn);
      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
      this->setSrc0(insn, msg);
@@ -1279,7 +1287,7 @@ namespace gbe
   {
      assert(dst_num == 1 || dst_num ==2);
      uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
-     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
      this->setHeader(insn);
      this->setDst(insn, dst);
      this->setSrc0(insn, src);
@@ -1289,7 +1297,7 @@ namespace gbe
   }
 
   void GenEncoder::EOT(uint32_t msg) {
-    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
     this->setSrc0(insn, GenRegister::ud8grf(msg,0));
     this->setSrc1(insn, GenRegister::immud(0));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 13db6ae..d6e2b97 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -56,6 +56,7 @@
 #include "sys/platform.hpp"
 #include "sys/vector.hpp"
 #include <cassert>
+#include "src/cl_device_data.h"
 
 namespace gbe
 {
@@ -64,9 +65,13 @@ namespace gbe
   {
   public:
     /*! simdWidth is the default width for the instructions */
-    GenEncoder(uint32_t simdWidth, uint32_t gen);
+    GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID);
+
+    virtual ~GenEncoder(void) { }
     /*! Size of the stack (should be large enough) */
     enum { MAX_STATE_NUM = 16 };
+    /*! gen7 exec width of the double data type */
+    #define GEN7_DOUBLE_EXEC_WIDTH  8
     /*! Push the current instruction state */
     void push(void);
     /*! Pop the latest pushed state */
@@ -81,13 +86,17 @@ namespace gbe
     uint32_t stateNum;
     /*! Gen generation to encode */
     uint32_t gen;
-
+    /*! Device ID */
+    uint32_t deviceID;
+    /*! simd width for this codegen */
+    uint32_t simdWidth;
     ////////////////////////////////////////////////////////////////////////
     // Encoding functions
     ////////////////////////////////////////////////////////////////////////
 
-#define ALU1(OP) void OP(GenRegister dest, GenRegister src0);
+#define ALU1(OP) void OP(GenRegister dest, GenRegister src0, uint32_t condition = 0);
 #define ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1);
+#define ALU2_MOD(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition = 0);
 #define ALU3(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2);
     ALU1(MOV)
     ALU1(FBH)
@@ -103,9 +112,9 @@ namespace gbe
     ALU1(F32TO16)
     ALU2(SEL)
     ALU1(NOT)
-    ALU2(AND)
-    ALU2(OR)
-    ALU2(XOR)
+    ALU2_MOD(AND)
+    ALU2_MOD(OR)
+    ALU2_MOD(XOR)
     ALU2(SHR)
     ALU2(SHL)
     ALU2(RSR)
@@ -122,20 +131,33 @@ namespace gbe
     ALU2(PLN)
     ALU3(MAD)
     //ALU2(MOV_DF);
+    ALU2(BRC)
+    ALU1(BRD)
 #undef ALU1
 #undef ALU2
+#undef ALU2_MOD
 #undef ALU3
-    void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
-    void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+    /*! Get double/long exec width */
+    virtual int getDoubleExecWidth(void) { return GEN7_DOUBLE_EXEC_WIDTH; }
+    virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
+    virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     void LOAD_INT64_IMM(GenRegister dest, int64_t value);
     /*! Barrier message (to synchronize threads of a workgroup) */
     void BARRIER(GenRegister src);
     /*! Memory fence message (to order loads and stores between threads) */
     void FENCE(GenRegister dst);
     /*! Jump indexed instruction */
-    void JMPI(GenRegister src);
+    virtual void JMPI(GenRegister src, bool longjmp = false);
+    /*! IF indexed instruction */
+    void IF(GenRegister src);
+    /*! ENDIF indexed instruction */
+    void ENDIF(GenRegister src);
+    /*! BRC indexed instruction */
+    void BRC(GenRegister src);
+    /*! BRD indexed instruction */
+    void BRD(GenRegister src);
     /*! Compare instructions */
-    void CMP(uint32_t conditional, GenRegister src0, GenRegister src1);
+    void CMP(uint32_t conditional, GenRegister src0, GenRegister src1, GenRegister dst = GenRegister::null());
     /*! Select with embedded compare (like sel.le ...) */
     void SEL_CMP(uint32_t conditional, GenRegister dst, GenRegister src0, GenRegister src1);
     /*! EOT is used to finish GPGPU threads */
@@ -145,15 +167,11 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
     /*! Atomic instructions */
-    void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-    /*! Read 64-bits float/int arrays */
-    void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
-    /*! Write 64-bits float/int arrays */
-    void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
     /*! Untyped read (upto 4 channels) */
-    void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
-    void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
     void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
@@ -167,38 +185,55 @@ namespace gbe
     /*! Send instruction for the sampler */
     void SAMPLE(GenRegister dest,
                 GenRegister msg,
+                unsigned int msg_len,
                 bool header_present,
                 unsigned char bti,
                 unsigned char sampler,
-                unsigned int coord_cnt,
                 unsigned int simdWidth,
                 uint32_t writemask,
-                uint32_t return_format);
+                uint32_t return_format,
+                bool isLD,
+                bool isUniform);
 
     /*! TypedWrite instruction for texture */
-    void TYPED_WRITE(GenRegister header,
-                     bool header_present,
-                     unsigned char bti);
+    virtual void TYPED_WRITE(GenRegister header,
+                             bool header_present,
+                             unsigned char bti);
     /*! Extended math function (2 sources) */
     void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
     /*! Extended math function (1 source) */
     void MATH(GenRegister dst, uint32_t function, GenRegister src);
 
-    /*! Patch JMPI (located at index insnID) with the given jump distance */
-    void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+    /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
+    virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
 
     ////////////////////////////////////////////////////////////////////////
     // Helper functions to encode
     ////////////////////////////////////////////////////////////////////////
-    void setHeader(GenInstruction *insn);
-    void setDst(GenInstruction *insn, GenRegister dest);
-    void setSrc0(GenInstruction *insn, GenRegister reg);
-    void setSrc1(GenInstruction *insn, GenRegister reg);
-    GenInstruction *next(uint32_t opcode);
+    virtual void setHeader(GenNativeInstruction *insn);
+    virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
+                                uint32_t msg_type, uint32_t msg_length,
+                                uint32_t response_length);
+    virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                      unsigned char msg_type, uint32_t msg_length,
+                                      bool header_present);
+    void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
+                              unsigned msg_length, unsigned response_length,
+                              bool header_present = false, bool end_of_thread = false);
+    void setDst(GenNativeInstruction *insn, GenRegister dest);
+    void setSrc0(GenNativeInstruction *insn, GenRegister reg);
+    void setSrc1(GenNativeInstruction *insn, GenRegister reg);
+    GenCompactInstruction *nextCompact(uint32_t opcode);
+    GenNativeInstruction *next(uint32_t opcode);
     uint32_t n_instruction(void) const { return store.size(); }
     GBE_CLASS(GenEncoder); //!< Use custom allocators
   };
 
+  void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
+            GenRegister src, uint32_t condition = 0);
+
+  void alu2(GenEncoder *p, uint32_t opcode, GenRegister dst,
+            GenRegister src0, GenRegister src1, uint32_t condition = 0);
 } /* namespace gbe */
 
 #endif /* __GBE_GEN_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_insn_compact.cpp b/backend/src/backend/gen_insn_compact.cpp
new file mode 100644
index 0000000..f19c364
--- /dev/null
+++ b/backend/src/backend/gen_insn_compact.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling Song <ruiling.song at intel.com>
+ */
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include <cstring>
+
+namespace gbe {
+
+  struct compact_table_entry {
+    uint32_t bit_pattern;
+    uint32_t index;
+  };
+
+  static compact_table_entry control_table[] = {
+    {0b0000000000000000010, 0},
+    {0b0000100000000000000, 1},
+    {0b0000100000000000001, 2},
+    {0b0000100000000000010, 3},
+    {0b0000100000000000011, 4},
+    {0b0000100000000000100, 5},
+    {0b0000100000000000101, 6},
+    {0b0000100000000000111, 7},
+    {0b0000100000000001000, 8},
+    {0b0000100000000001001, 9},
+    {0b0000100000000001101, 10},
+    {0b0000110000000000000, 11},
+    {0b0000110000000000001, 12},
+    {0b0000110000000000010, 13},
+    {0b0000110000000000011, 14},
+    {0b0000110000000000100, 15},
+    {0b0000110000000000101, 16},
+    {0b0000110000000000111, 17},
+    {0b0000110000000001001, 18},
+    {0b0000110000000001101, 19},
+    {0b0000110000000010000, 20},
+    {0b0000110000100000000, 21},
+    {0b0001000000000000000, 22},
+    {0b0001000000000000010, 23},
+    {0b0001000000000000100, 24},
+    {0b0001000000100000000, 25},
+    {0b0010110000000000000, 26},
+    {0b0010110000000010000, 27},
+    {0b0011000000000000000, 28},
+    {0b0011000000100000000, 29},
+    {0b0101000000000000000, 30},
+    {0b0101000000100000000, 31},
+  };
+
+  static compact_table_entry data_type_table[] = {
+    {0b000000001000001100, 20},
+    {0b001000000000000001, 0},
+    {0b001000000000100000, 1},
+    {0b001000000000100001, 2},
+    {0b001000000000111101, 21},
+    {0b001000000001100001, 3},
+    {0b001000000010100101, 22},
+    {0b001000000010111101, 4},
+    {0b001000001011111101, 5},
+    {0b001000001110100001, 6},
+    {0b001000001110100101, 7},
+    {0b001000001110111101, 8},
+    {0b001000010000100000, 23},
+    {0b001000010000100001, 9},
+    {0b001000110000100000, 10},
+    {0b001000110000100001, 11},
+    {0b001001010010100100, 24},
+    {0b001001010010100101, 12},
+    {0b001001110010000100, 25},
+    {0b001001110010100100, 13},
+    {0b001001110010100101, 14},
+    {0b001010010100001001, 26},
+    {0b001010010100101000, 30},
+    {0b001010110100101000, 31},
+    {0b001011110110101100, 29},
+    {0b001101111110111101, 27},
+    {0b001111001110111101, 15},
+    {0b001111011110011101, 16},
+    {0b001111011110111100, 17},
+    {0b001111011110111101, 18},
+    {0b001111111110111100, 19},
+    {0b001111111110111101, 28},
+  };
+
+  static compact_table_entry data_type_decompact[] = {
+    {0b001000000000000001, 0},
+    {0b001000000000100000, 1},
+    {0b001000000000100001, 2},
+    {0b001000000001100001, 3},
+    {0b001000000010111101, 4},
+    {0b001000001011111101, 5},
+    {0b001000001110100001, 6},
+    {0b001000001110100101, 7},
+    {0b001000001110111101, 8},
+    {0b001000010000100001, 9},
+    {0b001000110000100000, 10},
+    {0b001000110000100001, 11},
+    {0b001001010010100101, 12},
+    {0b001001110010100100, 13},
+    {0b001001110010100101, 14},
+    {0b001111001110111101, 15},
+    {0b001111011110011101, 16},
+    {0b001111011110111100, 17},
+    {0b001111011110111101, 18},
+    {0b001111111110111100, 19},
+    {0b000000001000001100, 20},
+    {0b001000000000111101, 21},
+    {0b001000000010100101, 22},
+    {0b001000010000100000, 23},
+    {0b001001010010100100, 24},
+    {0b001001110010000100, 25},
+    {0b001010010100001001, 26},
+    {0b001101111110111101, 27},
+    {0b001111111110111101, 28},
+    {0b001011110110101100, 29},
+    {0b001010010100101000, 30},
+    {0b001010110100101000, 31},
+  };
+
+  static compact_table_entry subreg_table[] = {
+    {0b000000000000000, 0},
+    {0b000000000000001, 1},
+    {0b000000000001000, 2},
+    {0b000000000001111, 3},
+    {0b000000000010000, 4},
+    {0b000000010000000, 5},
+    {0b000000100000000, 6},
+    {0b000000110000000, 7},
+    {0b000001000000000, 8},
+    {0b000001000010000, 9},
+    {0b000001010000000, 10},
+    {0b001000000000000, 11},
+    {0b001000000000001, 12},
+    {0b001000010000001, 13},
+    {0b001000010000010, 14},
+    {0b001000010000011, 15},
+    {0b001000010000100, 16},
+    {0b001000010000111, 17},
+    {0b001000010001000, 18},
+    {0b001000010001110, 19},
+    {0b001000010001111, 20},
+    {0b001000110000000, 21},
+    {0b001000111101000, 22},
+    {0b010000000000000, 23},
+    {0b010000110000000, 24},
+    {0b011000000000000, 25},
+    {0b011110010000111, 26},
+    {0b100000000000000, 27},
+    {0b101000000000000, 28},
+    {0b110000000000000, 29},
+    {0b111000000000000, 30},
+    {0b111000000011100, 31},
+  };
+
+  static compact_table_entry srcreg_table[] = {
+    {0b000000000000, 0},
+    {0b000000000010, 1},
+    {0b000000010000, 2},
+    {0b000000010010, 3},
+    {0b000000011000, 4},
+    {0b000000100000, 5},
+    {0b000000101000, 6},
+    {0b000001001000, 7},
+    {0b000001010000, 8},
+    {0b000001110000, 9},
+    {0b000001111000, 10},
+    {0b001100000000, 11},
+    {0b001100000010, 12},
+    {0b001100001000, 13},
+    {0b001100010000, 14},
+    {0b001100010010, 15},
+    {0b001100100000, 16},
+    {0b001100101000, 17},
+    {0b001100111000, 18},
+    {0b001101000000, 19},
+    {0b001101000010, 20},
+    {0b001101001000, 21},
+    {0b001101010000, 22},
+    {0b001101100000, 23},
+    {0b001101101000, 24},
+    {0b001101110000, 25},
+    {0b001101110001, 26},
+    {0b001101111000, 27},
+    {0b010001101000, 28},
+    {0b010001101001, 29},
+    {0b010001101010, 30},
+    {0b010110001000, 31},
+  };
+
+  static int cmp_key(const void *p1, const void*p2) {
+    const compact_table_entry * px = (compact_table_entry *)p1;
+    const compact_table_entry * py = (compact_table_entry *)p2;
+    return (px->bit_pattern) - py->bit_pattern;
+  }
+  union ControlBits{
+    struct {
+      uint32_t access_mode:1;
+      uint32_t mask_control:1;
+      uint32_t dependency_control:2;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t saturate:1;
+      uint32_t flag_sub_reg_nr:1;
+      uint32_t flag_reg_nr:1;
+      uint32_t pad:23;
+    };
+    uint32_t data;
+  };
+  union DataTypeBits{
+    struct {
+      uint32_t dest_reg_file:2;
+      uint32_t dest_reg_type:3;
+      uint32_t src0_reg_file:2;
+      uint32_t src0_reg_type:3;
+      uint32_t src1_reg_file:2;
+      uint32_t src1_reg_type:3;
+      uint32_t dest_horiz_stride:2;
+      uint32_t dest_address_mode:1;
+      uint32_t pad:14;
+    };
+    uint32_t data;
+  };
+  union SubRegBits {
+    struct {
+      uint32_t dest_subreg_nr:5;
+      uint32_t src0_subreg_nr:5;
+      uint32_t src1_subreg_nr:5;
+      uint32_t pad:17;
+    };
+    uint32_t data;
+  };
+  union SrcRegBits {
+    struct {
+      uint32_t src_abs:1;
+      uint32_t src_negate:1;
+      uint32_t src_address_mode:1;
+      uint32_t src_horiz_stride:2;
+      uint32_t src_width:3;
+      uint32_t src_vert_stride:4;
+      uint32_t pad:20;
+    };
+    uint32_t data;
+  };
+
+  void decompactInstruction(GenCompactInstruction * p, GenNativeInstruction *pOut) {
+
+    memset(pOut, 0, sizeof(GenNativeInstruction));
+    union ControlBits control_bits;
+    control_bits.data = control_table[(uint32_t)p->bits1.control_index].bit_pattern;
+    pOut->low.low = (uint32_t)p->bits1.opcode | ((control_bits.data & 0xffff) << 8);
+    pOut->header.destreg_or_condmod = p->bits1.destreg_or_condmod;
+    pOut->header.saturate = control_bits.saturate;
+    pOut->header.acc_wr_control = p->bits1.acc_wr_control;
+    pOut->header.cmpt_control = p->bits1.cmpt_control;
+    pOut->header.debug_control = p->bits1.debug_control;
+
+    union DataTypeBits data_type_bits;
+    union SubRegBits subreg_bits;
+    union SrcRegBits src0_bits;
+    data_type_bits.data = data_type_decompact[(uint32_t)p->bits1.data_type_index].bit_pattern;
+    subreg_bits.data = subreg_table[(uint32_t)p->bits1.sub_reg_index].bit_pattern;
+    src0_bits.data = srcreg_table[p->bits1.src0_index_lo | p->bits2.src0_index_hi << 2].bit_pattern;
+
+    pOut->low.high |= data_type_bits.data & 0x7fff;
+    pOut->bits1.da1.dest_horiz_stride = data_type_bits.dest_horiz_stride;
+    pOut->bits1.da1.dest_address_mode = data_type_bits.dest_address_mode;
+    pOut->bits1.da1.dest_reg_nr = p->bits2.dest_reg_nr;
+    pOut->bits1.da1.dest_subreg_nr = subreg_bits.dest_subreg_nr;
+
+    pOut->bits2.da1.src0_subreg_nr = subreg_bits.src0_subreg_nr;
+    pOut->bits2.da1.src0_reg_nr = p->bits2.src0_reg_nr;
+    pOut->high.low |= (src0_bits.data << 13);
+    pOut->bits2.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
+    pOut->bits2.da1.flag_reg_nr = control_bits.flag_reg_nr;
+
+    if(data_type_bits.src1_reg_file == GEN_IMMEDIATE_VALUE) {
+      uint32_t imm = (uint32_t)p->bits2.src1_reg_nr | (p->bits2.src1_index<<8);
+      pOut->bits3.ud = imm & 0x1000 ? (imm | 0xfffff000) : imm;
+    } else {
+      union SrcRegBits src1_bits;
+      src1_bits.data = srcreg_table[p->bits2.src1_index].bit_pattern;
+      pOut->bits3.da1.src1_subreg_nr = subreg_bits.src1_subreg_nr;
+      pOut->bits3.da1.src1_reg_nr = p->bits2.src1_reg_nr;
+      pOut->high.high |= (src1_bits.data << 13);
+    }
+  }
+
+  int compactControlBits(GenEncoder *p, uint32_t quarter, uint32_t execWidth) {
+
+    const GenInstructionState *s = &p->curr;
+    // some quick check
+    if(s->nibControl != 0)
+      return -1;
+    if(s->predicate > GEN_PREDICATE_NORMAL)
+      return -1;
+    if(s->flag == 1)
+      return -1;
+
+    ControlBits b;
+    b.data = 0;
+
+    if (execWidth == 8)
+      b.execution_size = GEN_WIDTH_8;
+    else if (execWidth == 16)
+      b.execution_size = GEN_WIDTH_16;
+    else if (execWidth == 4)
+      b.execution_size = GEN_WIDTH_4;
+    else if (execWidth == 1)
+      b.execution_size = GEN_WIDTH_1;
+    else
+      NOT_IMPLEMENTED;
+
+    b.mask_control = s->noMask;
+    b.quarter_control = quarter;
+    b.predicate_control = s->predicate;
+    b.predicate_inverse = s->inversePredicate;
+
+    b.saturate = s->saturate;
+    b.flag_sub_reg_nr = s->subFlag;
+    b.flag_reg_nr = s->flag;
+
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, control_table,
+      sizeof(control_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+
+  int compactDataTypeBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
+
+    // compact does not support any indirect acess
+    if(dst->address_mode != GEN_ADDRESS_DIRECT)
+      return -1;
+
+    if(src0->file == GEN_IMMEDIATE_VALUE)
+      return -1;
+
+    DataTypeBits b;
+    b.data = 0;
+
+    b.dest_horiz_stride = dst->hstride == GEN_HORIZONTAL_STRIDE_0 ? GEN_HORIZONTAL_STRIDE_1 : dst->hstride;
+    b.dest_address_mode = dst->address_mode;
+    b.dest_reg_file = dst->file;
+    b.dest_reg_type = dst->type;
+
+    b.src0_reg_file = src0->file;
+    b.src0_reg_type = src0->type;
+
+    if(src1) {
+      b.src1_reg_type = src1->type;
+      b.src1_reg_file = src1->file;
+    } else {
+      // default to zero
+      b.src1_reg_type = 0;
+      b.src1_reg_file = 0;
+    }
+
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, data_type_table,
+                             sizeof(data_type_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+  int compactSubRegBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
+    SubRegBits b;
+    b.data = 0;
+    b.dest_subreg_nr = dst->subnr;
+    b.src0_subreg_nr = src0->subnr;
+    if(src1)
+      b.src1_subreg_nr = src1->subnr;
+    else
+      b.src1_subreg_nr = 0;
+
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, subreg_table,
+                sizeof(subreg_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+  int compactSrcRegBits(GenEncoder *p, GenRegister *src) {
+    // As we only use GEN_ALIGN_1 and compact only support direct register access,
+    // we only need to verify [hstride, width, vstride]
+    if(src->file == GEN_IMMEDIATE_VALUE)
+      return -1;
+    if(src->address_mode != GEN_ADDRESS_DIRECT)
+      return -1;
+
+    SrcRegBits b;
+    b.data = 0;
+    b.src_abs = src->absolute;
+    b.src_negate = src->negation;
+    b.src_address_mode = src->address_mode;
+    if(p->curr.execWidth == 1 && src->width == GEN_WIDTH_1) {
+      b.src_width = src->width;
+      b.src_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+      b.src_vert_stride = GEN_VERTICAL_STRIDE_0;
+    }
+    else {
+      b.src_horiz_stride = src->hstride;
+      b.src_width = src->width;
+      b.src_vert_stride = src->vstride;
+    }
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, srcreg_table,
+                    sizeof(srcreg_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+
+  bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split) {
+    if(split) {
+      // TODO support it
+      return false;
+    } else {
+      int control_index = compactControlBits(p, p->curr.quarterControl, p->curr.execWidth);
+      if(control_index == -1) return false;
+
+      int data_type_index = compactDataTypeBits(p, &dst, &src, NULL);
+      if(data_type_index == -1) return false;
+
+      int sub_reg_index = compactSubRegBits(p, &dst, &src, NULL);
+      if(sub_reg_index == -1) return false;
+
+      int src_reg_index = compactSrcRegBits(p, &src);
+      if(src_reg_index == -1) return false;
+
+      GenCompactInstruction * insn = p->nextCompact(opcode);
+      insn->bits1.control_index = control_index;
+      insn->bits1.data_type_index = data_type_index;
+      insn->bits1.sub_reg_index = sub_reg_index;
+      insn->bits1.acc_wr_control = p->curr.accWrEnable;
+      insn->bits1.destreg_or_condmod = condition;
+      insn->bits1.cmpt_control = 1;
+      insn->bits1.src0_index_lo = src_reg_index & 3;
+
+      insn->bits2.src0_index_hi = src_reg_index >> 2;
+      insn->bits2.src1_index = 0;
+      insn->bits2.dest_reg_nr = dst.nr;
+      insn->bits2.src0_reg_nr = src.nr;
+      insn->bits2.src1_reg_nr = 0;
+      return true;
+    }
+  }
+
+  bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split) {
+    if(split) {
+      // TODO support it
+      return false;
+    } else {
+      if(opcode == GEN_OPCODE_IF  || opcode == GEN_OPCODE_ENDIF || opcode == GEN_OPCODE_JMPI) return false;
+
+      int control_index = compactControlBits(p, p->curr.quarterControl, p->curr.execWidth);
+      if(control_index == -1) return false;
+
+      int data_type_index = compactDataTypeBits(p, &dst, &src0, &src1);
+      if(data_type_index == -1) return false;
+
+      int sub_reg_index = compactSubRegBits(p, &dst, &src0, &src1);
+      if(sub_reg_index == -1) return false;
+
+      int src0_reg_index = compactSrcRegBits(p, &src0);
+      if(src0_reg_index == -1) return false;
+
+      bool src1_imm = false;
+      int src1_reg_index;
+      if(src1.file == GEN_IMMEDIATE_VALUE) {
+        if(src1.absolute != 0 || src1.negation != 0 || src1.type == GEN_TYPE_F)
+          return false;
+        if(src1.value.d < -4096 || src1.value.d > 4095) // 13bit signed imm
+          return false;
+        src1_imm = true;
+      } else {
+        src1_reg_index = compactSrcRegBits(p, &src1);
+        if(src1_reg_index == -1) return false;
+      }
+      GenCompactInstruction * insn = p->nextCompact(opcode);
+      insn->bits1.control_index = control_index;
+      insn->bits1.data_type_index = data_type_index;
+      insn->bits1.sub_reg_index = sub_reg_index;
+      insn->bits1.acc_wr_control = p->curr.accWrEnable;
+      insn->bits1.destreg_or_condmod = condition;
+      insn->bits1.cmpt_control = 1;
+      insn->bits1.src0_index_lo = src0_reg_index & 3;
+
+      insn->bits2.src0_index_hi = src0_reg_index >> 2;
+      insn->bits2.src1_index = src1_imm ? (src1.value.ud & 8191)>> 8 : src1_reg_index;
+      insn->bits2.dest_reg_nr = dst.nr;
+      insn->bits2.src0_reg_nr = src0.nr;
+      insn->bits2.src1_reg_nr = src1_imm ? (src1.value.ud & 0xff): src1.nr;
+      return true;
+    }
+  }
+};
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 13cbd41..8535b4a 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -1,20 +1,20 @@
 //                 Family     Latency     SIMD16     SIMD8
 DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
 DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
-DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        4,        2)
+DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        40,      20)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
-DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        4,        2)
+DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        40,      20)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
-DECL_GEN7_SCHEDULE(I64Shift,        20,        4,        2)
-DECL_GEN7_SCHEDULE(I64HADD,         20,        4,        2)
-DECL_GEN7_SCHEDULE(I64RHADD,        20,        4,        2)
-DECL_GEN7_SCHEDULE(I64ToFloat,      20,        4,        2)
-DECL_GEN7_SCHEDULE(FloatToI64,      20,        4,        2)
-DECL_GEN7_SCHEDULE(I64MULHI,        20,        4,        2)
-DECL_GEN7_SCHEDULE(I64MADSAT,       20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Shift,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64HADD,         20,        40,      20)
+DECL_GEN7_SCHEDULE(I64RHADD,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64ToFloat,      20,        40,      20)
+DECL_GEN7_SCHEDULE(FloatToI64,      20,        40,      20)
+DECL_GEN7_SCHEDULE(I64MULHI,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64MADSAT,       20,        40,      20)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
-DECL_GEN7_SCHEDULE(I64Compare,      20,        4,        2)
-DECL_GEN7_SCHEDULE(I64DIVREM,       20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Compare,      20,        80,      20)
+DECL_GEN7_SCHEDULE(I64DIVREM,       20,        80,      20)
 DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
 DECL_GEN7_SCHEDULE(IndirectMove,    20,        2,        2)
 DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
@@ -25,16 +25,18 @@ DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
 DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
 DECL_GEN7_SCHEDULE(Read64,          80,        1,        1)
 DECL_GEN7_SCHEDULE(Write64,         80,        1,        1)
-DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
-DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
-DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
-DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
-DECL_GEN7_SCHEDULE(DWordGather,     80,        1,        1)
-DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
+DECL_GEN7_SCHEDULE(UntypedRead,     160,       1,        1)
+DECL_GEN7_SCHEDULE(UntypedWrite,    160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteGather,      160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteScatter,     160,       1,        1)
+DECL_GEN7_SCHEDULE(DWordGather,     160,       1,        1)
+DECL_GEN7_SCHEDULE(PackByte,        40,        1,        1)
+DECL_GEN7_SCHEDULE(UnpackByte,      40,        1,        1)
+DECL_GEN7_SCHEDULE(Sample,          160,       1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
-DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
-DECL_GEN7_SCHEDULE(UnSpillReg,      80,        1,        1)
+DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
+DECL_GEN7_SCHEDULE(UnSpillReg,      160,       1,        1)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
-DECL_GEN7_SCHEDULE(I64MUL,          20,        4,        2)
-DECL_GEN7_SCHEDULE(I64SATADD,       20,        4,        2)
-DECL_GEN7_SCHEDULE(I64SATSUB,       20,        4,        2)
+DECL_GEN7_SCHEDULE(I64MUL,          20,        40,      20)
+DECL_GEN7_SCHEDULE(I64SATADD,       20,        40,      20)
+DECL_GEN7_SCHEDULE(I64SATSUB,       20,        40,      20)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index a711f45..106d608 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -95,18 +95,26 @@ namespace gbe
   // Node for the schedule DAG
   struct ScheduleDAGNode;
 
+  typedef enum {
+    WRITE_AFTER_WRITE,
+    WRITE_AFTER_READ,
+    READ_AFTER_WRITE,
+    READ_AFTER_WRITE_MEMORY
+  } DepMode;
+
   /*! We need to chain together the node we point */
   struct ScheduleListNode : public intrusive_list_node
   {
-    INLINE ScheduleListNode(ScheduleDAGNode *node) : node(node) {}
+    INLINE ScheduleListNode(ScheduleDAGNode *node, DepMode m = READ_AFTER_WRITE) : node(node), depMode(m) {}
     ScheduleDAGNode *node;
+    DepMode depMode;
   };
 
   /*! Node of the DAG */
   struct ScheduleDAGNode
   {
     INLINE ScheduleDAGNode(SelectionInstruction &insn) :
-      insn(insn), refNum(0), retiredCycle(0) {}
+      insn(insn), refNum(0), retiredCycle(0), preRetired(false), readDistance(0x7fffffff) {}
     bool dependsOn(ScheduleDAGNode *node) const {
       GBE_ASSERT(node != NULL);
       for (auto child : node->children)
@@ -122,12 +130,15 @@ namespace gbe
     uint32_t refNum;
     /*! Cycle when the instruction is retired */
     uint32_t retiredCycle;
+    bool preRetired;
+    uint32_t readDistance;
   };
 
   /*! To track loads and stores */
   enum GenMemory : uint8_t {
     GLOBAL_MEMORY = 0,
     LOCAL_MEMORY,
+    SCRATCH_MEMORY,
     MAX_MEM_SYSTEM
   };
 
@@ -144,17 +155,17 @@ namespace gbe
   {
     DependencyTracker(const Selection &selection, SelectionScheduler &scheduler);
     /*! Reset it before scheduling a new block */
-    void clear(void);
+    void clear(bool fullClear = false);
     /*! Get an index in the node array for the given register */
     uint32_t getIndex(GenRegister reg) const;
     /*! Get an index in the node array for the given memory system */
     uint32_t getIndex(uint32_t bti) const;
     /*! Add a new dependency "node0 depends on node1" */
-    void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1);
+    void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode m);
     /*! Add a new dependency "node0 depends on node located at index" */
-    void addDependency(ScheduleDAGNode *node0, uint32_t index);
+    void addDependency(ScheduleDAGNode *node0, uint32_t index, DepMode m);
     /*! Add a new dependency "node located at index depends on node0" */
-    void addDependency(uint32_t index, ScheduleDAGNode *node0);
+    void addDependency(uint32_t index, ScheduleDAGNode *node0, DepMode m);
     /*! No dependency for null registers and immediate */
     INLINE bool ignoreDependency(GenRegister reg) const {
       if (reg.file == GEN_IMMEDIATE_VALUE)
@@ -168,23 +179,9 @@ namespace gbe
     /*! Owns the tracker */
     SelectionScheduler &scheduler;
     /*! Add a new dependency "node0 depends on node set for register reg" */
-    INLINE  void addDependency(ScheduleDAGNode *node0, GenRegister reg) {
-      if (this->ignoreDependency(reg) == false) {
-        const uint32_t index = this->getIndex(reg);
-        this->addDependency(node0, index);
-        if (reg.isdf() || reg.isint64())
-          this->addDependency(node0, index + 1);
-      }
-    }
+    void addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m);
     /*! Add a new dependency "node set for register reg depends on node0" */
-    INLINE void addDependency(GenRegister reg, ScheduleDAGNode *node0) {
-      if (this->ignoreDependency(reg) == false) {
-        const uint32_t index = this->getIndex(reg);
-        this->addDependency(index, node0);
-        if (reg.isdf() || reg.isint64())
-          this->addDependency(index + 1, node0);
-      }
-    }
+    void addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m);
     /*! Make the node located at insnID a barrier */
     void makeBarrier(int32_t insnID, int32_t insnNum);
     /*! Update all the writes (memory, predicates, registers) */
@@ -195,6 +192,8 @@ namespace gbe
     static const uint32_t MAX_ACC_REGISTER = 1u;
     /*! Stores the last node that wrote to a register / memory ... */
     vector<ScheduleDAGNode*> nodes;
+    /*! store nodes each node depends on */
+    map<ScheduleDAGNode *, vector<ScheduleDAGNode*>> deps;
     /*! Stores the nodes per instruction */
     vector<ScheduleDAGNode*> insnNodes;
     /*! Number of virtual register in the selection */
@@ -210,8 +209,11 @@ namespace gbe
     void clearLists(void);
     /*! Return the number of instructions to schedule in the DAG */
     int32_t buildDAG(SelectionBlock &bb);
-    /*! Schedule the DAG */
-    void scheduleDAG(SelectionBlock &bb, int32_t insnNum);
+    /*! traverse read node and update read distance for all the child. */
+    void traverseReadNode(ScheduleDAGNode *node, uint32_t degree = 0);
+    /*! Schedule the DAG, pre register allocation and post register allocation. */
+    void preScheduleDAG(SelectionBlock &bb, int32_t insnNum);
+    void postScheduleDAG(SelectionBlock &bb, int32_t insnNum);
     /*! To limit register pressure or limit insn latency problems */
     SchedulePolicy policy;
     /*! Make ScheduleListNode allocation faster */
@@ -245,22 +247,49 @@ namespace gbe
     insnNodes.resize(selection.getLargestBlockSize());
   }
 
-  void DependencyTracker::clear(void) { for (auto &x : nodes) x = NULL; }
+  void DependencyTracker::clear(bool fullClear) { for (auto &x : nodes) x = NULL; if (fullClear) deps.clear(); }
+  void DependencyTracker::addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m) {
+    if (this->ignoreDependency(reg) == false) {
+      const uint32_t index = this->getIndex(reg);
+      this->addDependency(node0, index, m);
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+        this->addDependency(node0, index + 1, m);
+    }
+  }
 
-  void DependencyTracker::addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1) {
+  void DependencyTracker::addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m) {
+    if (this->ignoreDependency(reg) == false) {
+      const uint32_t index = this->getIndex(reg);
+      this->addDependency(index, node0, m);
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+        this->addDependency(index + 1, node0, m);
+    }
+  }
+
+  void DependencyTracker::addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode depMode) {
     if (node0 != NULL && node1 != NULL && node0 != node1 && node0->dependsOn(node1) == false) {
-      ScheduleListNode *dep = scheduler.newScheduleListNode(node0);
+      if (node1->insn.isRead())
+        depMode = depMode == READ_AFTER_WRITE ? READ_AFTER_WRITE_MEMORY : depMode;
+      ScheduleListNode *dep = scheduler.newScheduleListNode(node0, depMode);
       node0->refNum++;
       node1->children.push_back(dep);
+      auto it = deps.find(node0);
+      if (it != deps.end()) {
+        it->second.push_back(node1);
+      } else {
+        vector<ScheduleDAGNode*> vn;
+        vn.push_back(node1);
+        deps.insert(std::make_pair(node0, vn));
+      }
     }
   }
 
-  void DependencyTracker::addDependency(ScheduleDAGNode *node, uint32_t index) {
-    this->addDependency(node, this->nodes[index]);
+  void DependencyTracker::addDependency(ScheduleDAGNode *node, uint32_t index, DepMode m) {
+    this->addDependency(node, this->nodes[index], m);
   }
 
-  void DependencyTracker::addDependency(uint32_t index, ScheduleDAGNode *node) {
-    this->addDependency(this->nodes[index], node);
+  void DependencyTracker::addDependency(uint32_t index, ScheduleDAGNode *node, DepMode m) {
+    this->addDependency(this->nodes[index], node, m);
   }
 
   void DependencyTracker::makeBarrier(int32_t barrierID, int32_t insnNum) {
@@ -268,11 +297,11 @@ namespace gbe
 
     // The barrier depends on all nodes before it
     for (int32_t insnID = 0; insnID < barrierID; ++insnID)
-      this->addDependency(barrier, this->insnNodes[insnID]);
+      this->addDependency(barrier, this->insnNodes[insnID], WRITE_AFTER_WRITE);
 
     // All nodes after barriers depend on the barrier
     for (int32_t insnID = barrierID + 1; insnID < insnNum; ++insnID)
-      this->addDependency(this->insnNodes[insnID], barrier);
+      this->addDependency(this->insnNodes[insnID], barrier, WRITE_AFTER_WRITE);
   }
 
   static GenRegister getFlag(const SelectionInstruction &insn) {
@@ -320,7 +349,7 @@ namespace gbe
 
   uint32_t DependencyTracker::getIndex(uint32_t bti) const {
     const uint32_t memDelta = grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER;
-    return bti == 0xfe ? memDelta + LOCAL_MEMORY : memDelta + GLOBAL_MEMORY;
+    return bti == 0xfe ? memDelta + LOCAL_MEMORY : (bti == 0xff ? memDelta + SCRATCH_MEMORY : memDelta + GLOBAL_MEMORY);
   }
 
   void DependencyTracker::updateWrites(ScheduleDAGNode *node) {
@@ -332,13 +361,13 @@ namespace gbe
       if (this->ignoreDependency(dst) == false) {
         const uint32_t index = this->getIndex(dst);
         this->nodes[index] = node;
-        if (dst.isdf() || dst.isint64())
+        if (scheduler.policy == POST_ALLOC && (dst.isdf() || dst.isint64()))
           this->nodes[index + 1] = node;
       }
     }
 
     // Track writes in predicates
-    if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
+    if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP || insn.state.modFlag) {
       const uint32_t index = this->getIndex(getFlag(insn));
       this->nodes[index] = node;
     }
@@ -351,10 +380,11 @@ namespace gbe
 
     // Track writes in memory
     if (insn.isWrite()) {
-      const uint32_t index = this->getIndex(insn.extra.function);
+      const uint32_t index = this->getIndex(insn.getbti());
       this->nodes[index] = node;
     }
 
+    // Track writes in scratch memory
     if(insn.opcode == SEL_OP_SPILL_REG) {
       const uint32_t index = this->getIndex(0xff);
       this->nodes[index] = node;
@@ -413,10 +443,27 @@ namespace gbe
     this->active.fast_clear();
   }
 
+  void SelectionScheduler::traverseReadNode(ScheduleDAGNode *node, uint32_t degree) {
+    GBE_ASSERT(degree != 0 || node->insn.isRead());
+    if (node->readDistance != 0x7FFFFFFF)
+      return;
+    node->readDistance = degree;
+    if (degree > 5)
+      return;
+    //printf("node id %d op %d degree %d \n", node->insn.ID, node->insn.opcode, degree);
+    auto it = tracker.deps.find(node);
+    if (it != tracker.deps.end()) {
+      for (auto &depNode : it->second) {
+        if (depNode && !depNode->insn.isRead())
+          traverseReadNode(depNode, degree + 1);
+      }
+    }
+  }
+
   int32_t SelectionScheduler::buildDAG(SelectionBlock &bb) {
     nodePool.rewind();
     listPool.rewind();
-    tracker.clear();
+    tracker.clear(true);
     this->clearLists();
 
     // Track write-after-write and read-after-write dependencies
@@ -428,21 +475,21 @@ namespace gbe
 
       // read-after-write in registers
       for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
-        tracker.addDependency(node, insn.src(srcID));
+        tracker.addDependency(node, insn.src(srcID), READ_AFTER_WRITE);
 
       // read-after-write for predicate
       if (insn.state.predicate != GEN_PREDICATE_NONE)
-        tracker.addDependency(node, getFlag(insn));
+        tracker.addDependency(node, getFlag(insn), READ_AFTER_WRITE);
 
       // read-after-write in memory
       if (insn.isRead()) {
-        const uint32_t index = tracker.getIndex(insn.extra.function);
-        tracker.addDependency(node, index);
+        const uint32_t index = tracker.getIndex(insn.getbti());
+        tracker.addDependency(node, index, READ_AFTER_WRITE);
       }
       //read-after-write of scratch memory
       if (insn.opcode == SEL_OP_UNSPILL_REG) {
         const uint32_t index = tracker.getIndex(0xff);
-        tracker.addDependency(node, index);
+        tracker.addDependency(node, index, READ_AFTER_WRITE);
       }
 
       // Consider barriers and wait are reading memory (local and global)
@@ -451,42 +498,32 @@ namespace gbe
         insn.opcode == SEL_OP_WAIT) {
         const uint32_t local = tracker.getIndex(0xfe);
         const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(node, local);
-        tracker.addDependency(node, global);
+        tracker.addDependency(node, local, READ_AFTER_WRITE);
+        tracker.addDependency(node, global, READ_AFTER_WRITE);
       }
 
       // write-after-write in registers
       for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID)
-        tracker.addDependency(node, insn.dst(dstID));
+        tracker.addDependency(node, insn.dst(dstID), WRITE_AFTER_WRITE);
 
       // write-after-write for predicate
-      if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP)
-        tracker.addDependency(node, getFlag(insn));
+      if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP || insn.state.modFlag)
+        tracker.addDependency(node, getFlag(insn), WRITE_AFTER_WRITE);
 
       // write-after-write for accumulators
       if (insn.state.accWrEnable)
-        tracker.addDependency(node, GenRegister::acc());
+        tracker.addDependency(node, GenRegister::acc(), WRITE_AFTER_WRITE);
 
       // write-after-write in memory
       if (insn.isWrite()) {
-        const uint32_t index = tracker.getIndex(insn.extra.function);
-        tracker.addDependency(node, index);
+        const uint32_t index = tracker.getIndex(insn.getbti());
+        tracker.addDependency(node, index, WRITE_AFTER_WRITE);
       }
 
       // write-after-write in scratch memory
       if (insn.opcode == SEL_OP_SPILL_REG) {
         const uint32_t index = tracker.getIndex(0xff);
-        tracker.addDependency(node, index);
-      }
-
-      // Consider barriers and wait are writing memory (local and global)
-    if (insn.opcode == SEL_OP_BARRIER ||
-        insn.opcode == SEL_OP_FENCE ||
-        insn.opcode == SEL_OP_WAIT) {
-        const uint32_t local = tracker.getIndex(0xfe);
-        const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(node, local);
-        tracker.addDependency(node, global);
+        tracker.addDependency(node, index, WRITE_AFTER_WRITE);
       }
 
       // Track all writes done by the instruction
@@ -501,16 +538,22 @@ namespace gbe
 
       // write-after-read in registers
       for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
-        tracker.addDependency(insn.src(srcID), node);
+        tracker.addDependency(insn.src(srcID), node, WRITE_AFTER_READ);
 
       // write-after-read for predicate
       if (insn.state.predicate != GEN_PREDICATE_NONE)
-        tracker.addDependency(getFlag(insn), node);
+        tracker.addDependency(getFlag(insn), node, WRITE_AFTER_READ);
 
       // write-after-read in memory
       if (insn.isRead()) {
-        const uint32_t index = tracker.getIndex(insn.extra.function);
-        tracker.addDependency(index, node);
+        const uint32_t index = tracker.getIndex(insn.getbti());
+        tracker.addDependency(index, node, WRITE_AFTER_READ);
+      }
+
+      // write-after-read in scratch memory
+      if (insn.opcode == SEL_OP_UNSPILL_REG) {
+        const uint32_t index = tracker.getIndex(0xff);
+        tracker.addDependency(index, node, WRITE_AFTER_READ);
       }
 
       // Consider barriers and wait are reading memory (local and global)
@@ -519,18 +562,28 @@ namespace gbe
           insn.opcode == SEL_OP_WAIT) {
         const uint32_t local = tracker.getIndex(0xfe);
         const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(local, node);
-        tracker.addDependency(global, node);
+        tracker.addDependency(local, node, WRITE_AFTER_READ);
+        tracker.addDependency(global, node, WRITE_AFTER_READ);
       }
 
       // Track all writes done by the instruction
       tracker.updateWrites(node);
     }
 
+    // Update distance to read for each read node.
+    for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+      ScheduleDAGNode *node = tracker.insnNodes[insnID];
+      const SelectionInstruction &insn = node->insn;
+      if (insn.isRead())
+        traverseReadNode(node);
+    }
+
     // Make labels and branches non-schedulable (i.e. they act as barriers)
     for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
       ScheduleDAGNode *node = tracker.insnNodes[insnID];
-      if (node->insn.isBranch() || node->insn.isLabel() || node->insn.opcode == SEL_OP_EOT)
+      if (node->insn.isBranch() || node->insn.isLabel()
+          || node->insn.opcode == SEL_OP_EOT || node->insn.opcode == SEL_OP_IF
+          || node->insn.opcode == SEL_OP_BARRIER)
         tracker.makeBarrier(insnID, insnNum);
     }
 
@@ -546,63 +599,99 @@ namespace gbe
     return insnNum;
   }
 
-  void SelectionScheduler::scheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+  void SelectionScheduler::preScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+    printf("Not implemented yet. \n");
+  }
+
+  void SelectionScheduler::postScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
     uint32_t cycle = 0;
     const bool isSIMD8 = this->ctx.getSimdWidth() == 8;
+    vector <ScheduleDAGNode *> scheduledNodes;
     while (insnNum) {
 
       // Retire all the instructions that finished
+      //printf("cycle = %d \n", cycle);
       for (auto toRetireIt = active.begin(); toRetireIt != active.end();) {
         ScheduleDAGNode *toRetireNode = toRetireIt.node()->node;
+        // Firstly, put all write after read children to ready.
+        if (toRetireNode->preRetired == false) {
+          auto &children = toRetireNode->children;
+          toRetireNode->preRetired = true;
+          //printf("id %d pre retired \n", toRetireNode->insn.ID);
+          for (auto it = children.begin(); it != children.end();) {
+            ScheduleListNode *listNode = it.node();
+            if (listNode->depMode != WRITE_AFTER_READ) {
+              ++it;
+              continue;
+            }
+            if (--it->node->refNum == 0) {
+              //printf("pre push id %d to ready list. \n", listNode->node->insn.ID);
+              it = children.erase(it);
+              this->ready.push_back(listNode);
+            } else
+              ++it;
+          }
+          if (children.size() == 0) {
+            toRetireIt = this->active.erase(toRetireIt);
+            continue;
+          }
+        }
         // Instruction is now complete
         if (toRetireNode->retiredCycle <= cycle) {
           toRetireIt = this->active.erase(toRetireIt);
+          //printf("id %d retired \n", toRetireNode->insn.ID);
           // Traverse all children and make them ready if no more dependency
           auto &children = toRetireNode->children;
           for (auto it = children.begin(); it != children.end();) {
+            ScheduleListNode *listNode = it.node();
+            if (listNode->depMode == WRITE_AFTER_READ) {
+              ++it;
+              continue;
+            }
             if (--it->node->refNum == 0) {
-              ScheduleListNode *listNode = it.node();
               it = children.erase(it);
-              this->ready.push_back(listNode);
+              if (listNode->depMode != WRITE_AFTER_READ)
+                this->ready.push_back(listNode);
+              //printf("push id %d to ready list. \n", listNode->node->insn.ID);
             } else
               ++it;
           }
-        }
-        // Get the next one
-        else
+        } else
           ++toRetireIt;
       }
 
       // Try to schedule something from the ready list
       intrusive_list<ScheduleListNode>::iterator toSchedule;
-      if (policy == POST_ALLOC) // FIFO scheduling
-        toSchedule = this->ready.begin();
-      else                      // LIFO scheduling
-        toSchedule = this->ready.rbegin();
-        // toSchedule = this->ready.begin();
-
+      toSchedule = this->ready.begin();
+      float minCost = 1000;
+      for(auto it = this->ready.begin(); it != this->ready.end(); ++it) {
+        float cost = (it->depMode == WRITE_AFTER_READ) ?  0 : ((it->depMode == WRITE_AFTER_WRITE) ? 5 : 10)
+                     - 5.0 / (it->node->readDistance == 0 ? 0.1 : it->node->readDistance);
+        if (cost < minCost) {
+          toSchedule = it;
+          minCost = cost;
+        }
+      }
       if (toSchedule != this->ready.end()) {
+        //printf("get id %d  op %d to schedule \n", toSchedule->node->insn.ID, toSchedule->node->insn.opcode);
         // The instruction is instantaneously issued to simulate zero cycle
         // scheduling
-        if (policy == POST_ALLOC)
-          cycle += getThroughputGen7(toSchedule->node->insn, isSIMD8);
+        cycle += getThroughputGen7(toSchedule->node->insn, isSIMD8);
 
         this->ready.erase(toSchedule);
         this->active.push_back(toSchedule.node());
         // When we schedule before allocation, instruction is instantaneously
         // ready. This allows to have a real LIFO strategy
-        if (policy == POST_ALLOC)
-          toSchedule->node->retiredCycle = cycle + getLatencyGen7(toSchedule->node->insn);
-        else
-          toSchedule->node->retiredCycle = cycle;
+        toSchedule->node->retiredCycle = cycle + getLatencyGen7(toSchedule->node->insn);
         bb.append(&toSchedule->node->insn);
+        scheduledNodes.push_back(toSchedule->node);
         insnNum--;
       } else
         cycle++;
     }
   }
 
-  BVAR(OCL_POST_ALLOC_INSN_SCHEDULE, false);
+  BVAR(OCL_POST_ALLOC_INSN_SCHEDULE, true);
   BVAR(OCL_PRE_ALLOC_INSN_SCHEDULE, false);
 
   void schedulePostRegAllocation(GenContext &ctx, Selection &selection) {
@@ -611,7 +700,7 @@ namespace gbe
       for (auto &bb : *selection.blockList) {
         const int32_t insnNum = scheduler.buildDAG(bb);
         bb.insnList.clear();
-        scheduler.scheduleDAG(bb, insnNum);
+        scheduler.postScheduleDAG(bb, insnNum);
       }
     }
   }
@@ -619,10 +708,12 @@ namespace gbe
   void schedulePreRegAllocation(GenContext &ctx, Selection &selection) {
     if (OCL_PRE_ALLOC_INSN_SCHEDULE) {
       SelectionScheduler scheduler(ctx, selection, PRE_ALLOC);
+      // FIXME, need to implement proper pre reg allocation scheduling algorithm.
+      return;
       for (auto &bb : *selection.blockList) {
         const int32_t insnNum = scheduler.buildDAG(bb);
         bb.insnList.clear();
-        scheduler.scheduleDAG(bb, insnNum);
+        scheduler.preScheduleDAG(bb, insnNum);
       }
     }
   }
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 54e5ebe..96d3965 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -76,8 +76,6 @@
  *
  * Also, there is some extra kludge to handle the predicates for JMPI.
  *
- * See TODO for a better idea for branching and masking
- *
  * TODO:
  * =====
  *
@@ -92,14 +90,9 @@
  * interesting approach which consists in traversing the dominator tree in post
  * order
  *
- * About masking and branching, a much better idea (that I found later unfortunately)
- * is to replace the use of the flag by uses of if/endif to enclose the basic
- * block. So, instead of using predication, we use auto-masking. The very cool
- * consequence is that we can reintegrate back the structured branches.
- * Basically, we will be able to identify branches that can be mapped to
- * structured branches and mix nicely unstructured branches (which will use
- * jpmi, if/endif to mask the blocks) and structured branches (which are pretty
- * fast)
+ * We already use if/endif to enclose each basic block. We will continue to identify
+ * those blocks which could match to structured branching and use pure structured
+ * instruction to handle them completely.
  */
 
 #include "backend/gen_insn_selection.hpp"
@@ -110,6 +103,7 @@
 #include "sys/cvar.hpp"
 #include "sys/vector.hpp"
 #include <algorithm>
+#include <climits>
 
 namespace gbe
 {
@@ -136,15 +130,32 @@ namespace gbe
     }
   }
 
-  uint32_t getGenCompare(ir::Opcode opcode) {
+  ir::Type getIRType(uint32_t genType) {
+    using namespace ir;
+    switch (genType) {
+      case GEN_TYPE_B: return TYPE_S8;
+      case GEN_TYPE_UB: return TYPE_U8;
+      case GEN_TYPE_W: return TYPE_S16;
+      case GEN_TYPE_UW: return TYPE_U16;
+      case GEN_TYPE_D: return TYPE_S32;
+      case GEN_TYPE_UD: return TYPE_U32;
+      case GEN_TYPE_L: return TYPE_S64;
+      case GEN_TYPE_UL: return TYPE_U64;
+      case GEN_TYPE_F: return TYPE_FLOAT;
+      case GEN_TYPE_DF: return TYPE_DOUBLE;
+      default: NOT_SUPPORTED; return TYPE_FLOAT;
+    }
+  }
+
+  uint32_t getGenCompare(ir::Opcode opcode, bool inverse = false) {
     using namespace ir;
     switch (opcode) {
-      case OP_LE: return GEN_CONDITIONAL_LE;
-      case OP_LT: return GEN_CONDITIONAL_L;
-      case OP_GE: return GEN_CONDITIONAL_GE;
-      case OP_GT: return GEN_CONDITIONAL_G;
-      case OP_EQ: return GEN_CONDITIONAL_EQ;
-      case OP_NE: return GEN_CONDITIONAL_NEQ;
+      case OP_LE: return (!inverse) ? GEN_CONDITIONAL_LE : GEN_CONDITIONAL_G;
+      case OP_LT: return (!inverse) ? GEN_CONDITIONAL_L : GEN_CONDITIONAL_GE;
+      case OP_GE: return (!inverse) ? GEN_CONDITIONAL_GE : GEN_CONDITIONAL_L;
+      case OP_GT: return (!inverse) ? GEN_CONDITIONAL_G : GEN_CONDITIONAL_LE;
+      case OP_EQ: return (!inverse) ? GEN_CONDITIONAL_EQ : GEN_CONDITIONAL_NEQ;
+      case OP_NE: return (!inverse) ? GEN_CONDITIONAL_NEQ : GEN_CONDITIONAL_EQ;
       default: NOT_SUPPORTED; return 0u;
     };
   }
@@ -155,7 +166,9 @@ namespace gbe
 
   SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
     parent(NULL), opcode(op), dstNum(dst), srcNum(src)
-  {}
+  {
+    extra.function = 0;
+  }
 
   void SelectionInstruction::prepend(SelectionInstruction &other) {
     gbe::prepend(&other, this);
@@ -169,16 +182,19 @@ namespace gbe
 
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
-           this->opcode == SEL_OP_READ64 ||
+           this->opcode == SEL_OP_READ64       ||
            this->opcode == SEL_OP_ATOMIC       ||
-           this->opcode == SEL_OP_BYTE_GATHER;
+           this->opcode == SEL_OP_BYTE_GATHER  ||
+           this->opcode == SEL_OP_SAMPLE ||
+           this->opcode == SEL_OP_DWORD_GATHER;
   }
 
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
-           this->opcode == SEL_OP_WRITE64 ||
+           this->opcode == SEL_OP_WRITE64       ||
            this->opcode == SEL_OP_ATOMIC        ||
-           this->opcode == SEL_OP_BYTE_SCATTER;
+           this->opcode == SEL_OP_BYTE_SCATTER  ||
+           this->opcode == SEL_OP_TYPED_WRITE;
   }
 
   bool SelectionInstruction::isBranch(void) const {
@@ -201,7 +217,7 @@ namespace gbe
   // SelectionBlock
   ///////////////////////////////////////////////////////////////////////////
 
-  SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb) {}
+  SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), isLargeBlock(false), endifLabel( (ir::LabelIndex) 0){}
 
   void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); }
 
@@ -229,8 +245,11 @@ namespace gbe
   public:
     INLINE SelectionDAG(const ir::Instruction &insn) :
       insn(insn), mergeable(0), childNum(insn.getSrcNum()), isRoot(0) {
+      GBE_ASSERT(insn.getSrcNum() < 127);
       for (uint32_t childID = 0; childID < childNum; ++childID)
         this->child[childID] = NULL;
+      computeBool = false;
+      isUsed = false;
     }
     /*! Mergeable are non-root instructions with valid sources */
     INLINE void setAsMergeable(uint32_t which) { mergeable|=(1<<which); }
@@ -243,9 +262,13 @@ namespace gbe
     /*! When sources have been overwritten, a child insn cannot be merged */
     uint32_t mergeable:ir::Instruction::MAX_SRC_NUM;
     /*! Number of children we have in the pattern */
-    uint32_t childNum:4;
+    uint32_t childNum:7;
     /*! A root must be generated, no matter what */
     uint32_t isRoot:1;
+    /*! A bool register is used as normal computing sources. */
+    bool computeBool;
+    /*! is used in this block */
+    bool isUsed;
   };
 
   /*! A pattern is a tree to match. This is the general interface for them. For
@@ -312,16 +335,30 @@ namespace gbe
     /*! Implement public class */
     INLINE uint32_t getVectorNum(void) const { return this->vectorNum; }
     /*! Implement public class */
-    INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
+    INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov);
     /*! Implement public class */
-    INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
+    INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov);
     /*! spill a register (insert spill/unspill instructions) */
     INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
+    /*! should add per thread offset to the local memory address when load/store/atomic */
+    bool needPatchSLMAddr() const { return patchSLMAddr; }
+    void setPatchSLMAddr(bool b) { patchSLMAddr = b; }
+    /*! indicate whether a register is a scalar/uniform register. */
+    INLINE bool isScalarReg(const ir::Register &reg) const {
+      const ir::RegisterData &regData = getRegisterData(reg);
+      return regData.isUniform();
+    }
+
+    INLINE GenRegister unpacked_uw(const ir::Register &reg) const {
+      return GenRegister::unpacked_uw(reg, isScalarReg(reg));
+    }
+
+    INLINE GenRegister unpacked_ub(const ir::Register &reg) const {
+      return GenRegister::unpacked_ub(reg, isScalarReg(reg));
+    }
     /*! Implement public class */
     INLINE uint32_t getRegNum(void) const { return file.regNum(); }
     /*! Implements public interface */
-    bool isScalarOrBool(ir::Register reg) const;
-    /*! Implements public interface */
     INLINE ir::RegisterData getRegisterData(ir::Register reg) const {
       return file.get(reg);
     }
@@ -350,9 +387,9 @@ namespace gbe
     /*! Create a new register in the register file and append it in the
      *  temporary list of the current block
      */
-    INLINE ir::Register reg(ir::RegisterFamily family) {
+    INLINE ir::Register reg(ir::RegisterFamily family, bool scalar = false) {
       GBE_ASSERT(block != NULL);
-      const ir::Register reg = file.append(family);
+      const ir::Register reg = file.append(family, scalar);
       block->append(reg);
       return reg;
     }
@@ -365,7 +402,7 @@ namespace gbe
     /*! Build a DAG for the basic block (return number of instructions) */
     uint32_t buildBasicBlockDAG(const ir::BasicBlock &bb);
     /*! Perform the selection on the basic block */
-    void matchBasicBlock(uint32_t insnNum);
+    void matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum);
     /*! A root instruction needs to be generated */
     bool isRoot(const ir::Instruction &insn) const;
 
@@ -419,7 +456,7 @@ namespace gbe
 #define ALU3(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
 #define I64Shift(OP) \
-  INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
+  INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
     ALU1(MOV)
     ALU1WithTemp(MOV_DF)
     ALU1WithTemp(LOAD_DF_IMM)
@@ -473,13 +510,13 @@ namespace gbe
 #undef ALU3
 #undef I64Shift
     /*! Convert 64-bit integer to 32-bit float */
-    void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[7]);
+    void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]);
     /*! Convert 64-bit integer to 32-bit float */
-    void CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[3]);
+    void CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]);
     /*! Saturated 64bit x*y + z */
-    void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]);
+    void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]);
     /*! High 64bit of x*y */
-    void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]);
+    void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]);
     /*! (x+y)>>1 without mod. overflow */
     void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
     /*! (x+y+1)>>1 without mod. overflow */
@@ -489,19 +526,27 @@ namespace gbe
     /*! Compare 64-bit integer */
     void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
     /*! Saturated addition of 64-bit integer */
-    void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]);
     /*! Saturated subtraction of 64-bit integer */
-    void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]);
     /*! Encode a barrier instruction */
     void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
     /*! Encode a barrier instruction */
     void FENCE(GenRegister dst);
     /*! Encode a label instruction */
     void LABEL(ir::LabelIndex label);
-    /*! Jump indexed instruction */
-    void JMPI(Reg src, ir::LabelIndex target);
+    /*! Jump indexed instruction, return the encoded instruction count according to jump distance. */
+    int JMPI(Reg src, ir::LabelIndex target, ir::LabelIndex origin);
+    /*! IF indexed instruction */
+    void IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip);
+    /*! ENDIF indexed instruction */
+    void ENDIF(Reg src, ir::LabelIndex jip);
+    /*! BRD indexed instruction */
+    void BRD(Reg src, ir::LabelIndex jip);
+    /*! BRC indexed instruction */
+    void BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip);
     /*! Compare instructions */
-    void CMP(uint32_t conditional, Reg src0, Reg src1);
+    void CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst = GenRegister::null());
     /*! Select instruction with embedded comparison */
     void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
     /* Constant buffer move instruction */
@@ -515,9 +560,9 @@ namespace gbe
     /*! Atomic instruction */
     void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
     /*! Read 64 bits float/int array */
-    void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
+    void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Write 64 bits float/int array */
-    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
+    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */
@@ -528,6 +573,10 @@ namespace gbe
     void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
+    /*! Unpack the uint to char4 */
+    void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum);
+    /*! pack the char4 to uint */
+    void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum);
     /*! Extended math function (2 arguments) */
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
@@ -543,21 +592,40 @@ namespace gbe
     /*! Encode ternary instructions */
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
-    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
+    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform);
     /*! Encode typed write instructions */
-    void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
+    void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
     /*! Multiply 64-bit integers */
     void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
     /*! 64-bit integer division */
-    void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
+    void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
     /*! 64-bit integer remainder of division */
-    void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
+    void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
+    /* common functions for both binary instruction and sel_cmp and compare instruction.
+       It will handle the IMM or normal register assignment, and will try to avoid LOADI
+       as much as possible. */
+    void getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
+                      GenRegister &src1, ir::Type type, bool &inverse);
+    void getSrcGenRegImm(SelectionDAG &dag,
+                      SelectionDAG *dag0, SelectionDAG *dag1,
+                      GenRegister &src0, GenRegister &src1,
+                      ir::Type type, bool &inverse);
     /*! Use custom allocators */
     GBE_CLASS(Opaque);
     friend class SelectionBlock;
     friend class SelectionInstruction;
+  private:
+    /*! Auxiliary label for if/endif. */ 
+    uint16_t currAuxLabel;
+    bool patchSLMAddr;
+    INLINE ir::LabelIndex newAuxLabel()
+    {
+      currAuxLabel++;
+      return (ir::LabelIndex)currAuxLabel;
+    }
+
   };
 
   ///////////////////////////////////////////////////////////////////////////
@@ -586,12 +654,11 @@ namespace gbe
     return src0DAG->child[src0ID] == src1DAG->child[src1ID];
   }
 
-
   Selection::Opaque::Opaque(GenContext &ctx) :
     ctx(ctx), block(NULL),
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
-    stateNum(0), vectorNum(0), bwdCodeGeneration(false)
+    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()), patchSLMAddr(false)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -647,6 +714,7 @@ namespace gbe
                                                       uint32_t dstNum,
                                                       uint32_t srcNum)
   {
+    GBE_ASSERT(dstNum <= SelectionInstruction::MAX_DST_NUM && srcNum <= SelectionInstruction::MAX_SRC_NUM);
     GBE_ASSERT(this->block != NULL);
     SelectionInstruction *insn = this->create(opcode, dstNum, srcNum);
     if (this->bwdCodeGeneration)
@@ -670,18 +738,9 @@ namespace gbe
     return vector;
   }
 
-  // FIXME, there is a risk need to be fixed here.
-  // as the instruction we spill here is the gen ir level not the final
-  // single instruction. If it will be translated to multiple instructions
-  // at gen_context stage, and as the destination registers and source registers
-  // may be spilled to the same register based on current implementation,
-  // then the source register may be modified within the final instruction and
-  // may lead to incorrect result.
   bool Selection::Opaque::spillRegs(const SpilledRegs &spilledRegs,
                                     uint32_t registerPool) {
     GBE_ASSERT(registerPool != 0);
-    const uint32_t dstStart = registerPool + 1;
-    const uint32_t srcStart = registerPool + 1;
 
     for (auto &block : blockList)
       for (auto &insn : block.insnList) {
@@ -689,21 +748,24 @@ namespace gbe
         if(insn.opcode == SEL_OP_SPILL_REG
            || insn.opcode == SEL_OP_UNSPILL_REG)
           continue;
+        const int simdWidth = insn.state.execWidth;
 
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
         struct RegSlot {
           RegSlot(ir::Register _reg, uint8_t _srcID,
-                  bool _isTmp, uint32_t _addr)
-                 : reg(_reg), srcID(_srcID), isTmpReg(_isTmp), addr(_addr)
+                   uint8_t _poolOffset, bool _isTmp, uint32_t _addr)
+                 : reg(_reg), srcID(_srcID), poolOffset(_poolOffset), isTmpReg(_isTmp), addr(_addr)
           {};
           ir::Register reg;
           union {
             uint8_t srcID;
             uint8_t dstID;
           };
+          uint8_t poolOffset;
           bool isTmpReg;
           int32_t addr;
         };
+        uint8_t poolOffset = 1; // keep one for scratch message header
         vector <struct RegSlot> regSet;
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const GenRegister selReg = insn.src(srcID);
@@ -712,37 +774,59 @@ namespace gbe
           if(it != spilledRegs.end()
              && selReg.file == GEN_GENERAL_REGISTER_FILE
              && selReg.physical == 0) {
-            struct RegSlot regSlot(reg, srcID,
+            ir::RegisterFamily family = getRegisterFamily(reg);
+            if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+              poolOffset += simdWidth / 8; // qword register fill could not share the scratch read message payload register
+            }
+            struct RegSlot regSlot(reg, srcID, poolOffset,
                                    it->second.isTmpReg,
                                    it->second.addr);
+            if(family == ir::FAMILY_QWORD) {
+              poolOffset += 2 * simdWidth / 8;
+            } else {
+              poolOffset += simdWidth / 8;
+            }
             regSet.push_back(regSlot);
           }
         }
 
-        if (regSet.size() > 5)
+        if (poolOffset > ctx.reservedSpillRegs) {
+          if (GBE_DEBUG)
+            std::cerr << "Instruction (#" << (uint32_t)insn.opcode
+                      << ") src too large pooloffset "
+                      << (uint32_t)poolOffset << std::endl;
           return false;
-
+        }
+        // FIXME, to support post register allocation scheduling,
+        // put all the reserved register to the spill/unspill's destination registers.
+        // This is not the best way. We need to refine the spill/unspill instruction to
+        // only use passed in registers and don't access hard coded offset in the future.
         while(!regSet.empty()) {
-          uint32_t scratchID = regSet.size() - 1;
           struct RegSlot regSlot = regSet.back();
           regSet.pop_back();
           const GenRegister selReg = insn.src(regSlot.srcID);
           if (!regSlot.isTmpReg) {
           /* For temporary registers, we don't need to unspill. */
-            SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
-            unspill->state  = GenInstructionState(ctx.getSimdWidth());
+            SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG,
+                                            1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(), 0);
+            unspill->state = GenInstructionState(simdWidth);
+            unspill->state.noMask = 1;
             unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
-                                          srcStart + scratchID, 0,
+                                          registerPool + regSlot.poolOffset, 0,
                                           selReg.type, selReg.vstride,
                                           selReg.width, selReg.hstride);
-            unspill->extra.scratchOffset = regSlot.addr;
+            for(uint32_t i = 1; i < 1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++)
+              unspill->dst(i) = ctx.getSimdWidth() == 8 ?
+                                GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1), 0 ) :
+                                GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1) * 2, 0);
+            unspill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth;
             unspill->extra.scratchMsgHeader = registerPool;
             insn.prepend(*unspill);
           }
 
           GenRegister src = insn.src(regSlot.srcID);
           // change nr/subnr, keep other register settings
-          src.nr = srcStart + scratchID; src.subnr = 0; src.physical = 1;
+          src.nr = registerPool + regSlot.poolOffset; src.subnr = 0; src.physical = 1;
           insn.src(regSlot.srcID) = src;
         };
 
@@ -756,7 +840,6 @@ namespace gbe
           instruction. Thus the registerPool + 1 still contain valid
           data.
          */
-
         for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
           const GenRegister selReg = insn.dst(dstID);
           const ir::Register reg = selReg.reg();
@@ -764,96 +847,118 @@ namespace gbe
           if(it != spilledRegs.end()
              && selReg.file == GEN_GENERAL_REGISTER_FILE
              && selReg.physical == 0) {
-            struct RegSlot regSlot(reg, dstID,
+            ir::RegisterFamily family = getRegisterFamily(reg);
+            if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+              poolOffset += simdWidth / 8; // qword register spill could not share the scratch write message payload register
+            }
+            struct RegSlot regSlot(reg, dstID, poolOffset,
                                    it->second.isTmpReg,
                                    it->second.addr);
+            if (family == ir::FAMILY_QWORD) poolOffset += 2 * simdWidth / 8;
+            else poolOffset += simdWidth / 8;
             regSet.push_back(regSlot);
           }
         }
 
-        if (regSet.size() > 5)
+        if (poolOffset > ctx.reservedSpillRegs){
+          if (GBE_DEBUG)
+           std::cerr << "Instruction (#" << (uint32_t)insn.opcode
+                     << ") dst too large pooloffset "
+                     << (uint32_t)poolOffset << std::endl;
           return false;
-
+        }
         while(!regSet.empty()) {
-          uint32_t scratchID = regSet.size() - 1;
           struct RegSlot regSlot = regSet.back();
           regSet.pop_back();
           const GenRegister selReg = insn.dst(regSlot.dstID);
           if(!regSlot.isTmpReg) {
             /* For temporary registers, we don't need to unspill. */
-            SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
-            spill->state  = GenInstructionState(ctx.getSimdWidth());
+            SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG,
+                                          (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth() , 1);
+            spill->state  = insn.state;//GenInstructionState(simdWidth);
+            spill->state.accWrEnable = 0;
+            spill->state.saturate = 0;
+            if (insn.opcode == SEL_OP_SEL)
+              spill->state.predicate = GEN_PREDICATE_NONE;
             spill->src(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
-                                        dstStart + scratchID, 0,
+                                        registerPool + regSlot.poolOffset, 0,
                                         selReg.type, selReg.vstride,
                                         selReg.width, selReg.hstride);
-            spill->extra.scratchOffset = regSlot.addr;
+            spill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth;
             spill->extra.scratchMsgHeader = registerPool;
+            for(uint32_t i = 0; i < 0 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++)
+              spill->dst(i) = ctx.getSimdWidth() == 8 ?
+                                GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i), 0 ) :
+                                GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i) * 2, 0);
             insn.append(*spill);
           }
 
           GenRegister dst = insn.dst(regSlot.dstID);
           // change nr/subnr, keep other register settings
-          dst.physical =1; dst.nr = dstStart + scratchID; dst.subnr = 0;
+          dst.physical =1; dst.nr = registerPool + regSlot.poolOffset; dst.subnr = 0;
           insn.dst(regSlot.dstID)= dst;
-          scratchID++;
         }
       }
     return true;
   }
 
-  ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID) {
+  ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
     SelectionBlock *block = insn->parent;
-    const uint32_t simdWidth = ctx.getSimdWidth();
+    const uint32_t simdWidth = insn->state.execWidth;
     ir::Register tmp;
+    GenRegister gr;
 
     // This will append the temporary register in the instruction block
     this->block = block;
-    tmp = this->reg(ir::FAMILY_DWORD);
-
-    // Generate the MOV instruction and replace the register in the instruction
-    SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
-    mov->src(0) = GenRegister::retype(insn->src(regID), GEN_TYPE_F);
-    mov->state = GenInstructionState(simdWidth);
-    insn->src(regID) = mov->dst(0) = GenRegister::fxgrf(simdWidth, tmp);
-    insn->prepend(*mov);
+    tmp = this->reg(ir::getFamily(type), simdWidth == 1);
+    gr =  this->selReg(tmp, type);
+    if (needMov) {
+      // Generate the MOV instruction and replace the register in the instruction
+      SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
+      mov->src(0) = GenRegister::retype(insn->src(regID), gr.type);
+      mov->state = GenInstructionState(simdWidth);
+      if (this->isScalarReg(insn->src(regID).reg()))
+        mov->state.noMask = 1;
+      mov->dst(0) = gr;
+      insn->prepend(*mov);
+    }
+    insn->src(regID) = gr;
 
     return tmp;
   }
 
-  ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) {
+  ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
     SelectionBlock *block = insn->parent;
-    uint32_t simdWidth = ctx.getSimdWidth();
+    uint32_t simdWidth;
+    if (!GenRegister::isNull(insn->dst(regID)))
+      simdWidth = this->isScalarReg(insn->dst(regID).reg()) ? 1 : insn->state.execWidth;
+    else {
+      GBE_ASSERT(needMov == false);
+      simdWidth = insn->state.execWidth;
+    }
     ir::Register tmp;
-    ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
-    int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
     GenRegister gr;
-
-    // This will append the temporary register in the instruction block
     this->block = block;
-    tmp = this->reg(f);
-
+    tmp = this->reg(ir::getFamily(type));
+    gr = this->selReg(tmp, type);
+    if (needMov) {
     // Generate the MOV instruction and replace the register in the instruction
-    SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
-    mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
-    mov->state = GenInstructionState(simdWidth);
-    gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp);
-    insn->dst(regID) = mov->src(0) = gr;
-    insn->append(*mov);
-    return tmp;
-  }
-
-  bool Selection::Opaque::isScalarOrBool(ir::Register reg) const {
-    if (ctx.isScalarReg(reg))
-      return true;
-    else {
-      const ir::RegisterFamily family = file.get(reg).family;
-      return family == ir::FAMILY_BOOL;
+      SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
+      mov->dst(0) = GenRegister::retype(insn->dst(regID), gr.type);
+      mov->state = GenInstructionState(simdWidth);
+      if (simdWidth == 1) {
+        mov->state.noMask = 1;
+        mov->src(0) = GenRegister::retype(GenRegister::vec1(GEN_GENERAL_REGISTER_FILE, gr.reg()), gr.type);
+      } else
+        mov->src(0) = gr;
+      insn->append(*mov);
     }
+    insn->dst(regID) = gr;
+    return tmp;
   }
 
 #define SEL_REG(SIMD16, SIMD8, SIMD1) \
-  if (ctx.sel->isScalarOrBool(reg) == true) \
+  if (ctx.sel->isScalarReg(reg) == true) \
     return GenRegister::retype(GenRegister::SIMD1(reg), genType); \
   else if (simdWidth == 8) \
     return GenRegister::retype(GenRegister::SIMD8(reg), genType); \
@@ -869,7 +974,7 @@ namespace gbe
     const RegisterData data = file.get(reg);
     const RegisterFamily family = data.family;
     switch (family) {
-      case FAMILY_BOOL: SEL_REG(uw1grf, uw1grf, uw1grf); break;
+      case FAMILY_BOOL: SEL_REG(uw16grf, uw8grf, uw1grf); break;
       case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
       case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
       case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
@@ -908,16 +1013,47 @@ namespace gbe
     insn->dst(0) = dst;
   }
 
-  void Selection::Opaque::JMPI(Reg src, ir::LabelIndex index) {
+  int Selection::Opaque::JMPI(Reg src, ir::LabelIndex index, ir::LabelIndex origin) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1);
     insn->src(0) = src;
     insn->index = uint16_t(index);
+    insn->extra.longjmp = abs(index - origin) > 800;
+    return insn->extra.longjmp ? 2 : 1;
+  }
+
+  void Selection::Opaque::BRD(Reg src, ir::LabelIndex jip) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BRD, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+  }
+
+  void Selection::Opaque::BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BRC, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+    insn->index1 = uint16_t(uip);
+  }
+
+  void Selection::Opaque::IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+    insn->index1 = uint16_t(uip);
   }
 
-  void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 0, 2);
+  void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
+    this->block->endifLabel = this->newAuxLabel();
+    this->LABEL(this->block->endifLabel);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(this->block->endifLabel);
+  }
+
+  void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 1, 2);
     insn->src(0) = src0;
     insn->src(1) = src1;
+    insn->dst(0) = dst;
     insn->extra.function = conditional;
   }
 
@@ -943,7 +1079,7 @@ namespace gbe
     if(srcNum > 1) insn->src(1) = src1;
     if(srcNum > 2) insn->src(2) = src2;
     insn->extra.function = function;
-    insn->extra.elem     = bti;
+    insn->setbti(bti);
     SelectionVector *vector = this->appendVector();
 
     vector->regNum = srcNum;
@@ -955,34 +1091,26 @@ namespace gbe
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
 
-  /* elemNum contains all the temporary register and the
-     real destination registers.*/
   void Selection::Opaque::READ64(Reg addr,
-                                 Reg tempAddr,
                                  const GenRegister *dst,
                                  uint32_t elemNum,
-                                 uint32_t valueNum,
                                  uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
-    /* temporary addr register is to be modified, set it to dst registers.*/
-    insn->dst(0) = tempAddr;
     // Regular instruction to encode
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-      insn->dst(elemID + 1) = dst[elemID];
+      insn->dst(elemID) = dst[elemID];
     insn->src(0) = addr;
-    insn->extra.function = bti;
-    insn->extra.elem = valueNum;
+    insn->setbti(bti);
+    insn->extra.elem = elemNum;
 
-    // Only the temporary registers need contiguous allocation
-    dstVector->regNum = elemNum - valueNum;
+    dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
-    dstVector->reg = &insn->dst(1);
+    dstVector->reg = &insn->dst(0);
 
-    // Source cannot be scalar (yet)
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
     srcVector->reg = &insn->src(0);
@@ -996,48 +1124,43 @@ namespace gbe
     SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
-
+    if (this->isScalarReg(dst[0].reg()))
+      insn->state.noMask = 1;
     // Regular instruction to encode
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->dst(elemID) = dst[elemID];
     insn->src(0) = addr;
-    insn->extra.function = bti;
+    insn->setbti(bti);
     insn->extra.elem = elemNum;
 
     // Sends require contiguous allocation
     dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
     dstVector->reg = &insn->dst(0);
-    // Source cannot be scalar (yet)
+
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
     srcVector->reg = &insn->src(0);
   }
 
-  /* elemNum contains all the temporary register and the
-     real data registers.*/
   void Selection::Opaque::WRITE64(Reg addr,
                                   const GenRegister *src,
                                   uint32_t srcNum,
-                                  const GenRegister *dst,
-                                  uint32_t dstNum,
                                   uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
     SelectionVector *vector = this->appendVector();
 
     // Regular instruction to encode
     insn->src(0) = addr;
     for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
       insn->src(elemID + 1) = src[elemID];
-    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
-      insn->dst(elemID) = dst[elemID];
-    insn->extra.function = bti;
+
+    insn->setbti(bti);
     insn->extra.elem = srcNum;
 
-    // Only the addr + temporary registers need to be contiguous.
-    vector->regNum = dstNum;
-    vector->reg = &insn->dst(0);
+    vector->regNum = srcNum + 1;
+    vector->reg = &insn->src(0);
     vector->isSrc = 1;
   }
 
@@ -1053,7 +1176,7 @@ namespace gbe
     insn->src(0) = addr;
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->src(elemID+1) = src[elemID];
-    insn->extra.function = bti;
+    insn->setbti(bti);
     insn->extra.elem = elemNum;
 
     // Sends require contiguous allocation for the sources
@@ -1067,10 +1190,12 @@ namespace gbe
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
+    if (this->isScalarReg(dst.reg()))
+      insn->state.noMask = 1;
     // Instruction to encode
     insn->src(0) = addr;
     insn->dst(0) = dst;
-    insn->extra.function = bti;
+    insn->setbti(bti);
     insn->extra.elem = elemSize;
 
     // byte gather requires vector in the sense that scalar are not allowed
@@ -1090,7 +1215,7 @@ namespace gbe
     // Instruction to encode
     insn->src(0) = addr;
     insn->src(1) = src;
-    insn->extra.function = bti;
+    insn->setbti(bti);
     insn->extra.elem = elemSize;
 
     // value and address are contiguous in the send
@@ -1101,10 +1226,33 @@ namespace gbe
 
   void Selection::Opaque::DWORD_GATHER(Reg dst, Reg addr, uint32_t bti) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_DWORD_GATHER, 1, 1);
+    SelectionVector *vector = this->appendVector();
+    SelectionVector *srcVector = this->appendVector();
 
+    if (this->isScalarReg(dst.reg()))
+      insn->state.noMask = 1;
     insn->src(0) = addr;
     insn->dst(0) = dst;
-    insn->extra.function = bti;
+    insn->setbti(bti);
+    vector->regNum = 1;
+    vector->isSrc = 0;
+    vector->reg = &insn->dst(0);
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
+  void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1);
+    insn->src(0) = src;
+    for(uint32_t i = 0; i < elemNum; i++)
+      insn->dst(i) = dst[i];
+  }
+  void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum);
+    for(uint32_t i = 0; i < elemNum; i++)
+      insn->src(i) = src[i];
+    insn->dst(0) = dst;
   }
 
   void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
@@ -1131,21 +1279,21 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 15, 2);
+  void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 14, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 14; i++)
+    for(int i = 0; i < 13; i++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 15, 2);
+  void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 14, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 14; i++)
+    for(int i = 0; i < 13; i++)
       insn->dst(i + 1) = tmp[i];
   }
 
@@ -1194,56 +1342,56 @@ namespace gbe
     insn->extra.function = conditional;
   }
 
-  void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 7, 2);
+  void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 6, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i=0; i<6; i++)
+    for(int i=0; i<5; i++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 7, 2);
+  void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 6, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i=0; i<6; i++)
+    for(int i=0; i<5; i++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[7]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 8, 1);
+  void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 7, 1);
     insn->dst(0) = dst;
     insn->src(0) = src;
-    for(int i = 0; i < 7; i ++)
+    for(int i = 0; i < 6; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[3]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVF_TO_I64, 4, 1);
+  void Selection::Opaque::CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVF_TO_I64, 3, 1);
     insn->dst(0) = dst;
     insn->src(0) = src;
-    for(int i = 0; i < 3; i ++)
+    for(int i = 0; i < 2; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 11, 3);
+  void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 10, 3);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
     insn->src(2) = src2;
-    for(int i = 0; i < 10; i ++)
+    for(int i = 0; i < 9; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 11, 2);
+  void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 10, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 10; i ++)
+    for(int i = 0; i < 9; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
@@ -1265,12 +1413,12 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) {
-    SelectionInstruction *insn = this->appendInsn(opcode, 8, 2);
+  void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 7, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 7; i ++)
+    for(int i = 0; i < 6; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
@@ -1315,9 +1463,17 @@ namespace gbe
     for (uint32_t regID = 0; regID < this->regNum; ++regID)
       this->regDAG[regID] = NULL;
 
+    this->block->hasBarrier = false;
+    this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA ||
+                             bb.getLastInstruction()->getOpcode() == OP_RET;
+    if (!this->block->hasBranch)
+      this->block->endifOffset = -1;
+
     // Build the DAG on the fly
     uint32_t insnNum = 0;
     const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+      if (insn.getOpcode() == OP_SYNC)
+        this->block->hasBarrier = true;
 
       // Build a selectionDAG node for instruction
       SelectionDAG *dag = this->newSelectionDAG(insn);
@@ -1344,6 +1500,15 @@ namespace gbe
           }
           if (mergeable) dag->setAsMergeable(srcID);
           dag->child[srcID] = child;
+          // Check whether this bool is used as a normal source
+          // oprand other than BRA/SEL.
+          if (getRegisterFamily(reg) == FAMILY_BOOL) {
+            if (insn.getOpcode() != OP_BRA &&
+                 (insn.getOpcode() != OP_SEL ||
+                   (insn.getOpcode() == OP_SEL && srcID != 0)))
+              child->computeBool = true;
+          }
+          child->isUsed = true;
         } else
           dag->child[srcID] = NULL;
       }
@@ -1365,9 +1530,16 @@ namespace gbe
     return insnNum;
   }
 
-  void Selection::Opaque::matchBasicBlock(uint32_t insnNum)
+  void Selection::Opaque::matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum)
   {
     // Bottom up code generation
+    bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
+
+    if(needEndif) {
+      const ir::BasicBlock *next = bb.getNextBlock();
+      this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+    }
+
     for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
       // Process all possible patterns for this instruction
       SelectionDAG &dag = *insnDAG[insnID];
@@ -1379,6 +1551,7 @@ namespace gbe
 
         // Start a new code fragment
         this->startBackwardGeneration();
+        // If there is no branch at the end of this block.
 
         // Try all the patterns from best to worst
         do {
@@ -1387,6 +1560,21 @@ namespace gbe
           ++it;
         } while (it != end);
         GBE_ASSERT(it != end);
+        // If we are in if/endif fix mode, and this block is
+        // large enough, we need to insert endif/if pair to eliminate
+        // the too long if/endif block.
+        if (this->ctx.getIFENDIFFix() &&
+            this->block->insnList.size() != 0 &&
+            this->block->insnList.size() % 1000 == 0 &&
+            (uint16_t)this->block->endifLabel != 0) {
+          ir::LabelIndex jip = this->block->endifLabel;
+          this->ENDIF(GenRegister::immd(0), jip);
+          this->push();
+            this->curr.predicate = GEN_PREDICATE_NORMAL;
+            this->IF(GenRegister::immd(0), jip, jip);
+          this->pop();
+          this->block->isLargeBlock = true;
+        }
 
         // Output the code in the current basic block
         this->endBackwardGeneration();
@@ -1404,15 +1592,14 @@ namespace gbe
       this->dagPool.rewind();
       this->appendBlock(bb);
       const uint32_t insnNum = this->buildBasicBlockDAG(bb);
-      this->matchBasicBlock(insnNum);
+      this->matchBasicBlock(bb, insnNum);
     });
    }
 
   void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
-                                 GenRegister *src, uint32_t srcNum,
                                  GenRegister *msgPayloads, uint32_t msgNum,
-                                 uint32_t bti, uint32_t sampler, bool is3D) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum);
+                                 uint32_t bti, uint32_t sampler, bool isLD, bool isUniform) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum);
     SelectionVector *dstVector = this->appendVector();
     SelectionVector *msgVector = this->appendVector();
 
@@ -1421,8 +1608,6 @@ namespace gbe
       insn->dst(elemID) = dst[elemID];
     for (uint32_t elemID = 0; elemID < msgNum; ++elemID)
       insn->src(elemID) = msgPayloads[elemID];
-    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
-      insn->src(msgNum + elemID) = src[elemID];
 
     // Sends require contiguous allocation
     dstVector->regNum = dstNum;
@@ -1434,9 +1619,11 @@ namespace gbe
     msgVector->isSrc = 1;
     msgVector->reg = &insn->src(0);
 
-    insn->extra.rdbti = bti;
+    insn->setbti(bti);
     insn->extra.sampler = sampler;
-    insn->extra.is3DRead = is3D;
+    insn->extra.rdmsglen = msgNum;
+    insn->extra.isLD = isLD;
+    insn->extra.isUniform = isUniform;
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -1448,20 +1635,21 @@ namespace gbe
     this->opaque = GBE_NEW(Selection::Opaque, ctx);
   }
 
-  void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum,
-                                      GenRegister *msgs, uint32_t msgNum,
+  Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setPatchSLMAddr(true);
+  }
+
+  void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
                                       uint32_t bti, bool is3D) {
     uint32_t elemID = 0;
     uint32_t i;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum);
     SelectionVector *msgVector = this->appendVector();;
 
     for( i = 0; i < msgNum; ++i, ++elemID)
       insn->src(elemID) = msgs[i];
-    for (i = 0; i < srcNum; ++i, ++elemID)
-      insn->src(elemID) = src[i];
 
-    insn->extra.bti = bti;
+    insn->setbti(bti);
     insn->extra.msglen = msgNum;
     insn->extra.is3DWrite = is3D;
     // Sends require contiguous allocation
@@ -1477,10 +1665,6 @@ namespace gbe
     this->blockList = &this->opaque->blockList;
   }
 
-  bool Selection::isScalarOrBool(ir::Register reg) const {
-    return this->opaque->isScalarOrBool(reg);
-  }
-
   uint32_t Selection::getLargestBlockSize(void) const {
     return this->opaque->getLargestBlockSize();
   }
@@ -1501,17 +1685,21 @@ namespace gbe
     return this->opaque->getRegisterData(reg);
   }
 
-  ir::Register Selection::replaceSrc(SelectionInstruction *insn, uint32_t regID) {
-    return this->opaque->replaceSrc(insn, regID);
+  ir::Register Selection::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+    return this->opaque->replaceSrc(insn, regID, type, needMov);
   }
 
-  ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID) {
-    return this->opaque->replaceDst(insn, regID);
+  ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+    return this->opaque->replaceDst(insn, regID, type, needMov);
   }
   bool Selection::spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool) {
     return this->opaque->spillRegs(spilledRegs, registerPool);
   }
 
+  bool Selection::isScalarReg(const ir::Register &reg) const {
+    return this->opaque->isScalarReg(reg);
+  }
+
   SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
     return this->opaque->create(opcode, dstNum, srcNum);
   }
@@ -1524,28 +1712,89 @@ namespace gbe
     using namespace ir;
     const auto &childInsn = cast<LoadImmInstruction>(insn);
     const auto &imm = childInsn.getImmediate();
-    if(imm.type != TYPE_DOUBLE && imm.type != TYPE_S64 && imm.type != TYPE_U64)
+    if(imm.getType() != TYPE_DOUBLE && imm.getType() != TYPE_S64 && imm.getType() != TYPE_U64)
       return true;
     return false;
   }
 
-  GenRegister getRegisterFromImmediate(ir::Immediate imm)
+  GenRegister getRegisterFromImmediate(ir::Immediate imm, ir::Type type, bool negate = false)
   {
     using namespace ir;
-    switch (imm.type) {
-      case TYPE_U32:   return GenRegister::immud(imm.data.u32);
-      case TYPE_S32:   return GenRegister::immd(imm.data.s32);
-      case TYPE_FLOAT: return GenRegister::immf(imm.data.f32);
-      case TYPE_U16: return GenRegister::immuw(imm.data.u16);
-      case TYPE_S16: return  GenRegister::immw(imm.data.s16);
-      case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
-      case TYPE_S8:  return GenRegister::immw(imm.data.s8);
-      case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
-      case TYPE_BOOL: return GenRegister::immuw(-imm.data.b);  //return 0xffff when true
+    int sign = negate ? -1 : 1;
+    switch (type) {
+      case TYPE_U32:   return GenRegister::immud(imm.getIntegerValue() * sign);
+      case TYPE_S32:   return GenRegister::immd(imm.getIntegerValue() * sign);
+      case TYPE_FLOAT: return GenRegister::immf(imm.getFloatValue() * sign);
+      case TYPE_U16: return GenRegister::immuw(imm.getIntegerValue() * sign);
+      case TYPE_S16: return  GenRegister::immw((int16_t)imm.getIntegerValue() * sign);
+      case TYPE_U8:  return GenRegister::immuw(imm.getIntegerValue() * sign);
+      case TYPE_S8:  return GenRegister::immw((int8_t)imm.getIntegerValue() * sign);
+      case TYPE_DOUBLE: return GenRegister::immdf(imm.getDoubleValue() * sign);
+      case TYPE_BOOL: return GenRegister::immuw(-imm.getIntegerValue());  //return 0xffff when true
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
 
+  BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
+  void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag,
+                                          SelectionDAG *dag0, SelectionDAG *dag1,
+                                          GenRegister &src0, GenRegister &src1,
+                                          ir::Type type, bool &inverse) {
+    using namespace ir;
+    inverse = false;
+    // Right source can always be an immediate
+    const int src0Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src0Index : 0;
+    const int src1Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src1Index : 1;
+    if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
+        canGetRegisterFromImmediate(dag1->insn)) {
+      const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
+      src0 = this->selReg(dag.insn.getSrc(src0Index), type);
+      src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
+      if (dag0) dag0->isRoot = 1;
+    }
+    // Left source cannot be immediate but it is OK if we can commute
+    else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && dag.insn.isMemberOf<BinaryInstruction>() &&
+             ((cast<BinaryInstruction>(dag.insn)).commutes() || dag.insn.getOpcode() == OP_SUB) &&
+             dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
+      const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
+      src0 = dag.insn.getOpcode() != OP_SUB ?
+             this->selReg(dag.insn.getSrc(src1Index), type) :
+             GenRegister::negate(this->selReg(dag.insn.getSrc(src1Index), type));
+      Immediate imm = childInsn.getImmediate();
+      src1 = getRegisterFromImmediate(imm, type, dag.insn.getOpcode() == OP_SUB);
+      if (dag1) dag1->isRoot = 1;
+    }
+    // If it's a compare instruction, theoritically, we can easily revert the condition code to
+    // switch the two operands. But we can't do that for float due to the NaN's exist.
+    // For a normal select instruction, we can always inverse the predication to switch the two
+    // operands' position.
+    else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
+             dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) &&
+             ((dag.insn.isMemberOf<CompareInstruction>() && type != TYPE_FLOAT && type != TYPE_DOUBLE) ||
+              (dag.insn.isMemberOf<SelectInstruction>()))) {
+      const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
+      src0 = this->selReg(dag.insn.getSrc(src1Index), type);
+      src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
+      inverse = true;
+      if (dag1) dag1->isRoot = 1;
+    }
+    // Just grab the two sources
+    else {
+      src0 = this->selReg(dag.insn.getSrc(src0Index), type);
+      src1 = this->selReg(dag.insn.getSrc(src1Index), type);
+      markAllChildren(dag);
+    }
+  }
+
+  void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
+                                       GenRegister &src1, ir::Type type,
+                                       bool &inverse) {
+    SelectionDAG *dag0 = dag.child[0];
+    SelectionDAG *dag1 = dag.child[1];
+    getSrcGenRegImm(dag, dag0, dag1, src0, src1, type, inverse);
+  }
+
+
   /*! Template for the one-to-many instruction patterns */
   template <typename T, typename U>
   class OneToManyPattern : public SelectionPattern
@@ -1561,8 +1810,10 @@ namespace gbe
     }
     /*! Call the child method with the proper prototype */
     virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
-      if (static_cast<const T*>(this)->emitOne(sel, ir::cast<U>(dag.insn))) {
-        markAllChildren(dag);
+      bool markChildren = true;
+      if (static_cast<const T*>(this)->emitOne(sel, ir::cast<U>(dag.insn), markChildren)) {
+        if (markChildren)
+          markAllChildren(dag);
         return true;
       }
       return false;
@@ -1591,68 +1842,126 @@ namespace gbe
       return ir::TYPE_FLOAT;
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn) const {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn, bool &markChildren) const {
       const ir::Opcode opcode = insn.getOpcode();
       const ir::Type insnType = insn.getType();
       const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType));
       const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType));
-      switch (opcode) {
-        case ir::OP_ABS:
-          if (insn.getType() == ir::TYPE_S32) {
-            const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
-            const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
-            sel.MOV(dst_, GenRegister::abs(src_));
-          } else {
-            GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT);
-            sel.MOV(dst, GenRegister::abs(src));
-          }
-          break;
-        case ir::OP_MOV:
-          if(insn.getType() == ir::TYPE_BOOL) {
-            GenRegister flagReg;
-            uint32_t predicate = sel.curr.predicate;
-            sel.push();
-              sel.curr.execWidth = 1;
-              sel.curr.predicate = GEN_PREDICATE_NONE;
-              sel.curr.noMask = 1;
-              if(predicate == GEN_PREDICATE_NONE)
-                sel.MOV(dst, src);
-              else {
-                if(sel.curr.physicalFlag)
-                  flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
+      sel.push();
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        switch (opcode) {
+          case ir::OP_ABS:
+            if (insn.getType() == ir::TYPE_S32) {
+              const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
+              const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
+              sel.MOV(dst_, GenRegister::abs(src_));
+            } else {
+              GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT);
+              sel.MOV(dst, GenRegister::abs(src));
+            }
+            break;
+          case ir::OP_MOV:
+            if (dst.isdf()) {
+              ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+              sel.MOV_DF(dst, src, sel.selReg(r));
+            } else {
+              sel.push();
+                auto dag = sel.regDAG[insn.getDst(0)];
+                if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
+                    dag->isUsed) {
+                sel.curr.physicalFlag = 0;
+                sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+                sel.curr.modFlag = 1;
+              }
+              sel.MOV(dst, src);
+              sel.pop();
+            }
+            break;
+          case ir::OP_RNDD: sel.RNDD(dst, src); break;
+          case ir::OP_RNDE: sel.RNDE(dst, src); break;
+          case ir::OP_RNDU: sel.RNDU(dst, src); break;
+          case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
+          case ir::OP_FBH: sel.FBH(dst, src); break;
+          case ir::OP_FBL: sel.FBL(dst, src); break;
+          case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
+          case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
+          case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
+          case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break;
+          case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
+          case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
+          case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
+          case ir::OP_SIMD_ANY:
+            {
+              const GenRegister constZero = GenRegister::immuw(0);;
+              const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
+              const GenRegister flag01 = GenRegister::flag(0, 1);
+
+              sel.push();
+                int simdWidth = sel.curr.execWidth;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.execWidth = 1;
+                sel.curr.noMask = 1;
+                sel.MOV(flag01, constZero);
+                sel.curr.execWidth = simdWidth;
+                sel.curr.noMask = 0;
+
+                sel.curr.flag = 0;
+                sel.curr.subFlag = 1;
+                sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero);
+
+                if (sel.curr.execWidth == 16)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+                else if (sel.curr.execWidth == 8)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
                 else
-                  flagReg = sel.selReg(ir::Register(sel.curr.flagIndex), ir::TYPE_U16);
+                  NOT_IMPLEMENTED;
+                sel.SEL(dst, regOne, constZero);
+              sel.pop();
+            }
+            break;
+          case ir::OP_SIMD_ALL:
+            {
+              const GenRegister constZero = GenRegister::immuw(0);
+              const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
+              const GenRegister flag01 = GenRegister::flag(0, 1);
+
+              sel.push();
+                int simdWidth = sel.curr.execWidth;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.execWidth = 1;
+                sel.curr.noMask = 1;
+                sel.MOV(flag01, regOne);
+
+                sel.curr.execWidth = simdWidth;
+                sel.curr.noMask = 0;
+
+                sel.curr.flag = 0;
+                sel.curr.subFlag = 1;
+                sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero);
+
+                if (sel.curr.execWidth == 16)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+                else if (sel.curr.execWidth == 8)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+                else
+                  NOT_IMPLEMENTED;
+                sel.SEL(dst, regOne, constZero);
+              sel.pop();
+            }
+            break;
 
-                sel.AND(dst, flagReg, src);
-              }
-            sel.pop();
-          } else if (dst.isdf()) {
-            ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
-            sel.MOV_DF(dst, src, sel.selReg(r));
-          } else
-            sel.MOV(dst, src);
-          break;
-        case ir::OP_RNDD: sel.RNDD(dst, src); break;
-        case ir::OP_RNDE: sel.RNDE(dst, src); break;
-        case ir::OP_RNDU: sel.RNDU(dst, src); break;
-        case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
-        case ir::OP_FBH: sel.FBH(dst, src); break;
-        case ir::OP_FBL: sel.FBL(dst, src); break;
-        case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
-        case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
-        case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
-        case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break;
-        case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
-        case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
-        case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
-        default: NOT_SUPPORTED;
-      }
+          default: NOT_SUPPORTED;
+        }
+      sel.pop();
       return true;
     }
     DECL_CTOR(UnaryInstruction, 1, 1)
   };
 
-  BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
 
   /*! Binary regular instruction pattern */
   class BinaryInstructionPattern : public SelectionPattern
@@ -1681,7 +1990,7 @@ namespace gbe
       //bytes and shorts must be converted to int for DIV and REM per GEN restriction
       if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
         GenRegister tmp0, tmp1;
-        ir::Register reg = sel.reg(FAMILY_DWORD);
+        ir::Register reg = sel.reg(FAMILY_DWORD, simdWidth == 1);
 
         tmp0 = GenRegister::udxgrf(simdWidth, reg);
         tmp0 = GenRegister::retype(tmp0, GEN_TYPE_D);
@@ -1694,9 +2003,9 @@ namespace gbe
         sel.MATH(tmp0, function, tmp0, tmp1);
         GenRegister unpacked;
         if(family == FAMILY_WORD) {
-          unpacked = GenRegister::unpacked_uw(reg);
+          unpacked = sel.unpacked_uw(reg);
         } else {
-          unpacked = GenRegister::unpacked_ub(reg);
+          unpacked = sel.unpacked_ub(reg);
         }
         unpacked = GenRegister::retype(unpacked, getGenType(type));
         sel.MOV(dst, unpacked);
@@ -1706,16 +2015,19 @@ namespace gbe
         GBE_ASSERT(op != OP_REM);
         sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
       } else if (type == TYPE_S64 || type == TYPE_U64) {
-        GenRegister tmp[14];
-        for(int i=0; i<13; i++) {
+        GenRegister tmp[13];
+        for(int i=0; i < 13; i++) {
           tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
           tmp[i].type = GEN_TYPE_UD;
         }
-        tmp[13] = sel.selReg(sel.reg(FAMILY_BOOL));
-        if(op == OP_DIV)
-          sel.I64DIV(dst, src0, src1, tmp);
-        else
-          sel.I64REM(dst, src0, src1, tmp);
+        sel.push();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          if(op == OP_DIV)
+            sel.I64DIV(dst, src0, src1, tmp);
+          else
+            sel.I64REM(dst, src0, src1, tmp);
+        sel.pop();
       }
       markAllChildren(dag);
       return true;
@@ -1729,8 +2041,19 @@ namespace gbe
       const Type type = insn.getType();
       GenRegister dst  = sel.selReg(insn.getDst(0), type);
 
+      sel.push();
+
+      // Boolean values use scalars
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+
       if(opcode == OP_DIV || opcode == OP_REM) {
-        return this->emitDivRemInst(sel, dag, opcode);
+        bool ret = this->emitDivRemInst(sel, dag, opcode);
+        sel.pop();
+        return ret;
       }
       // Immediates not supported
       if (opcode == OP_POW) {
@@ -1743,48 +2066,25 @@ namespace gbe
           NOT_IMPLEMENTED;
         }
         markAllChildren(dag);
+        sel.pop();
         return true;
       }
 
-      sel.push();
-
-      // Boolean values use scalars
-      if (sel.isScalarOrBool(insn.getDst(0)) == true) {
-        sel.curr.execWidth = 1;
-        sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.noMask = 1;
-      }
-
       // Look for immediate values
       GenRegister src0, src1;
-      SelectionDAG *dag0 = dag.child[0];
-      SelectionDAG *dag1 = dag.child[1];
-
-      // Right source can always be an immediate
-      //logica ops of bool shouldn't use 0xffff, may use flag reg, so can't optimize
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
-          canGetRegisterFromImmediate(dag1->insn) && type != TYPE_BOOL) {
-        const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
-        src0 = sel.selReg(insn.getSrc(0), type);
-        src1 = getRegisterFromImmediate(childInsn.getImmediate());
-        if (dag0) dag0->isRoot = 1;
-      }
-      // Left source cannot be immediate but it is OK if we can commute
-      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI &&
-               canGetRegisterFromImmediate(dag0->insn) && type != TYPE_BOOL) {
-        const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
-        src0 = sel.selReg(insn.getSrc(1), type);
-        src1 = getRegisterFromImmediate(childInsn.getImmediate());
-        if (dag1) dag1->isRoot = 1;
-      }
-      // Just grab the two sources
-      else {
-        src0 = sel.selReg(insn.getSrc(0), type);
-        src1 = sel.selReg(insn.getSrc(1), type);
-        markAllChildren(dag);
+      bool inverse = false;
+      sel.getSrcGenRegImm(dag, src0, src1, type, inverse);
+      // Output the binary instruction
+      if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
+          dag.isUsed) {
+        GBE_ASSERT(insn.getOpcode() == OP_AND ||
+                   insn.getOpcode() == OP_OR ||
+                   insn.getOpcode() == OP_XOR);
+        sel.curr.physicalFlag = 0;
+        sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+        sel.curr.modFlag = 1;
       }
 
-      // Output the binary instruction
       switch (opcode) {
         case OP_ADD:
           if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
@@ -1795,13 +2095,16 @@ namespace gbe
           break;
         case OP_ADDSAT:
           if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
-            GenRegister tmp[6];
+            GenRegister tmp[5];
             for(int i=0; i<5; i++) {
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
               tmp[i].type = GEN_TYPE_UD;
             }
-            tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
-            sel.I64SATADD(dst, src0, src1, tmp);
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SATADD(dst, src0, src1, tmp);
+            sel.pop();
             break;
           }
           sel.push();
@@ -1836,13 +2139,16 @@ namespace gbe
           break;
         case OP_SUBSAT:
           if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
-            GenRegister tmp[6];
+            GenRegister tmp[5];
             for(int i=0; i<5; i++) {
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
               tmp[i].type = GEN_TYPE_UD;
             }
-            tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
-            sel.I64SATSUB(dst, src0, src1, tmp);
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SATSUB(dst, src0, src1, tmp);
+            sel.pop();
             break;
           }
           sel.push();
@@ -1852,31 +2158,40 @@ namespace gbe
           break;
         case OP_SHL:
           if (type == TYPE_S64 || type == TYPE_U64) {
-            GenRegister tmp[7];
+            GenRegister tmp[6];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-            tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
-            sel.I64SHL(dst, src0, src1, tmp);
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SHL(dst, src0, src1, tmp);
+            sel.pop();
           } else
             sel.SHL(dst, src0, src1);
           break;
         case OP_SHR:
           if (type == TYPE_S64 || type == TYPE_U64) {
-            GenRegister tmp[7];
+            GenRegister tmp[6];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-            tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
-            sel.I64SHR(dst, src0, src1, tmp);
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SHR(dst, src0, src1, tmp);
+            sel.pop();
           } else
             sel.SHR(dst, src0, src1);
           break;
         case OP_ASR:
           if (type == TYPE_S64 || type == TYPE_U64) {
-            GenRegister tmp[7];
+            GenRegister tmp[6];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-            tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
-            sel.I64ASR(dst, src0, src1, tmp);
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64ASR(dst, src0, src1, tmp);
+            sel.pop();
           } else
             sel.ASR(dst, src0, src1);
           break;
@@ -1887,13 +2202,16 @@ namespace gbe
           }
         case OP_I64_MUL_HI:
          {
-          GenRegister temp[10];
+          GenRegister temp[9];
           for(int i=0; i<9; i++) {
             temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
             temp[i].type = GEN_TYPE_UD;
           }
-          temp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
-          sel.I64_MUL_HI(dst, src0, src1, temp);
+          sel.push();
+            sel.curr.flag = 0;
+            sel.curr.subFlag = 1;
+            sel.I64_MUL_HI(dst, src0, src1, temp);
+          sel.pop();
           break;
          }
         case OP_MUL:
@@ -1957,6 +2275,7 @@ namespace gbe
     /*! Register the pattern for all opcodes of the family */
     MulAddInstructionPattern(void) : SelectionPattern(2, 1) {
        this->opcodes.push_back(ir::OP_ADD);
+       this->opcodes.push_back(ir::OP_SUB);
     }
 
     /*! Implements base class */
@@ -1966,7 +2285,8 @@ namespace gbe
 
       // XXX TODO: we need a clean support of FP_CONTRACT to remove below line 'return false'
       // if 'pragma FP_CONTRACT OFF' is used in cl kernel, we should not do mad optimization.
-      return false;
+      if (!sel.ctx.relaxMath || sel.ctx.getSimdWidth() == 16)
+        return false;
       // MAD tend to increase liveness of the sources (since there are three of
       // them). TODO refine this strategy. Well, we should be able at least to
       // evaluate per basic block register pressure and selectively enable
@@ -1983,9 +2303,16 @@ namespace gbe
       const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT);
       if (child0 && child0->insn.getOpcode() == OP_MUL) {
         GBE_ASSERT(cast<ir::BinaryInstruction>(child0->insn).getType() == TYPE_FLOAT);
+        SelectionDAG *child00 = child0->child[0];
+        SelectionDAG *child01 = child0->child[1];
+        if ((child00 && child00->insn.getOpcode() == OP_LOADI) ||
+            (child01 && child01->insn.getOpcode() == OP_LOADI) ||
+            (child1 && child1->insn.getOpcode() == OP_LOADI))
+          return false;
         const GenRegister src0 = sel.selReg(child0->insn.getSrc(0), TYPE_FLOAT);
         const GenRegister src1 = sel.selReg(child0->insn.getSrc(1), TYPE_FLOAT);
-        const GenRegister src2 = sel.selReg(insn.getSrc(1), TYPE_FLOAT);
+        GenRegister src2 = sel.selReg(insn.getSrc(1), TYPE_FLOAT);
+        if(insn.getOpcode() == ir::OP_SUB) src2 = GenRegister::negate(src2);
         sel.MAD(dst, src2, src0, src1); // order different on HW!
         if (child0->child[0]) child0->child[0]->isRoot = 1;
         if (child0->child[1]) child0->child[1]->isRoot = 1;
@@ -1994,9 +2321,16 @@ namespace gbe
       }
       if (child1 && child1->insn.getOpcode() == OP_MUL) {
         GBE_ASSERT(cast<ir::BinaryInstruction>(child1->insn).getType() == TYPE_FLOAT);
-        const GenRegister src0 = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
+        SelectionDAG *child10 = child1->child[0];
+        SelectionDAG *child11 = child1->child[1];
+        if ((child10 && child10->insn.getOpcode() == OP_LOADI) ||
+            (child11 && child11->insn.getOpcode() == OP_LOADI) ||
+            (child0 && child0->insn.getOpcode() == OP_LOADI))
+          return false;
+        GenRegister src0 = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
         const GenRegister src1 = sel.selReg(child1->insn.getSrc(1), TYPE_FLOAT);
         const GenRegister src2 = sel.selReg(insn.getSrc(0), TYPE_FLOAT);
+        if(insn.getOpcode() == ir::OP_SUB) src0 = GenRegister::negate(src0);
         sel.MAD(dst, src2, src0, src1); // order different on HW!
         if (child1->child[0]) child1->child[0]->isRoot = 1;
         if (child1->child[1]) child1->child[1]->isRoot = 1;
@@ -2038,35 +2372,31 @@ namespace gbe
       // So both sources must match
       if (sourceMatch(cmp, 0, &dag, 1) == false) return false;
       if (sourceMatch(cmp, 1, &dag, 2) == false) return false;
-
       // OK, we merge the instructions
       const ir::CompareInstruction &cmpInsn = cast<CompareInstruction>(cmp->insn);
       const ir::Opcode opcode = cmpInsn.getOpcode();
       if(opcode == OP_ORD) return false;
-      const uint32_t genCmp = getGenCompare(opcode);
-
-      // Like for regular selects, we need a temporary since we cannot predicate
-      // properly
+      GenRegister src0, src1;
       const ir::Type type = cmpInsn.getType();
-      const RegisterFamily family = getFamily(type);
-      const GenRegister tmp = sel.selReg(sel.reg(family), type);
-      const uint32_t simdWidth = sel.curr.execWidth;
-      const GenRegister dst  = sel.selReg(insn.getDst(0), type);
-      const GenRegister src0 = sel.selReg(cmpInsn.getSrc(0), type);
-      const GenRegister src1 = sel.selReg(cmpInsn.getSrc(1), type);
+      bool inverse = false;
+      sel.getSrcGenRegImm(*cmp, src0, src1, type, inverse);
 
+      const uint32_t genCmp = getGenCompare(opcode, inverse);
       sel.push();
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+
+        // Like for regular selects, we need a temporary since we cannot predicate
+        // properly
+        const uint32_t simdWidth = sel.curr.execWidth;
+        const GenRegister dst  = sel.selReg(insn.getDst(0), type);
         sel.curr.predicate = GEN_PREDICATE_NONE;
         sel.curr.execWidth = simdWidth;
-        sel.SEL_CMP(genCmp, tmp, src0, src1);
+        sel.SEL_CMP(genCmp, dst, src0, src1);
       sel.pop();
-
-      // Update the destination register properly now
-      sel.MOV(dst, tmp);
-
-      // We need the sources of the compare instruction
-      markAllChildren(*cmp);
-
       return true;
     }
   };
@@ -2085,15 +2415,20 @@ namespace gbe
     {
       using namespace ir;
       const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
-      const uint32_t simdWidth = sel.curr.execWidth;
       const Type type = insn.getType();
       if (type == TYPE_U32 || type == TYPE_S32) {
+        sel.push();
+          if (sel.isScalarReg(insn.getDst(0)) == true) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+        const uint32_t simdWidth = sel.curr.execWidth;
+
         GenRegister dst  = sel.selReg(insn.getDst(0), type);
         GenRegister src0 = sel.selReg(insn.getSrc(0), type);
         GenRegister src1 = sel.selReg(insn.getSrc(1), type);
 
-        sel.push();
-
         // Either left part of the 16-wide register or just a simd 8 register
         dst  = GenRegister::retype(dst,  GEN_TYPE_D);
         src0 = GenRegister::retype(src0, GEN_TYPE_D);
@@ -2104,7 +2439,13 @@ namespace gbe
         sel.curr.accWrEnable = 1;
         sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), src0, src1);
         sel.curr.accWrEnable = 0;
-        sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::acc());
+        if (simdWidth == 1) {
+          sel.curr.execWidth = 1;
+          sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::vec1(GenRegister::acc()));
+        } else {
+          sel.curr.execWidth = 8;
+          sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::acc());
+        }
 
         // Right part of the 16-wide register now
         if (simdWidth == 16) {
@@ -2131,7 +2472,6 @@ namespace gbe
         }
 
         sel.pop();
-
         // All children are marked as root
         markAllChildren(dag);
         return true;
@@ -2172,20 +2512,36 @@ namespace gbe
         if (src0DAG->insn.getOpcode() == OP_LOADI) {
           const auto &loadimm = cast<LoadImmInstruction>(src0DAG->insn);
           const Immediate imm = loadimm.getImmediate();
-          const Type type = imm.type;
+          const Type type = imm.getType();
           GBE_ASSERT(type == TYPE_U32 || type == TYPE_S32);
-          if (type == TYPE_U32 && imm.data.u32 <= 0xffff) {
-            sel.MUL(sel.selReg(dst, type),
-                    sel.selReg(src1, type),
-                    GenRegister::immuw(imm.data.u32));
+          if (type == TYPE_U32 && imm.getIntegerValue() <= 0xffff) {
+            sel.push();
+              if (sel.isScalarReg(insn.getDst(0)) == true) {
+                sel.curr.execWidth = 1;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.noMask = 1;
+              }
+
+              sel.MUL(sel.selReg(dst, type),
+                      sel.selReg(src1, type),
+                      GenRegister::immuw(imm.getIntegerValue()));
+            sel.pop();
             if (dag.child[childID ^ 1] != NULL)
               dag.child[childID ^ 1]->isRoot = 1;
             return true;
           }
-          if (type == TYPE_S32 && (imm.data.s32 >= -32768 && imm.data.s32 <= 32767)) {
-            sel.MUL(sel.selReg(dst, type),
-                    sel.selReg(src1, type),
-                    GenRegister::immw(imm.data.s32));
+          if (type == TYPE_S32 && (imm.getIntegerValue() >= -32768 && imm.getIntegerValue() <= 32767)) {
+            sel.push();
+              if (sel.isScalarReg(insn.getDst(0)) == true) {
+                sel.curr.execWidth = 1;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.noMask = 1;
+              }
+
+              sel.MUL(sel.selReg(dst, type),
+                      sel.selReg(src1, type),
+                      GenRegister::immw(imm.getIntegerValue()));
+            sel.pop();
             if (dag.child[childID ^ 1] != NULL)
               dag.child[childID ^ 1]->isRoot = 1;
             return true;
@@ -2204,9 +2560,16 @@ namespace gbe
       const Register src0 = insn.getSrc(childID);
       const Register src1 = insn.getSrc(childID ^ 1);
       if (is16BitSpecialReg(src0)) {
-        sel.MUL(sel.selReg(dst, type),
-                sel.selReg(src1, type),
-                sel.selReg(src0, TYPE_U32));
+        sel.push();
+          if (sel.isScalarReg(insn.getDst(0)) == true) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+          sel.MUL(sel.selReg(dst, type),
+                  sel.selReg(src1, type),
+                  sel.selReg(src0, TYPE_U32));
+        sel.pop();
         markAllChildren(dag);
         return true;
       }
@@ -2235,7 +2598,7 @@ namespace gbe
 #define DECL_NOT_IMPLEMENTED_ONE_TO_MANY(FAMILY) \
   struct FAMILY##Pattern : public OneToManyPattern<FAMILY##Pattern, ir::FAMILY>\
   {\
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::FAMILY &insn) const {\
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::FAMILY &insn, bool &markChildren) const {\
       NOT_IMPLEMENTED;\
       return false;\
     }\
@@ -2246,51 +2609,42 @@ namespace gbe
   /*! Load immediate pattern */
   DECL_PATTERN(LoadImmInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadImmInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadImmInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const Type type = insn.getType();
       const Immediate imm = insn.getImmediate();
       const GenRegister dst = sel.selReg(insn.getDst(0), type);
-      GenRegister flagReg;
 
       sel.push();
-      if (sel.isScalarOrBool(insn.getDst(0)) == true) {
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
         sel.curr.execWidth = 1;
-        if(type == TYPE_BOOL) {
-          if(imm.data.b) {
-            if(sel.curr.predicate == GEN_PREDICATE_NONE)
-              flagReg = GenRegister::immuw(0xffff);
-            else {
-              if(sel.curr.physicalFlag)
-                flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
-              else
-                flagReg = sel.selReg(Register(sel.curr.flagIndex), TYPE_U16);
-            }
-          } else
-            flagReg = GenRegister::immuw(0x0);
-        }
         sel.curr.predicate = GEN_PREDICATE_NONE;
         sel.curr.noMask = 1;
       }
 
       switch (type) {
         case TYPE_BOOL:
-          sel.MOV(dst, flagReg);
+          if (!sel.isScalarReg(insn.getDst(0)) && sel.regDAG[insn.getDst(0)]->isUsed) {
+            sel.curr.modFlag = 1;
+            sel.curr.physicalFlag = 0;
+            sel.curr.flagIndex = (uint16_t) insn.getDst(0);
+          }
+          sel.MOV(dst, imm.getIntegerValue() ? GenRegister::immuw(0xffff) : GenRegister::immuw(0));
         break;
         case TYPE_U32:
         case TYPE_S32:
         case TYPE_FLOAT:
           sel.MOV(GenRegister::retype(dst, GEN_TYPE_F),
-                  GenRegister::immf(imm.data.f32));
+                  GenRegister::immf(imm.asFloatValue()));
         break;
-        case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.data.u16)); break;
-        case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
-        case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
-        case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
-        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); break;
-        case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.s64)); break;
-        case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.u64)); break;
+        case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
+        case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
+        case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
+        case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
+        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.getDoubleValue()), sel.selReg(sel.reg(FAMILY_QWORD))); break;
+        case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
+        case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
         default: NOT_SUPPORTED;
       }
       sel.pop();
@@ -2303,30 +2657,14 @@ namespace gbe
   /*! Sync instruction */
   DECL_PATTERN(SyncInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::SyncInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::SyncInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const ir::Register reg = sel.reg(FAMILY_DWORD);
-      const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
-      const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
-      const GenRegister flagReg = GenRegister::flag(0, 0);
       const uint32_t params = insn.getParameters();
 
-      sel.push();
-        sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.noMask = 1;
-        sel.curr.execWidth = 1;
-        sel.OR(barrierMask, flagReg, barrierMask);
-        sel.MOV(tempFlag, barrierMask);
-      sel.pop();
-
       // A barrier is OK to start the thread synchronization *and* SLM fence
-      sel.push();
-      //sel.curr.predicate = GEN_PREDICATE_NONE;
-      sel.curr.flagIndex = (uint16_t)tempFlag.value.reg;
-      sel.curr.physicalFlag = 0;
       sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
-      sel.pop();
       return true;
     }
 
@@ -2344,6 +2682,7 @@ namespace gbe
       case TYPE_U32:
       case TYPE_S32:
         return GEN_BYTE_SCATTER_DWORD;
+      case TYPE_BOOL:
       case TYPE_U16:
       case TYPE_S16:
         return GEN_BYTE_SCATTER_WORD;
@@ -2358,81 +2697,190 @@ namespace gbe
   /*! Load instruction pattern */
   DECL_PATTERN(LoadInstruction)
   {
+    void readDWord(Selection::Opaque &sel,
+                   vector<GenRegister> &dst,
+                   vector<GenRegister> &dst2,
+                   GenRegister addr,
+                   uint32_t valueNum,
+                   ir::AddressSpace space,
+                   ir::BTI bti) const
+    {
+      for (uint32_t x = 0; x < bti.count; x++) {
+        if(x > 0)
+          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+            dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+
+        GenRegister temp = getRelativeAddress(sel, addr, space, bti.bti[x]);
+        sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
+        if(x > 0) {
+          sel.push();
+            if(sel.isScalarReg(dst[0].reg())) {
+              sel.curr.noMask = 1;
+              sel.curr.execWidth = 1;
+            }
+            for (uint32_t y = 0; y < valueNum; y++)
+              sel.ADD(dst[y], dst[y], dst2[y]);
+          sel.pop();
+        }
+      }
+    }
+
     void emitUntypedRead(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> dst(valueNum);
+      vector<GenRegister> dst2(valueNum);
       for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
-      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
+        dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+      readDWord(sel, dst, dst2, addr, valueNum, insn.getAddressSpace(), bti);
     }
 
     void emitDWordGather(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
     {
       using namespace ir;
-      const uint32_t valueNum = insn.getValueNum();
-      const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      GBE_ASSERT(valueNum == 1);
-      GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
-      // get dword based address
-      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-      sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+      GBE_ASSERT(bti.count == 1);
+      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth();
+      GBE_ASSERT(insn.getValueNum() == 1);
+
+      if(simdWidth == 1) {
+        GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
+        sel.push();
+          sel.curr.noMask = 1;
+          sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
+        sel.pop();
+        return;
+      }
+
+      GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+      // get dword based address
+      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
 
-      sel.DWORD_GATHER(dst, addrDW, bti);
+      sel.push();
+        if (sel.isScalarReg(addr.reg())) {
+          sel.curr.noMask = 1;
+        }
+        sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+      sel.pop();
+
+      sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
     }
 
     void emitRead64(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
-                         uint32_t bti) const
+                         ir::BTI bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      uint32_t dstID;
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
+      GBE_ASSERT(bti.count == 1);
+      GenRegister dst[valueNum];
+      GenRegister tmpAddr = getRelativeAddress(sel, addr, insn.getAddressSpace(), bti.bti[0]);
+      for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
+        dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
+      sel.READ64(tmpAddr, dst, valueNum, bti.bti[0]);
+    }
 
-      // The first 16 DWORD register space is for temporary usage at encode stage.
-      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
-      GenRegister dst[valueNum + tmpRegNum];
-      for (dstID = 0; dstID < tmpRegNum ; ++dstID)
-        dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
-      for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
-        dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
+    void readByteAsDWord(Selection::Opaque &sel,
+                        const uint32_t elemSize,
+                        GenRegister address,
+                        GenRegister dst,
+                        uint32_t simdWidth,
+                        uint8_t bti) const
+    {
+      using namespace ir;
+        Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
+        GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
+        GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
+        // Get dword aligned addr
+        sel.push();
+          if (simdWidth == 1) {
+            sel.curr.execWidth = 1;
+            sel.curr.noMask = 1;
+          }
+          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+        sel.pop();
+        sel.push();
+          if (simdWidth == 1)
+            sel.curr.noMask = 1;
+          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
+
+          if (simdWidth == 1)
+            sel.curr.execWidth = 1;
+          // Get the remaining offset from aligned addr
+          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+          sel.SHL(tmpAddr, tmpAddr, GenRegister::immud(0x3));
+          sel.SHR(tmpData, tmpData, tmpAddr);
+
+          if (elemSize == GEN_BYTE_SCATTER_WORD)
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+          else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+        sel.pop();
     }
 
     void emitByteGather(Selection::Opaque &sel,
                         const ir::LoadInstruction &insn,
                         const uint32_t elemSize,
                         GenRegister address,
-                        GenRegister value,
-                        uint32_t bti) const
+                        ir::BTI bti) const
     {
       using namespace ir;
-      GBE_ASSERT(insn.getValueNum() == 1);
-      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
+                                 1 : sel.ctx.getSimdWidth();
+      RegisterFamily family = getFamily(insn.getValueType());
+
+      if(valueNum > 1) {
+        vector<GenRegister> dst(valueNum);
+        const uint32_t typeSize = getFamilySize(family);
+
+        for(uint32_t i = 0; i < valueNum; i++)
+          dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        vector<GenRegister> tmp2(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        }
 
-      // We need a temporary register if we read bytes or words
-      Register dst = Register(value.value.reg);
-      if (elemSize == GEN_BYTE_SCATTER_WORD ||
-          elemSize == GEN_BYTE_SCATTER_BYTE) {
-        dst = sel.reg(FAMILY_DWORD);
-        sel.BYTE_GATHER(GenRegister::fxgrf(simdWidth, dst), address, elemSize, bti);
-      }
+        readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
 
-      // Repack bytes or words using a converting mov instruction
-      if (elemSize == GEN_BYTE_SCATTER_WORD)
-        sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
-      else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-        sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+        }
+      } else {
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
+        GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
+        GenRegister tmp = value;
+
+        for (int x = 0; x < bti.count; x++) {
+          if (x > 0)
+            tmp = sel.selReg(sel.reg(family, simdWidth == 1), insn.getValueType());
+
+          GenRegister addr = getRelativeAddress(sel, address, insn.getAddressSpace(), bti.bti[x]);
+          readByteAsDWord(sel, elemSize, addr, tmp, simdWidth, bti.bti[x]);
+          if (x > 0) {
+            sel.push();
+              if (simdWidth == 1) {
+                sel.curr.noMask = 1;
+                sel.curr.execWidth = 1;
+              }
+              sel.ADD(value, value, tmp);
+            sel.pop();
+          }
+        }
+      }
     }
 
     void emitIndirectMove(Selection::Opaque &sel,
@@ -2447,37 +2895,60 @@ namespace gbe
       sel.INDIRECT_MOVE(dst, src);
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn) const {
+    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, ir::AddressSpace space, uint8_t bti) const {
+      if(space == ir::MEM_LOCAL || space == ir::MEM_CONSTANT)
+        return address;
+
+      sel.push();
+        sel.curr.noMask = 1;
+        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
+      sel.pop();
+      return temp;
+    }
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
       using namespace ir;
-      const GenRegister address = sel.selReg(insn.getAddress());
+      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
       const AddressSpace space = insn.getAddressSpace();
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
                  insn.getAddressSpace() == MEM_LOCAL);
-      GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
+      //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.getAddressSpace() == MEM_CONSTANT) {
+      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+        address = temp;
+      }
+      BTI bti;
+      if (space == MEM_CONSTANT || space == MEM_LOCAL) {
+        bti.bti[0] = space == MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+        bti.count = 1;
+      } else {
+        bti = insn.getBTI();
+      }
+      if (space == MEM_CONSTANT) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
         // So, byte/short data type, we have to read through data cache.
         if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-          this->emitRead64(sel, insn, address, 0x2);
+          this->emitRead64(sel, insn, address, bti);
         else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-          this->emitDWordGather(sel, insn, address, 0x2);
+          this->emitDWordGather(sel, insn, address, bti);
         else {
-          const GenRegister value = sel.selReg(insn.getValue(0));
-          this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
+          this->emitByteGather(sel, insn, elemSize, address, bti);
+        }
+      } else {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, bti);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedRead(sel, insn, address, bti);
+        else {
+          this->emitByteGather(sel, insn, elemSize, address, bti);
         }
-      }
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
-      else {
-        const GenRegister value = sel.selReg(insn.getValue(0));
-        this->emitByteGather(sel, insn, elemSize, address, value, space == MEM_LOCAL ? 0xfe : 0x01);
       }
       return true;
     }
@@ -2489,83 +2960,114 @@ namespace gbe
   {
     void emitUntypedWrite(Selection::Opaque &sel,
                           const ir::StoreInstruction &insn,
+                          GenRegister addr,
                           uint32_t bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      const uint32_t addrID = ir::StoreInstruction::addressIndex;
-      GenRegister addr;
       vector<GenRegister> value(valueNum);
 
-      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);;
+      addr = GenRegister::retype(addr, GEN_TYPE_F);
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
       sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
     }
 
     void emitWrite64(Selection::Opaque &sel,
-                          const ir::StoreInstruction &insn,
-                          uint32_t bti) const
+                     const ir::StoreInstruction &insn,
+                     GenRegister addr,
+                     uint32_t bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      const uint32_t addrID = ir::StoreInstruction::addressIndex;
-      GenRegister addr;
-      uint32_t srcID;
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
-      // The first 16 DWORD register space is for temporary usage at encode stage.
-      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+      addr = GenRegister::retype(addr, GEN_TYPE_UD);
       GenRegister src[valueNum];
-      GenRegister dst[tmpRegNum + 1];
-      /* dst 0 is for the temporary address register. */
-      dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
-      for (srcID = 0; srcID < tmpRegNum; ++srcID)
-        dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
+      sel.WRITE64(addr, src, valueNum, bti);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize,
                          GenRegister addr,
-                         GenRegister value,
                          uint32_t bti) const
     {
       using namespace ir;
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      const GenRegister dst = value;
+      uint32_t valueNum = insn.getValueNum();
+
+      if(valueNum > 1) {
+        const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
+        vector<GenRegister> value(valueNum);
+
+        if(elemSize == GEN_BYTE_SCATTER_WORD) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16);
+        } else if(elemSize == GEN_BYTE_SCATTER_BYTE) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8);
+        }
 
-      GBE_ASSERT(insn.getValueNum() == 1);
-      if (elemSize == GEN_BYTE_SCATTER_WORD) {
-        value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UW));
-      } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
-        value = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        sel.MOV(value, GenRegister::retype(dst, GEN_TYPE_UB));
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+          sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
+        }
+
+        sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+      } else {
+        const GenRegister value = sel.selReg(insn.getValue(0));
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        if (elemSize == GEN_BYTE_SCATTER_WORD) {
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
+        } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
+        }
+        sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
       }
-      sel.BYTE_SCATTER(addr, value, elemSize, bti);
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const AddressSpace space = insn.getAddressSpace();
-      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitWrite64(sel, insn, bti);
-      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedWrite(sel, insn, bti);
-      else {
-        const GenRegister address = sel.selReg(insn.getAddress());
-        const GenRegister value = sel.selReg(insn.getValue(0));
-        this->emitByteScatter(sel, insn, elemSize, address, value, bti);
+      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+        address = temp;
+      }
+      if(space == MEM_LOCAL) {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitWrite64(sel, insn, address, 0xfe);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedWrite(sel, insn, address,  0xfe);
+        else
+          this->emitByteScatter(sel, insn, elemSize, address, 0xfe);
+      } else {
+        BTI bti = insn.getBTI();
+        for (int x = 0; x < bti.count; x++) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+          sel.push();
+            sel.curr.noMask = 1;
+            sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+            this->emitWrite64(sel, insn, temp, bti.bti[x]);
+          else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+            this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
+          else {
+            this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x]);
+          }
+        }
       }
       return true;
     }
@@ -2589,76 +3091,63 @@ namespace gbe
       const Opcode opcode = insn.getOpcode();
       const Type type = insn.getType();
       const Register dst = insn.getDst(0);
-      Register tmpDst;
-
-      const ir::BasicBlock *insnBlock = insn.getParent();
+      GenRegister tmpDst;
+      const BasicBlock *curr = insn.getParent();
       const ir::Liveness &liveness = sel.ctx.getLiveness();
-      const ir::Liveness::UEVar &livein = liveness.getLiveIn(insnBlock);
-      if (!livein.contains(dst))
-        tmpDst = dst;
+      const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(curr);
+      bool needStoreBool = false;
+      if (liveOut.contains(dst) || dag.computeBool)
+        needStoreBool = true;
+
+      if(type == TYPE_S64 || type == TYPE_U64 ||
+         type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+         type == TYPE_U32 ||  type == TYPE_S32 /*||
+         (!needStoreBool)*/)
+        tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_F);
       else
-        tmpDst = sel.reg(FAMILY_BOOL);
-
-      // Limit the compare to the active lanes. Use the same compare as for f0.0
-      sel.push();
-        const LabelIndex label = insn.getParent()->getLabelIndex();
-        const GenRegister blockip = sel.selReg(ocl::blockip, TYPE_U16);
-        const GenRegister labelReg = GenRegister::immuw(label);
-
-        sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = uint16_t(tmpDst);
-        if (tmpDst != dst) {
-          sel.CMP(GEN_CONDITIONAL_G, blockip, labelReg);
-          sel.curr.execWidth = 1;
-          sel.AND(sel.selReg(dst, TYPE_BOOL), sel.selReg(dst, TYPE_BOOL), sel.selReg(tmpDst, TYPE_BOOL));
-          sel.XOR(sel.selReg(tmpDst, TYPE_BOOL), sel.selReg(tmpDst, TYPE_BOOL), GenRegister::immuw(0xFFFF));
-        } else
-          sel.CMP(GEN_CONDITIONAL_LE, blockip, labelReg);
-      sel.pop();
+        tmpDst = sel.selReg(dst, TYPE_BOOL);
 
       // Look for immediate values for the right source
       GenRegister src0, src1;
-      SelectionDAG *dag0 = dag.child[0];
-      SelectionDAG *dag1 = dag.child[1];
-
-      // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
-          canGetRegisterFromImmediate(dag1->insn) && opcode != OP_ORD) {
-        const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
-        src0 = sel.selReg(insn.getSrc(0), type);
-        Immediate imm = childInsn.getImmediate();
-        if(imm.type != type)
-          imm.type = type;
-        src1 = getRegisterFromImmediate(imm);
-        if (dag0) dag0->isRoot = 1;
-      } else {
-        src0 = sel.selReg(insn.getSrc(0), type);
-        src1 = sel.selReg(insn.getSrc(1), type);
-        markAllChildren(dag);
-      }
-
+      bool inverseCmp = false;
+      sel.getSrcGenRegImm(dag, src0, src1, type, inverseCmp);
       sel.push();
+        if (sel.isScalarReg(dst))
+          sel.curr.noMask = 1;
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = uint16_t(tmpDst);
+        sel.curr.modFlag = 1;
+        sel.curr.flagIndex = (uint16_t)dst;
+        sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean.
         if (type == TYPE_S64 || type == TYPE_U64) {
           GenRegister tmp[3];
           for(int i=0; i<3; i++)
             tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-          sel.I64CMP(getGenCompare(opcode), src0, src1, tmp);
+          sel.curr.flagGen = 1;
+          sel.I64CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmp);
         } else if(opcode == OP_ORD) {
-          sel.CMP(GEN_CONDITIONAL_EQ, src0, src0);
-          sel.CMP(GEN_CONDITIONAL_EQ, src1, src1);
-        } else
-          sel.CMP(getGenCompare(opcode), src0, src1);
+          sel.push();
+            sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
+            sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            sel.curr.flagGen = 1;
+            sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
+          sel.pop();
+        } else {
+          if((type == TYPE_S64 || type == TYPE_U64 ||
+              type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+              type == TYPE_U32 ||  type == TYPE_S32))
+            sel.curr.flagGen = 1;
+          else if (sel.isScalarReg(dst)) {
+            // If the dest reg is a scalar bool, we can't set it as
+            // dst register, as the execution width is still 8 or 16.
+            // Instead, we set the needStoreBool to flagGen, and change
+            // the dst to null register. And let the flag reg allocation
+            // function to generate the flag grf on demand correctly latter.
+            sel.curr.flagGen = needStoreBool;
+            tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_UW);
+          }
+          sel.CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmpDst);
+        }
       sel.pop();
-      if (tmpDst != dst) {
-        sel.push();
-          sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.curr.execWidth = 1;
-          sel.OR(sel.selReg(dst, TYPE_U16), sel.selReg(dst, TYPE_U16), sel.selReg(tmpDst, TYPE_U16));
-        sel.pop();
-      }
       return true;
     }
   };
@@ -2666,7 +3155,7 @@ namespace gbe
   /*! Bit cast instruction pattern */
   DECL_PATTERN(BitCastInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::BitCastInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::BitCastInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const Type dstType = insn.getDstType();
@@ -2689,6 +3178,19 @@ namespace gbe
         narrowDst = 0;
       }
 
+      sel.push();
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+
+      // As we store long/ulong low/high part separately,
+      // we need to deal with it separately, we need to change it back again
+      // when hardware support native long type.
+      const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
+      const int simdWidth = sel.curr.execWidth;
+
       for(int i = 0; i < narrowNum; i++, index++) {
         GenRegister narrowReg, wideReg;
         if(narrowDst) {
@@ -2698,27 +3200,53 @@ namespace gbe
           wideReg = sel.selReg(insn.getDst(index/multiple), narrowType);
           narrowReg = sel.selReg(insn.getSrc(i), narrowType);  //retype to narrow type
         }
-        if(wideReg.hstride != GEN_VERTICAL_STRIDE_0) {
+
+        // set correct horizontal stride
+        if(wideReg.hstride != GEN_HORIZONTAL_STRIDE_0) {
           if(multiple == 2) {
-            wideReg = GenRegister::unpacked_uw(wideReg.reg());
+            wideReg = sel.unpacked_uw(wideReg.reg());
             wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+            if(isInt64) {
+              wideReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+              wideReg.vstride = GEN_VERTICAL_STRIDE_8;
+            }
           } else if(multiple == 4) {
-            wideReg = GenRegister::unpacked_ub(wideReg.reg());
+            wideReg = sel.unpacked_ub(wideReg.reg());
             wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
-          } else if(multiple == 8) {  //need to specail handle long to char
-            GBE_ASSERT(multiple == 8);
+            if(isInt64) {
+              wideReg.hstride = GEN_HORIZONTAL_STRIDE_2;
+              wideReg.vstride = GEN_VERTICAL_STRIDE_16;
+            }
+          } else if(multiple == 8) {
+            // we currently store high/low 32bit separately in register,
+            // so, its hstride is 4 here.
+            wideReg = sel.unpacked_ub(wideReg.reg());
+            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+          } else {
+            GBE_ASSERT(0);
           }
         }
-        if(index % multiple) {
+
+        if(!isInt64 && index % multiple) {
           wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
           wideReg.subphysical = 1;
         }
+        if(isInt64) {
+          wideReg.subphysical = 1;
+          // Offset to next half
+          if((i % multiple) >= multiple/2)
+            wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
+          // Offset to desired narrow element in wideReg
+          if(index % (multiple/2))
+            wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
+        }
+
         GenRegister xdst = narrowDst ? narrowReg : wideReg;
         GenRegister xsrc = narrowDst ? wideReg : narrowReg;
 
-        if((srcType == TYPE_S64 || srcType == TYPE_U64 || srcType == TYPE_DOUBLE) ||
-           (dstType == TYPE_S64 || dstType == TYPE_U64 || dstType == TYPE_DOUBLE)) {
-          const int simdWidth = sel.curr.execWidth;
+        if(isInt64) {
+          sel.MOV(xdst, xsrc);
+        } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
           sel.push();
             sel.curr.execWidth = 8;
             xdst.subphysical = 1;
@@ -2733,6 +3261,7 @@ namespace gbe
         } else
           sel.MOV(xdst, xsrc);
       }
+      sel.pop();
 
       return true;
     }
@@ -2742,7 +3271,48 @@ namespace gbe
   /*! Convert instruction pattern */
   DECL_PATTERN(ConvertInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn) const
+
+    INLINE bool lowerI64Reg(Selection::Opaque &sel, SelectionDAG *dag, GenRegister &src, uint32_t type) const {
+      using namespace ir;
+      GBE_ASSERT(type == GEN_TYPE_UD || type == GEN_TYPE_F);
+      if (dag->insn.getOpcode() == OP_LOADI) {
+        const auto &immInsn = cast<LoadImmInstruction>(dag->insn);
+        const auto imm = immInsn.getImmediate();
+        const Type immType = immInsn.getType();
+        if (immType == TYPE_S64 &&
+          imm.getIntegerValue() <= INT_MAX &&
+          imm.getIntegerValue() >= INT_MIN) {
+          src = GenRegister::immd((int32_t)imm.getIntegerValue());
+          return true;
+        } else if (immType == TYPE_U64 &&
+                   imm.getIntegerValue() <= UINT_MAX) {
+          src = GenRegister::immud((uint32_t)imm.getIntegerValue());
+          return true;
+        }
+      } else if (dag->insn.getOpcode() == OP_CVT) {
+        const auto cvtInsn = cast<ConvertInstruction>(dag->insn);
+        auto srcType = cvtInsn.getSrcType();
+        if (((srcType == TYPE_U32 || srcType == TYPE_S32) &&
+            (type == GEN_TYPE_UD || type == GEN_TYPE_D)) ||
+             ((srcType == TYPE_FLOAT) && type == GEN_TYPE_F)) {
+          src = GenRegister::retype(sel.selReg(cvtInsn.getSrc(0), srcType), type);
+          dag->isRoot = 1;
+          return true;
+        } else if (srcType == TYPE_FLOAT ||
+                   srcType == TYPE_U16 ||
+                   srcType == TYPE_S16 ||
+                   srcType == TYPE_U32 ||
+                   srcType == TYPE_S32) {
+          src = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32), type);
+          dag->isRoot = 1;
+          sel.MOV(src, sel.selReg(cvtInsn.getSrc(0), srcType));
+          return true;
+        }
+      }
+      return false;
+    }
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const Type dstType = insn.getDstType();
@@ -2752,60 +3322,126 @@ namespace gbe
       const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
       const Opcode opcode = insn.getOpcode();
-
-      if(opcode == ir::OP_SAT_CVT) {
-        sel.push();
+      sel.push();
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+      if(opcode == ir::OP_SAT_CVT)
         sel.curr.saturate = 1;
-      }
 
       // We need two instructions to make the conversion
       if (opcode == OP_F16TO32) {
         sel.F16TO32(dst, src);
       } else if (opcode == OP_F32TO16) {
         GenRegister unpacked;
-        unpacked = GenRegister::unpacked_uw(sel.reg(FAMILY_DWORD));
-        sel.F32TO16(unpacked, src);
+        unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+        sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+          sel.F32TO16(unpacked, src);
+        sel.pop();
         sel.MOV(dst, unpacked);
       } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
           const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
-          unpacked = GenRegister::unpacked_uw(sel.reg(FAMILY_DWORD));
-          unpacked = GenRegister::retype(unpacked, type);
+          if (!sel.isScalarReg(dst.reg())) {
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, type);
+          } else
+            unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
         } else {
           const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
-          unpacked = GenRegister::unpacked_ub(sel.reg(FAMILY_DWORD));
-          unpacked = GenRegister::retype(unpacked, type);
+          if (!sel.isScalarReg(dst.reg())) {
+            unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, type);
+          } else
+            unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
         }
         if(srcFamily == FAMILY_QWORD) {
           GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
           tmp.type = GEN_TYPE_D;
           sel.CONVI64_TO_I(tmp, src);
           sel.MOV(unpacked, tmp);
-        } else
-          sel.MOV(unpacked, src);
-        sel.MOV(dst, unpacked);
-      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) && srcFamily == FAMILY_QWORD) {
+        } else {
+          sel.push();
+            if (sel.isScalarReg(insn.getSrc(0))) {
+              sel.curr.execWidth = 1;
+              sel.curr.predicate = GEN_PREDICATE_NONE;
+              sel.curr.noMask = 1;
+            }
+            sel.MOV(unpacked, src);
+          sel.pop();
+        }
+        if (unpacked.reg() != dst.reg())
+          sel.MOV(dst, unpacked);
+      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) &&
+                 (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64))
         sel.CONVI64_TO_I(dst, src);
-      } else if (dstType == ir::TYPE_FLOAT && srcFamily == FAMILY_QWORD) {
-        GenRegister tmp[7];
+      else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {
+        auto dag = sel.regDAG[src.reg()];
+        // FIXME, in the future, we need to do a common I64 lower to I32 analysis
+        // at llvm IR layer which could cover more cases then just this one.
+        SelectionDAG *dag0, *dag1;
+        if (dag && dag->child[0] && dag->child[1]) {
+          if (dag->child[0]->insn.getOpcode() == OP_LOADI) {
+            dag0 = dag->child[1];
+            dag1 = dag->child[0];
+          } else {
+            dag0 = dag->child[0];
+            dag1 = dag->child[1];
+          }
+          GBE_ASSERT(!(dag->child[0]->insn.getOpcode() == OP_LOADI &&
+                       dag->child[1]->insn.getOpcode() == OP_LOADI));
+          if (dag->insn.getOpcode() == OP_AND ||
+              dag->insn.getOpcode() == OP_OR  ||
+              dag->insn.getOpcode() == OP_XOR) {
+            GenRegister src0;
+            GenRegister src1;
+            if (lowerI64Reg(sel, dag0, src0, GEN_TYPE_UD) &&
+                lowerI64Reg(sel, dag1, src1, GEN_TYPE_UD)) {
+              switch (dag->insn.getOpcode()) {
+                default:
+                case OP_AND: sel.AND(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+                case OP_OR:  sel.OR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+                case OP_XOR: sel.XOR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+              }
+              sel.MOV(dst, GenRegister::retype(dst, GEN_TYPE_UD));
+              markChildren = false;
+              return true;
+            }
+          }
+        }
+        GenRegister tmp[6];
         for(int i=0; i<6; i++) {
           tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         }
-        tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
-        sel.CONVI64_TO_F(dst, src, tmp);
-      } else if (dst.isdf()) {
+        sel.push();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          sel.CONVI64_TO_F(dst, src, tmp);
+        sel.pop();
+      } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
+                 (src.isdf() && dstType == ir::TYPE_FLOAT)) {
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
         sel.MOV_DF(dst, src, sel.selReg(r));
       } else if (dst.isint64()) {
         switch(src.type) {
           case GEN_TYPE_F:
           {
-            GenRegister tmp[3];
+            GenRegister tmp[2];
             tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
             tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
-            tmp[2] = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
-            sel.CONVF_TO_I64(dst, src, tmp);
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.CONVF_TO_I64(dst, src, tmp);
+            sel.pop();
             break;
           }
           case GEN_TYPE_DF:
@@ -2816,8 +3452,7 @@ namespace gbe
       } else
         sel.MOV(dst, src);
 
-      if(opcode == ir::OP_SAT_CVT)
-        sel.pop();
+      sel.pop();
 
       return true;
     }
@@ -2827,20 +3462,37 @@ namespace gbe
   /*! Convert instruction pattern */
   DECL_PATTERN(AtomicInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const AtomicOps atomicOp = insn.getAtomicOpcode();
       const AddressSpace space = insn.getAddressSpace();
-      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const uint32_t srcNum = insn.getSrcNum();
-      const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
+
+      GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
       GenRegister src1 = src0, src2 = src0;
       if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
       if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
-      sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
+      if(space == MEM_LOCAL) {
+        if (sel.needPatchSLMAddr()) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+          src0 = temp;
+        }
+        sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
+      } else {
+        ir::BTI b = insn.getBTI();
+        for (int x = 0; x < b.count; x++) {
+          sel.push();
+            sel.curr.noMask = 1;
+            GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+            sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
+        }
+      }
       return true;
     }
     DECL_CTOR(AtomicInstruction, 1, 1);
@@ -2871,47 +3523,35 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
       SelectionDAG *dag2 = dag.child[2];
 
-      // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag2->insn)) {
-        const auto &childInsn = cast<LoadImmInstruction>(dag2->insn);
-        src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
-        src1 = getRegisterFromImmediate(childInsn.getImmediate());
-        if (dag0) dag0->isRoot = 1;
-        if (dag1) dag1->isRoot = 1;
-      } else {
-        src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
-        src1 = sel.selReg(insn.getSrc(SelectInstruction::src1Index), type);
-        markAllChildren(dag);
-      }
-
-      // Since we cannot predicate the select instruction with our current mask,
-      // we need to perform the selection in two steps (one to select, one to
-      // update the destination register)
-      const RegisterFamily family = getFamily(type);
-      const GenRegister tmp = sel.selReg(sel.reg(family), type);
-      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      if (dag0) dag0->isRoot = 1;
+      bool inverse = false;
+      sel.getSrcGenRegImm(dag, dag1, dag2, src0, src1, type, inverse);
       const Register pred = insn.getPredicate();
       sel.push();
-        sel.curr.predicate = GEN_PREDICATE_NORMAL;
-        sel.curr.execWidth = simdWidth;
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.curr.inversePredicate ^= inverse;
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = uint16_t(pred);
-        sel.curr.noMask = 0;
+        sel.curr.flagIndex = (uint16_t) pred;
+        sel.curr.predicate = GEN_PREDICATE_NORMAL;
+        if (!dag0)
+          sel.curr.externFlag = 1;
         if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
-          sel.SEL_INT64(tmp, src0, src1);
+          sel.SEL_INT64(dst, src0, src1);
         else
-          sel.SEL(tmp, src0, src1);
+          sel.SEL(dst, src0, src1);
       sel.pop();
 
-      // Update the destination register properly now
-      sel.MOV(dst, tmp);
       return true;
     }
   };
 
   DECL_PATTERN(TernaryInstruction)
    {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn) const {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn, bool &markChildren) const {
       using namespace ir;
       const Type type = insn.getType();
       const GenRegister dst = sel.selReg(insn.getDst(0), type),
@@ -2921,13 +3561,16 @@ namespace gbe
       switch(insn.getOpcode()) {
         case OP_I64MADSAT:
          {
-          GenRegister tmp[10];
+          GenRegister tmp[9];
           for(int i=0; i<9; i++) {
             tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
             tmp[i].type = GEN_TYPE_UD;
           }
-          tmp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
-          sel.I64MADSAT(dst, src0, src1, src2, tmp);
+          sel.push();
+            sel.curr.flag = 0;
+            sel.curr.subFlag = 1;
+            sel.I64MADSAT(dst, src0, src1, src2, tmp);
+          sel.pop();
           break;
          }
         case OP_MAD:
@@ -2944,54 +3587,94 @@ namespace gbe
     DECL_CTOR(TernaryInstruction, 1, 1);
    };
 
+
   /*! Label instruction pattern */
   DECL_PATTERN(LabelInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::LabelInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::LabelInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const LabelIndex label = insn.getLabelIndex();
       const GenRegister src0 = sel.selReg(ocl::blockip);
       const GenRegister src1 = GenRegister::immuw(label);
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GBE_ASSERTM(label < GEN_MAX_LABEL, "We reached the maximum label number which is reserved for barrier handling");
       sel.LABEL(label);
 
-     // Do not emit any code for the "returning" block. There is no need for it
-     if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
+      // Do not emit any code for the "returning" block. There is no need for it
+      if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
         return true;
 
+      LabelIndex jip;
+      const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex();
+      if (sel.ctx.hasJIP(&insn))
+        jip = sel.ctx.getLabelIndex(&insn);
+      else
+        jip = nextLabel;
+
       // Emit the mask computation at the head of each basic block
       sel.push();
+        sel.curr.noMask = 1;
         sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.curr.flag = 0;
-        sel.curr.subFlag = 0;
-        sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1);
+        sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1,
+                GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
       sel.pop();
 
-      // If it is required, insert a JUMP to bypass the block
-      if (sel.ctx.hasJIP(&insn)) {
-        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+      if (sel.block->hasBarrier) {
+        // If this block has barrier, we don't execute the block until all lanes
+        // are 1s. Set each reached lane to 1, then check all lanes. If there is any
+        // lane not reached, we jump to jip. And no need to issue if/endif for
+        // this block, as it will always excute with all lanes activated.
         sel.push();
-
-          sel.curr.noMask = 1;
-          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
-          GenRegister flagReg = GenRegister::flag(0, 0);
-          sel.AND(flagReg, flagReg, emaskReg);
-
+          sel.curr.noMask = 1;
+          sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL),
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
           if (simdWidth == 8)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
           else if (simdWidth == 16)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
           else
             NOT_IMPLEMENTED;
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
           sel.curr.inversePredicate = 1;
-          sel.curr.flag = 0;
-          sel.curr.subFlag = 0;
-          sel.JMPI(GenRegister::immd(0), jip);
+          sel.JMPI(GenRegister::immd(0), jip, label);
+        sel.pop();
+        // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
+        sel.push();
+         sel.curr.predicate = GEN_PREDICATE_NORMAL;
+         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+        sel.pop();
+      }
+      else {
+        if (sel.ctx.hasJIP(&insn) &&
+            // If jump to next label and the endif offset is -1, then
+            // We don't need to add a jmpi here, as the following IF will do the same
+            // thing if all channels are disabled.
+            (jip != nextLabel || sel.block->endifOffset != -1)) {
+          // If it is required, insert a JUMP to bypass the block
+          sel.push();
+            if (simdWidth == 8)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+            else if (simdWidth == 16)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+            else
+              NOT_IMPLEMENTED;
+            sel.curr.noMask = 1;
+            sel.curr.execWidth = 1;
+            sel.curr.inversePredicate = 1;
+            sel.JMPI(GenRegister::immd(0), jip, label);
+          sel.pop();
+        }
+        sel.push();
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.IF(GenRegister::immd(0), sel.block->endifLabel, sel.block->endifLabel);
         sel.pop();
       }
+
       return true;
     }
     DECL_CTOR(LabelInstruction, 1, 1);
@@ -2999,27 +3682,52 @@ namespace gbe
 
   DECL_PATTERN(SampleInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::SampleInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::SampleInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       GenRegister msgPayloads[4];
-      GenRegister dst[insn.getDstNum()], src[insn.getSrcNum()];
+      GenRegister dst[insn.getDstNum()];
       uint32_t srcNum = insn.getSrcNum();
+      uint32_t valueID = 0;
+      uint32_t msgLen = 0;
 
-      for( int i = 0; i < 4; ++i)
-        msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-
-      for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
+      for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
-      for (uint32_t valueID = 0; valueID < srcNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
-
-      uint32_t bti = insn.getImageIndex();
-      /* We have the clamp border workaround. */
-      uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 8;
+      GBE_ASSERT(srcNum == 3);
+      if (insn.getSrc(1) == ir::ocl::invalid) //not 3D
+        srcNum = 1;
+      else if (insn.getSrc(2) == ir::ocl::invalid)
+        srcNum = 2;
+
+      if (insn.getSamplerOffset() != 0) {
+        // U, lod, [V], [W]
+        GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
+        msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+        msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        if (srcNum > 1)
+          msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+        if (srcNum > 2)
+          msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+        // Clear the lod to zero.
+        sel.MOV(msgPayloads[1], GenRegister::immud(0));
+        msgLen = srcNum + 1;
+      } else {
+        // U, V, [W]
+        GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
+        for (valueID = 0; valueID < srcNum; ++valueID)
+          msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+        msgLen = srcNum;
+      }
+      // We switch to a fixup bti for linear filter on a image1d array sampling.
+      uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? BTI_MAX_IMAGE_NUM : 0);
+      if (bti > 253) {
+        std::cerr << "Too large bti " << bti;
+        return false;
+      }
+      uint32_t sampler = insn.getSamplerIndex();
 
-      sel.SAMPLE(dst, insn.getDstNum(), src, srcNum, msgPayloads, 4, bti, sampler, insn.is3D());
+      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, msgLen, bti, sampler, insn.getSamplerOffset() != 0, false);
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1);
@@ -3028,28 +3736,86 @@ namespace gbe
   /*! Typed write instruction pattern. */
   DECL_PATTERN(TypedWriteInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      uint32_t valueID;
       GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
-      GenRegister src[insn.getSrcNum()];
-      uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-      uint32_t coordNum = 3;
-
-      for(uint32_t i = 0; i < msgNum; i++)
-        msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
+      const uint32_t coordNum = 3;
 
-      // u, v, w coords should use coord type.
-      for (valueID = 0; valueID < coordNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getCoordType());
+      if (simdWidth == 16) {
+        for(uint32_t i = 0; i < msgNum; i++)
+          msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      } else {
+        uint32_t valueID = 0;
+        msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
+          msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
+
+        // fake u.
+        if (insn.getSrc(1) == ir::ocl::invalid)
+          msgs[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        // fake w.
+        if (insn.getSrc(2) == ir::ocl::invalid)
+          msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        // LOD.
+        msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
+          msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      }
 
-      for (; valueID < insn.getSrcNum(); ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      sel.push();
+      sel.curr.predicate = GEN_PREDICATE_NONE;
+      sel.curr.noMask = 1;
+      sel.MOV(msgs[0], GenRegister::immud(0));
+      sel.curr.execWidth = 1;
+
+      GenRegister channelEn = GenRegister::offset(msgs[0], 0, 7*4);
+      channelEn.subphysical = 1;
+      // Enable all channels.
+      sel.MOV(channelEn, GenRegister::immud(0xffff));
+      sel.curr.execWidth = 8;
+      // Set zero LOD.
+      if (simdWidth == 8)
+        sel.MOV(msgs[4], GenRegister::immud(0));
+      else
+        sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
+      sel.pop();
 
       uint32_t bti = insn.getImageIndex();
-      sel.TYPED_WRITE(src, insn.getSrcNum(), msgs, msgNum, bti, insn.is3D());
+      if (simdWidth == 8)
+        sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+      else {
+        sel.push();
+        sel.curr.execWidth = 8;
+        for( uint32_t quarter = 0; quarter < 2; quarter++)
+        {
+          #define QUARTER_MOV0(msgs, msgid, src) \
+                    sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \
+                            GenRegister::Qn(src, quarter))
+
+          #define QUARTER_MOV1(msgs, msgid, src) \
+                  sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \
+                          GenRegister::Qn(src, quarter))
+          sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
+          // Set U,V,W
+          QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
+          if (insn.getSrc(1) != ir::ocl::invalid) //not 2D
+            QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
+          if (insn.getSrc(2) != ir::ocl::invalid) //not 3D
+            QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
+          // Set R, G, B, A
+          QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(3), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(4), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(5), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(6), insn.getSrcType()));
+          sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+          #undef QUARTER_MOV0
+          #undef QUARTER_MOV1
+        }
+        sel.pop();
+      }
       return true;
     }
     DECL_CTOR(TypedWriteInstruction, 1, 1);
@@ -3058,7 +3824,7 @@ namespace gbe
   /*! get image info instruction pattern. */
   DECL_PATTERN(GetImageInfoInstruction)
   {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn) const
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn, bool &markChildren) const
     {
       using namespace ir;
       GenRegister dst;
@@ -3071,55 +3837,14 @@ namespace gbe
     DECL_CTOR(GetImageInfoInstruction, 1, 1);
   };
 
-  /*! get sampler info instruction pattern. */
-  DECL_PATTERN(GetSamplerInfoInstruction)
-  {
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::GetSamplerInfoInstruction &insn) const
-    {
-      using namespace ir;
-      GenRegister dst, src;
-      dst = sel.selReg(insn.getDst(0), TYPE_U16);
-      src = GenRegister::offset(GenRegister::uw1grf(insn.getSrc(0)), 0, insn.getSamplerIndex() * 2);
-      src.subphysical = 1;
-      sel.MOV(dst, src);
-      return true;
-    }
-    DECL_CTOR(GetSamplerInfoInstruction, 1, 1);
-  };
-
   /*! Branch instruction pattern */
-  DECL_PATTERN(BranchInstruction)
+  class BranchInstructionPattern : public SelectionPattern
   {
-
-    // Get active pred.
-    const ir::Register getActivePred(Selection::Opaque &sel,
-                       const ir::BranchInstruction &insn,
-                       const ir::Register pred) const
-    {
-        using namespace ir;
-        GenRegister flagReg;
-        Register activePred;
-        const ir::BasicBlock *insnBlock = insn.getParent();
-        const ir::Liveness &liveness = sel.ctx.getLiveness();
-        const ir::Liveness::UEVar &livein = liveness.getLiveIn(insnBlock);
-       
-        /* If the pred is not in the livein set, then this pred should be defined
-           in this block and we don't need to validate it. */ 
-        if (!livein.contains(pred))
-          return pred;
-
-        activePred = sel.reg(FAMILY_BOOL);
-        sel.push();
-          sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
-          if(sel.curr.physicalFlag)
-             flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
-          else
-             flagReg = sel.selReg(ir::Register(sel.curr.flagIndex), ir::TYPE_U16);
-          sel.AND(sel.selReg(activePred, TYPE_U16), flagReg, sel.selReg(pred, TYPE_U16));
-        sel.pop();
-        return activePred;
+  public:
+    BranchInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::BranchInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
     }
 
     void emitForwardBranch(Selection::Opaque &sel,
@@ -3129,70 +3854,40 @@ namespace gbe
     {
       using namespace ir;
       const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
-      const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
-      const uint32_t simdWidth = sel.ctx.getSimdWidth();
 
       // We will not emit any jump if we must go the next block anyway
       const BasicBlock *curr = insn.getParent();
       const BasicBlock *next = curr->getNextBlock();
       const LabelIndex nextLabel = next->getLabelIndex();
-
       if (insn.isPredicated() == true) {
         const Register pred = insn.getPredicateIndex();
-        const Register activePred = getActivePred(sel, insn, pred);
-
-        // Update the PcIPs
         sel.push();
+          // we don't need to set next label to the pcip
+          // as if there is no backward jump latter, then obviously everything will work fine.
+          // If there is backward jump latter, then all the pcip will be updated correctly there.
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = uint16_t(activePred);
+          sel.curr.flagIndex = (uint16_t) pred;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-        sel.pop();
-
-        if (nextLabel == jip) return;
-
-        // It is slightly more complicated than for backward jump. We check that
-        // all PcIPs are greater than the next block IP to be sure that we can
-        // jump
-        // We set all the inactive channel to 1 as the GEN_PREDICATE_ALIGN1_ALL8/16
-        // will check those bits as well.
-
-        sel.push();
-          sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = uint16_t(activePred);
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.CMP(GEN_CONDITIONAL_G, ip, GenRegister::immuw(nextLabel));
-
-          // Branch to the jump target
-          // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
-          // work correct, as flag register bits mapped to non-active lanes tend
-          // to be zero.
-
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
-          GenRegister notEmaskReg = GenRegister::uw1grf(ocl::notemask);
-          sel.OR(sel.selReg(activePred, TYPE_U16), sel.selReg(activePred, TYPE_U16), notEmaskReg);
-
-          if (simdWidth == 8)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
-          else if (simdWidth == 16)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
-          else
-            NOT_SUPPORTED;
-
-          sel.JMPI(GenRegister::immd(0), jip);
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), nextLabel);
+          sel.block->endifOffset = -1;
         sel.pop();
-
       } else {
         // Update the PcIPs
+        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
         sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
-        // Do not emit branch when we go to the next block anyway
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), nextLabel);
+        sel.block->endifOffset = -1;
         if (nextLabel == jip) return;
+        // Branch to the jump target
         sel.push();
           sel.curr.execWidth = 1;
           sel.curr.noMask = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.JMPI(GenRegister::immd(0), jip);
+          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, curr->getLabelIndex());
         sel.pop();
       }
     }
@@ -3207,60 +3902,56 @@ namespace gbe
       const Function &fn = sel.ctx.getFunction();
       const BasicBlock &bb = fn.getBlock(src);
       const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+      const LabelIndex label = bb.getLabelIndex();
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       GBE_ASSERT(bb.getNextBlock() != NULL);
 
       if (insn.isPredicated() == true) {
         const Register pred = insn.getPredicateIndex();
-        const Register activePred = getActivePred(sel, insn, pred);
 
         // Update the PcIPs for all the branches. Just put the IPs of the next
-        // block. Next instruction will properly reupdate the IPs of the lanes
+        // block. Next instruction will properly update the IPs of the lanes
         // that actually take the branch
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
-
+        GBE_ASSERT(jip == dst);
         sel.push();
-          // Re-update the PcIPs for the branches that takes the backward jump
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = uint16_t(activePred);
+          sel.curr.flagIndex = (uint16_t) pred;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
-        // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
-        // will check those bits as well.
+          sel.block->endifOffset = -1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), next);
           sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
-          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
-          sel.AND(sel.selReg(activePred, TYPE_U16), sel.selReg(activePred, TYPE_U16), emaskReg);
-
-          // Branch to the jump target
-          if (simdWidth == 8)
-            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
-          else if (simdWidth == 16)
+          if (simdWidth == 16)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
           else
-            NOT_SUPPORTED;
-          sel.JMPI(GenRegister::immd(0), jip);
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+          sel.curr.noMask = 1;
+          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label);
         sel.pop();
-
       } else {
-
+        const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         // Update the PcIPs
         sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-
+        sel.block->endifOffset = -1;
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), next);
         // Branch to the jump target
         sel.push();
           sel.curr.execWidth = 1;
           sel.curr.noMask = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.JMPI(GenRegister::immd(0), jip);
+          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label);
         sel.pop();
       }
     }
 
-    INLINE bool emitOne(Selection::Opaque &sel, const ir::BranchInstruction &insn) const {
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
       using namespace ir;
+      const ir::BranchInstruction &insn = cast<BranchInstruction>(dag.insn);
       const Opcode opcode = insn.getOpcode();
       if (opcode == OP_RET)
         sel.EOT();
@@ -3268,17 +3959,25 @@ namespace gbe
         const LabelIndex dst = insn.getLabelIndex();
         const LabelIndex src = insn.getParent()->getLabelIndex();
 
+        sel.push();
+        if (insn.isPredicated() == true) {
+          if (dag.child[0] == NULL)
+            sel.curr.externFlag = 1;
+        }
+
         // We handle foward and backward branches differently
         if (uint32_t(dst) <= uint32_t(src))
           this->emitBackwardBranch(sel, insn, dst, src);
         else
           this->emitForwardBranch(sel, insn, dst, src);
+        sel.pop();
       } else
         NOT_IMPLEMENTED;
+
+      markAllChildren(dag);
       return true;
     }
 
-    DECL_CTOR(BranchInstruction, 1, 1);
   };
 
   /*! Sort patterns */
@@ -3310,7 +4009,6 @@ namespace gbe
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
     this->insert<GetImageInfoInstructionPattern>();
-    this->insert<GetSamplerInfoInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 09e6762..9bcce6f 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -38,10 +38,14 @@ namespace gbe
 {
   /*! Translate IR type to Gen type */
   uint32_t getGenType(ir::Type type);
+  /*! Translate Gen type to IR type */
+  ir::Type getIRType(uint32_t genType);
 
   /*! Translate IR compare to Gen compare */
   uint32_t getGenCompare(ir::Opcode opcode);
 
+  #define GEN_MAX_LABEL 0xFFFF
+
   /*! Selection opcodes properly encoded from 0 to n for fast jump tables
    *  generations
    */
@@ -85,10 +89,10 @@ namespace gbe
     const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
     /*! Damn C++ */
     const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
-    /*! No more than 17 sources (used by typed writes on simd8 mode.) */
-    enum { MAX_SRC_NUM = 17 };
-    /*! No more than 11 destinations (used by samples and untyped reads) */
-    enum { MAX_DST_NUM = 11 };
+    /*! No more than 9 sources (used by typed writes on simd8 mode.) */
+    enum { MAX_SRC_NUM = 9 };
+    /*! No more than 16 destinations (15 used by I64DIV/I64REM) */
+    enum { MAX_DST_NUM = 16 };
     /*! State of the instruction (extra fields neeed for the encoding) */
     GenInstructionState state;
     union {
@@ -120,21 +124,63 @@ namespace gbe
       struct {
         uint16_t rdbti:8;
         uint16_t sampler:5;
-        uint16_t is3DRead:1;
+        uint16_t rdmsglen:3;
+        bool     isLD;  // is this a ld message?
+        bool     isUniform;
       };
       uint32_t barrierType;
+      bool longjmp;
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
     /*! Number of destinations */
-    uint8_t dstNum:4;
+    uint8_t dstNum:5;
     /*! Number of sources */
-    uint8_t srcNum:5;
+    uint8_t srcNum:4;
     /*! To store various indices */
     uint16_t index;
+    /*! For BRC/IF to store the UIP */
+    uint16_t index1;
+    /*! instruction ID used for vector allocation. */
+    uint32_t ID;
     /*! Variable sized. Destinations and sources go here */
     GenRegister regs[0];
+    INLINE uint32_t getbti() const {
+      GBE_ASSERT(isRead() || isWrite());
+      switch (opcode) {
+        case SEL_OP_ATOMIC: return extra.elem;
+        case SEL_OP_BYTE_SCATTER:
+        case SEL_OP_WRITE64:
+        case SEL_OP_DWORD_GATHER:
+        case SEL_OP_UNTYPED_WRITE:
+        case SEL_OP_UNTYPED_READ:
+        case SEL_OP_BYTE_GATHER:
+        case SEL_OP_READ64: return extra.function;
+        case SEL_OP_SAMPLE: return extra.rdbti;
+        case SEL_OP_TYPED_WRITE: return extra.bti;
+        default:
+          GBE_ASSERT(0);
+      }
+      return 0;
+    }
   private:
+    INLINE void setbti(uint32_t bti) {
+      GBE_ASSERT(isRead() || isWrite());
+      switch (opcode) {
+        case SEL_OP_ATOMIC: extra.elem = bti; return;
+        case SEL_OP_BYTE_SCATTER:
+        case SEL_OP_WRITE64:
+        case SEL_OP_UNTYPED_WRITE:
+        case SEL_OP_DWORD_GATHER:
+        case SEL_OP_UNTYPED_READ:
+        case SEL_OP_BYTE_GATHER:
+        case SEL_OP_READ64: extra.function = bti; return;
+        case SEL_OP_SAMPLE: extra.rdbti = bti; return;
+        case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
+        default:
+          GBE_ASSERT(0);
+      }
+    }
     /*! Just Selection class can create SelectionInstruction */
     SelectionInstruction(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
     // Allocates (with a linear allocator) and owns SelectionInstruction
@@ -182,6 +228,11 @@ namespace gbe
     void append(SelectionInstruction *insn);
     /*! Append a new selection instruction at the beginning of the block */
     void prepend(SelectionInstruction *insn);
+    bool isLargeBlock;
+    ir::LabelIndex endifLabel;
+    int endifOffset;
+    bool hasBarrier;
+    bool hasBranch;
   };
 
   /*! Owns the selection engine */
@@ -196,8 +247,6 @@ namespace gbe
     ~Selection(void);
     /*! Implements the instruction selection itself */
     void select(void);
-    /*! Bool and scalar register use scalar physical registers */
-    bool isScalarOrBool(ir::Register reg) const;
     /*! Get the number of instructions of the largest block */
     uint32_t getLargestBlockSize(void) const;
     /*! Number of register vectors in the selection */
@@ -209,11 +258,13 @@ namespace gbe
     /*! Get the data for the given register */
     ir::RegisterData getRegisterData(ir::Register reg) const;
     /*! Replace a source by the returned temporary register */
-    ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
+    ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type = ir::TYPE_FLOAT, bool needMov = true);
     /*! Replace a destination to the returned temporary register */
-    ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
+    ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type = ir::TYPE_FLOAT, bool needMov = true);
     /*! spill a register (insert spill/unspill instructions) */
     bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
+    /*! Indicate if a register is scalar or not */
+    bool isScalarReg(const ir::Register &reg) const;
     /*! Create a new selection instruction */
     SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
     /*! List of emitted blocks */
@@ -226,6 +277,13 @@ namespace gbe
     GBE_CLASS(Selection);
   };
 
+  class Selection75: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      Selection75(GenContext &ctx);
+  };
+
 } /* namespace gbe */
 
 #endif /*  __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index e44b9d4..ddc9d5e 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -58,6 +58,8 @@ DECL_SELECTION_IR(WRITE64, Write64Instruction)
 DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
+DECL_SELECTION_IR(PACK_BYTE, PackByteInstruction)
+DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
@@ -78,3 +80,7 @@ DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
 DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
 DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
 DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
+DECL_SELECTION_IR(BRC, UnaryInstruction)
+DECL_SELECTION_IR(BRD, UnaryInstruction)
+DECL_SELECTION_IR(IF, UnaryInstruction)
+DECL_SELECTION_IR(ENDIF, UnaryInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 22f4aa1..5324587 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -22,26 +22,56 @@
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DataLayout.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/Linker/Linker.h"
+#else
+#include "llvm/Linker.h"
+#endif
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IRReader/IRReader.h"
+
 #include "backend/program.h"
 #include "backend/gen_program.h"
 #include "backend/gen_program.hpp"
 #include "backend/gen_context.hpp"
+#include "backend/gen75_context.hpp"
 #include "backend/gen_defs.hpp"
 #include "backend/gen/gen_mesa_disasm.h"
 #include "backend/gen_reg_allocation.hpp"
 #include "ir/unit.hpp"
 #include "llvm/llvm_to_gen.hpp"
+#include "llvm/llvm_gen_backend.hpp"
+
+#include <clang/CodeGen/CodeGenAction.h>
 
 #include <cstring>
 #include <sstream>
 #include <memory>
 #include <iostream>
 #include <fstream>
+#include <mutex>
+#include <unistd.h>
 
 namespace gbe {
 
-  GenKernel::GenKernel(const std::string &name) :
-    Kernel(name), insns(NULL), insnNum(0)
+  GenKernel::GenKernel(const std::string &name, uint32_t deviceID) :
+    Kernel(name), deviceID(deviceID), insns(NULL), insnNum(0)
   {}
   GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
   const char *GenKernel::getCode(void) const { return (const char*) insns; }
@@ -52,14 +82,25 @@ namespace gbe {
   size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
 
   void GenKernel::printStatus(int indent, std::ostream& outs) {
+#ifdef GBE_COMPILER_AVAILABLE
     Kernel::printStatus(indent, outs);
 
     FILE *f = fopen("/dev/null", "w");
     char *buf = new char[4096];
     setbuffer(f, buf, 4096);
+    GenCompactInstruction * pCom = NULL;
+    GenNativeInstruction nativeInsn;
 
-    for (uint32_t i = 0; i < insnNum; i++) {
-      gen_disasm(f, insns+i);
+    for (uint32_t i = 0; i < insnNum;) {
+      pCom = (GenCompactInstruction*)(insns+i);
+      if(pCom->bits1.cmpt_control == 1) {
+        decompactInstruction(pCom, &nativeInsn);
+        gen_disasm(f, &nativeInsn, deviceID, 1);
+        i++;
+      } else {
+        gen_disasm(f, insns+i, deviceID, 0);
+        i = i + 2;
+      }
       outs << buf;
       fflush(f);
       setbuffer(f, NULL, 0);
@@ -69,58 +110,122 @@ namespace gbe {
     setbuffer(f, NULL, 0);
     delete [] buf;
     fclose(f);
+#endif
   }
 
-  GenProgram::GenProgram(void) {}
-  GenProgram::~GenProgram(void) {}
+  void GenProgram::CleanLlvmResource(void){
+#ifdef GBE_COMPILER_AVAILABLE
+    if(module){
+      delete (llvm::Module*)module;
+      module = NULL;
+    }
+
+    if(llvm_ctx){
+      delete (llvm::LLVMContext*)llvm_ctx;
+      llvm_ctx = NULL;
+    }
+#endif
+  }
 
   /*! We must avoid spilling at all cost with Gen */
   static const struct CodeGenStrategy {
     uint32_t simdWidth;
+    uint32_t reservedSpillRegs;
     bool limitRegisterPressure;
   } codeGenStrategy[] = {
-    {16,false},
-    {16,true},
-    {8,false},
-    {8,true},
+    {16, 0, false},
+    {16, 10, false},
+    {8, 0, false},
+    {8, 8, false},
+    {8, 16, false},
   };
 
-  Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name) {
-
+  Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) {
+#ifdef GBE_COMPILER_AVAILABLE
     // Be careful when the simdWidth is forced by the programmer. We can see it
     // when the function already provides the simd width we need to use (i.e.
     // non zero)
     const ir::Function *fn = unit.getFunction(name);
-    const uint32_t codeGenNum = fn->getSimdWidth() != 0 ? 2 : 4;
-    uint32_t codeGen = fn->getSimdWidth() == 8 ? 2 : 0;
+    uint32_t codeGenNum = sizeof(codeGenStrategy) / sizeof(codeGenStrategy[0]);
+    uint32_t codeGen = 0;
+    GenContext *ctx = NULL;
+    if (fn->getSimdWidth() == 8) {
+      codeGen = 2;
+    } else if (fn->getSimdWidth() == 16) {
+      codeGenNum = 2;
+    } else if (fn->getSimdWidth() == 0) {
+      codeGen = 0;
+    } else
+      GBE_ASSERT(0);
     Kernel *kernel = NULL;
 
     // Stop when compilation is successful
+    if (IS_IVYBRIDGE(deviceID)) {
+      ctx = GBE_NEW(GenContext, unit, name, deviceID, relaxMath);
+    } else if (IS_HASWELL(deviceID)) {
+      ctx = GBE_NEW(Gen75Context, unit, name, deviceID, relaxMath);
+    }
+    GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
+
     for (; codeGen < codeGenNum; ++codeGen) {
       const uint32_t simdWidth = codeGenStrategy[codeGen].simdWidth;
       const bool limitRegisterPressure = codeGenStrategy[codeGen].limitRegisterPressure;
+      const uint32_t reservedSpillRegs = codeGenStrategy[codeGen].reservedSpillRegs;
 
       // Force the SIMD width now and try to compile
       unit.getFunction(name)->setSimdWidth(simdWidth);
-      Context *ctx = GBE_NEW(GenContext, unit, name, limitRegisterPressure);
+      ctx->startNewCG(simdWidth, reservedSpillRegs, limitRegisterPressure);
       kernel = ctx->compileKernel();
       if (kernel != NULL) {
+        GBE_ASSERT(ctx->getErrCode() == NO_ERROR);
         break;
       }
-      GBE_DELETE(ctx);
       fn->getImageSet()->clearInfo();
+      // If we get a out of range if/endif error.
+      // We need to set the context to if endif fix mode and restart the previous compile.
+      if ( ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && !ctx->getIFENDIFFix() ) {
+        ctx->setIFENDIFFix(true);
+        codeGen--;
+      } else
+        GBE_ASSERT(!(ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && ctx->getIFENDIFFix()));
     }
 
-    // XXX spill must be implemented
-    GBE_ASSERTM(kernel != NULL, "Register spilling not supported yet!");
+    GBE_ASSERTM(kernel != NULL, "Fail to compile kernel, may need to increase reserved registers for spilling.");
     return kernel;
+#else
+    return NULL;
+#endif
   }
 
-  static gbe_program genProgramNewFromBinary(const char *binary, size_t size) {
+#define BINARY_HEADER_LENGTH 8
+#define IS_GEN_BINARY(binary) (*binary == '\0' && *(binary+1) == 'G'&& *(binary+2) == 'E' &&*(binary+3) == 'N' &&*(binary+4) == 'C')
+#define FILL_GEN_BINARY(binary) do{*binary = '\0'; *(binary+1) = 'G'; *(binary+2) = 'E'; *(binary+3) = 'N'; *(binary+4) = 'C';}while(0)
+#define FILL_DEVICE_ID(binary, src_hw_info) do {*(binary+5) = src_hw_info[0]; *(binary+6) = src_hw_info[1]; *(binary+7) = src_hw_info[2];}while(0)
+#define DEVICE_MATCH(typeA, src_hw_info) ((IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "IVB")) ||  \
+                                      (IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
+                                      (IS_BAYTRAIL_T(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
+                                      (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) )
+
+  static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
     using namespace gbe;
     std::string binary_content;
-    binary_content.assign(binary, size);
-    GenProgram *program = GBE_NEW_NO_ARG(GenProgram);
+    //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3  bytes are hw info.
+    char src_hw_info[4]="";
+    src_hw_info[0] = *(binary+5);
+    src_hw_info[1] = *(binary+6);
+    src_hw_info[2] = *(binary+7);
+
+    // check whether is gen binary ('/0GENC')
+    if(!IS_GEN_BINARY(binary)){
+        return NULL;
+    }
+    // check the whether the current device ID match the binary file's.
+    if(!DEVICE_MATCH(deviceID, src_hw_info)){
+      return NULL;
+    }
+
+    binary_content.assign(binary+BINARY_HEADER_LENGTH, size-BINARY_HEADER_LENGTH);
+    GenProgram *program = GBE_NEW(GenProgram, deviceID);
     std::istringstream ifs(binary_content, std::ostringstream::binary);
 
     if (!program->deserializeFromBin(ifs)) {
@@ -132,33 +237,100 @@ namespace gbe {
     return reinterpret_cast<gbe_program>(program);
   }
 
-  static size_t genProgramSerializeToBinary(gbe_program program, char **binary) {
+  static gbe_program genProgramNewFromLLVMBinary(uint32_t deviceID, const char *binary, size_t size) {
+#ifdef GBE_COMPILER_AVAILABLE
+    using namespace gbe;
+    std::string binary_content;
+    //the first byte stands for binary_type.
+    binary_content.assign(binary+1, size-1);
+    llvm::StringRef llvm_bin_str(binary_content);
+    llvm::LLVMContext& c = llvm::getGlobalContext();
+    llvm::SMDiagnostic Err;
+    llvm::MemoryBuffer* memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
+    acquireLLVMContextLock();
+    llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
+    releaseLLVMContextLock();
+    if(module == NULL){
+      GBE_ASSERT(0);
+    }
+
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module);
+
+    //program->printStatus(0, std::cout);
+    return reinterpret_cast<gbe_program>(program);
+#else
+      return NULL;
+#endif
+  }
+
+  static size_t genProgramSerializeToBinary(gbe_program program, char **binary, int binary_type) {
     using namespace gbe;
     size_t sz;
     std::ostringstream oss;
     GenProgram *prog = (GenProgram*)program;
 
-    if ((sz = prog->serializeToBin(oss)) == 0) {
-      *binary = 0;
+    //0 means GEN binary, 1 means LLVM bitcode compiled object, 2 means LLVM bitcode library
+    if(binary_type == 0){
+      if ((sz = prog->serializeToBin(oss)) == 0) {
+        *binary = NULL;
+        return 0;
+      }
+
+      //add header to differetiate from llvm bitcode binary.
+      //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3  bytes are hw info.
+      *binary = (char *)malloc(sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
+      memset(*binary, 0, sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
+      FILL_GEN_BINARY(*binary);
+      char src_hw_info[4]="";
+      if(IS_IVYBRIDGE(prog->deviceID)){
+        src_hw_info[0]='I';
+        src_hw_info[1]='V';
+        src_hw_info[2]='B';
+        if(IS_BAYTRAIL_T(prog->deviceID)){
+          src_hw_info[0]='B';
+          src_hw_info[1]='Y';
+          src_hw_info[2]='T';
+        }
+      }else if(IS_HASWELL(prog->deviceID)){
+        src_hw_info[0]='H';
+        src_hw_info[1]='S';
+        src_hw_info[2]='W';
+      }
+      FILL_DEVICE_ID(*binary, src_hw_info);
+      memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
+      return sz+BINARY_HEADER_LENGTH;
+    }else{
+#ifdef GBE_COMPILER_AVAILABLE
+      std::string str;
+      llvm::raw_string_ostream OS(str);
+      llvm::WriteBitcodeToFile((llvm::Module*)prog->module, OS);
+      std::string& bin_str = OS.str();
+      int llsz = bin_str.size();
+      *binary = (char *)malloc(sizeof(char) * (llsz+1) );
+      *(*binary) = binary_type;
+      memcpy(*binary+1, bin_str.c_str(), llsz);
+      return llsz+1;
+#else
       return 0;
+#endif
     }
-
-    *binary = (char *)malloc(sizeof(char) * sz);
-    memcpy(*binary, oss.str().c_str(), sz*sizeof(char));
-    return sz;
   }
 
-  static gbe_program genProgramNewFromLLVM(const char *fileName,
+  static gbe_program genProgramNewFromLLVM(uint32_t deviceID,
+                                           const char *fileName,
+                                           const void* module,
+                                           const void* llvm_ctx,
                                            size_t stringSize,
                                            char *err,
                                            size_t *errSize,
                                            int optLevel)
   {
     using namespace gbe;
-    GenProgram *program = GBE_NEW_NO_ARG(GenProgram);
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx);
+#ifdef GBE_COMPILER_AVAILABLE
     std::string error;
     // Try to compile the program
-    if (program->buildFromLLVMFile(fileName, error, optLevel) == false) {
+    if (program->buildFromLLVMFile(fileName, module, error, optLevel) == false) {
       if (err != NULL && errSize != NULL && stringSize > 0u) {
         const size_t msgSize = std::min(error.size(), stringSize-1u);
         std::memcpy(err, error.c_str(), msgSize);
@@ -167,14 +339,106 @@ namespace gbe {
       GBE_DELETE(program);
       return NULL;
     }
+#endif
+    // Everything run fine
+    return (gbe_program) program;
+  }
+
+  static gbe_program genProgramNewGenProgram(uint32_t deviceID, const void* module,
+                                             const void* llvm_ctx)  {
+    using namespace gbe;
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx);
     // Everything run fine
     return (gbe_program) program;
   }
+
+  static void genProgramLinkFromLLVM(gbe_program           dst_program,
+                                     gbe_program           src_program,
+                                     size_t                stringSize,
+                                     char *                err,
+                                     size_t *              errSize)
+  {
+#ifdef GBE_COMPILER_AVAILABLE
+    using namespace gbe;
+    std::string errMsg;
+    if(((GenProgram*)dst_program)->module == NULL){
+      ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module);
+      errSize = 0;
+    }else{
+      //set the global variables and functions to link once to fix redefine.
+      llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
+      for (llvm::Module::global_iterator I = src->global_begin(), E = src->global_end(); I != E; ++I) {
+        I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
+      }
+
+      for (llvm::Module::iterator I = src->begin(), E = src->end(); I != E; ++I) {
+        llvm::Function *F = llvm::dyn_cast<llvm::Function>(I);
+        if (F && isKernelFunction(*F)) continue;
+        I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
+      }
+      llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
+      llvm::Linker::LinkModules( dst,
+                                 src,
+                                 llvm::Linker::PreserveSource,
+                                 &errMsg);
+      if (errMsg.c_str() != NULL) {
+        if (err != NULL && errSize != NULL && stringSize > 0u) {
+          if(errMsg.length() < stringSize )
+            stringSize = errMsg.length();
+          strcpy(err, errMsg.c_str());
+          err[stringSize+1] = '\0';
+        }
+      }
+    }
+    // Everything run fine
+#endif
+  }
+
+  static void genProgramBuildFromLLVM(gbe_program program,
+                                      size_t stringSize,
+                                      char *err,
+                                      size_t *errSize,
+                                      const char *          options)
+  {
+#ifdef GBE_COMPILER_AVAILABLE
+    using namespace gbe;
+    std::string error;
+
+    int optLevel = 1;
+
+    if(options) {
+      char *p;
+      p = strstr(const_cast<char *>(options), "-cl-opt-disable");
+      if (p)
+        optLevel = 0;
+    }
+
+    GenProgram* p = (GenProgram*) program;
+    // Try to compile the program
+    acquireLLVMContextLock();
+    llvm::Module* module = (llvm::Module*)p->module;
+
+    if (p->buildFromLLVMFile(NULL, module, error, optLevel) == false) {
+      if (err != NULL && errSize != NULL && stringSize > 0u) {
+        const size_t msgSize = std::min(error.size(), stringSize-1u);
+        std::memcpy(err, error.c_str(), msgSize);
+        *errSize = error.size();
+      }
+      GBE_DELETE(p);
+    }
+    releaseLLVMContextLock();
+#endif
+  }
+
 } /* namespace gbe */
 
 void genSetupCallBacks(void)
 {
   gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
+  gbe_program_new_from_llvm_binary = gbe::genProgramNewFromLLVMBinary;
   gbe_program_serialize_to_binary = gbe::genProgramSerializeToBinary;
   gbe_program_new_from_llvm = gbe::genProgramNewFromLLVM;
+  gbe_program_new_gen_program = gbe::genProgramNewGenProgram;
+  gbe_program_link_from_llvm = gbe::genProgramLinkFromLLVM;
+  gbe_program_build_from_llvm = gbe::genProgramBuildFromLLVM;
 }
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
index f78e324..1b5136e 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -27,6 +27,7 @@
 
 #include "backend/program.h"
 #include "backend/program.hpp"
+#include "backend/gen_defs.hpp"
 
 // Gen ISA instruction
 struct GenInstruction;
@@ -37,7 +38,7 @@ namespace gbe
   {
   public:
     /*! Create an empty kernel with the given name */
-    GenKernel(const std::string &name);
+    GenKernel(const std::string &name, uint32_t deviceID);
     /*! Destroy it */
     virtual ~GenKernel(void);
     /*! Implements base class */
@@ -48,6 +49,7 @@ namespace gbe
     virtual size_t getCodeSize(void) const;
     /*! Implements printStatus*/
     virtual void printStatus(int indent, std::ostream& outs);
+    uint32_t deviceID;      //!< Current device ID
     GenInstruction *insns; //!< Instruction stream
     uint32_t insnNum;      //!< Number of instructions
     GBE_CLASS(GenKernel);  //!< Use custom allocators
@@ -58,19 +60,26 @@ namespace gbe
   {
   public:
     /*! Create an empty program */
-    GenProgram(void);
+    GenProgram(uint32_t deviceID, const void* mod = NULL, const void* ctx = NULL) : deviceID(deviceID),module((void*)mod), llvm_ctx((void*)ctx) {}
+    /*! Current device ID*/
+    uint32_t deviceID;
     /*! Destroy the program */
-    virtual ~GenProgram(void);
+    virtual ~GenProgram(void) {};
+    /*! Clean LLVM resource */
+    virtual void CleanLlvmResource(void);
     /*! Implements base class */
-    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name);
+    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath);
     /*! Allocate an empty kernel. */
     virtual Kernel *allocateKernel(const std::string &name) {
-      return GBE_NEW(GenKernel, name);
+      return GBE_NEW(GenKernel, name, deviceID);
     }
+    void* module;
+    void* llvm_ctx;
     /*! Use custom allocators */
     GBE_CLASS(GenProgram);
   };
-
+  /*! decompact GEN ASM if it is in compacted format */
+  extern void decompactInstruction(union GenCompactInstruction *p, union GenNativeInstruction *pOut);
 } /* namespace gbe */
 
 #endif /* __GBE_GEN_PROGRAM_HPP__ */
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 2aafdb1..b7fbc93 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -28,12 +28,12 @@
 #include "backend/gen_register.hpp"
 #include "backend/program.hpp"
 #include "sys/exception.hpp"
+#include "sys/cvar.hpp"
 #include <algorithm>
 #include <climits>
 #include <iostream>
 #include <iomanip>
 
-#define RESERVED_REG_NUM_FOR_SPILL 6
 
 namespace gbe
 {
@@ -54,19 +54,16 @@ namespace gbe
   };
 
   typedef struct GenRegIntervalKey {
-    GenRegIntervalKey(uint16_t reg, uint16_t maxID) {
-      if (maxID == INT_MAX)
-        maxID = 0xFFFF;
-      GBE_ASSERT(reg <= 0xFFFF && maxID <= 0xFFFF);
-      key = (maxID << 16) | reg;
+    GenRegIntervalKey(uint16_t reg, int32_t maxID) {
+      key = ((uint64_t)maxID << 16) | reg;
     }
     const ir::Register getReg() const {
       return (ir::Register)(key & 0xFFFF);
     }
-    const uint16_t getMaxID() const {
+    const int32_t getMaxID() const {
       return key >> 16;
     }
-    uint32_t key;
+    uint64_t key;
   } GenRegIntervalKey;
 
   struct spillCmp {
@@ -111,9 +108,9 @@ namespace gbe
       // Note that byte vector registers use two bytes per byte (and can be
       // interleaved)
       static const size_t familyVectorSize[] = {2,2,2,4,8};
-      static const size_t familyScalarSize[] = {2,1,2,4,8};
+      static const size_t familyScalarSize[] = {2,2,2,4,8};
       using namespace ir;
-      const bool isScalar = ctx.sel->isScalarOrBool(reg);
+      const bool isScalar = ctx.sel->isScalarReg(reg);
       const RegisterData regData = ctx.sel->getRegisterData(reg);
       const RegisterFamily family = regData.family;
       const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
@@ -128,6 +125,12 @@ namespace gbe
     bool expireFlag(const GenRegInterval &limit);
     /*! Allocate the virtual boolean (== flags) registers */
     void allocateFlags(Selection &selection);
+    /*! validated flags which contains valid value in the physical flag register */
+    set<uint16_t> validatedFlags;
+    /*! validated temp flag register which indicate the flag 0,1 contains which virtual flag register. */
+    uint16_t validTempFlagReg;
+    /*! validate flag for the current flag user instruction */
+    void validateFlag(Selection &selection, SelectionInstruction &insn);
     /*! Allocate the GRF registers */
     bool allocateGRFs(Selection &selection);
     /*! Create gen registers for all preallocated curbe registers. */
@@ -157,8 +160,14 @@ namespace gbe
     vector<SelectionVector*> vectors;
     /*! The set of booleans that will go to GRF (cannot be kept into flags) */
     set<ir::Register> grfBooleans;
+    /*! The set of booleans which be held in flags, don't need to allocate grf */
+    set<ir::Register> flagBooleans;
     /*! All the register intervals */
     vector<GenRegInterval> intervals;
+    /*! All the boolean register intervals on the corresponding BB*/
+    typedef map<ir::Register, GenRegInterval> RegIntervalMap;
+    set<SelectionBlock *> flag0ReservedBlocks;
+    map<SelectionBlock *, RegIntervalMap *> boolIntervalsMap;
     /*! Intervals sorting based on starting point positions */
     vector<GenRegInterval*> starting;
     /*! Intervals sorting based on ending point positions */
@@ -178,7 +187,22 @@ namespace gbe
     INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
     INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
     INLINE bool vectorCanSpill(SelectionVector *vector);
-
+    INLINE void allocateScratchForSpilled();
+
+    /*! replace specified source/dst register with temporary register and update interval */
+    INLINE ir::Register replaceReg(Selection &sel, SelectionInstruction *insn,
+                                   uint32_t regID, bool isSrc,
+                                   ir::Type type = ir::TYPE_FLOAT, bool needMov = true) {
+      ir::Register reg;
+      if (isSrc)
+        reg = sel.replaceSrc(insn, regID, type, needMov);
+      else
+        reg = sel.replaceDst(insn, regID, type, needMov);
+      intervals.push_back(reg);
+      intervals[reg].minID = insn->ID;
+      intervals[reg].maxID = insn->ID;
+      return reg;
+    }
     /*! Use custom allocator */
     GBE_CLASS(Opaque);
   };
@@ -227,14 +251,11 @@ namespace gbe
     const ir::Register reg = interval.reg;
     if (RA.contains(reg) == true)
       return true; // already allocated
-    GBE_ASSERT(ctx.isScalarReg(reg) == false);
     uint32_t regSize;
     ir::RegisterFamily family;
     getRegAttrib(reg, regSize, &family);
     uint32_t grfOffset = allocateReg(interval, regSize, regSize);
     if (grfOffset == 0) {
-      /* this register is going to be spilled. */
-      GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
       return false;
     }
     insertNewReg(reg, grfOffset);
@@ -275,9 +296,12 @@ namespace gbe
       // case 1: the register is not already in a vector, so it can stay in this
       // vector. Note that local IDs are *non-scalar* special registers but will
       // require a MOV anyway since pre-allocated in the CURBE
+      // If an element has very long interval, we don't want to put it into a
+      // vector as it will add more pressure to the register allocation.
       if (it == vectorMap.end() &&
-          ctx.sel->isScalarOrBool(reg) == false &&
-          ctx.isSpecialReg(reg) == false)
+          ctx.sel->isScalarReg(reg) == false &&
+          ctx.isSpecialReg(reg) == false &&
+          (intervals[reg].maxID - intervals[reg].minID) < 2048)
       {
         const VectorLocation location = std::make_pair(vector, regID);
         this->vectorMap.insert(std::make_pair(reg, location));
@@ -290,10 +314,8 @@ namespace gbe
       // the MOVs
       else {
         ir::Register tmp;
-        if (vector->isSrc)
-          tmp = selection.replaceSrc(vector->insn, regID);
-        else
-          tmp = selection.replaceDst(vector->insn, regID);
+        ir::Type type = getIRType(vector->reg[regID].type);
+        tmp = this->replaceReg(selection, vector->insn, regID, vector->isSrc, type);
         const VectorLocation location = std::make_pair(vector, regID);
         this->vectorMap.insert(std::make_pair(tmp, location));
       }
@@ -351,11 +373,6 @@ namespace gbe
         this->expiringID++;
         continue;
       }
-      // Ignore booleans that were allocated with flags
-      if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg)) {
-        this->expiringID++;
-        continue;
-      }
 
       if (toExpire->maxID >= limit.minID)
         break;
@@ -369,162 +386,271 @@ namespace gbe
     return ret;
   }
 
-  void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
-
-    // Store the registers allocated in the map
-    map<ir::Register, uint32_t> allocatedFlags;
-    GenRegInterval spill = ir::Register(ir::RegisterFile::MAX_INDEX);
 
-    // we have two flags we use for booleans f1.0 and f1.1
-    const uint32_t flagNum = 2;
-    uint32_t freeFlags[] = {0,1};
-    uint32_t freeNum = flagNum;
+  #define IS_IMPLICITLY_MOD_FLAG(insn) (insn.state.modFlag == 1 &&      \
+                                         (insn.opcode == SEL_OP_MOV ||  \
+                                          insn.opcode == SEL_OP_AND  || \
+                                          insn.opcode == SEL_OP_OR  ||  \
+                                          insn.opcode == SEL_OP_XOR))
+
+  #define IS_SCALAR_FLAG(insn) selection.isScalarReg(ir::Register(insn.state.flagIndex))
+  #define GET_FLAG_REG(insn) GenRegister::uwxgrf(IS_SCALAR_FLAG(insn) ? 1 : 8,\
+                                                 ir::Register(insn.state.flagIndex));
+  #define IS_TEMP_FLAG(insn) (insn.state.flag == 0 && insn.state.subFlag == 1)
+  // Flag is a virtual flag, this function is to validate the virtual flag
+  // to a physical flag. It is used to validate both temporary flag and the
+  // non-temporary flag registers.
+  // We track the last temporary validate register, if it's the same as
+  // current, we can avoid the revalidation.
+  void GenRegAllocator::Opaque::validateFlag(Selection &selection,
+                                             SelectionInstruction &insn) {
+    GBE_ASSERT(insn.state.physicalFlag == 1);
+    if (!IS_TEMP_FLAG(insn) && validatedFlags.find(insn.state.flagIndex) != validatedFlags.end())
+      return;
+    else if (IS_TEMP_FLAG(insn) && validTempFlagReg == insn.state.flagIndex)
+      return;
+    SelectionInstruction *cmp0 = selection.create(SEL_OP_CMP, 1, 2);
+    cmp0->state = GenInstructionState(ctx.getSimdWidth());
+    cmp0->state.flag = insn.state.flag;
+    cmp0->state.subFlag = insn.state.subFlag;
+    if (IS_SCALAR_FLAG(insn))
+      cmp0->state.noMask = 1;
+    cmp0->src(0) = GET_FLAG_REG(insn);
+    cmp0->src(1) = GenRegister::immuw(0);
+    cmp0->dst(0) = GenRegister::retype(GenRegister::null(), GEN_TYPE_UW);
+    cmp0->extra.function = GEN_CONDITIONAL_NEQ;
+    insn.prepend(*cmp0);
+    if (!IS_TEMP_FLAG(insn))
+      validatedFlags.insert(insn.state.flagIndex);
+    else {
+      if (insn.state.modFlag == 0)
+        validTempFlagReg = insn.state.flagIndex;
+      else
+        validTempFlagReg = 0;
+    }
+  }
 
-    // Perform the linear scan allocator on the flag registers only. We only use
-    // two flags registers for the booleans right now: f1.0 and f1.1 
-    const uint32_t regNum = ctx.sel->getRegNum();
-    uint32_t endID = 0; // interval to expire
-    for (uint32_t startID = 0; startID < regNum; ++startID) {
-      const GenRegInterval &interval = *this->starting[startID];
-      const ir::Register reg = interval.reg;
-      if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
-        continue; // Not a flag. We don't care
-      if (grfBooleans.contains(reg))
-        continue; // Cannot use a flag register
-      if (interval.maxID == -INT_MAX)
-        continue; // Unused register
-      if (freeNum != 0) {
-        spill = interval;
-        allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+  
+  void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
+    // Previously, we have a global flag allocation implemntation.
+    // After some analysis, I found the global flag allocation is not
+    // the best solution here.
+    // As for the cross block reference of bool value, we have to
+    // combine it with current emask. There is no obvious advantage to
+    // allocate deadicate physical flag register for those cross block usage.
+    // We just need to allocate physical flag within each BB. We need to handle
+    // the following cases:
+    //
+    // 1. The bool's liveness never beyond this BB. And the bool is only used as
+    //    a dst register or a pred register. This bool value could be
+    //    allocated in physical flag only if there is enough physical flag.
+    //    We already identified those bool at the instruction select stage, and
+    //    put them in the flagBooleans set.
+    // 2. The bool is defined in another BB and used in this BB, then we need
+    //    to prepend an instruction at the position where we use it.
+    // 3. The bool is defined in this BB but is also used as some instruction's
+    //    source registers rather than the pred register. We have to keep the normal
+    //    grf (UW8/UW16) register for this bool. For some CMP instruction, we need to
+    //    append a SEL instruction convert the flag to the grf register.
+    // 4. Even for the spilling flag, if there is only one spilling flag, we will also
+    //    try to reuse the temporary flag register latter. This requires all the
+    //    instructions should got it flag at the instruction selection stage. And should
+    //    not use the flag physical number directly at the gen_context stage. Otherwise,
+    //    may break the algorithm here.
+    // We will track all the validated bool value and to avoid any redundant
+    // validation for the same flag. But if there is no enough physical flag,
+    // we have to spill the previous allocated physical flag. And the spilling
+    // policy is to spill the allocate flag which live to the last time end point.
+
+    // we have three flags we use for booleans f0.0 , f1.0 and f1.1
+    for (auto &block : *selection.blockList) {
+      // Store the registers allocated in the map
+      map<ir::Register, uint32_t> allocatedFlags;
+      map<const GenRegInterval*, uint32_t> allocatedFlagIntervals;
+
+      const uint32_t flagNum = flag0ReservedBlocks.contains(&block) ?  2 : 3;
+      uint32_t freeFlags[] = {2, 3, 0};
+      uint32_t freeNum = flagNum;
+      if (boolIntervalsMap.find(&block) == boolIntervalsMap.end())
+        continue;
+      const auto boolsMap = boolIntervalsMap[&block];
+      vector<const GenRegInterval*> flagStarting;
+      vector<const GenRegInterval*> flagEnding;
+      GBE_ASSERT(boolsMap->size() > 0);
+      uint32_t regNum = boolsMap->size();
+      flagStarting.resize(regNum);
+      flagEnding.resize(regNum);
+      uint32_t id = 0;
+      for (auto &interval : *boolsMap) {
+        flagStarting[id] = flagEnding[id] = &interval.second;
+        id++;
       }
-      else {
+      std::sort(flagStarting.begin(), flagStarting.end(), cmp<true>);
+      std::sort(flagEnding.begin(), flagEnding.end(), cmp<false>);
+
+      uint32_t endID = 0; // interval to expire
+      for (uint32_t startID = 0; startID < regNum; ++startID) {
+        const GenRegInterval *interval = flagStarting[startID];
+        const ir::Register reg = interval->reg;
+        GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+        if (freeNum != 0) {
+          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+          allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+        } else {
         // Try to expire one register
-        while (endID != ending.size()) {
-          const GenRegInterval *toExpire = this->ending[endID];
-          const ir::Register reg = toExpire->reg;
+        while (endID != flagEnding.size()) {
+          const GenRegInterval *toExpire = flagEnding[endID];
           // Dead code produced by the insn selection -> we skip it
           if (toExpire->minID > toExpire->maxID) {
             endID++;
             continue;
           }
           // We cannot expire this interval and the next ones
-          if (toExpire->maxID >= interval.minID)
+          if (toExpire->maxID >= interval->minID)
             break;
-          // Must be a boolean allocated with a flag register
-          if (ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL || grfBooleans.contains(reg)) {
+          // We reuse a flag from a previous interval (the oldest one)
+          auto it = allocatedFlags.find(toExpire->reg);
+          if (it == allocatedFlags.end()) {
             endID++;
             continue;
           }
-          // We reuse a flag from a previous interval (the oldest one)
-          auto it = allocatedFlags.find(toExpire->reg);
-          GBE_ASSERT(it != allocatedFlags.end());
           freeFlags[freeNum++] = it->second;
           endID++;
           break;
         }
-
-        // We need to spill one of the previous boolean values
-        if (freeNum == 0) {
-          GBE_ASSERT(uint16_t(spill.reg) != ir::RegisterFile::MAX_INDEX);
-          // We spill the last inserted boolean and use its flag instead for
-          // this one
-          if (spill.maxID > interval.maxID) {
-            auto it = allocatedFlags.find(spill.reg);
-            GBE_ASSERT(it != allocatedFlags.end());
-            allocatedFlags.insert(std::make_pair(reg, it->second));
-            allocatedFlags.erase(spill.reg);
-            grfBooleans.insert(spill.reg);
-            spill = interval;
+        if (freeNum != 0) {
+          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+          allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+        }
+        else {
+          // FIXME we may sort the allocated flags before do the spilling in the furture.
+          int32_t spill = -1;
+          const GenRegInterval *spillInterval = NULL;
+          int32_t maxID = 0;
+          for (auto &it : allocatedFlagIntervals) {
+            if (it.first->maxID <= interval->minID)
+              continue;
+            if (it.first->maxID > maxID && it.second != 0) {
+              maxID = it.first->maxID;
+              spill = it.second;
+              spillInterval = it.first;
+            }
           }
-          // We will use a grf for the current register
-          else {
-            grfBooleans.insert(reg);
+          if (spill != -1) {
+            allocatedFlags.insert(std::make_pair(reg, spill));
+            allocatedFlagIntervals.insert(std::make_pair(interval, spill));
+            allocatedFlags.erase(spillInterval->reg);
+            allocatedFlagIntervals.erase(spillInterval);
+            // We spill this flag booleans register, so erase it from the flag boolean set.
+            if (flagBooleans.contains(spillInterval->reg))
+              flagBooleans.erase(spillInterval->reg);
+          } else {
+            GBE_ASSERT(0);
           }
         }
-        else
-          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
-      }
-    }
-
-    // Now, we traverse all the selection instructions and we patch them to make
-    // them use flag registers
-    for (auto &block : *selection.blockList)
-    for (auto &insn : block.insnList) {
-      const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
-
-      // Patch the source booleans
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const GenRegister selReg = insn.src(srcID);
-        const ir::Register reg = selReg.reg();
-        if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
-          continue;
-        auto it = allocatedFlags.find(reg);
-        if (it == allocatedFlags.end())
-          continue;
-        // Use a flag register for it now
-        insn.src(srcID) = GenRegister::flag(1,it->second);
-      }
-
-      // Patch the destination booleans
-      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
-        const GenRegister selReg = insn.dst(dstID);
-        const ir::Register reg = selReg.reg();
-        if (selReg.physical || ctx.sel->getRegisterFamily(reg) != ir::FAMILY_BOOL)
-          continue;
-        auto it = allocatedFlags.find(reg);
-        if (it == allocatedFlags.end())
-          continue;
-        // Use a flag register for it now
-        insn.dst(dstID) = GenRegister::flag(1,it->second);
+        }
       }
+      delete boolsMap;
 
-      // Patch the predicate now. Note that only compares actually modify it (it
-      // is called a "conditional modifier"). The other instructions just read
-      // it
-      if (insn.state.physicalFlag == 0) {
-        auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
-        // Just patch it if we can use a flag directly
-        if (it != allocatedFlags.end()) {
-          insn.state.flag = 1;
-          insn.state.subFlag = it->second;
-          insn.state.physicalFlag = 1;
-        }
-        // When we let the boolean in a GRF, use f0.1 as a temporary
-        else {
-          // Mov the GRF to the flag such that the flag can be read
-          SelectionInstruction *mov0 = selection.create(SEL_OP_MOV,1,1);
-          mov0->state = GenInstructionState(1);
-          mov0->state.predicate = GEN_PREDICATE_NONE;
-          mov0->state.noMask = 1;
-          mov0->src(0) = GenRegister::uw1grf(ir::Register(insn.state.flagIndex));
-          mov0->dst(0) = GenRegister::flag(0,1);
-
-          // Do not prepend if the flag is not read (== used only as a
-          // conditional modifier)
-          if (insn.state.predicate != GEN_PREDICATE_NONE)
-            insn.prepend(*mov0);
-
-          // We can use f0.1 (our "backdoor" flag)
-          insn.state.flag = 0;
-          insn.state.subFlag = 1;
-          insn.state.physicalFlag = 1;
-
-          // Compare instructions update the flags so we must copy it back to
-          // the GRF
-          if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
-            SelectionInstruction *mov1 = selection.create(SEL_OP_MOV,1,1);
-            mov1->state = mov0->state;
-            mov1->dst(0) = mov0->src(0);
-            mov1->src(0) = mov0->dst(0);
-            insn.append(*mov1);
+      // Now, we traverse all the selection instructions and we patch them to make
+      // them use flag registers
+      validTempFlagReg = 0;
+      validatedFlags.clear();
+      for (auto &insn : block.insnList) {
+        // Patch the predicate now. Note that only compares actually modify it (it
+        // is called a "conditional modifier"). The other instructions just read
+        // it
+        if (insn.state.physicalFlag == 0) {
+          auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
+          if (it != allocatedFlags.end()) {
+            insn.state.physicalFlag = 1;
+            insn.state.flag = it->second / 2;
+            insn.state.subFlag = it->second & 1;
+
+            // modFlag is for the LOADI/MOV/AND/OR/XOR instructions which will modify a
+            // flag register. We set the condition for them to save one instruction if possible.
+            if (IS_IMPLICITLY_MOD_FLAG(insn)) {
+              // If this is a modFlag on a scalar bool, we need to remove it
+              // from the allocated flags map. Then latter, the user could
+              // validate the flag from the scalar value correctly.
+              if (IS_SCALAR_FLAG(insn)) {
+                allocatedFlags.erase(ir::Register(insn.state.flagIndex));
+                continue;
+              }
+              insn.extra.function = GEN_CONDITIONAL_NEQ;
+            }
+            // If this is an external bool, we need to validate it if it is not validated yet.
+            if ((insn.state.externFlag &&
+                 insn.state.predicate != GEN_PREDICATE_NONE))
+              validateFlag(selection, insn);
+          } else {
+            insn.state.physicalFlag = 1;
+            insn.state.flag = 0;
+            insn.state.subFlag = 1;
+
+            // If this is for MOV/AND/OR/... we don't need to waste an extra instruction
+            // to generate the flag here, just continue to next instruction. And the validTempFlagReg
+            // will not be destroyed.
+            if (IS_IMPLICITLY_MOD_FLAG(insn))
+              continue;
+            // This bool doesn't have a deadicated flag, we use temporary flag here.
+            // each time we need to validate it from the grf register.
+            if (insn.state.predicate != GEN_PREDICATE_NONE)
+              validateFlag(selection, insn);
           }
+          // This is a CMP for a pure flag booleans, we don't need to write result to
+          // the grf. And latter, we will not allocate grf for it.
+          if (insn.opcode == SEL_OP_CMP &&
+              (flagBooleans.contains(insn.dst(0).reg()) ||
+               GenRegister::isNull(insn.dst(0)))) {
+            // set a temporary register to avoid switch in this block.
+            bool isSrc = false;
+            bool needMov = false;
+            this->replaceReg(selection, &insn, 0, isSrc, ir::TYPE_FLOAT, needMov);
+          }
+          // If the instruction requires to generate (CMP for long/int/float..)
+          // the flag value to the register, and it's not a pure flag boolean,
+          // we need to use SEL instruction to generate the flag value to the UW8
+          // register.
+          if (insn.state.flagGen == 1 &&
+              !flagBooleans.contains((ir::Register)(insn.state.flagIndex))) {
+            SelectionInstruction *sel0 = selection.create(SEL_OP_SEL, 1, 2);
+            uint32_t simdWidth;
+            simdWidth = IS_SCALAR_FLAG(insn) ? 1 : ctx.getSimdWidth();
+
+            sel0->state = GenInstructionState(simdWidth);
+            if (IS_SCALAR_FLAG(insn))
+              sel0->state.noMask = 1;
+            sel0->state.flag = insn.state.flag;
+            sel0->state.subFlag = insn.state.subFlag;
+            sel0->state.predicate = GEN_PREDICATE_NORMAL;
+            sel0->src(0) = GenRegister::uw1grf(ir::ocl::one);
+            sel0->src(1) = GenRegister::uw1grf(ir::ocl::zero);
+            sel0->dst(0) = GET_FLAG_REG(insn);
+            insn.append(*sel0);
+            // We use the zero one after the liveness analysis, we have to update
+            // the liveness data manually here.
+            GenRegInterval &interval0 = intervals[ir::ocl::zero];
+            GenRegInterval &interval1 = intervals[ir::ocl::one];
+            interval0.minID = std::min(interval0.minID, (int32_t)insn.ID);
+            interval0.maxID = std::max(interval0.maxID, (int32_t)insn.ID);
+            interval1.minID = std::min(interval1.minID, (int32_t)insn.ID);
+            interval1.maxID = std::max(interval1.maxID, (int32_t)insn.ID);
+          }
+        } else {
+          // If the instruction use the temporary flag register manually,
+          // we should invalidate the temp flag reg here.
+          if (insn.state.flag == 0 && insn.state.subFlag == 1)
+            validTempFlagReg = 0;
         }
       }
     }
   }
 
+  IVAR(OCL_SIMD16_SPILL_THRESHOLD, 0, 16, 256);
   bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
     // Perform the linear scan allocator
+    ctx.errCode = REGISTER_ALLOCATION_FAIL;
     const uint32_t regNum = ctx.sel->getRegNum();
     for (uint32_t startID = 0; startID < regNum; ++startID) {
       const GenRegInterval &interval = *this->starting[startID];
@@ -534,7 +660,7 @@ namespace gbe
       if (RA.contains(reg))
         continue; // already allocated
 
-      if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg))
+      if (flagBooleans.contains(reg))
         continue;
 
       // Case 1: the register belongs to a vector, allocate all the registers in
@@ -548,25 +674,31 @@ namespace gbe
           continue;
 
         uint32_t alignment;
-        ir::RegisterFamily family;
-        getRegAttrib(reg, alignment, &family);
-        const uint32_t size = vector->regNum * alignment;
-        const uint32_t grfOffset = allocateReg(interval, size, alignment);
+        uint32_t size = 0;
+        for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+          getRegAttrib(vector->reg[regID].reg(), alignment, NULL);
+          size += alignment;
+        }
+        // FIXME this is workaround for scheduling limitation, which requires 2*GEN_REG_SIZE under SIMD16.
+        const uint32_t maxAlignment = ctx.getSimdWidth()/8*GEN_REG_SIZE;
+        const uint32_t grfOffset = allocateReg(interval, size, maxAlignment);
         if(grfOffset == 0) {
-          GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
-          GBE_ASSERT(vector->regNum < RESERVED_REG_NUM_FOR_SPILL);
           for(int i = vector->regNum-1; i >= 0; i--) {
             if (!spillReg(vector->reg[i].reg()))
               return false;
           }
           continue;
         }
+        uint32_t subOffset = 0;
         for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
           const ir::Register reg = vector->reg[regID].reg();
-          GBE_ASSERT(RA.contains(reg) == false
-                     && ctx.sel->getRegisterData(reg).family == family);
-          insertNewReg(reg, grfOffset + alignment * regID, true);
-          ctx.splitBlock(grfOffset, alignment * regID);  //splitBlock will not split if regID == 0
+          GBE_ASSERT(RA.contains(reg) == false);
+          getRegAttrib(reg, alignment, NULL);
+          // check all sub registers aligned correctly
+          GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
+          insertNewReg(reg, grfOffset + subOffset, true);
+          ctx.splitBlock(grfOffset, subOffset);  //splitBlock will not split if regID == 0
+          subOffset += alignment;
         }
       }
       // Case 2: This is a regular scalar register, allocate it alone
@@ -577,18 +709,65 @@ namespace gbe
     }
     if (!spilledRegs.empty()) {
       GBE_ASSERT(reservedReg != 0);
+      if (ctx.getSimdWidth() == 16) {
+        if (spilledRegs.size() > (unsigned int)OCL_SIMD16_SPILL_THRESHOLD) {
+          ctx.errCode = REGISTER_SPILL_EXCEED_THRESHOLD;
+          return false;
+        }
+      }
+      allocateScratchForSpilled();
       bool success = selection.spillRegs(spilledRegs, reservedReg);
       if (!success) {
-        std::cerr << "Fail to spill registers." << std::endl;
+        ctx.errCode = REGISTER_SPILL_FAIL;
         return false;
       }
     }
+    ctx.errCode = NO_ERROR;
     return true;
   }
 
+  INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+  {
+    const uint32_t regNum = spilledRegs.size();
+    this->starting.resize(regNum);
+    this->ending.resize(regNum);
+    uint32_t regID = 0;
+    for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
+      this->starting[regID] = this->ending[regID] = &intervals[it->first];
+      regID++;
+    }
+    std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+    std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+    int toExpire = 0;
+    for(uint32_t i = 0; i < regNum; i++) {
+      const GenRegInterval * cur = starting[i];
+      const GenRegInterval * exp = ending[toExpire];
+      if (exp->maxID < cur->minID) {
+        auto it = spilledRegs.find(exp->reg);
+        GBE_ASSERT(it != spilledRegs.end());
+        if(it->second.addr != -1) {
+          ctx.deallocateScratchMem(it->second.addr);
+        }
+        toExpire++;
+      }
+      auto it = spilledRegs.find(cur->reg);
+      GBE_ASSERT(it != spilledRegs.end());
+      if(cur->minID == cur->maxID) {
+        it->second.addr = -1;
+        continue;
+      }
+
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
+      it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
+                                             * ctx.getSimdWidth());
+      }
+  }
+
   INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
   {
     auto it = RA.find(reg);
+    if (flagBooleans.contains(reg))
+      return false;
     GBE_ASSERT(it != RA.end());
     // offset less than 32 means it is not managed by our reg allocator.
     if (it->second < 32)
@@ -616,8 +795,14 @@ namespace gbe
        uint32_t regSize;
        ir::RegisterFamily family;
        getRegAttrib(reg, regSize, &family);
-
-       if (regSize == GEN_REG_SIZE && family == ir::FAMILY_DWORD /*&& !isVector*/) {
+       // At simd16 mode, we may introduce some simd8 registers in te instruction selection stage.
+       // To spill those simd8 temporary registers will introduce unecessary complexity. We just simply
+       // avoid to spill those temporary registers here.
+       if (ctx.getSimdWidth() == 16 && reg.value() >= ctx.getFunction().getRegisterFile().regNum())
+         return;
+
+       if ((regSize == ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_DWORD)
+          || (regSize == 2 * ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_QWORD)) {
          GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end());
          offsetReg.insert(std::make_pair(grfOffset, reg));
          spillCandidate.insert(intervals[reg]);
@@ -634,29 +819,38 @@ namespace gbe
                                                 bool isAllocated) {
     if (reservedReg == 0)
       return false;
+
+    if (interval.reg.value() >= ctx.getFunction().getRegisterFile().regNum() &&
+        ctx.getSimdWidth() == 16)
+      return false;
+
+    ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
+    // we currently only support DWORD/QWORD spill
+    if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)
+      return false;
+
     SpillRegTag spillTag;
     spillTag.isTmpReg = interval.maxID == interval.minID;
-    if (!spillTag.isTmpReg) {
-      // FIXME, we can optimize scratch allocation according to
-      // the interval information.
-      spillTag.addr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D)
-                                             * ctx.getSimdWidth());
-    } else
-      spillTag.addr = -1;
+    spillTag.addr = -1;
+
     if (isAllocated) {
       // If this register is allocated, we need to expire it and erase it
       // from the RA map.
       bool success = expireReg(interval.reg);
       GBE_ASSERT(success);
+      success = success;
       RA.erase(interval.reg);
     }
     spilledRegs.insert(std::make_pair(interval.reg, spillTag));
     return true;
   }
 
+  // Check whethere a vector which is allocated can be spilled out
+  // If a partial of a vector is expired, the vector will be unspillable, currently.
+  // FIXME we may need to fix those unspillable vector in the furture.
   INLINE bool GenRegAllocator::Opaque::vectorCanSpill(SelectionVector *vector) {
     for(uint32_t id = 0; id < vector->regNum; id++)
-      if (spillCandidate.find(intervals[(ir::Register)(vector->reg[id]).value.reg])
+      if (spillCandidate.find(intervals[(ir::Register)(vector->reg[id].value.reg)])
           == spillCandidate.end())
         return false;
     return true;
@@ -671,8 +865,12 @@ namespace gbe
     // If there is no spill candidate or current register is spillable and current register's
     // endpoint is after all the spillCandidate register's endpoint we return false. The
     // caller will spill current register.
+    // At simd16 mode, we will always try to spill here rather than return to the caller.
+    // The reason is that the caller may have a vector to allocate, and some element may be
+    // temporary registers which could not be spilled.
     if (it == spillCandidate.end()
-        || (it->getMaxID() <= interval.maxID && alignment == GEN_REG_SIZE))
+        || (ctx.getSimdWidth() == 8 && (it->getMaxID() <= interval.maxID
+            && alignment == ctx.getSimdWidth()/8 * GEN_REG_SIZE)))
       return false;
 
     ir::Register reg = it->getReg();
@@ -682,6 +880,7 @@ namespace gbe
       auto vectorIt = vectorMap.find(reg);
       bool isVector = vectorIt != vectorMap.end();
       bool needRestart = false;
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(reg);
       if (isVector
           && (vectorCanSpill(vectorIt->second.first))) {
         const SelectionVector *vector = vectorIt->second.first;
@@ -690,11 +889,14 @@ namespace gbe
                      == spilledRegs.end());
           spillSet.insert(vector->reg[id].reg());
           reg = vector->reg[id].reg();
-          size -= GEN_REG_SIZE;
+          family = ctx.sel->getRegisterFamily(reg);
+          size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
+                                             : GEN_REG_SIZE * ctx.getSimdWidth()/8;
         }
       } else if (!isVector) {
         spillSet.insert(reg);
-        size -= GEN_REG_SIZE;
+        size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
+                                           : GEN_REG_SIZE * ctx.getSimdWidth()/8;
       } else
         needRestart = true; // is a vector which could not be spilled.
 
@@ -702,7 +904,9 @@ namespace gbe
         break;
       if (!needRestart) {
         uint32_t offset = RA.find(reg)->second;
-        auto nextRegIt = offsetReg.find(offset + GEN_REG_SIZE);
+        uint32_t nextOffset = (family == ir::FAMILY_QWORD) ? (offset + 2 * GEN_REG_SIZE * ctx.getSimdWidth() / 8)
+                                                           : (offset + GEN_REG_SIZE * ctx.getSimdWidth() / 8);
+        auto nextRegIt = offsetReg.find(nextOffset);
         if (nextRegIt != offsetReg.end())
           reg = nextRegIt->second;
         else
@@ -710,9 +914,18 @@ namespace gbe
       }
 
       if (needRestart) {
+#if 0
+        // FIXME, we should enable this code block in the future.
+        // If the spill set is not zero and we need a restart, we can
+        // simply return to try to allocate the registers at first.
+        // As some vectors which have expired elements may be marked as
+        // unspillable vector.
+        if (spillSet.size() > 0)
+          break;
+#endif
+        it++;
         // next register is not in spill candidate.
         // let's move to next candidate and start over.
-        it++;
         if (it == spillCandidate.end())
           return false;
         reg = it->getReg();
@@ -730,6 +943,19 @@ namespace gbe
                                                        uint32_t size,
                                                        uint32_t alignment) {
     uint32_t grfOffset;
+    static uint32_t tick = 0;
+    // Doing expireGRF too freqently will cause the post register allocation
+    // scheduling very hard. As it will cause a very high register conflict rate.
+    // The tradeoff here is to reduce the freqency here. And if we are under spilling
+    // then no need to reduce that freqency as the register pressure is the most
+    // important factor.
+    if (tick % 12 == 0 || ctx.reservedSpillRegs != 0)
+      this->expireGRF(interval);
+    tick++;
+    // For some scalar byte register, it may be used as a destination register
+    // and the source is a scalar Dword. If that is the case, the byte register
+    // must get 4byte alignment register offset.
+    alignment = (alignment + 3) & ~3;
     while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
       const bool success = this->expireGRF(interval);
       if (success == false) {
@@ -742,14 +968,12 @@ namespace gbe
 
   INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
     using namespace ir;
-    if (ctx.getSimdWidth() == 8) {
-      reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE);
+    if (ctx.reservedSpillRegs != 0) {
+      reservedReg = ctx.allocate(ctx.reservedSpillRegs * GEN_REG_SIZE, GEN_REG_SIZE);
       reservedReg /= GEN_REG_SIZE;
     } else {
       reservedReg = 0;
     }
-    // Allocate all the vectors first since they need to be contiguous
-    this->allocateVector(selection);
     // schedulePreRegAllocation(ctx, selection);
 
     // Now start the linear scan allocation
@@ -774,8 +998,12 @@ namespace gbe
       int32_t firstID = insnID;
       // Update the intervals of each used register. Note that we do not
       // register allocate R0, so we skip all sub-registers in r0
+      RegIntervalMap *boolsMap = new RegIntervalMap;
+      if (block.isLargeBlock)
+        flag0ReservedBlocks.insert(&block);
       for (auto &insn : block.insnList) {
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+        insn.ID  = insnID;
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const GenRegister &selReg = insn.src(srcID);
           const ir::Register reg = selReg.reg();
@@ -801,22 +1029,43 @@ namespace gbe
           this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
         }
 
-        // Flag registers can only go to src[0]
-        const SelectionOpcode opcode = SelectionOpcode(insn.opcode);
-        if (opcode == SEL_OP_AND || opcode == SEL_OP_OR || opcode == SEL_OP_XOR
-            || opcode == SEL_OP_I64AND || opcode == SEL_OP_I64OR || opcode == SEL_OP_I64XOR) {
-          if (insn.src(1).physical == 0) {
-            const ir::Register reg = insn.src(1).reg();
-            if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL)
-              grfBooleans.insert(reg);
-          }
-        }
-
         // OK, a flag is used as a predicate or a conditional modifier
         if (insn.state.physicalFlag == 0) {
           const ir::Register reg = ir::Register(insn.state.flagIndex);
           this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
           this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+          // Check whether this is a pure flag booleans candidate.
+          if (insn.state.grfFlag == 0)
+            flagBooleans.insert(reg);
+          GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+          // update the bool register's per-BB's interval data
+          if (boolsMap->find(reg) == boolsMap->end()) {
+            GenRegInterval boolInterval(reg);
+            boolsMap->insert(std::make_pair(reg, boolInterval));
+          }
+          boolsMap->find(reg)->second.minID = std::min(boolsMap->find(reg)->second.minID, insnID);
+          boolsMap->find(reg)->second.maxID = std::max(boolsMap->find(reg)->second.maxID, insnID);
+          if (&insn == block.insnList.back() &&
+              insn.opcode == SEL_OP_JMPI &&
+              insn.state.predicate != GEN_PREDICATE_NONE) {
+            // If this is the last instruction and is a predicated JMPI.
+            // We must extent its liveness before any other instrution.
+            // As we need to allocate f0 to it, and need to keep the f0
+            // unchanged during the block. The root cause is this instruction
+            // is out-of the if/endif region, so we have to borrow the f0
+            // to get correct bits for all channels.
+            boolsMap->find(reg)->second.minID = 0;
+            if (flag0ReservedBlocks.contains(&block))
+              flag0ReservedBlocks.erase(&block);
+          }
+        } else {
+          // Make sure that instruction selection stage didn't use physiacl flags incorrectly.
+          GBE_ASSERT ((insn.opcode == SEL_OP_LABEL ||
+                       insn.opcode == SEL_OP_IF ||
+                       insn.opcode == SEL_OP_JMPI ||
+                       insn.state.predicate == GEN_PREDICATE_NONE ||
+                       (block.hasBarrier && insn.opcode == SEL_OP_MOV) ||
+                       (insn.state.flag == 0 && insn.state.subFlag == 1)));
         }
         lastID = insnID;
         insnID++;
@@ -825,26 +1074,28 @@ namespace gbe
       // All registers alive at the begining of the block must update their intervals.
       const ir::BasicBlock *bb = block.bb;
       for (auto reg : ctx.getLiveIn(bb))
-          this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
+        this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
 
-      for (auto reg : ctx.getExtraLiveIn(bb))
-          this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
       // All registers alive at the end of the block must have their intervals
       // updated as well
       for (auto reg : ctx.getLiveOut(bb))
         this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
 
-      for (auto reg : ctx.getExtraLiveOut(bb))
-        this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
+      if (boolsMap->size() > 0)
+        boolIntervalsMap.insert(std::make_pair(&block, boolsMap));
+      else
+        delete boolsMap;
     }
 
-    this->intervals[ocl::emask].minID = 0;
-    this->intervals[ocl::emask].maxID = INT_MAX;
-    this->intervals[ocl::notemask].minID = 0;
-    this->intervals[ocl::notemask].maxID = INT_MAX;
     this->intervals[ocl::retVal].minID = INT_MAX;
     this->intervals[ocl::retVal].maxID = -INT_MAX;
 
+    // Allocate all the vectors first since they need to be contiguous
+    this->allocateVector(selection);
+
+    // First we try to put all booleans registers into flags
+    this->allocateFlags(selection);
+
     // Sort both intervals in starting point and ending point increasing orders
     const uint32_t regNum = ctx.sel->getRegNum();
     this->starting.resize(regNum);
@@ -864,9 +1115,6 @@ namespace gbe
         break;
     }
 
-    // First we try to put all booleans registers into flags
-    this->allocateFlags(selection);
-
     // Allocate all the GRFs now (regular register and boolean that are not in
     // flag registers)
     return this->allocateGRFs(selection);
@@ -960,5 +1208,11 @@ namespace gbe
     this->opaque->outputAllocation();
   }
 
+  uint32_t GenRegAllocator::getRegSize(ir::Register reg) {
+     uint32_t regSize; 
+     this->opaque->getRegAttrib(reg, regSize); 
+     return regSize;
+  }
+
 } /* namespace gbe */
 
diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp
index 84b0f9c..e41f503 100644
--- a/backend/src/backend/gen_reg_allocation.hpp
+++ b/backend/src/backend/gen_reg_allocation.hpp
@@ -56,6 +56,8 @@ namespace gbe
     GenRegister genReg(const GenRegister &reg);
     /*! Output the register allocation */
     void outputAllocation(void);
+    /*! Get register actual size in byte. */
+    uint32_t getRegSize(ir::Register reg);
   private:
     /*! Actual implementation of the register allocator (use Pimpl) */
     class Opaque;
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 8794318..da58c06 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -118,16 +118,25 @@ namespace gbe
       this->noMask = 0;
       this->flag = 0;
       this->subFlag = 0;
-      this->predicate = GEN_PREDICATE_NORMAL;
+      this->grfFlag = 1;
+      this->externFlag = 0;
+      this->modFlag = 0;
+      this->flagGen = 0;
+      this->predicate = GEN_PREDICATE_NONE;
       this->inversePredicate = 0;
       this->physicalFlag = 1;
       this->flagIndex = 0;
       this->saturate = GEN_MATH_SATURATE_NONE;
     }
     uint32_t physicalFlag:1; //!< Physical or virtual flag register
-    uint32_t flag:1;         //!< Only if physical flag
+    uint32_t flag:1;         //!< Only if physical flag,
     uint32_t subFlag:1;      //!< Only if physical flag
     uint32_t flagIndex:16;   //!< Only if virtual flag (index of the register)
+    uint32_t grfFlag:1;      //!< Only if virtual flag, 0 means we do not need to allocate GRF.
+    uint32_t externFlag:1;   //!< Only if virtual flag, 1 means this flag is from external BB.
+    uint32_t modFlag:1;      //!< Only if virtual flag, 1 means will modify flag.
+    uint32_t flagGen:1;      //!< Only if virtual flag, 1 means the gen_context stage may need to
+                             //!< generate the flag.
     uint32_t execWidth:5;
     uint32_t quarterControl:1;
     uint32_t nibControl:1;
@@ -255,6 +264,31 @@ namespace gbe
       return r;
     }
 
+    // split a DWORD register into unpacked Byte or Short register
+    static INLINE GenRegister splitReg(GenRegister reg, uint32_t count, uint32_t sub_part) {
+      GenRegister r = reg;
+      GBE_ASSERT(count == 4 || count == 2);
+      GBE_ASSERT(reg.type == GEN_TYPE_UD || reg.type == GEN_TYPE_D);
+
+      if(reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        GBE_ASSERT(reg.hstride == GEN_HORIZONTAL_STRIDE_1);
+        r.hstride = count == 4 ? GEN_HORIZONTAL_STRIDE_4 : GEN_HORIZONTAL_STRIDE_2;
+      }
+      if(count == 4) {
+        r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UB : GEN_TYPE_B;
+        r.vstride = GEN_VERTICAL_STRIDE_32;
+      } else {
+        r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UW : GEN_TYPE_W;
+        r.vstride = GEN_VERTICAL_STRIDE_16;
+      }
+
+      r.subnr += sub_part*typeSize(r.type);
+      r.nr += r.subnr / 32;
+      r.subnr %= 32;
+
+      return r;
+    }
+
     INLINE bool isint64(void) const {
       if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
         return true;
@@ -267,20 +301,25 @@ namespace gbe
       return false;
     }
 
-    INLINE GenRegister top_half(void) const {
-      GenRegister r = bottom_half();
-      r.subnr += 4;
-      r.nr += r.subnr / 32;
-      r.subnr %= 32;
-      return r;
+    INLINE GenRegister top_half(int simdWidth) const {
+      GBE_ASSERT(isint64());
+      GenRegister reg = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
+
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg);
+        reg.nr += reg.subnr / 32;
+        reg.subnr %= 32;
+      } else {
+        reg.subnr += typeSize(reg.type);
+        reg.nr += reg.subnr/32;
+        reg.subnr %= 32;
+      }
+      return reg;
     }
 
     INLINE GenRegister bottom_half(void) const {
       GBE_ASSERT(isint64());
-      GenRegister r = h2(*this);
-      r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
-      if(r.vstride != GEN_VERTICAL_STRIDE_0)
-       r.vstride = GEN_VERTICAL_STRIDE_16;
+      GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
       return r;
     }
 
@@ -457,22 +496,22 @@ namespace gbe
       return retype(vec1(file, reg), GEN_TYPE_UB);
     }
 
-    static INLINE GenRegister unpacked_uw(ir::Register reg) {
+    static INLINE GenRegister unpacked_uw(ir::Register reg, bool uniform = false) {
         return GenRegister(GEN_GENERAL_REGISTER_FILE,
                            reg,
                            GEN_TYPE_UW,
-                           GEN_VERTICAL_STRIDE_16,
-                           GEN_WIDTH_8,
-                           GEN_HORIZONTAL_STRIDE_2);
+                           uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_16,
+                           uniform ? GEN_WIDTH_1 : GEN_WIDTH_8,
+                           uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_2);
     }
 
-    static INLINE GenRegister unpacked_ub(ir::Register reg) {
+    static INLINE GenRegister unpacked_ub(ir::Register reg, bool uniform = false) {
       return GenRegister(GEN_GENERAL_REGISTER_FILE,
                          reg,
                          GEN_TYPE_UB,
-                         GEN_VERTICAL_STRIDE_32,
-                         GEN_WIDTH_8,
-                         GEN_HORIZONTAL_STRIDE_4);
+                         uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_32,
+                         uniform ? GEN_WIDTH_1 : GEN_WIDTH_8,
+                         uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_4);
     }
 
     static INLINE GenRegister imm(uint32_t type) {
@@ -517,13 +556,13 @@ namespace gbe
 
     static INLINE GenRegister immuw(uint16_t uw) {
       GenRegister immediate = imm(GEN_TYPE_UW);
-      immediate.value.ud = uw | (uw << 16);
+      immediate.value.ud = uw;
       return immediate;
     }
 
     static INLINE GenRegister immw(int16_t w) {
       GenRegister immediate = imm(GEN_TYPE_W);
-      immediate.value.d = w | (w << 16);
+      immediate.value.d = w;
       return immediate;
     }
 
@@ -632,11 +671,29 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_1);
     }
 
+    static INLINE GenRegister nullud(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_NULL,
+                         0,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+
     static INLINE bool isNull(GenRegister reg) {
       return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
               && reg.nr == GEN_ARF_NULL);
     }
 
+    static INLINE GenRegister vec1(GenRegister reg) {
+      reg.width = GEN_WIDTH_1;
+      reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+      reg.vstride = GEN_VERTICAL_STRIDE_0;
+      return reg;
+    }
+
     static INLINE GenRegister acc(void) {
       return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
                          GEN_ARF_ACCUMULATOR,
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 2492a8b..787d111 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -30,10 +30,13 @@
 #include "ir/liveness.hpp"
 #include "ir/value.hpp"
 #include "ir/unit.hpp"
+#include "ir/printf.hpp"
 #include "llvm/llvm_to_gen.hpp"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/LLVMContext.h"
 #include <cstring>
 #include <algorithm>
 #include <fstream>
@@ -65,7 +68,6 @@
 #include <clang/Basic/TargetInfo.h>
 #include <clang/Basic/TargetOptions.h>
 #include <llvm/ADT/IntrusiveRefCntPtr.h>
-#include <llvm/ADT/OwningPtr.h>
 #if LLVM_VERSION_MINOR <= 2
 #include <llvm/Module.h>
 #else
@@ -78,12 +80,13 @@
 namespace gbe {
 
   Kernel::Kernel(const std::string &name) :
-    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL)
-  {}
+    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false),
+        slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), printfSet(NULL) {}
   Kernel::~Kernel(void) {
     if(ctx) GBE_DELETE(ctx);
     if(samplerSet) GBE_DELETE(samplerSet);
     if(imageSet) GBE_DELETE(imageSet);
+    if(printfSet) GBE_DELETE(printfSet);
     GBE_SAFE_DELETE_ARRAY(args);
   }
   int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
@@ -100,18 +103,43 @@ namespace gbe {
     if (constantSet) delete constantSet;
   }
 
+#ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_GEN_IR, false);
 
-  bool Program::buildFromLLVMFile(const char *fileName, std::string &error, int optLevel) {
-    ir::Unit unit;
-    if (llvmToGen(unit, fileName, optLevel) == false) {
-      error = std::string(fileName) + " not found";
+  bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
+    ir::Unit *unit = new ir::Unit();
+    llvm::Module * cloned_module = NULL;
+    if(module){
+      cloned_module = llvm::CloneModule((llvm::Module*)module);
+    }
+    if (llvmToGen(*unit, fileName, module, optLevel) == false) {
+      if (fileName)
+        error = std::string(fileName) + " not found";
+      delete unit;
       return false;
     }
-    this->buildFromUnit(unit, error);
+    //If unit is not valid, maybe some thing don't support by backend, introduce by some passes
+    //use optLevel 0 to try again.
+    if(!unit->getValid()) {
+      delete unit;   //clear unit
+      unit = new ir::Unit();
+      if(cloned_module){
+        llvmToGen(*unit, fileName, cloned_module, 0);  //suppose file exists and llvmToGen will not return false.
+      }else{
+        llvmToGen(*unit, fileName, module, 0);  //suppose file exists and llvmToGen will not return false.
+      }
+    }
+    assert(unit->getValid());
+    this->buildFromUnit(*unit, error);
+    delete unit;
+    if(cloned_module){
+      delete (llvm::Module*) cloned_module;
+    }
     return true;
   }
 
+  BVAR(OCL_STRICT_CONFORMANCE, false);
+
   bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
     constantSet = new ir::ConstantSet(unit.getConstantSet());
     const auto &set = unit.getFunctionSet();
@@ -120,14 +148,17 @@ namespace gbe {
     if (kernelNum == 0) return true;
     for (const auto &pair : set) {
       const std::string &name = pair.first;
-      Kernel *kernel = this->compileKernel(unit, name);
+      Kernel *kernel = this->compileKernel(unit, name, !OCL_STRICT_CONFORMANCE);
       kernel->setSamplerSet(pair.second->getSamplerSet());
       kernel->setImageSet(pair.second->getImageSet());
+      kernel->setPrintfSet(pair.second->getPrintfSet());
       kernel->setCompileWorkGroupSize(pair.second->getCompileWorkGroupSize());
+      kernel->setFunctionAttributes(pair.second->getFunctionAttributes());
       kernels.insert(std::make_pair(name, kernel));
     }
     return true;
   }
+#endif
 
 #define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
 #define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
@@ -232,7 +263,7 @@ namespace gbe {
       OUT_UPDATE_SZ(arg.type);
       OUT_UPDATE_SZ(arg.size);
       OUT_UPDATE_SZ(arg.align);
-      OUT_UPDATE_SZ(arg.bufSize);
+      OUT_UPDATE_SZ(arg.bti);
     }
 
     OUT_UPDATE_SZ(patches.size());
@@ -256,7 +287,7 @@ namespace gbe {
     OUT_UPDATE_SZ(compileWgSize[1]);
     OUT_UPDATE_SZ(compileWgSize[2]);
     /* samplers. */
-    if (samplerSet) {
+    if (!samplerSet->empty()) {   //samplerSet is always valid, allocated in Function::Function
       has_samplerset = 1;
       OUT_UPDATE_SZ(has_samplerset);
       size_t sz = samplerSet->serializeToBin(outs);
@@ -269,7 +300,7 @@ namespace gbe {
     }
 
     /* images. */
-    if (imageSet) {
+    if (!imageSet->empty()) {   //imageSet is always valid, allocated in Function::Function
       has_imageset = 1;
       OUT_UPDATE_SZ(has_imageset);
       size_t sz = imageSet->serializeToBin(outs);
@@ -321,7 +352,7 @@ namespace gbe {
       IN_UPDATE_SZ(arg.type);
       IN_UPDATE_SZ(arg.size);
       IN_UPDATE_SZ(arg.align);
-      IN_UPDATE_SZ(arg.bufSize);
+      IN_UPDATE_SZ(arg.bti);
     }
 
     IN_UPDATE_SZ(patch_num);
@@ -358,6 +389,8 @@ namespace gbe {
 
       total_size += sz;
     }
+    else
+      samplerSet = NULL;
 
     IN_UPDATE_SZ(has_imageset);
     if (has_imageset) {
@@ -369,6 +402,8 @@ namespace gbe {
 
       total_size += sz;
     }
+    else
+      imageSet = NULL;
 
     IN_UPDATE_SZ(code_size);
     if (code_size) {
@@ -433,7 +468,7 @@ namespace gbe {
       outs << spaces_nl << "      type value: "<< arg.type << "\n";
       outs << spaces_nl << "      size: "<< arg.size << "\n";
       outs << spaces_nl << "      align: "<< arg.align << "\n";
-      outs << spaces_nl << "      bufSize: "<< arg.bufSize << "\n";
+      outs << spaces_nl << "      bti: "<< arg.bti << "\n";
     }
 
     outs << spaces_nl << "  Patches Number is " << patches.size() << "\n";
@@ -459,16 +494,54 @@ namespace gbe {
 
   /*********************** End of Program class member function *************************/
 
+#define REDEF_MATH_FUNC(x) "#ifdef "#x"\n#undef "#x"\n#endif\n#define "#x" __gen_ocl_internal_fastpath_"#x"\n"
+  std::string ocl_mathfunc_fastpath_str =
+    REDEF_MATH_FUNC(acosh)
+    REDEF_MATH_FUNC(asinh)
+    REDEF_MATH_FUNC(atanh)
+    REDEF_MATH_FUNC(cbrt)
+    REDEF_MATH_FUNC(cos)
+    REDEF_MATH_FUNC(cosh)
+    REDEF_MATH_FUNC(cospi)
+    REDEF_MATH_FUNC(exp)
+    REDEF_MATH_FUNC(exp10)
+    REDEF_MATH_FUNC(expm1)
+    REDEF_MATH_FUNC(fmod)
+    REDEF_MATH_FUNC(hypot)
+    REDEF_MATH_FUNC(ilogb)
+    REDEF_MATH_FUNC(ldexp)
+    REDEF_MATH_FUNC(log)
+    REDEF_MATH_FUNC(log2)
+    REDEF_MATH_FUNC(log10)
+    REDEF_MATH_FUNC(log1p)
+    REDEF_MATH_FUNC(logb)
+    REDEF_MATH_FUNC(remainder)
+    REDEF_MATH_FUNC(rootn)
+    REDEF_MATH_FUNC(sin)
+    REDEF_MATH_FUNC(sincos)
+    REDEF_MATH_FUNC(sinh)
+    REDEF_MATH_FUNC(sinpi)
+    REDEF_MATH_FUNC(tan)
+    REDEF_MATH_FUNC(tanh)
+    "\n"
+  ;
+
   static void programDelete(gbe_program gbeProgram) {
     gbe::Program *program = (gbe::Program*)(gbeProgram);
     GBE_SAFE_DELETE(program);
   }
 
+  static void programCleanLlvmResource(gbe_program gbeProgram) {
+    gbe::Program *program = (gbe::Program*)(gbeProgram);
+    program->CleanLlvmResource();
+  }
+
+#ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_BUILD_LOG, false);
   SVAR(OCL_PCH_PATH, PCH_OBJECT_DIR);
   SVAR(OCL_PCM_PATH, PCM_OBJECT_DIR);
 
-  static bool buildModuleFromSource(const char* input, const char* output, std::string options,
+  static bool buildModuleFromSource(const char* input, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx, std::string options,
                                     size_t stringSize, char *err, size_t *errSize) {
     // Arguments to pass to the clang frontend
     vector<const char *> args;
@@ -485,6 +558,7 @@ namespace gbe {
     //Handle -cl-opt-disable in llvmToGen, skip here
     const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
                                          "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
+    bool useDefaultCLCVersion = true;
     while (end != std::string::npos) {
       end = options.find(' ', start);
       std::string str = options.substr(start, end - start);
@@ -494,9 +568,25 @@ namespace gbe {
       if(str == "-cl-fast-relaxed-math") bFastMath = true;
       if(unsupportedOptions.find(str) != std::string::npos)
         continue;
+      if(str.find("-cl-std=") != std::string::npos) {
+        useDefaultCLCVersion = false;
+        if (str == "-cl-std=CL1.1")
+          args.push_back("-D__OPENCL_C_VERSION__=110");
+        else if (str == "-cl-std=CL1.2")
+          args.push_back("-D__OPENCL_C_VERSION__=120");
+        else {
+          if (err && stringSize > 0 && errSize)
+            *errSize = snprintf(err, stringSize, "Invalid build option: %s\n", str.c_str());
+          return false;
+        }
+      }
       useless.push_back(str);
       args.push_back(str.c_str());
     }
+    if (useDefaultCLCVersion) {
+      args.push_back("-D__OPENCL_C_VERSION__=120");
+      args.push_back("-cl-std=CL1.2");
+    }
     args.push_back("-mllvm");
     args.push_back("-inline-threshold=200000");
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
@@ -525,6 +615,7 @@ namespace gbe {
     llvm::raw_string_ostream ErrorInfo(ErrorString);
     llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
     DiagOpts->ShowCarets = false;
+    DiagOpts->ShowPresumedLoc = true;
 #if LLVM_VERSION_MINOR <= 1
     args.push_back("-triple");
     args.push_back("ptx32");
@@ -542,7 +633,7 @@ namespace gbe {
     clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
 #endif /* LLVM_VERSION_MINOR <= 1 */
     // Create the compiler invocation
-    llvm::OwningPtr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
+    std::unique_ptr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
     clang::CompilerInvocation::CreateFromArgs(*CI,
                                               &args[0],
                                               &args[0] + args.size(),
@@ -550,7 +641,7 @@ namespace gbe {
 
     // Create the compiler instance
     clang::CompilerInstance Clang;
-    Clang.setInvocation(CI.take());
+    Clang.setInvocation(CI.release());
     // Get ready to report problems
 #if LLVM_VERSION_MINOR <= 2
     Clang.createDiagnostics(args.size(), &args[0]);
@@ -583,7 +674,7 @@ namespace gbe {
     }
 
     // Create an action and make the compiler instance carry it out
-    llvm::OwningPtr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction());
+    std::unique_ptr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction(llvm_ctx));
 
     std::string dirs = OCL_PCM_PATH;
     std::string pcmFileName;
@@ -592,7 +683,7 @@ namespace gbe {
 
     while (getline(idirs, pcmFileName, ':')) {
       if(access(pcmFileName.c_str(), R_OK) == 0) {
-        findPcm = true;
+        findPcm |= true;
         break;
       }
     }
@@ -618,49 +709,23 @@ namespace gbe {
 
     llvm::Module *module = Act->takeModule();
 
-#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
-    auto mode = llvm::sys::fs::F_Binary;
-#else
-    auto mode = llvm::raw_fd_ostream::F_Binary;
-#endif
-    llvm::raw_fd_ostream OS(output, ErrorString, mode);
-    //still write to temp file for code simply, otherwise need add another function.
-    //because gbe_program_new_from_llvm also be used by cl_program_create_from_llvm, can't be removed
-    //TODO: Pass module to llvmToGen, if use module, should return Act and use OwningPtr out of this funciton
-    llvm::WriteBitcodeToFile(module, OS);
-    if (err != NULL && *errSize < stringSize - 1 && ErrorString.size() > 0) {
-      size_t errLen;
-      errLen = ErrorString.copy(err + *errSize, stringSize - *errSize - 1, 0);
-      *errSize += errLen;
-    }
-
-    if (err == NULL || OCL_OUTPUT_BUILD_LOG) {
-      // flush the error messages to the errs() if there is no
-      // error string buffer.
-      llvm::errs() << ErrorString;
-    }
-    OS.close();
+    *out_module = module;
     return true;
   }
 
   extern std::string ocl_stdlib_str;
 
   BVAR(OCL_USE_PCH, true);
-  static gbe_program programNewFromSource(const char *source,
-                                          size_t stringSize,
-                                          const char *options,
-                                          char *err,
-                                          size_t *errSize)
+  static void processSourceAndOption(const char *source,
+                                     const char *options,
+                                     const char *temp_header_path,
+                                     std::string& clOpt,
+                                     std::string& clName,
+                                     int& optLevel)
   {
     char clStr[] = "/tmp/XXXXXX.cl";
-    char llStr[] = "/tmp/XXXXXX.ll";
     int clFd = mkstemps(clStr, 3);
-    int llFd = mkstemps(llStr, 3);
-    close(llFd);
-    const std::string clName = std::string(clStr);
-    const std::string llName = std::string(llStr);
-    std::string clOpt;
-    int optLevel = 1;
+    clName = std::string(clStr);
 
     FILE *clFile = fdopen(clFd, "w");
     FATAL_IF(clFile == NULL, "Failed to open temporary file");
@@ -699,7 +764,12 @@ namespace gbe {
 
        So we just disable the PCH validation of Clang and do the judgement by ourself. */
 
-    if(options) {
+    /* We always add -cl-kernel-arg-info to the options. This option just generate the arg
+       information for the backend, no other side effect and does not have performance issue. */
+    if (!options || !strstr(const_cast<char *>(options), "-cl-kernel-arg-info"))
+      clOpt += "-cl-kernel-arg-info ";
+
+    if (options) {
       char *p;
       /* FIXME: Though we can disable the pch valid check, and load pch successfully,
          but these language opts and pre-defined macro will still generate the diag msg
@@ -709,7 +779,7 @@ namespace gbe {
           "-cl-single-precision-constant",
 //        "-cl-denorms-are-zero",
           "-cl-fast-relaxed-math",
-          "-cl-std=",
+          "-cl-std=CL1.1"
       };
       const char * incompatible_defs[] = {
           "GET_FLOAT_WORD",
@@ -738,6 +808,14 @@ namespace gbe {
       p = strstr(const_cast<char *>(options), "-cl-opt-disable");
       if (p)
         optLevel = 0;
+      // XXX enable cl_khr_fp64 may cause some potential bugs.
+      // we may need to revisit here latter when we want to support fp64 completely.
+      // For now, as we don't support fp64 actually, just disable it by default.
+#if 0
+      #define ENABLE_CL_KHR_FP64_STR "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      if (!strstr(const_cast<char *>(options), "-cl-std=CL1.1"))
+        fwrite(ENABLE_CL_KHR_FP64_STR, strlen(ENABLE_CL_KHR_FP64_STR), 1, clFile);
+#endif
 
       clOpt += options;
     }
@@ -760,16 +838,50 @@ namespace gbe {
     } else
       fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
 
+    //for clCompilerProgram usage.
+    if(temp_header_path){
+      clOpt += " -I ";
+      clOpt += temp_header_path;
+      clOpt += " ";
+    }
+
+    if (!OCL_STRICT_CONFORMANCE) {
+        fwrite(ocl_mathfunc_fastpath_str.c_str(), strlen(ocl_mathfunc_fastpath_str.c_str()), 1, clFile);
+    }
+
+    // reset the file number in case we have inserted something into the kernel
+    std::string resetFileNum = "#line 1\n";
+    fwrite(resetFileNum.c_str(), strlen(resetFileNum.c_str()), 1, clFile);
+
     // Write the source to the cl file
     fwrite(source, strlen(source), 1, clFile);
     fclose(clFile);
+  }
+
+  static gbe_program programNewFromSource(uint32_t deviceID,
+                                          const char *source,
+                                          size_t stringSize,
+                                          const char *options,
+                                          char *err,
+                                          size_t *errSize)
+  {
+    int optLevel = 1;
+    std::string clOpt;
+    std::string clName;
+    processSourceAndOption(source, options, NULL, clOpt, clName, optLevel);
 
     gbe_program p;
-    if (buildModuleFromSource(clName.c_str(), llName.c_str(), clOpt.c_str(),
+    // will delete the module and act in GenProgram::CleanLlvmResource().
+    llvm::Module * out_module;
+    llvm::LLVMContext* llvm_ctx = new llvm::LLVMContext;
+
+    static std::mutex llvm_mutex;
+    if (!llvm::llvm_is_multithreaded())
+      llvm_mutex.lock();
+
+    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
                               stringSize, err, errSize)) {
     // Now build the program from llvm
-      static std::mutex gbe_mutex;
-      gbe_mutex.lock();
       size_t clangErrSize = 0;
       if (err != NULL) {
         GBE_ASSERT(errSize != NULL);
@@ -777,19 +889,83 @@ namespace gbe {
         err += *errSize;
         clangErrSize = *errSize;
       }
-      p = gbe_program_new_from_llvm(llName.c_str(), stringSize,
+
+      p = gbe_program_new_from_llvm(deviceID, NULL, out_module, llvm_ctx, stringSize,
                                     err, errSize, optLevel);
       if (err != NULL)
         *errSize += clangErrSize;
-      gbe_mutex.unlock();
       if (OCL_OUTPUT_BUILD_LOG && options)
         llvm::errs() << options;
-      remove(llName.c_str());
+    } else
+      p = NULL;
+
+    if (!llvm::llvm_is_multithreaded())
+      llvm_mutex.unlock();
+
+    remove(clName.c_str());
+    return p;
+  }
+#endif
+
+#ifdef GBE_COMPILER_AVAILABLE
+
+  static gbe_program programCompileFromSource(uint32_t deviceID,
+                                          const char *source,
+                                          const char *temp_header_path,
+                                          size_t stringSize,
+                                          const char *options,
+                                          char *err,
+                                          size_t *errSize)
+  {
+    int optLevel = 1;
+    std::string clOpt;
+    std::string clName;
+    processSourceAndOption(source, options, temp_header_path, clOpt, clName, optLevel);
+
+    gbe_program p;
+    acquireLLVMContextLock();
+    //FIXME: if use new allocated context to link two modules there would be context mismatch
+    //for some functions, so we use global context now, need switch to new context later.
+    llvm::Module * out_module;
+    llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
+    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
+                              stringSize, err, errSize)) {
+    // Now build the program from llvm
+      if (err != NULL) {
+        GBE_ASSERT(errSize != NULL);
+        stringSize -= *errSize;
+        err += *errSize;
+      }
+
+      p = gbe_program_new_gen_program(deviceID, out_module, NULL);
+
+      if (OCL_OUTPUT_BUILD_LOG && options)
+        llvm::errs() << options;
     } else
       p = NULL;
     remove(clName.c_str());
+    releaseLLVMContextLock();
     return p;
   }
+#endif
+
+#ifdef GBE_COMPILER_AVAILABLE
+  static void programLinkProgram(gbe_program           dst_program,
+                                 gbe_program           src_program,
+                                 size_t                stringSize,
+                                 char *                err,
+                                 size_t *              errSize)
+  {
+    acquireLLVMContextLock();
+
+    gbe_program_link_from_llvm(dst_program, src_program, stringSize, err, errSize);
+
+    releaseLLVMContextLock();
+
+    if (OCL_OUTPUT_BUILD_LOG && err)
+      llvm::errs() << err;
+  }
+#endif
 
   static size_t programGetGlobalConstantSize(gbe_program gbeProgram) {
     if (gbeProgram == NULL) return 0;
@@ -827,6 +1003,12 @@ namespace gbe {
     return kernel->getName();
   }
 
+  static const char *kernelGetAttributes(gbe_kernel genKernel) {
+    if (genKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getFunctionAttributes();
+  }
+
   static const char *kernelGetCode(gbe_kernel genKernel) {
     if (genKernel == NULL) return NULL;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -845,12 +1027,41 @@ namespace gbe {
     return kernel->getArgNum();
   }
 
+  static void *kernelGetArgInfo(gbe_kernel genKernel, uint32_t argID, uint32_t value) {
+    if (genKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    ir::FunctionArgument::InfoFromLLVM* info = kernel->getArgInfo(argID);
+
+    switch (value) {
+      case GBE_GET_ARG_INFO_ADDRSPACE:
+        return (void*)((unsigned long)info->addrSpace);
+      case GBE_GET_ARG_INFO_TYPE:
+        return (void *)(info->typeName.c_str());
+      case GBE_GET_ARG_INFO_ACCESS:
+        return (void *)(info->accessQual.c_str());
+      case GBE_GET_ARG_INFO_TYPEQUAL:
+        return (void *)(info->typeQual.c_str());
+      case GBE_GET_ARG_INFO_NAME:
+        return (void *)(info->argName.c_str());
+      default:
+        assert(0);
+    }
+
+    return NULL;
+  }
+
   static uint32_t kernelGetArgSize(gbe_kernel genKernel, uint32_t argID) {
     if (genKernel == NULL) return 0u;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
     return kernel->getArgSize(argID);
   }
 
+  static uint8_t kernelGetArgBTI(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgBTI(argID);
+  }
+
   static uint32_t kernelGetArgAlign(gbe_kernel genKernel, uint32_t argID) {
     if (genKernel == NULL) return 0u;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -904,12 +1115,6 @@ namespace gbe {
     return kernel->getSLMSize();
   }
 
-  static int32_t kernelSetConstBufSize(gbe_kernel genKernel, uint32_t argID, size_t sz) {
-    if (genKernel == NULL) return -1;
-    gbe::Kernel *kernel = (gbe::Kernel*) genKernel;
-    return kernel->setConstBufSize(argID, sz);
-  }
-
   static size_t kernelGetSamplerSize(gbe_kernel gbeKernel) {
     if (gbeKernel == NULL) return 0;
     const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -922,6 +1127,52 @@ namespace gbe {
     kernel->getSamplerData(samplers);
   }
 
+  static uint32_t kernelGetPrintfNum(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getPrintfNum();
+  }
+
+  static void* kernelDupPrintfSet(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->dupPrintfSet();
+  }
+
+  static uint8_t kernelGetPrintfBufBTI(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getBufBTI();
+  }
+
+  static uint8_t kernelGetPrintfIndexBufBTI(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getIndexBufBTI();
+  }
+
+  static void kernelReleasePrintfSet(void * printf_info) {
+    if (printf_info == NULL) return;
+    ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    delete ps;
+  }
+
+  static uint32_t kernelGetPrintfSizeOfSize(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getPrintfSizeOfSize();
+  }
+
+  static void kernelOutputPrintf(void * printf_info, void* index_addr,
+                                 void* buf_addr, size_t global_wk_sz0,
+                                 size_t global_wk_sz1, size_t global_wk_sz2)
+  {
+    if (printf_info == NULL) return;
+    ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    ps->outputPrintf(index_addr, buf_addr, global_wk_sz0,
+                         global_wk_sz1, global_wk_sz2);
+  }
+
   static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
     if (gbeKernel == NULL) return;
     const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -940,35 +1191,47 @@ namespace gbe {
     kernel->getImageData(images);
   }
 
-  static uint32_t gbeImageBaseIndex = 0;
-  static void setImageBaseIndex(uint32_t baseIdx) {
-     gbeImageBaseIndex = baseIdx;
-  }
-
-  static uint32_t getImageBaseIndex() {
-    return gbeImageBaseIndex;
-  }
-
   static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
     return 0u;
   }
 } /* namespace gbe */
 
+std::mutex llvm_ctx_mutex;
+void acquireLLVMContextLock()
+{
+  llvm_ctx_mutex.lock();
+}
+
+void releaseLLVMContextLock()
+{
+  llvm_ctx_mutex.unlock();
+}
+
 GBE_EXPORT_SYMBOL gbe_program_new_from_source_cb *gbe_program_new_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_compile_from_source_cb *gbe_program_compile_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_link_program_cb *gbe_program_link_program = NULL;
 GBE_EXPORT_SYMBOL gbe_program_new_from_binary_cb *gbe_program_new_from_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_binary_cb *gbe_program_new_from_llvm_binary = NULL;
 GBE_EXPORT_SYMBOL gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary = NULL;
 GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_gen_program_cb *gbe_program_new_gen_program = NULL;
+GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
+GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL;
 GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_kernel_cb *gbe_program_get_kernel = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_name_cb *gbe_kernel_get_name = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_code_cb *gbe_kernel_get_code = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
@@ -976,7 +1239,6 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset =
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL;
-GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
@@ -985,9 +1247,15 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data =
 GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
-GBE_EXPORT_SYMBOL gbe_set_image_base_index_cb *gbe_set_image_base_index = NULL;
-GBE_EXPORT_SYMBOL gbe_get_image_base_index_cb *gbe_get_image_base_index = NULL;
-
+GBE_EXPORT_SYMBOL gbe_get_printf_num_cb *gbe_get_printf_num = NULL;
+GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size = NULL;
+GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
+
+#ifdef GBE_COMPILER_AVAILABLE
 namespace gbe
 {
   /* Use pre-main to setup the call backs */
@@ -995,17 +1263,23 @@ namespace gbe
   {
     CallBackInitializer(void) {
       gbe_program_new_from_source = gbe::programNewFromSource;
+      gbe_program_compile_from_source = gbe::programCompileFromSource;
+      gbe_program_link_program = gbe::programLinkProgram;
       gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
       gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+      gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
       gbe_program_delete = gbe::programDelete;
       gbe_program_get_kernel_num = gbe::programGetKernelNum;
       gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
       gbe_program_get_kernel = gbe::programGetKernel;
       gbe_kernel_get_name = gbe::kernelGetName;
+      gbe_kernel_get_attributes = gbe::kernelGetAttributes;
       gbe_kernel_get_code = gbe::kernelGetCode;
       gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
       gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
+      gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
       gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+      gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
       gbe_kernel_get_arg_type = gbe::kernelGetArgType;
       gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
       gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
@@ -1013,7 +1287,6 @@ namespace gbe
       gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
       gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
       gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
-      gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
       gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
       gbe_kernel_use_slm = gbe::kernelUseSLM;
       gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
@@ -1022,14 +1295,17 @@ namespace gbe
       gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
       gbe_kernel_get_image_size = gbe::kernelGetImageSize;
       gbe_kernel_get_image_data = gbe::kernelGetImageData;
-      gbe_get_image_base_index = gbe::getImageBaseIndex;
-      gbe_set_image_base_index = gbe::setImageBaseIndex;
+      gbe_get_printf_num = gbe::kernelGetPrintfNum;
+      gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
+      gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
+      gbe_dup_printfset = gbe::kernelDupPrintfSet;
+      gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
+      gbe_release_printf_info = gbe::kernelReleasePrintfSet;
+      gbe_output_printf = gbe::kernelOutputPrintf;
       genSetupCallBacks();
-      llvm::llvm_start_multithreaded();
     }
 
     ~CallBackInitializer() {
-      llvm::llvm_stop_multithreaded();
 #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
       llvm::llvm_shutdown();
 #endif
@@ -1038,4 +1314,4 @@ namespace gbe
 
   static CallBackInitializer cbInitializer;
 } /* namespace gbe */
-
+#endif
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index e6436c3..1421993 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -52,6 +52,23 @@ enum gbe_arg_type {
   GBE_ARG_INVALID = 0xffffffff
 };
 
+/*! Get argument info values */
+enum gbe_get_arg_info_value {
+  GBE_GET_ARG_INFO_ADDRSPACE = 0,
+  GBE_GET_ARG_INFO_ACCESS = 1,
+  GBE_GET_ARG_INFO_TYPE = 2,
+  GBE_GET_ARG_INFO_TYPEQUAL = 3,
+  GBE_GET_ARG_INFO_NAME = 4,
+  GBE_GET_ARG_INFO_INVALID = 0xffffffff
+};
+
+// BTI magic number
+#define BTI_CONSTANT 0
+#define BTI_PRIVATE 1
+#define BTI_RESERVED_NUM 2
+#define BTI_MAX_IMAGE_NUM 128
+#define BTI_MAX_ID (BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM - 1)
+
 /*! Constant buffer values (ie values to setup in the constant buffer) */
 enum gbe_curbe_type {
   GBE_CURBE_LOCAL_ID_X = 0,
@@ -70,16 +87,17 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Y,
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
-  GBE_CURBE_SAMPLER_INFO,
   GBE_CURBE_IMAGE_INFO,
   GBE_CURBE_STACK_POINTER,
+  GBE_CURBE_PRINTF_BUF_POINTER,
+  GBE_CURBE_PRINTF_INDEX_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
   GBE_CURBE_EXTRA_ARGUMENT,
   GBE_CURBE_BLOCK_IP,
   GBE_CURBE_THREAD_NUM,
-  GBE_CURBE_EMASK,
-  GBE_CURBE_NOT_EMASK,
-  GBE_CURBE_BARRIER_MASK,
+  GBE_CURBE_ZERO,
+  GBE_CURBE_ONE,
+  GBE_CURBE_SLM_OFFSET,
 };
 
 /*! Extra arguments use the negative range of sub-values */
@@ -113,30 +131,108 @@ extern gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size;
 typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo *images);
 extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
 
+/*! Get the printf number */
+typedef uint32_t (gbe_get_printf_num_cb)(void* printf_info);
+extern gbe_get_printf_num_cb *gbe_get_printf_num;
+
+/*! Get the printf buffer bti */
+typedef uint8_t (gbe_get_printf_buf_bti_cb)(void* printf_info);
+extern gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti;
+
+typedef uint8_t (gbe_get_printf_indexbuf_bti_cb)(void* printf_info);
+extern gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti;
+
+/*! Release the printfset */
+typedef void (gbe_release_printf_info_cb)(void* printf_info);
+extern gbe_release_printf_info_cb *gbe_release_printf_info;
+
+/*! Dup the printf set */
+typedef void* (gbe_dup_printfset_cb)(gbe_kernel gbeKernel);
+extern gbe_dup_printfset_cb *gbe_dup_printfset;
+
+/*! Get the printf buffer const offset */
+typedef uint32_t (gbe_get_printf_sizeof_size_cb)(void* printf_info);
+extern gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size;
+
+typedef void (gbe_output_printf_cb) (void* printf_info, void* index_addr, void* buf_addr,
+                         size_t global_wk_sz0, size_t global_wk_sz1, size_t global_wk_sz2);
+extern gbe_output_printf_cb* gbe_output_printf;
+
 /*! Create a new program from the given source code (zero terminated string) */
-typedef gbe_program (gbe_program_new_from_source_cb)(const char *source,
+typedef gbe_program (gbe_program_new_from_source_cb)(uint32_t deviceID,
+                                                     const char *source,
                                                      size_t stringSize,
                                                      const char *options,
                                                      char *err,
                                                      size_t *err_size);
 extern gbe_program_new_from_source_cb *gbe_program_new_from_source;
+/*! Create a new program from the given source code and compile it (zero terminated string) */
+typedef gbe_program (gbe_program_compile_from_source_cb)(uint32_t deviceID,
+                                                         const char *source,
+                                                         const char *temp_header_path,
+                                                         size_t stringSize,
+                                                         const char *options,
+                                                         char *err,
+                                                         size_t *err_size);
+extern gbe_program_compile_from_source_cb *gbe_program_compile_from_source;
+/*! link the programs. */
+typedef void (gbe_program_link_program_cb)(gbe_program           dst_program,
+                                           gbe_program           src_program,
+                                           size_t                stringSize,
+                                           char *                err,
+                                           size_t *              errSize);
+extern gbe_program_link_program_cb *gbe_program_link_program;
+
+/*! create s new genprogram for link. */
+typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
+                                                     const void *module,
+                                                     const void *act);
+extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
 
 /*! Create a new program from the given blob */
-typedef gbe_program (gbe_program_new_from_binary_cb)(const char *binary, size_t size);
+typedef gbe_program (gbe_program_new_from_binary_cb)(uint32_t deviceID, const char *binary, size_t size);
 extern gbe_program_new_from_binary_cb *gbe_program_new_from_binary;
 
-/*! Serialize a program to a bin */
-typedef size_t (gbe_program_serialize_to_binary_cb)(gbe_program program, char **binary);
+/*! Create a new program from the llvm bitcode*/
+typedef gbe_program (gbe_program_new_from_llvm_binary_cb)(uint32_t deviceID, const char *binary, size_t size);
+extern gbe_program_new_from_llvm_binary_cb *gbe_program_new_from_llvm_binary;
+
+/*! Serialize a program to a bin, 0 means executable, 1 means llvm bitcode*/
+typedef size_t (gbe_program_serialize_to_binary_cb)(gbe_program program, char **binary, int binary_type);
 extern gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary;
 
 /*! Create a new program from the given LLVM file */
-typedef gbe_program (gbe_program_new_from_llvm_cb)(const char *fileName,
+typedef gbe_program (gbe_program_new_from_llvm_cb)(uint32_t deviceID,
+                                                   const char *fileName,
+                                                   const void *module,
+                                                   const void *llvm_ctx,
                                                    size_t string_size,
                                                    char *err,
                                                    size_t *err_size,
                                                    int optLevel);
 extern gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm;
 
+/*! create s new genprogram for link. */
+typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
+                                                   const void *module,
+                                                   const void *act);
+extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
+
+/*! link the programs from llvm level. */
+typedef void (gbe_program_link_from_llvm_cb)(gbe_program dst_program,
+                                             gbe_program src_program,
+                                             size_t      stringSize,
+                                             char *      err,
+                                             size_t *    errSize);
+extern gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm;
+/* build the program to gen binary */
+typedef void gbe_program_build_from_llvm_cb(gbe_program program,
+                                      size_t stringSize,
+                                      char *err,
+                                      size_t *errSize,
+                                      const char *          options);
+extern gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm;
+
 /*! Get the size of global constants */
 typedef size_t (gbe_program_get_global_constant_size_cb)(gbe_program gbeProgram);
 extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size;
@@ -157,6 +253,10 @@ extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
 typedef void (gbe_kernel_get_compile_wg_size_cb)(gbe_kernel gbeKernel, size_t wg_sz[3]);
 extern gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size;
 
+/*! Clean LLVM resource of the given program */
+typedef void (gbe_program_clean_llvm_resource_cb)(gbe_program);
+extern gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource;
+
 /*! Destroy and deallocate the given program */
 typedef void (gbe_program_delete_cb)(gbe_program);
 extern gbe_program_delete_cb *gbe_program_delete;
@@ -177,6 +277,10 @@ extern gbe_program_get_kernel_cb *gbe_program_get_kernel;
 typedef const char *(gbe_kernel_get_name_cb)(gbe_kernel);
 extern gbe_kernel_get_name_cb *gbe_kernel_get_name;
 
+/*! Get the kernel attributes*/
+typedef const char *(gbe_kernel_get_attributes_cb)(gbe_kernel);
+extern gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes;
+
 /*! Get the kernel source code */
 typedef const char *(gbe_kernel_get_code_cb)(gbe_kernel);
 extern gbe_kernel_get_code_cb *gbe_kernel_get_code;
@@ -189,10 +293,18 @@ extern gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size;
 typedef uint32_t (gbe_kernel_get_arg_num_cb)(gbe_kernel);
 extern gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num;
 
+/*! Get the argument info */
+typedef void* (gbe_kernel_get_arg_info_cb)(gbe_kernel, uint32_t argID, uint32_t value);
+extern gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info;
+
 /*! Get the size of the given argument */
 typedef uint32_t (gbe_kernel_get_arg_size_cb)(gbe_kernel, uint32_t argID);
 extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
 
+/*! Get the the bti of a __global buffer */
+typedef uint8_t (gbe_kernel_get_arg_bti_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti;
+
 /*! Get the type of the given argument */
 typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
 extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
@@ -221,10 +333,6 @@ extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
 typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
 extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
 
-/*! Set the constant pointer arg size and return the cb offset in curbe */
-typedef int32_t (gbe_kernel_set_const_buffer_size_cb)(gbe_kernel, uint32_t argID, size_t sz);
-extern gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size;
-
 /*! Indicates if a work group size is required. Return the required width or 0
  *  if none
  */
@@ -238,6 +346,10 @@ extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
 typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
 extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
 
+/*mutex to lock global llvmcontext access.*/
+extern void acquireLLVMContextLock();
+extern void releaseLLVMContextLock();
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index e6fc411..56f60af 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -30,6 +30,7 @@
 #include "ir/constant.hpp"
 #include "ir/unit.hpp"
 #include "ir/function.hpp"
+#include "ir/printf.hpp"
 #include "ir/sampler.hpp"
 #include "sys/hash_map.hpp"
 #include "sys/vector.hpp"
@@ -48,7 +49,8 @@ namespace gbe {
     gbe_arg_type type; //!< Pointer, structure, image, regular value?
     uint32_t size;     //!< Size of the argument
     uint32_t align;    //!< addr alignment of the argument
-    uint32_t bufSize;  //!< Contant buffer size
+    uint8_t bti;      //!< binding table index for __global buffer
+    ir::FunctionArgument::InfoFromLLVM info;
   };
 
   /*! Stores the offset where to patch where to patch */
@@ -89,6 +91,11 @@ namespace gbe {
     INLINE uint32_t getArgSize(uint32_t argID) const {
       return argID >= argNum ? 0u : args[argID].size;
     }
+    /*! Return the bti for __global buffer */
+    INLINE uint8_t getArgBTI(uint32_t argID) const {
+      return argID >= argNum ? 0u : args[argID].bti;
+    }
+    /*! Return the alignment of buffer argument */
     INLINE uint32_t getArgAlign(uint32_t argID) const {
       return argID >= argNum ? 0u : args[argID].align;
     }
@@ -110,28 +117,53 @@ namespace gbe {
     INLINE bool getUseSLM(void) const { return this->useSLM; }
     /*! get slm size for kernel local variable */
     INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
-    /*! set constant buffer size and return the cb curbe offset */
-    int32_t setConstBufSize(uint32_t argID, size_t sz) {
-      if(argID >= argNum) return -1;
-      if(args[argID].type != GBE_ARG_CONSTANT_PTR) return -1;
-      if(args[argID].bufSize != sz) {
-        args[argID].bufSize = sz;
-        return ctx->allocConstBuf(argID);
-      }
-      return -1;
-    }
     /*! Set sampler set. */
     void setSamplerSet(ir::SamplerSet *from) {
       samplerSet = from;
     }
     /*! Get defined sampler size */
-    size_t getSamplerSize(void) const { return samplerSet->getDataSize(); }
+    size_t getSamplerSize(void) const { return (samplerSet == NULL ? 0 : samplerSet->getDataSize()); }
     /*! Get defined sampler value array */
     void getSamplerData(uint32_t *samplers) const { samplerSet->getData(samplers); }
     /*! Set image set. */
     void setImageSet(ir::ImageSet * from) {
       imageSet = from;
     }
+    /*! Set printf set. */
+    void setPrintfSet(ir::PrintfSet * from) {
+      printfSet = from;
+    }
+    /* ! Return the offset in the sizeof(xxx). */
+    uint32_t getPrintfSizeOfSize(void) const {
+      return printfSet ? printfSet->getPrintfSizeOfSize() : 0;
+    }
+    uint32_t getPrintfNum() const {
+      return printfSet ? printfSet->getPrintfNum() : 0;
+    }
+
+    void * dupPrintfSet() const {
+      void* ptr = printfSet ? (void *)(new ir::PrintfSet(*printfSet)) : NULL;
+      return ptr;
+    }
+    uint8_t getPrintfBufBTI() const {
+      GBE_ASSERT(printfSet);
+      return printfSet->getBufBTI();
+    }
+
+    uint8_t getPrintfIndexBufBTI() const {
+      GBE_ASSERT(printfSet);
+      return printfSet->getIndexBufBTI();
+    }
+
+    void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+                      size_t global_wk_sz1, size_t global_wk_sz2) {
+      if(printfSet)
+        printfSet->outputPrintf(index_addr, buf_addr, global_wk_sz0,
+                                global_wk_sz1, global_wk_sz2);
+    }
+
+    ir::FunctionArgument::InfoFromLLVM* getArgInfo(uint32_t id) const { return &args[id].info; }
+
     /*! Set compile work group size */
     void setCompileWorkGroupSize(const size_t wg_sz[3]) {
        compileWgSize[0] = wg_sz[0];
@@ -144,8 +176,13 @@ namespace gbe {
        wg_sz[1] = compileWgSize[1];
        wg_sz[2] = compileWgSize[2];
     }
+    /*! Set function attributes string. */
+    void setFunctionAttributes(const std::string& functionAttributes) {  this->functionAttributes= functionAttributes; }
+    /*! Get function attributes string. */
+    const char* getFunctionAttributes(void) const {return this->functionAttributes.c_str();}
+
     /*! Get defined image size */
-    size_t getImageSize(void) const { return imageSet->getDataSize(); }
+    size_t getImageSize(void) const { return (imageSet == NULL ? 0 : imageSet->getDataSize()); }
     /*! Get defined image value array */
     void getImageData(ImageInfo *images) const { imageSet->getData(images); }
 
@@ -180,6 +217,7 @@ namespace gbe {
 
   protected:
     friend class Context;      //!< Owns the kernels
+    friend class GenContext;
     std::string name;    //!< Kernel name
     KernelArgument *args;      //!< Each argument
     vector<PatchInfo> patches; //!< Indicates how to build the curbe
@@ -193,7 +231,9 @@ namespace gbe {
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
     ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
     ir::ImageSet *imageSet;    //!< Copy from the corresponding function.
+    ir::PrintfSet *printfSet;  //!< Copy from the corresponding function.
     size_t compileWgSize[3];   //!< required work group size by kernel attribute.
+    std::string functionAttributes; //!< function attribute qualifiers combined.
     GBE_CLASS(Kernel);         //!< Use custom allocators
   };
 
@@ -205,6 +245,8 @@ namespace gbe {
     Program(void);
     /*! Destroy the program */
     virtual ~Program(void);
+    /*! Clean LLVM resource of the program */
+    virtual void CleanLlvmResource() = 0;
     /*! Get the number of kernels in the program */
     uint32_t getKernelNum(void) const { return kernels.size(); }
     /*! Get the kernel from its name */
@@ -231,7 +273,7 @@ namespace gbe {
     /*! Build a program from a ir::Unit */
     bool buildFromUnit(const ir::Unit &unit, std::string &error);
     /*! Buils a program from a LLVM source code */
-    bool buildFromLLVMFile(const char *fileName, std::string &error, int optLevel);
+    bool buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel);
     /*! Buils a program from a OCL string */
     bool buildFromSource(const char *source, std::string &error);
     /*! Get size of the global constant arrays */
@@ -261,7 +303,7 @@ namespace gbe {
 
   protected:
     /*! Compile a kernel */
-    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name) = 0;
+    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) = 0;
     /*! Allocate an empty kernel. */
     virtual Kernel *allocateKernel(const std::string &name) = 0;
     /*! Kernels sorted by their name */
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
index 4393ad5..18d23ca 100644
--- a/backend/src/builtin_vector_proto.def
+++ b/backend/src/builtin_vector_proto.def
@@ -129,6 +129,45 @@ gentype tanpi (gentype x)
 gentype tgamma (gentype)
 gentype trunc (gentype)
 
+##math function fast path
+gentype __gen_ocl_internal_fastpath_acosh (gentype x)
+gentype __gen_ocl_internal_fastpath_asinh (gentype x)
+gentype __gen_ocl_internal_fastpath_atanh (gentype x)
+gentype __gen_ocl_internal_fastpath_cbrt (gentype x)
+gentype __gen_ocl_internal_fastpath_cos (gentype x)
+gentype __gen_ocl_internal_fastpath_cosh (gentype x)
+gentype __gen_ocl_internal_fastpath_cospi (gentype x)
+gentype __gen_ocl_internal_fastpath_exp (gentype x)
+gentype __gen_ocl_internal_fastpath_exp10 (gentype x)
+gentype __gen_ocl_internal_fastpath_expm1 (gentype x)
+gentype __gen_ocl_internal_fastpath_fmod (gentype x, gentype y)
+gentype __gen_ocl_internal_fastpath_hypot (gentype x, gentype y)
+intn __gen_ocl_internal_fastpath_ilogb (floatn x)
+int __gen_ocl_internal_fastpath_ilogb (float x)
+intn __gen_ocl_internal_fastpath_ilogb (doublen x)
+int __gen_ocl_internal_fastpath_ilogb (double x)
+floatn __gen_ocl_internal_fastpath_ldexp (floatn x, intn k)
+floatn __gen_ocl_internal_fastpath_ldexp (floatn x, int k)
+float __gen_ocl_internal_fastpath_ldexp (float x, int k)
+doublen __gen_ocl_internal_fastpath_ldexp (doublen x, intn k)
+doublen __gen_ocl_internal_fastpath_ldexp (doublen x, int k)
+double __gen_ocl_internal_fastpath_ldexp (double x, int k)
+gentype __gen_ocl_internal_fastpath_log (gentype x)
+gentype __gen_ocl_internal_fastpath_log2 (gentype x)
+gentype __gen_ocl_internal_fastpath_log10 (gentype x)
+gentype __gen_ocl_internal_fastpath_log1p (gentype x)
+gentype __gen_ocl_internal_fastpath_logb (gentype x)
+gentype __gen_ocl_internal_fastpath_remainder (gentype x, gentype y)
+floatn __gen_ocl_internal_fastpath_rootn (floatn x, intn k)
+gentype __gen_ocl_internal_fastpath_sin (gentype x)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __global gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __local gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __private gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sinh (gentype x)
+gentype __gen_ocl_internal_fastpath_sinpi (gentype x)
+gentype __gen_ocl_internal_fastpath_tan (gentype x)
+gentype __gen_ocl_internal_fastpath_tanh (gentype x)
+
 ##half_native_math
 #gentype half_cos (gentype x)
 #gentype half_divide (gentype x, gentype y)
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index f813775..79e3935 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -38,6 +38,8 @@
 
 #include "backend/program.h"
 #include "backend/program.hpp"
+#include "backend/src/sys/platform.hpp"
+#include "src/cl_device_data.h"
 
 using namespace std;
 
@@ -46,6 +48,8 @@ using namespace std;
 #define FILE_BUILD_FAILED 3
 #define FILE_SERIALIZATION_FAILED 4
 
+static uint32_t gen_pci_id = 0;
+
 class program_build_instance {
 
 protected:
@@ -146,40 +150,99 @@ public:
 
 string program_build_instance::bin_path;
 bool program_build_instance::str_fmt_out = false;
+#define OUTS_UPDATE_SZ(elt) SERIALIZE_OUT(elt, oss, header_sz)
+#define OUTF_UPDATE_SZ(elt) SERIALIZE_OUT(elt, ofs, header_sz)
 
 void program_build_instance::serialize_program(void) throw(int)
 {
     ofstream ofs;
     ostringstream oss;
-    size_t sz;
+    size_t sz = 0, header_sz = 0;
     ofs.open(bin_path, ofstream::out | ofstream::trunc | ofstream::binary);
 
-    if (str_fmt_out) {
-        string array_name = "Unkown_name_array";
-        unsigned long last_slash = bin_path.rfind("/");
-        unsigned long last_dot = bin_path.rfind(".");
-
-        if (last_slash != string::npos &&  last_dot != string::npos)
-            array_name = bin_path.substr(last_slash + 1, last_dot - 1 - last_slash);
+    char src_hw_info[4]="";
+    if(IS_IVYBRIDGE(gen_pci_id)){
+      src_hw_info[0]='I';
+      src_hw_info[1]='V';
+      src_hw_info[2]='B';
+      if(IS_BAYTRAIL_T(gen_pci_id)){
+        src_hw_info[0]='B';
+        src_hw_info[1]='Y';
+        src_hw_info[2]='T';
+      }
+    }else if(IS_HASWELL(gen_pci_id)){
+        src_hw_info[0]='H';
+        src_hw_info[1]='S';
+        src_hw_info[2]='W';
+    }
 
-        ofs << "char " << array_name << "[] = {" << "\n";
+    if (str_fmt_out) {
 
+      if(gen_pci_id){
+        //add header to differeciate from llvm bitcode binary.
+        // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
+        char gen_header[6] = "\0GENC";
+        OUTS_UPDATE_SZ(gen_header[0]);
+        OUTS_UPDATE_SZ(gen_header[1]);
+        OUTS_UPDATE_SZ(gen_header[2]);
+        OUTS_UPDATE_SZ(gen_header[3]);
+        OUTS_UPDATE_SZ(gen_header[4]);
+        OUTS_UPDATE_SZ(src_hw_info[0]);
+        OUTS_UPDATE_SZ(src_hw_info[1]);
+        OUTS_UPDATE_SZ(src_hw_info[2]);
+      }
+
+      string array_name = "Unknown_name_array";
+      unsigned long last_slash = bin_path.rfind("/");
+      unsigned long last_dot = bin_path.rfind(".");
+
+      if (last_slash != string::npos &&  last_dot != string::npos)
+        array_name = bin_path.substr(last_slash + 1, last_dot - 1 - last_slash);
+
+      ofs << "#include <stddef.h>" << "\n";
+      ofs << "char " << array_name << "[] = {" << "\n";
+
+      if(gen_pci_id){
         sz = gbe_prog->serializeToBin(oss);
-
-        for (size_t i = 0; i < sz; i++) {
-            unsigned char c = oss.str().c_str()[i];
-            char asic_str[9];
-            sprintf(asic_str, "%2.2x", c);
-            ofs << "0x";
-            ofs << asic_str << ((i == sz - 1) ? "" : ", ");
-        }
-
-        ofs << "};\n";
-
-	string array_size = array_name + "_size";
-	ofs << "int " << array_size << " = " << sz << ";" << "\n";
+        sz += header_sz;
+      }else{
+        char *llvm_binary;
+        size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
+        oss.write(llvm_binary, bin_length);
+        sz += bin_length;
+      }
+
+      for (size_t i = 0; i < sz; i++) {
+        unsigned char c = oss.str().c_str()[i];
+        char asic_str[9];
+        sprintf(asic_str, "%2.2x", c);
+        ofs << "0x";
+        ofs << asic_str << ((i == sz - 1) ? "" : ", ");
+      }
+      ofs << "};\n";
+
+      string array_size = array_name + "_size";
+      ofs << "size_t " << array_size << " = " << sz << ";" << "\n";
     } else {
+      if(gen_pci_id){
+        //add header to differeciate from llvm bitcode binary.
+        // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
+        char gen_header[6] = "\0GENC";
+        OUTF_UPDATE_SZ(gen_header[0]);
+        OUTF_UPDATE_SZ(gen_header[1]);
+        OUTF_UPDATE_SZ(gen_header[2]);
+        OUTF_UPDATE_SZ(gen_header[3]);
+        OUTF_UPDATE_SZ(gen_header[4]);
+        OUTF_UPDATE_SZ(src_hw_info[0]);
+        OUTF_UPDATE_SZ(src_hw_info[1]);
+        OUTF_UPDATE_SZ(src_hw_info[2]);
         sz = gbe_prog->serializeToBin(ofs);
+      }else{
+        char *llvm_binary;
+        size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
+        ofs.write(llvm_binary, bin_length);
+        sz+=bin_length;
+      }
     }
 
     ofs.close();
@@ -192,13 +255,20 @@ void program_build_instance::serialize_program(void) throw(int)
 
 void program_build_instance::build_program(void) throw(int)
 {
-    gbe_program opaque = gbe_program_new_from_source(code, 0, build_opt.c_str(), NULL, NULL);
+    gbe_program  opaque = NULL;
+    if(gen_pci_id){
+      opaque = gbe_program_new_from_source(gen_pci_id, code, 0, build_opt.c_str(), NULL, NULL);
+    }else{
+      opaque = gbe_program_compile_from_source(0, code, NULL, 0, build_opt.c_str(), NULL, NULL);
+    }
     if (!opaque)
         throw FILE_BUILD_FAILED;
 
     gbe_prog = reinterpret_cast<gbe::Program*>(opaque);
 
-    assert(gbe_program_get_kernel_num(opaque));
+    if(gen_pci_id){
+      assert(gbe_program_get_kernel_num(opaque));
+    }
 }
 
 const char* program_build_instance::file_map_open(void) throw(int)
@@ -247,7 +317,7 @@ int main (int argc, const char **argv)
         argv_saved.push_back(string(argv[i]));
     }
 
-    while ( (oc = getopt(argc, (char * const *)argv, "o:p:s")) != -1 ) {
+    while ( (oc = getopt(argc, (char * const *)argv, "t:o:p:s")) != -1 ) {
         switch (oc) {
         case 'p':
         {
@@ -281,6 +351,24 @@ int main (int argc, const char **argv)
             used_index[optind-1] = 1;
             break;
 
+        case 't':
+        {
+            char *s = optarg;
+            if (optarg[0] == '0' && (optarg[1] == 'x' || optarg[1] == 'X'))
+            s += 2;
+
+            if (s[0] < '0' || s[0] > '9') {
+                cout << "Invalid target option argument" << endl;
+                return 1;
+            }
+
+            std::stringstream str(s);
+            str >> std::hex >> gen_pci_id;
+
+            used_index[optind-1] = 1;
+            break;
+        }
+
         case 's':
             program_build_instance::set_str_fmt_out(true);
             used_index[optind-1] = 1;
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
new file mode 100644
index 0000000..1c67a4b
--- /dev/null
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "sys/alloc.cpp"
+#include "sys/cvar.cpp"
+#include "sys/assert.cpp"
+#include "sys/platform.cpp"
+#include "ir/constant.cpp"
+#include "ir/printf.cpp"
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#undef GBE_COMPILER_AVAILABLE
+#include "backend/program.cpp"
+#include "backend/gen_program.cpp"
+#include "ir/sampler.cpp"
+#include "ir/image.cpp"
+
+struct BinInterpCallBackInitializer
+{
+  BinInterpCallBackInitializer() {
+    gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
+    gbe_program_get_kernel_num = gbe::programGetKernelNum;
+    gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
+    gbe_program_get_kernel = gbe::programGetKernel;
+    gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
+    gbe_kernel_get_code = gbe::kernelGetCode;
+    gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
+    gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
+    gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
+    gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
+    gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+    gbe_kernel_get_image_size = gbe::kernelGetImageSize;
+    gbe_kernel_get_name = gbe::kernelGetName;
+    gbe_kernel_get_attributes = gbe::kernelGetAttributes;
+    gbe_kernel_get_arg_type = gbe::kernelGetArgType;
+    gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+    gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
+    gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
+    gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
+    gbe_kernel_use_slm = gbe::kernelUseSLM;
+    gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
+    gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
+    gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
+    gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
+    gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
+    gbe_program_delete = gbe::programDelete;
+    gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+    gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+    gbe_kernel_get_image_data = gbe::kernelGetImageData;
+    gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
+    gbe_get_printf_num = gbe::kernelGetPrintfNum;
+    gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
+    gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
+    gbe_dup_printfset = gbe::kernelDupPrintfSet;
+    gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
+    gbe_release_printf_info = gbe::kernelReleasePrintfSet;
+    gbe_output_printf = gbe::kernelOutputPrintf;
+  }
+
+  ~BinInterpCallBackInitializer() {
+  }
+};
+
+static struct BinInterpCallBackInitializer binInterpCB;
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
index b100bbf..2d602c8 100755
--- a/backend/src/gen_builtin_vector.py
+++ b/backend/src/gen_builtin_vector.py
@@ -314,9 +314,9 @@ class builtinProto():
                     formatStr += '({0} {1} *)param{2} + {3:2d}'.format(ptype[2], ptype[0], n, j)
                 else:
                     if (self.functionName == 'select' and n == 2):
-                        formatStr += '({0})(param{1}.s{2:x} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
+                        formatStr += '({0})(param{1}.s{2:X} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
                     else:
-                        formatStr += 'param{0}.s{1:x}'.format(n, j)
+                        formatStr += 'param{0}.s{1:X}'.format(n, j)
 
             formatStr += ')'
 
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
index f0562a7..b940222 100755
--- a/backend/src/gen_convert.sh
+++ b/backend/src/gen_convert.sh
@@ -99,7 +99,7 @@ DEF(uint, float);
 
 #define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x > MAX ? (DSTTYPE)MAX : x < MIN ? (DSTTYPE)MIN : x; \
+    return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
   }
 DEF(char, long, -128, 127);
 DEF(uchar, long, 0, 255);
@@ -113,7 +113,7 @@ DEF(ulong, float, 0, 1.8446744073709552e+19f);
 
 #define DEF(DSTTYPE, SRCTYPE, MAX) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x > MAX ? (DSTTYPE)MAX : x; \
+    return x >= MAX ? (DSTTYPE)MAX : x; \
   }
 DEF(char, ulong, 127);
 DEF(uchar, ulong, 255);
@@ -125,12 +125,12 @@ DEF(uint, ulong, 0xffffffffu);
 
 INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
   ulong MAX = 0x7ffffffffffffffful;
-  return x > MAX ? MAX : x;
+  return x >= MAX ? MAX : x;
 }
 
 #define DEF(DSTTYPE, SRCTYPE) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x < 0 ? 0 : x; \
+    return x <= 0 ? 0 : x; \
   }
 DEF(ushort, char);
 DEF(uint, char);
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
index d6815e1..1528a8d 100644
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -75,14 +75,16 @@ namespace ir {
     // Remove all returns and insert one unique return block at the end of the
     // function
     lowerReturn(unit, fn->getName());
+    // check if there is empty labels at first
+    fn->checkEmptyLabels();
+    // Properly order labels and compute the CFG, it's needed by FunctionArgumentLower
+    fn->sortLabels();
+    fn->computeCFG();
 
     // Spill function argument to the stack if required and identify which
     // function arguments can use constant push
     lowerFunctionArguments(unit, fn->getName());
 
-    // Properly order labels and compute the CFG
-    fn->sortLabels();
-    fn->computeCFG();
     const StackElem elem = fnStack.back();
     fnStack.pop_back();
     fn = elem.fn;
@@ -90,9 +92,9 @@ namespace ir {
     usedLabels = elem.usedLabels;
   }
 
-  Register Context::reg(RegisterFamily family) {
+  Register Context::reg(RegisterFamily family, bool uniform) {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
-    return fn->newRegister(family);
+    return fn->newRegister(family, uniform);
   }
 
   LabelIndex Context::label(void) {
@@ -105,10 +107,11 @@ namespace ir {
     return index;
   }
 
-  void Context::input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elementSize, uint32_t align) {
+  void Context::input(const std::string &name, FunctionArgument::Type type, Register reg,
+                      FunctionArgument::InfoFromLLVM& info, uint32_t elementSize, uint32_t align, unsigned char bti) {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
     GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
-    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align);
+    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info, bti);
     fn->args.push_back(arg);
   }
 
@@ -157,7 +160,8 @@ namespace ir {
     bb->append(*insnPtr);
 #if GBE_DEBUG
     std::string whyNot;
-    GBE_ASSERTM(insnPtr->wellFormed(whyNot), whyNot.c_str());
+    if(getUnit().getValid())
+      GBE_ASSERTM(insnPtr->wellFormed(whyNot), whyNot.c_str());
 #endif /* GBE_DEBUG */
 
     // Close the current block if this is a branch
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index adeaf6f..cd09413 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -53,6 +53,8 @@ namespace ir {
     INLINE Unit &getUnit(void) { return unit; }
     /*! Get the current processed function */
     Function &getFunction(void);
+    /*! Get the current processed block */
+    BasicBlock *getBlock(void) { return bb; }
     /*! Set the SIMD width of the function */
     void setSimdWidth(uint32_t width) const {
       GBE_ASSERT(width == 8 || width == 16);
@@ -61,12 +63,24 @@ namespace ir {
     /*! Append a new pushed constant */
     void appendPushedConstant(Register reg, const PushLocation &pushed);
     /*! Create a new register with the given family for the current function */
-    Register reg(RegisterFamily family);
+    Register reg(RegisterFamily family, bool uniform = false);
     /*! Create a new immediate value */
     template <typename T> INLINE ImmediateIndex newImmediate(T value) {
       const Immediate imm(value);
       return fn->newImmediate(imm);
     }
+    template <typename T> INLINE ImmediateIndex newImmediate(T value, uint32_t num) {
+      const Immediate imm(value, num);
+      return fn->newImmediate(imm);
+    }
+    /*! Create a new immediate value */
+    INLINE ImmediateIndex newImmediate(vector<ImmediateIndex>indexVector) {
+      vector<const Immediate*> immVector;
+      for( uint32_t i = 0; i < indexVector.size(); i++)
+        immVector.push_back(&fn->getImmediate(indexVector[i]));
+      const Immediate imm(immVector);
+      return fn->newImmediate(imm);
+    }
     /*! Create an integer immediate value */
     INLINE ImmediateIndex newIntegerImmediate(int64_t x, Type type) {
       switch (type) {
@@ -89,6 +103,20 @@ namespace ir {
       return this->newImmediate(x);
     }
 
+    INLINE ImmediateIndex processImm(ImmOpCode op, ImmediateIndex src, Type type) {
+      const Immediate &imm = fn->getImmediate(src);
+      const Immediate &dstImm = Immediate(op, imm, type);
+      return fn->newImmediate(dstImm);
+    }
+
+    INLINE ImmediateIndex processImm(ImmOpCode op, ImmediateIndex src0,
+                                     ImmediateIndex src1, Type type) {
+      const Immediate &imm0 = fn->getImmediate(src0);
+      const Immediate &imm1 = fn->getImmediate(src1);
+      const Immediate &dstImm = Immediate(op, imm0, imm1, type);
+      return fn->newImmediate(dstImm);
+    }
+
     /*! Set an immediate value */
     template <typename T> INLINE void setImmediate(ImmediateIndex index, T value) {
       const Immediate imm(value);
@@ -99,15 +127,16 @@ namespace ir {
       GBE_ASSERTM(fn != NULL, "No function currently defined");
       const Immediate imm(value);
       const ImmediateIndex index = fn->newImmediate(imm);
-      const RegisterFamily family = getFamily(imm.type);
+      const RegisterFamily family = getFamily(imm.getType());
       const Register reg = this->reg(family);
-      this->LOADI(imm.type, reg, index);
+      this->LOADI(imm.getType(), reg, index);
       return reg;
     }
     /*! Create a new label for the current function */
     LabelIndex label(void);
     /*! Append a new input register for the function */
-    void input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elemSz = 0u, uint32_t align = 0);
+    void input(const std::string &name, FunctionArgument::Type type, Register reg,
+               FunctionArgument::InfoFromLLVM& info, uint32_t elemSz = 0u, uint32_t align = 0, uint8_t bti = 0);
     /*! Append a new output register for the function */
     void output(Register reg);
     /*! Get the immediate value */
@@ -160,23 +189,24 @@ namespace ir {
 
     /*! LOAD with the destinations directly specified */
     template <typename... Args>
-    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
+    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->LOAD(type, index, offset, space, valueNum, dwAligned);
+      this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
     }
 
     /*! STORE with the sources directly specified */
     template <typename... Args>
-    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
+    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
     {
       const Tuple index = this->tuple(values...);
       const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
       GBE_ASSERT(valueNum > 0);
-      this->STORE(type, index, offset, space, valueNum, dwAligned);
+      this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
     }
+    void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
 
   protected:
     /*! A block must be started with a label */
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 71dcc1f..85e7934 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -48,10 +48,12 @@ namespace ir {
     initProfile(*this);
     samplerSet = GBE_NEW(SamplerSet);
     imageSet = GBE_NEW(ImageSet);
+    printfSet = GBE_NEW(PrintfSet);
   }
 
   Function::~Function(void) {
     for (auto block : blocks) GBE_DELETE(block);
+    for (auto loop : loops) GBE_DELETE(loop);
     for (auto arg : args) GBE_DELETE(arg);
   }
 
@@ -59,6 +61,22 @@ namespace ir {
     return unit.getPointerFamily();
   }
 
+  void Function::addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits) {
+    loops.push_back(GBE_NEW(Loop, bbs, exits));
+  }
+
+  void Function::checkEmptyLabels(void) {
+    // Empty label map, we map the removed label to the next label.
+    map<LabelIndex, LabelIndex> labelMap;
+    map<LabelIndex, LabelIndex> revLabelMap;
+    foreachBlock([&](BasicBlock &BB) {
+      Instruction * insn = BB.getLastInstruction();
+      if (insn->getOpcode() == OP_LABEL) {
+        GBE_ASSERTM(0, "Found empty block. ");
+      }
+    });
+  }
+
   void Function::sortLabels(void) {
     uint32_t last = 0;
 
@@ -96,6 +114,17 @@ namespace ir {
       }
     });
 
+    // fix labels for loops
+    for (auto &x : loops) {
+      for (auto &y : x->bbs)
+        y = labelMap[y];
+
+      for (auto &z : x->exits) {
+        z.first = labelMap[z.first];
+        z.second = labelMap[z.second];
+      }
+    }
+
     // Reset the label to block mapping
     this->labels.resize(last);
     foreachBlock([&](BasicBlock &bb) {
@@ -117,19 +146,21 @@ namespace ir {
   void Function::outImmediate(std::ostream &out, ImmediateIndex index) const {
     GBE_ASSERT(index < immediates.size());
     const Immediate imm = immediates[index];
-    switch (imm.type) {
-      case TYPE_BOOL: out << !!imm.data.u8; break;
-      case TYPE_S8: out << imm.data.s8; break;
-      case TYPE_U8: out << imm.data.u8; break;
-      case TYPE_S16: out << imm.data.s16; break;
-      case TYPE_U16: out << imm.data.u16; break;
-      case TYPE_S32: out << imm.data.s32; break;
-      case TYPE_U32: out << imm.data.u32; break;
-      case TYPE_S64: out << imm.data.s64; break;
-      case TYPE_U64: out << imm.data.u64; break;
-      case TYPE_HALF: out << "half(" << imm.data.u16 << ")"; break;
-      case TYPE_FLOAT: out << imm.data.f32; break;
-      case TYPE_DOUBLE: out << imm.data.f64; break;
+    switch (imm.getType()) {
+      case TYPE_BOOL: out << !!imm.getIntegerValue(); break;
+      case TYPE_S8:
+      case TYPE_U8:
+      case TYPE_S16:
+      case TYPE_U16:
+      case TYPE_S32:
+      case TYPE_U32:
+      case TYPE_S64: out << imm.getIntegerValue(); break;
+      case TYPE_U64: out << (uint64_t)imm.getIntegerValue(); break;
+      case TYPE_HALF: out << "half(" << imm.getIntegerValue() << ")"; break;
+      case TYPE_FLOAT: out << imm.getFloatValue(); break;
+      case TYPE_DOUBLE: out << imm.getDoubleValue(); break;
+      default:
+        GBE_ASSERT(0 && "unsupported imm type.\n");
     }
   }
 
@@ -191,6 +222,15 @@ namespace ir {
     const uint32_t specialNum = this->getSpecialRegNum();
     return ID >= firstID && ID < firstID + specialNum;
   }
+  Register Function::getSurfaceBaseReg(uint8_t bti) const {
+    map<uint8_t, Register>::const_iterator iter = btiRegMap.find(bti);
+    GBE_ASSERT(iter != btiRegMap.end());
+    return iter->second;
+  }
+
+  void Function::appendSurface(uint8_t bti, Register reg) {
+    btiRegMap.insert(std::make_pair(bti, reg));
+  }
 
   void Function::computeCFG(void) {
     // Clear possible previously computed CFG and compute the direct
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 2468e73..9aa1e8d 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -29,6 +29,7 @@
 #include "ir/instruction.hpp"
 #include "ir/profile.hpp"
 #include "ir/sampler.hpp"
+#include "ir/printf.hpp"
 #include "ir/image.hpp"
 #include "sys/vector.hpp"
 #include "sys/set.hpp"
@@ -81,6 +82,8 @@ namespace ir {
         functor(*curr);
       }
     }
+    set <Register> undefPhiRegs;
+    set <Register> definedPhiRegs;
   private:
     friend class Function; //!< Owns the basic blocks
     BlockSet predecessors; //!< Incoming blocks
@@ -105,14 +108,26 @@ namespace ir {
       IMAGE             = 5,  // image*d_t
       SAMPLER           = 6
     };
+
+    struct InfoFromLLVM { // All the info about passed by llvm, using -cl-kernel-arg-info
+      uint32_t addrSpace;
+      std::string typeName;
+      std::string accessQual;
+      std::string typeQual;
+      std::string argName; // My different from arg->getName()
+    };
+
     /*! Create a function input argument */
-    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align) :
-      type(type), reg(reg), size(size), align(align), name(name) {}
+    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align, InfoFromLLVM& info, uint8_t bti) :
+      type(type), reg(reg), size(size), align(align), name(name), info(info), bti(bti) { }
+
     Type type;     //!< Gives the type of argument we have
     Register reg;  //!< Holds the argument
     uint32_t size; //!< == sizeof(void*) for ptr, sizeof(elem) for the rest
     uint32_t align; //!< address alignment for the argument
     const std::string name; //!< Holds the function name for IR output
+    InfoFromLLVM info;  //!< Holds the llvm passed info
+    uint8_t bti; //!< binding table index
     GBE_STRUCT(FunctionArgument); // Use custom allocator
   };
 
@@ -134,6 +149,17 @@ namespace ir {
     return arg0.offset < arg1.offset;
   }
 
+  /*! CFG loops */
+  struct Loop : public NonCopyable
+  {
+  public:
+    Loop(const vector<LabelIndex> &in, const vector<std::pair<LabelIndex, LabelIndex>> &exit) :
+    bbs(in), exits(exit) {}
+    vector<LabelIndex> bbs;
+    vector<std::pair<LabelIndex, LabelIndex>> exits;
+    GBE_STRUCT(Loop);
+  };
+
   /*! A function is :
    *  - a register file
    *  - a set of basic block layout into a CGF
@@ -153,8 +179,8 @@ namespace ir {
     /*! Get the function profile */
     INLINE Profile getProfile(void) const { return profile; }
     /*! Get a new valid register */
-    INLINE Register newRegister(RegisterFamily family) {
-      return this->file.append(family);
+    INLINE Register newRegister(RegisterFamily family, bool uniform = false) {
+      return this->file.append(family, uniform);
     }
     /*! Get the function name */
     const std::string &getName(void) const { return name; }
@@ -164,6 +190,10 @@ namespace ir {
     uint32_t getSimdWidth(void) const { return simdWidth; }
     /*! Extract the register from the register file */
     INLINE RegisterData getRegisterData(Register reg) const { return file.get(reg); }
+    /*! set a register to uniform or nonuniform type. */
+    INLINE void setRegisterUniform(Register reg, bool uniform) { file.setUniform(reg, uniform); }
+    /*! return true if the specified regsiter is uniform type */
+    INLINE bool isUniformRegister(Register reg) { return file.isUniform(reg); }
     /*! Get the register family from the register itself */
     INLINE RegisterFamily getRegisterFamily(Register reg) const {
       return this->getRegisterData(reg).family;
@@ -179,7 +209,7 @@ namespace ir {
     /*! Get the register file */
     INLINE const RegisterFile &getRegisterFile(void) const { return file; }
     /*! Get the given value ie immediate from the function */
-    INLINE Immediate getImmediate(ImmediateIndex ID) const {
+    INLINE const Immediate &getImmediate(ImmediateIndex ID) const {
       return immediates[ID];
     }
     /*! Create a new immediate and returns its index */
@@ -270,6 +300,8 @@ namespace ir {
     void computeCFG(void);
     /*! Sort labels in increasing orders (top block has the smallest label) */
     void sortLabels(void);
+    /*! check empty Label. */
+    void checkEmptyLabels(void);
     /*! Get the pointer family */
     RegisterFamily getPointerFamily(void) const;
     /*! Number of registers in the register file */
@@ -310,14 +342,26 @@ namespace ir {
     SamplerSet* getSamplerSet(void) const {return samplerSet; }
     /*! Get image set in this function */
     ImageSet* getImageSet(void) const {return imageSet; }
+    /*! Get printf set in this function */
+    PrintfSet* getPrintfSet(void) const {return printfSet; }
     /*! Set required work group size. */
     void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compileWgSize[0] = x; compileWgSize[1] = y; compileWgSize[2] = z; }
     /*! Get required work group size. */
     const size_t *getCompileWorkGroupSize(void) const {return compileWgSize;}
+    /*! Set function attributes string. */
+    void setFunctionAttributes(const std::string& functionAttributes) {  this->functionAttributes= functionAttributes; }
+    /*! Get function attributes string. */
+    const std::string& getFunctionAttributes(void) const {return this->functionAttributes;}
     /*! Get stack size. */
     INLINE const uint32_t getStackSize(void) const { return this->stackSize; }
     /*! Push stack size. */
     INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
+    /*! add the loop info for later liveness analysis */
+    void addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits);
+    INLINE const vector<Loop * > &getLoops() { return loops; }
+    /*! Get surface starting address register from bti */
+    Register getSurfaceBaseReg(uint8_t bti) const;
+    void appendSurface(uint8_t bti, Register reg);
   private:
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
@@ -327,6 +371,8 @@ namespace ir {
     vector<BasicBlock*> labels;     //!< Each label points to a basic block
     vector<Immediate> immediates;   //!< All immediate values in the function
     vector<BasicBlock*> blocks;     //!< All chained basic blocks
+    vector<Loop *> loops;           //!< Loops info of the function
+    map<uint8_t, Register> btiRegMap;//!< map bti to surface base address
     RegisterFile file;              //!< RegisterDatas used by the instructions
     Profile profile;                //!< Current function profile
     PushMap pushMap;                //!< Pushed function arguments (reg->loc)
@@ -337,8 +383,10 @@ namespace ir {
     uint32_t stackSize;             //!< stack size for private memory.
     SamplerSet *samplerSet;         //!< samplers used in this function.
     ImageSet* imageSet;             //!< Image set in this function's arguments..
+    PrintfSet *printfSet;           //!< printfSet store the printf info.
     size_t compileWgSize[3];        //!< required work group size specified by
                                     //   __attribute__((reqd_work_group_size(X, Y, Z))).
+    std::string functionAttributes; //!< function attribute qualifiers combined.
     GBE_CLASS(Function);            //!< Use custom allocator
   };
 
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index 8c34d70..a9b1563 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -64,16 +64,6 @@ namespace ir {
     setInfoOffset4Type(imageInfo, key.type, offset);
   }
 
-  Register ImageSet::appendInfo(ImageInfoKey key, Context *ctx)
-  {
-    auto it = infoRegMap.find(key.data);
-    if (it != infoRegMap.end())
-      return it->second;
-    Register reg = ctx->reg(FAMILY_DWORD);
-    infoRegMap.insert(std::make_pair(key.data, reg));
-    return reg;
-  }
-
   void ImageSet::clearInfo()
   {
     struct ImageInfo *imageInfo;
@@ -87,26 +77,6 @@ namespace ir {
     }
   }
 
-  void ImageSet::append(Register imageReg, Context *ctx)
-  {
-    ir::FunctionArgument *arg =  ctx->getFunction().getArg(imageReg);
-    GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
-    GBE_ASSERTM(regMap.find(imageReg) == regMap.end(), "Append the same image reg twice.");
-
-    int32_t id = ctx->getFunction().getArgID(arg);
-    struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
-    imageInfo->arg_idx = id;
-    imageInfo->idx = regMap.size() + gbe_get_image_base_index();
-    imageInfo->wSlot = -1;
-    imageInfo->hSlot = -1;
-    imageInfo->depthSlot = -1;
-    imageInfo->dataTypeSlot = -1;
-    imageInfo->channelOrderSlot = -1;
-    imageInfo->dimOrderSlot = -1;
-    regMap.insert(std::make_pair(imageReg, imageInfo));
-    indexMap.insert(std::make_pair(imageInfo->idx, imageInfo));
-  }
-
   const int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
   {
     auto it = indexMap.find(key.index);
@@ -124,8 +94,9 @@ namespace ir {
   }
 
   void ImageSet::getData(struct ImageInfo *imageInfos) const {
+      int id = 0;
       for(auto &it : regMap)
-        imageInfos[it.second->idx - gbe_get_image_base_index()] = *it.second;
+        imageInfos[id++] = *it.second;
   }
 
   ImageSet::~ImageSet() {
@@ -216,7 +187,7 @@ namespace ir {
       IN_UPDATE_SZ(img_info->channelOrderSlot);
       IN_UPDATE_SZ(img_info->dimOrderSlot);
 
-      indexMap.insert(std::make_pair(index, img_info));
+      indexMap.insert(std::make_pair(img_info->idx, img_info));
     }
 
     IN_UPDATE_SZ(magic);
@@ -271,6 +242,37 @@ namespace ir {
    outs << spaces << "------------- End ImageSet -------------" << "\n";
   }
 
+#ifdef GBE_COMPILER_AVAILABLE
+  Register ImageSet::appendInfo(ImageInfoKey key, Context *ctx)
+  {
+    auto it = infoRegMap.find(key.data);
+    if (it != infoRegMap.end())
+      return it->second;
+    Register reg = ctx->reg(FAMILY_DWORD);
+    infoRegMap.insert(std::make_pair(key.data, reg));
+    return reg;
+  }
+
+  void ImageSet::append(Register imageReg, Context *ctx, uint8_t bti)
+  {
+    ir::FunctionArgument *arg =  ctx->getFunction().getArg(imageReg);
+    GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
+    GBE_ASSERTM(regMap.find(imageReg) == regMap.end(), "Append the same image reg twice.");
+
+    int32_t id = ctx->getFunction().getArgID(arg);
+    struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
+    imageInfo->arg_idx = id;
+    imageInfo->idx = bti;
+    imageInfo->wSlot = -1;
+    imageInfo->hSlot = -1;
+    imageInfo->depthSlot = -1;
+    imageInfo->dataTypeSlot = -1;
+    imageInfo->channelOrderSlot = -1;
+    imageInfo->dimOrderSlot = -1;
+    regMap.insert(std::make_pair(imageReg, imageInfo));
+    indexMap.insert(std::make_pair(imageInfo->idx, imageInfo));
+  }
+#endif
 
 } /* namespace ir */
 } /* namespace gbe */
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
index cf388d4..b31c7da 100644
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -44,7 +44,7 @@ namespace ir {
   {
   public:
     /*! Append an image argument. */
-    void append(Register imageReg, Context *ctx);
+    void append(Register imageReg, Context *ctx, uint8_t bti);
     /*! Append an image info slot. */
     void appendInfo(ImageInfoKey key, uint32_t offset);
     /*! Append an image info register. */
@@ -61,6 +61,9 @@ namespace ir {
     void operator = (const ImageSet& other) {
       regMap.insert(other.regMap.begin(), other.regMap.end());
     }
+
+    bool empty() const { return regMap.empty(); }
+
     ImageSet(const ImageSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
     ImageSet() {}
     ~ImageSet();
@@ -90,7 +93,7 @@ namespace ir {
   private:
     map<Register, struct ImageInfo *> regMap;
     map<uint32_t, struct ImageInfo *> indexMap;
-    map<uint32_t, Register> infoRegMap;
+    map<uint16_t, Register> infoRegMap;
     GBE_CLASS(ImageSet);
   };
 } /* namespace ir */
diff --git a/backend/src/ir/immediate.cpp b/backend/src/ir/immediate.cpp
new file mode 100644
index 0000000..3a6b9a2
--- /dev/null
+++ b/backend/src/ir/immediate.cpp
@@ -0,0 +1,263 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "immediate.hpp"
+
+using namespace gbe;
+using namespace ir;
+
+#define SCALAR_SAME_TYPE_ASSERT()                           \
+      GBE_ASSERT(this->getType() == right.getType()       && \
+                 this->getElemNum() == right.getElemNum() && \
+                 this->getElemNum() == 1                  && \
+                 this->getType() != TYPE_BOOL);
+
+#define DECLAR_BINARY_ALL_TYPE_OP(OP) \
+    Immediate Immediate::operator OP (const Immediate &right) const { \
+      SCALAR_SAME_TYPE_ASSERT(); \
+      switch (this->getType()) { \
+        default: \
+          GBE_ASSERT(0); \
+        case TYPE_S8:     return Immediate(*this->data.s8 OP *right.data.s8);   \
+        case TYPE_U8:     return Immediate(*this->data.u8 OP *right.data.u8);   \
+        case TYPE_S16:    return Immediate(*this->data.s16 OP *right.data.s16); \
+        case TYPE_U16:    return Immediate(*this->data.u16 OP *right.data.u16); \
+        case TYPE_S32:    return Immediate(*this->data.s32 OP *right.data.s32); \
+        case TYPE_U32:    return Immediate(*this->data.u32 OP *right.data.u32); \
+        case TYPE_S64:    return Immediate(*this->data.s64 OP *right.data.s64); \
+        case TYPE_U64:    return Immediate(*this->data.u64 OP *right.data.u64); \
+        case TYPE_FLOAT:  return Immediate(*this->data.f32 OP *right.data.f32); \
+        case TYPE_DOUBLE: return Immediate(*this->data.f64 OP *right.data.f64); \
+      }\
+      return *this;\
+    }
+
+    DECLAR_BINARY_ALL_TYPE_OP(+)
+    DECLAR_BINARY_ALL_TYPE_OP(-)
+    DECLAR_BINARY_ALL_TYPE_OP(*)
+    DECLAR_BINARY_ALL_TYPE_OP(/)
+
+#undef DECLAR_BINARY_ALL_TYPE_OP
+
+#define DECLAR_BINARY_INT_TYPE_OP(OP) \
+    Immediate Immediate::operator OP (const Immediate &right) const { \
+      SCALAR_SAME_TYPE_ASSERT(); \
+      switch (this->getType()) { \
+        default: \
+          GBE_ASSERT(0); \
+        case TYPE_S8:     return Immediate(*this->data.s8 OP *right.data.s8);   \
+        case TYPE_U8:     return Immediate(*this->data.u8 OP *right.data.u8);   \
+        case TYPE_S16:    return Immediate(*this->data.s16 OP *right.data.s16); \
+        case TYPE_U16:    return Immediate(*this->data.u16 OP *right.data.u16); \
+        case TYPE_S32:    return Immediate(*this->data.s32 OP *right.data.s32); \
+        case TYPE_U32:    return Immediate(*this->data.u32 OP *right.data.u32); \
+        case TYPE_S64:    return Immediate(*this->data.s64 OP *right.data.s64); \
+        case TYPE_U64:    return Immediate(*this->data.u64 OP *right.data.u64); \
+      }\
+      return *this;\
+    }
+    DECLAR_BINARY_INT_TYPE_OP(%)
+    DECLAR_BINARY_INT_TYPE_OP(&)
+    DECLAR_BINARY_INT_TYPE_OP(|)
+    DECLAR_BINARY_INT_TYPE_OP(^)
+#undef DECLAR_BINARY_INT_TYPE_OP
+
+
+#define DECLAR_BINARY_ASHIFT_OP(OP) \
+    Immediate Immediate::operator OP (const Immediate &right) const { \
+      GBE_ASSERT(this->getType() > TYPE_BOOL && this->getType() <= TYPE_U64); \
+      int32_t shift = right.getIntegerValue(); \
+      if (shift == 0) \
+        return *this; \
+      else \
+        switch (this->getType()) { \
+          default: \
+            GBE_ASSERT(0); \
+          case TYPE_S8:  return Immediate((*this->data.s8 OP shift));  \
+          case TYPE_U8:  return Immediate((*this->data.u8 OP shift));  \
+          case TYPE_S16: return Immediate((*this->data.s16 OP shift)); \
+          case TYPE_U16: return Immediate((*this->data.u16 OP shift)); \
+          case TYPE_S32: return Immediate((*this->data.s32 OP shift)); \
+          case TYPE_U32: return Immediate((*this->data.u32 OP shift)); \
+          case TYPE_S64: return Immediate((*this->data.s64 OP shift)); \
+          case TYPE_U64: return Immediate((*this->data.u64 OP shift)); \
+        } \
+    }
+
+    DECLAR_BINARY_ASHIFT_OP(>>)
+    DECLAR_BINARY_ASHIFT_OP(<<)
+
+#undef DECLAR_BINARY_ASHIFT_OP
+    Immediate Immediate::lshr (const Immediate &left, const Immediate &right) {
+      GBE_ASSERT(left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64);
+      int32_t shift = right.getIntegerValue();
+      if (shift == 0)
+        return left;
+      else
+        switch (left.getType()) {
+          default:
+            GBE_ASSERT(0);
+          case TYPE_S8:  
+          case TYPE_U8:  return Immediate((*left.data.u8 >> shift));
+          case TYPE_S16: 
+          case TYPE_U16: return Immediate((*left.data.u16 >> shift));
+          case TYPE_S32: 
+          case TYPE_U32: return Immediate((*left.data.u32 >> shift));
+          case TYPE_S64: 
+          case TYPE_U64: return Immediate((*left.data.u64 >> shift));
+        }
+    }
+
+    Immediate::Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType) {
+      switch (op) {
+        default:
+          GBE_ASSERT(0 && "unsupported imm op\n");
+        case IMM_ADD: *this = left + right; break;
+        case IMM_SUB: *this = left - right; break;
+        case IMM_MUL: *this = left * right; break;
+        case IMM_DIV: *this = left / right; break;
+        case IMM_AND: *this = left & right; break;
+        case IMM_OR:  *this = left | right; break;
+        case IMM_XOR: *this = left ^ right; break;
+        case IMM_REM:
+        {
+          if (left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64)
+            *this = left % right;
+          else if (left.getType() == TYPE_FLOAT && right.getType() == TYPE_FLOAT) {
+            *this = Immediate(left);
+            *this->data.f32 = fmodf(left.getFloatValue(), right.getFloatValue());
+          }
+          else if (left.getType() == TYPE_DOUBLE && right.getType() == TYPE_DOUBLE) {
+            *this = Immediate(left);
+            *this->data.f64 += fmod(left.getDoubleValue(), right.getDoubleValue());
+          }
+          else
+            GBE_ASSERT(0);
+          break;
+        }
+        case IMM_LSHR:
+        {
+          if (left.getElemNum() == 1)
+            lshr(left, right);
+          else {
+            GBE_ASSERT(right.getIntegerValue() <= (left.getElemNum() * left.getTypeSize() * 8));
+            GBE_ASSERT(right.getIntegerValue() % (left.getTypeSize() * 8) == 0);
+            copy(left, right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+          }
+          break;
+        }
+        case IMM_ASHR:
+        {
+          if (left.getElemNum() == 1)
+            *this = left >> right;
+          else {
+            GBE_ASSERT(0 && "Doesn't support ashr on array constant.");
+            copy(left, right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+          }
+          break;
+        }
+        case IMM_SHL:
+        {
+          if (left.getElemNum() == 1)
+            *this = left << right;
+          else {
+            GBE_ASSERT(right.getIntegerValue() <= (left.getElemNum() * left.getTypeSize() * 8));
+            GBE_ASSERT(right.getIntegerValue() % (left.getTypeSize() * 8) == 0);
+            copy(left, -right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+          }
+          break;
+        }
+      }
+      // If the dst type is large int, we will not change the imm type to large int.
+      GBE_ASSERT(type == (ImmType)dstType || dstType == TYPE_LARGE_INT);
+    }
+
+    Immediate::Immediate(const vector<const Immediate*> immVec) {
+      if (immVec.size() == 1) {
+        *this = *immVec[0];
+      } else if (!(immVec[0]->isCompType()) && immVec[0]->elemNum == 1) {
+        this->type = immVec[0]->type;
+        this->elemNum = immVec.size();
+        if (immVec[0]->getTypeSize() * immVec.size() < 8)
+          this->data.p = &this->defaultData;
+        else
+          this->data.p = malloc(immVec[0]->getTypeSize() * immVec.size());
+        uint8_t *p = (uint8_t*)this->data.p;
+        for(uint32_t i = 0; i < immVec.size(); i++) {
+          GBE_ASSERT(immVec[i]->type == immVec[0]->type && immVec[i]->elemNum == 1);
+          memcpy(p, immVec[i]->data.p, immVec[i]->getTypeSize());
+          p += immVec[i]->getTypeSize();
+        }
+      } else {
+        this->type = IMM_TYPE_COMP;
+        if (immVec.size() * sizeof(Immediate*) < 8)
+          this->data.p = &this->defaultData;
+        else
+          this->data.p = malloc(immVec.size() * sizeof(Immediate*));
+        this->elemNum = immVec.size();
+        for(uint32_t i = 0; i < immVec.size(); i++)
+          this->data.immVec[i] = immVec[i];
+      }
+    }
+
+
+    // operator = and copy() are only called from constructor functions
+    // which this never hold a memory pointer, we don't need to bother
+    // to check the data.p before assignment.
+    Immediate & Immediate::operator= (const Immediate & other) {
+      if (this != &other) {
+        type = other.type;
+        elemNum = other.elemNum;
+        if (other.data.p != &other.defaultData) {
+          data.p = malloc(other.elemNum * other.getTypeSize());
+          memcpy(data.p, other.data.p, other.elemNum * other.getTypeSize());
+        }
+        else {
+          defaultData = other.defaultData;
+          data.p = &defaultData;
+        }
+      }
+      return *this;
+    }
+
+    void Immediate::copy(const Immediate &other, int32_t offset, uint32_t num) {
+      if (this != &other) {
+        if (other.type == IMM_TYPE_COMP && num == 1) {
+          GBE_ASSERT(offset >= 0 && offset <= (int32_t)other.elemNum);
+          *this = *other.data.immVec[offset];
+          return;
+        }
+        type = other.type;
+        elemNum = num;
+        if (num * other.getTypeSize() < 8)
+          data.p = &defaultData;
+        else
+          data.p = malloc(num * other.getTypeSize());
+        uint8_t* datap = (uint8_t*)data.p;
+        memset(datap, 0, num * other.getTypeSize());
+        if (offset < 0) {
+          datap += (-offset) * other.getTypeSize();
+          num -= num < (uint32_t)(-offset) ? num : (-offset);
+          offset = 0;
+        } else if (offset > 0 && num > 1) {
+          GBE_ASSERT((int32_t)num > offset);
+          num -= offset;
+        }
+        memcpy(datap, (uint8_t*)other.data.p + offset * other.getTypeSize(),
+               num * other.getTypeSize());
+      }
+    }
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
index 67dd03f..6a5c819 100644
--- a/backend/src/ir/immediate.hpp
+++ b/backend/src/ir/immediate.hpp
@@ -18,30 +18,97 @@
  */
 
 /**
- * \file value.hpp
+ * \file Immediate.hpp
  *
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
 #ifndef __GBE_IR_IMMEDIATE_HPP__
 #define __GBE_IR_IMMEDIATE_HPP__
 
+#include <string.h>
 #include "ir/type.hpp"
 #include "sys/platform.hpp"
 
 namespace gbe {
 namespace ir {
 
+  typedef enum {
+    IMM_TRUNC = 0,
+    IMM_BITCAST,
+    IMM_ADD,
+    IMM_SUB,
+    IMM_MUL,
+    IMM_DIV,
+    IMM_REM,
+    IMM_SHL,
+    IMM_ASHR,
+    IMM_LSHR,
+    IMM_AND,
+    IMM_OR,
+    IMM_XOR
+  } ImmOpCode;
+
+  typedef enum {
+    IMM_TYPE_BOOL = TYPE_BOOL,
+    IMM_TYPE_S8 = TYPE_S8,
+    IMM_TYPE_U8 = TYPE_U8,
+    IMM_TYPE_S16 = TYPE_S16,
+    IMM_TYPE_U16 = TYPE_U16,
+    IMM_TYPE_S32 = TYPE_S32,
+    IMM_TYPE_U32 = TYPE_U32,
+    IMM_TYPE_S64 = TYPE_S64,
+    IMM_TYPE_U64 = TYPE_U64,
+    IMM_TYPE_FLOAT = TYPE_FLOAT,
+    IMM_TYPE_DOUBLE = TYPE_DOUBLE,
+    IMM_TYPE_COMP             // compond immediate which consist many immediates.
+  } ImmType;
+
   /*! The value as stored in the instruction */
   class Immediate
   {
   public:
-    INLINE Immediate(void) {}
-#define DECL_CONSTRUCTOR(TYPE, FIELD, IR_TYPE)  \
-    Immediate(TYPE FIELD) {                     \
-      this->type = IR_TYPE;                     \
-      this->data.u64 = 0llu;                    \
-      this->data.FIELD = FIELD;                 \
+    INLINE Immediate(void) { }
+
+    INLINE Type getType(void) const {
+      return (Type)type;
+    }
+
+    INLINE bool isCompType(void) const {
+      return type == IMM_TYPE_COMP;
+    }
+
+    INLINE uint32_t getElemNum(void) const {
+      return elemNum;
+    }
+
+    uint32_t getTypeSize(void) const {
+      switch(type) {
+        default:
+          GBE_ASSERT(0 && "Invalid immeidate type.\n");
+        case TYPE_BOOL:
+        case TYPE_S8:
+        case TYPE_U8:   return 1;
+        case TYPE_S16:
+        case TYPE_U16:  return 2;
+        case TYPE_FLOAT:
+        case TYPE_S32:
+        case TYPE_U32:  return 4;
+        case TYPE_DOUBLE:
+        case TYPE_S64:
+        case TYPE_U64:  return 8;
+        case IMM_TYPE_COMP: return sizeof(Immediate*);
+      }
     }
+
+#define DECL_CONSTRUCTOR(TYPE, FIELD, IR_TYPE)                  \
+    Immediate(TYPE FIELD) {                                     \
+      this->type = (ImmType)IR_TYPE;                            \
+      this->elemNum = 1;                                        \
+      this->data.p = &defaultData;                              \
+      defaultData = 0ull;                                       \
+      *this->data.FIELD = FIELD;                                \
+    }
+
     DECL_CONSTRUCTOR(bool, b, TYPE_BOOL)
     DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8)
     DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8)
@@ -54,28 +121,137 @@ namespace ir {
     DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT)
     DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE)
 #undef DECL_CONSTRUCTOR
+
+#define DECL_CONSTRUCTOR(TYPE, FIELD, IR_TYPE, ELEMNUM)         \
+    Immediate(TYPE *FIELD, uint32_t ELEMNUM) {                  \
+      this->type = (ImmType)IR_TYPE;                            \
+      this->elemNum = ELEMNUM;                                  \
+      if (elemNum * ELEMNUM > 8)                                \
+        this->data.p = malloc(ELEMNUM * getTypeSize());         \
+      else                                                      \
+        this->data.p = &defaultData;                            \
+      defaultData = 0ull;                                       \
+      memcpy(this->data.FIELD, FIELD, ELEMNUM * getTypeSize()); \
+    }
+
+    DECL_CONSTRUCTOR(bool, b, TYPE_BOOL, elemNum)
+    DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8, elemNum)
+    DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8, elemNum)
+    DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16, elemNum)
+    DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16, elemNum)
+    DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32, elemNum)
+    DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32, elemNum)
+    DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64, elemNum)
+    DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64, elemNum)
+    DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT, elemNum)
+    DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE, elemNum)
+#undef DECL_CONSTRUCTOR
+
+    Immediate(const vector<const Immediate*> immVec);
+
+    INLINE int64_t getIntegerValue(void) const {
+      switch (type) {
+        default:
+          GBE_ASSERT(0 && "Invalid immediate type.\n");
+        case TYPE_BOOL: return *data.b;
+        case TYPE_S8:   return *data.s8;
+        case TYPE_U8:   return *data.u8;
+        case TYPE_S16:  return *data.s16;
+        case TYPE_U16:  return *data.u16;
+        case TYPE_S32:  return *data.s32;
+        case TYPE_U32:  return *data.u32;
+        case TYPE_S64:  return *data.s64;
+        case TYPE_U64:  return *data.u64;
+      }
+    }
+
+    INLINE float getFloatValue(void) const {
+      GBE_ASSERT(type == IMM_TYPE_FLOAT);
+      return *data.f32;
+    }
+
+    INLINE float asFloatValue(void) const {
+      GBE_ASSERT(type == IMM_TYPE_FLOAT || type == IMM_TYPE_U32 || type == IMM_TYPE_S32);
+      return *data.f32;
+    }
+
+    INLINE int64_t asIntegerValue(void) const {
+      GBE_ASSERT(elemNum == 1);
+      return *data.s64;
+    }
+
+    INLINE double getDoubleValue(void) const {
+      GBE_ASSERT(type == IMM_TYPE_DOUBLE);
+      return *data.f64;
+    }
+   
+    INLINE Immediate(const Immediate & other) {
+      *this = other;
+    }
+
+    Immediate(ImmOpCode op, const Immediate &other, Type dstType) {
+      if (op == IMM_TRUNC) {
+        copy(other, 0, 1);
+      } else if (op == IMM_BITCAST) {
+        *this = other;
+        type = (ImmType)dstType;
+      }
+    }
+
+    Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType);
+
+    ~Immediate() {
+      if (data.p != &defaultData) {
+        free(data.p);
+        data.p = NULL;
+      }
+    }
+
+  private:
     union {
-      bool b;
-      int8_t s8;
-      uint8_t u8;
-      int16_t s16;
-      uint16_t u16;
-      int32_t s32;
-      uint32_t u32;
-      int64_t s64;
-      uint64_t u64;
-      float f32;
-      double f64;
+      bool *b;
+      int8_t *s8;
+      uint8_t *u8;
+      int16_t *s16;
+      uint16_t *u16;
+      int32_t *s32;
+      uint32_t *u32;
+      int64_t *s64;
+      uint64_t *u64;
+      float *f32;
+      double *f64;
+      const Immediate *immVec[];
+      void *p;
     } data;     //!< Value to store
-    Type type;  //!< Type of the value
+    ImmType type;  //!< Type of the value
+    uint32_t elemNum; //!< vector imm data type
+    uint64_t defaultData;
+    Immediate & operator= (const Immediate &);
+    Immediate operator+ (const Immediate &) const; 
+    Immediate operator- (const Immediate &) const; 
+    Immediate operator* (const Immediate &) const; 
+    Immediate operator/ (const Immediate &) const; 
+    Immediate operator% (const Immediate &) const; 
+    Immediate operator& (const Immediate &) const; 
+    Immediate operator| (const Immediate &) const; 
+    Immediate operator^ (const Immediate &) const; 
+    Immediate operator<< (const Immediate &) const; 
+    Immediate operator>> (const Immediate &) const; 
+    static Immediate lshr (const Immediate &left, const Immediate &right);
+
+
+    void copy(const Immediate &other, int32_t offset, uint32_t num);
     GBE_CLASS(Immediate);
   };
 
   /*! Compare two immediates */
   INLINE bool operator< (const Immediate &imm0, const Immediate &imm1) {
-    if (imm0.type != imm1.type)
-      return uint32_t(imm0.type) < uint32_t(imm1.type);
-    return imm0.data.u64 < imm1.data.u64;
+    if (imm0.getType() != imm1.getType())
+      return uint32_t(imm0.getType()) < uint32_t(imm1.getType());
+    else if (imm0.getType() == TYPE_FLOAT || imm0.getType() == TYPE_DOUBLE)
+      return imm0.asIntegerValue() < imm1.asIntegerValue();
+    else
+      return imm0.getIntegerValue() < imm1.getIntegerValue();
   }
 
   /*! A value is stored in a per-function vector. This is the index to it */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 95bcff5..5fc1535 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -314,6 +314,7 @@ namespace ir {
       AtomicInstruction(AtomicOps atomicOp,
                          Register dst,
                          AddressSpace addrSpace,
+                         BTI bti,
                          Tuple src)
       {
         this->opcode = OP_ATOMIC;
@@ -321,6 +322,7 @@ namespace ir {
         this->dst[0] = dst;
         this->src = src;
         this->addrSpace = addrSpace;
+        this->bti = bti;
         srcNum = 2;
         if((atomicOp == ATOMIC_OP_INC) ||
           (atomicOp == ATOMIC_OP_DEC))
@@ -329,12 +331,14 @@ namespace ir {
           srcNum = 3;
       }
       INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
       INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
       INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       Register dst[1];
       Tuple src;
       AddressSpace addrSpace; //!< Address space
+      BTI bti;               //!< bti
       uint8_t srcNum:2;     //!<Source Number
       AtomicOps atomicOp:6;     //!<Source Number
     };
@@ -400,7 +404,8 @@ namespace ir {
                       Register offset,
                       AddressSpace addrSpace,
                       uint32_t valueNum,
-                      bool dwAligned)
+                      bool dwAligned,
+                      BTI bti)
       {
         GBE_ASSERT(valueNum < 128);
         this->opcode = OP_LOAD;
@@ -410,6 +415,7 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
       }
       INLINE Register getDst(const Function &fn, uint32_t ID) const {
         GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
@@ -423,6 +429,7 @@ namespace ir {
       INLINE Type getValueType(void) const { return type; }
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
@@ -431,6 +438,7 @@ namespace ir {
       Register offset;        //!< Alias to make it similar to store
       Tuple values;           //!< Values to load
       AddressSpace addrSpace; //!< Where to load
+      BTI bti;
       uint8_t valueNum:7;     //!< Number of values to load
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
     };
@@ -444,7 +452,8 @@ namespace ir {
                        Register offset,
                        AddressSpace addrSpace,
                        uint32_t valueNum,
-                       bool dwAligned)
+                       bool dwAligned,
+                       BTI bti)
       {
         GBE_ASSERT(valueNum < 255);
         this->opcode = OP_STORE;
@@ -454,6 +463,7 @@ namespace ir {
         this->addrSpace = addrSpace;
         this->valueNum = valueNum;
         this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
       }
       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
         GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
@@ -473,6 +483,7 @@ namespace ir {
       INLINE uint32_t getValueNum(void) const { return valueNum; }
       INLINE Type getValueType(void) const { return type; }
       INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       INLINE bool isAligned(void) const { return !!dwAligned; }
@@ -480,6 +491,7 @@ namespace ir {
       Register offset;        //!< First source is the offset where to store
       Tuple values;           //!< Values to store
       AddressSpace addrSpace; //!< Where to store
+      BTI bti;                //!< Which btis need access
       uint8_t valueNum:7;     //!< Number of values to store
       uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
       Register dst[0];        //!< No destination
@@ -491,7 +503,7 @@ namespace ir {
       public TupleDstPolicy<SampleInstruction>
     {
     public:
-      SampleInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset, bool is3D) {
+      SampleInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
         this->opcode = OP_SAMPLE;
         this->dst = dstTuple;
         this->src = srcTuple;
@@ -500,7 +512,6 @@ namespace ir {
         this->samplerIdx = sampler;
         this->imageIdx = imageIdx;
         this->samplerOffset = samplerOffset;
-        this->is3DRead = is3D;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
@@ -525,12 +536,10 @@ namespace ir {
       INLINE Type getDstType(void) const { return this->dstIsFloat ? TYPE_FLOAT : TYPE_U32; }
       INLINE const uint8_t getSamplerIndex(void) const { return this->samplerIdx; }
       INLINE const uint8_t getSamplerOffset(void) const { return this->samplerOffset; }
-      INLINE const bool is3D(void) const { return !!this->is3DRead; }
       uint8_t srcIsFloat:1;
       uint8_t dstIsFloat:1;
       uint8_t samplerIdx:4;
-      uint8_t samplerOffset:1;
-      uint8_t is3DRead:1;
+      uint8_t samplerOffset:2;
       uint8_t imageIdx;
       static const uint32_t srcNum = 3;
       static const uint32_t dstNum = 4;
@@ -543,13 +552,12 @@ namespace ir {
     {
     public:
 
-      INLINE TypedWriteInstruction(uint8_t imageIdx, Tuple srcTuple, Type srcType, Type coordType, bool is3D) {
+      INLINE TypedWriteInstruction(uint8_t imageIdx, Tuple srcTuple, Type srcType, Type coordType) {
         this->opcode = OP_TYPED_WRITE;
         this->src = srcTuple;
         this->coordType = coordType;
         this->srcType = srcType;
         this->imageIdx = imageIdx;
-        this->is3DWrite = is3D;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
@@ -569,9 +577,6 @@ namespace ir {
       uint8_t srcType;
       uint8_t coordType;
       uint8_t imageIdx;
-      uint8_t is3DWrite;
-
-      INLINE const bool is3D(void) const { return !!this->is3DWrite; }
 
       INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE Type getSrcType(void) const { return (Type)this->srcType; }
@@ -581,39 +586,6 @@ namespace ir {
       Register dst[0];               //!< No dest register
     };
 
-    class ALIGNED_INSTRUCTION GetSamplerInfoInstruction :
-      public BasePolicy,
-      public NSrcPolicy<GetSamplerInfoInstruction, 1>,
-      public NDstPolicy<GetSamplerInfoInstruction, 1>
-    {
-    public:
-      GetSamplerInfoInstruction( Register dst,
-                                 Register samplerInfo,
-                                 uint8_t samplerIdx)
-      {
-        this->opcode = OP_GET_SAMPLER_INFO;
-        this->dst[0] = dst;
-        this->src[0] = samplerInfo;
-        this->samplerIdx = samplerIdx;
-      }
-
-      INLINE bool wellFormed(const Function &fn, std::string &why) const;
-      INLINE void out(std::ostream &out, const Function &fn) const {
-        this->outOpcode(out);
-        out  << " %" << this->getDst(fn, 0)
-             << " %" << this->getSrc(fn, 0)
-             << " sampler idx " << (int)this->samplerIdx;
-      }
-      INLINE const uint8_t getSamplerIndex() const {
-        return this->samplerIdx;
-      }
-
-      Register src[1];                  //!< sampler to get info
-      Register dst[1];                  //!< return value
-      uint8_t samplerIdx;               //!< sampler slot index.
-      static const uint32_t dstNum = 1;
-    };
-
     class ALIGNED_INSTRUCTION GetImageInfoInstruction :
       public BasePolicy,
       public NSrcPolicy<GetImageInfoInstruction, 1>,
@@ -971,7 +943,6 @@ namespace ir {
         if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
           return false;
       }
-      CHECK_TYPE(insn.type, allButBool);
       return true;
     }
 
@@ -1007,8 +978,6 @@ namespace ir {
     { return true; }
     INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
-    INLINE bool GetSamplerInfoInstruction::wellFormed(const Function &fn, std::string &why) const
-    { return true; }
 
 
     // Ensure that types and register family match
@@ -1018,7 +987,7 @@ namespace ir {
         whyNot = "Out-of-bound immediate value index";
         return false;
       }
-      const ir::Type immType = fn.getImmediate(immediateIndex).type;
+      const ir::Type immType = fn.getImmediate(immediateIndex).getType();
       if (UNLIKELY(type != immType)) {
         whyNot = "Inconsistant type for the immediate value to load";
         return false;
@@ -1112,6 +1081,9 @@ namespace ir {
       out << " {" << "%" << this->getSrc(fn, 0) << "}";
       for (uint32_t i = 1; i < srcNum; ++i)
         out << " %" << this->getSrc(fn, i);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
     }
 
 
@@ -1146,6 +1118,9 @@ namespace ir {
         out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
       out << "}";
       out << " %" << this->getSrc(fn, 0);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
     }
 
     INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1155,6 +1130,9 @@ namespace ir {
       for (uint32_t i = 0; i < valueNum; ++i)
         out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
       out << "}";
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
     }
 
     INLINE void LabelInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1226,7 +1204,7 @@ namespace ir {
   return HelperIntrospection<CLASS, RefClass>::value == 1;
 
 #define START_INTROSPECTION(CLASS) \
-  static_assert(sizeof(internal::CLASS) == sizeof(uint64_t), \
+  static_assert(sizeof(internal::CLASS) == (sizeof(uint64_t)*2), \
                 "Bad instruction size"); \
   static_assert(offsetof(internal::CLASS, opcode) == 0, \
                 "Bad opcode offset"); \
@@ -1288,10 +1266,6 @@ START_INTROSPECTION(GetImageInfoInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(GetImageInfoInstruction)
 
-START_INTROSPECTION(GetSamplerInfoInstruction)
-#include "ir/instruction.hxx"
-END_INTROSPECTION(GetSamplerInfoInstruction)
-
 START_INTROSPECTION(LoadImmInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LoadImmInstruction)
@@ -1474,14 +1448,17 @@ DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
 DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
 DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
 DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
@@ -1491,16 +1468,13 @@ DECL_MEM_FN(SyncInstruction, uint32_t, getParameters(void), getParameters())
 DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerIndex(void), getSamplerIndex())
-DECL_MEM_FN(SampleInstruction, const bool, is3D(void), is3D())
 DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerOffset(void), getSamplerOffset())
 DECL_MEM_FN(SampleInstruction, const uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
 DECL_MEM_FN(TypedWriteInstruction, const uint8_t, getImageIndex(void), getImageIndex())
-DECL_MEM_FN(TypedWriteInstruction, const bool, is3D(void), is3D())
 DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
 DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImageIndex())
-DECL_MEM_FN(GetSamplerInfoInstruction, const uint8_t, getSamplerIndex(void), getSamplerIndex())
 
 #undef DECL_MEM_FN
 
@@ -1628,8 +1602,8 @@ DECL_MEM_FN(GetSamplerInfoInstruction, const uint8_t, getSamplerIndex(void), get
   }
 
   // For all unary functions with given opcode
-  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Tuple src) {
-    return internal::AtomicInstruction(atomicOp, dst, space, src).convert();
+  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
+    return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
   }
 
   // BRA
@@ -1657,9 +1631,10 @@ DECL_MEM_FN(GetSamplerInfoInstruction, const uint8_t, getSamplerIndex(void), get
                    Register offset, \
                    AddressSpace space, \
                    uint32_t valueNum, \
-                   bool dwAligned) \
+                   bool dwAligned, \
+                   BTI bti) \
   { \
-    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned).convert(); \
+    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
   }
 
   DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
@@ -1678,22 +1653,18 @@ DECL_MEM_FN(GetSamplerInfoInstruction, const uint8_t, getSamplerIndex(void), get
   }
 
   // SAMPLE
-  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset, bool is3D) {
-    return internal::SampleInstruction(imageIndex, dst, src, dstIsFloat, srcIsFloat, sampler, samplerOffset, is3D).convert();
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
+    return internal::SampleInstruction(imageIndex, dst, src, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
   }
 
-  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType, bool is3D) {
-    return internal::TypedWriteInstruction(imageIndex, src, srcType, coordType, is3D).convert();
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType) {
+    return internal::TypedWriteInstruction(imageIndex, src, srcType, coordType).convert();
   }
 
   Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg) {
     return internal::GetImageInfoInstruction(infoType, dst, imageIndex, infoReg).convert();
   }
 
-  Instruction GET_SAMPLER_INFO(Register dst, Register samplerInfo, uint8_t samplerIdx) {
-    return internal::GetSamplerInfoInstruction(dst, samplerInfo, samplerIdx).convert();
-  }
-
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
     switch (insn.getOpcode()) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 8e2cd11..a75a441 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -31,9 +31,18 @@
 #include "sys/intrusive_list.hpp"
 
 #include <ostream>
+#define MAX_MIXED_POINTER 4
 
 namespace gbe {
 namespace ir {
+  struct BTI {
+    uint8_t bti[MAX_MIXED_POINTER];
+    uint8_t count;
+    BTI() : count(0) {
+      memset(bti, 0, MAX_MIXED_POINTER);
+    }
+    ~BTI() {}
+  };
 
   /*! All opcodes */
   enum Opcode : uint8_t {
@@ -95,7 +104,7 @@ namespace ir {
   ///////////////////////////////////////////////////////////////////////////
 
   /*! Stores instruction internal data and opcode */
-  class ALIGNED(sizeof(uint64_t)) InstructionBase
+  class ALIGNED(sizeof(uint64_t)*2) InstructionBase
   {
   public:
     /*! Initialize the instruction from a 8 bytes stream */
@@ -109,7 +118,7 @@ namespace ir {
     /*! Get the instruction opcode */
     INLINE Opcode getOpcode(void) const { return opcode; }
   protected:
-    enum { opaqueSize = sizeof(uint64_t)-sizeof(uint8_t) };
+    enum { opaqueSize = sizeof(uint64_t)*2-sizeof(uint8_t) };
     Opcode opcode;               //!< Idendifies the instruction
     char opaque[opaqueSize];     //!< Remainder of it
     GBE_CLASS(InstructionBase);  //!< Use internal allocators
@@ -178,7 +187,8 @@ namespace ir {
     template <typename T> INLINE bool isMemberOf(void) const {
       return T::isClassOf(*this);
     }
-    static const uint32_t MAX_SRC_NUM = 16;
+    /*! max_src for store instruction (vec16 + addr) */
+    static const uint32_t MAX_SRC_NUM = 17;
     static const uint32_t MAX_DST_NUM = 16;
   protected:
     BasicBlock *parent;      //!< The basic block containing the instruction
@@ -272,6 +282,7 @@ namespace ir {
     static const uint32_t addressIndex = 0;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
+    BTI getBTI(void) const;
     /*! Return the atomic function code */
     AtomicOps getAtomicOpcode(void) const;
     /*! Return the register that contains the addresses */
@@ -291,6 +302,7 @@ namespace ir {
     Type getValueType(void) const;
     /*! Give the number of values the instruction is storing (srcNum-1) */
     uint32_t getValueNum(void) const;
+    BTI getBTI(void) const;
     /*! Address space that is manipulated here */
     AddressSpace getAddressSpace(void) const;
     /*! DWORD aligned means untyped read for Gen. That is what matters */
@@ -322,6 +334,7 @@ namespace ir {
     bool isAligned(void) const;
     /*! Return the register that contains the addresses */
     INLINE Register getAddress(void) const { return this->getSrc(0u); }
+    BTI getBTI(void) const;
     /*! Return the register that contain value valueID */
     INLINE Register getValue(uint32_t valueID) const {
       return this->getDst(valueID);
@@ -350,7 +363,6 @@ namespace ir {
   class TypedWriteInstruction : public Instruction {
   public:
     /*! Return true if the given instruction is an instance of this class */
-    const bool is3D() const;
     static bool isClassOf(const Instruction &insn);
     const uint8_t getImageIndex() const;
     Type getSrcType(void) const;
@@ -360,7 +372,6 @@ namespace ir {
   /*! Load texels from a texture */
   class SampleInstruction : public Instruction {
   public:
-    const bool is3D() const;
     const uint8_t getImageIndex() const;
     const uint8_t getSamplerIndex(void) const;
     const uint8_t getSamplerOffset(void) const;
@@ -376,7 +387,7 @@ namespace ir {
      uint8_t index; /*! the allocated image index */
      uint8_t  type;  /*! the information type */
     };
-    uint32_t data;
+    uint16_t data;
   } ImageInfoKey;
 
   /*! Get image information */
@@ -411,15 +422,6 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
-  /*! Get image information */
-  class GetSamplerInfoInstruction : public Instruction {
-  public:
-
-    const uint8_t getSamplerIndex(void) const;
-    /*! Return true if the given instruction is an instance of this class */
-    static bool isClassOf(const Instruction &insn);
-  };
-
   /*! Branch instruction is the unified way to branch (with or without
    *  predicate)
    */
@@ -575,6 +577,10 @@ namespace ir {
   Instruction RCP(Type type, Register dst, Register src);
   /*! abs.type dst src */
   Instruction ABS(Type type, Register dst, Register src);
+  /*! simd_all.type dst src */
+  Instruction SIMD_ALL(Type type, Register dst, Register src);
+  /*! simd_any.type dst src */
+  Instruction SIMD_ANY(Type type, Register dst, Register src);
   /*! log.type dst src */
   Instruction LOG(Type type, Register dst, Register src);
   /*! exp.type dst src */
@@ -650,7 +656,7 @@ namespace ir {
   /*! F32TO16.{dstType <- srcType} dst src */
   Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
-  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Tuple src);
+  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
   /*! bra labelIndex */
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
@@ -658,21 +664,19 @@ namespace ir {
   /*! ret */
   Instruction RET(void);
   /*! load.type.space {dst1,...,dst_valueNum} offset value */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
   /*! store.type.space offset {src1,...,src_valueNum} value */
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
   /*! loadi.type dst value */
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
   /*! sync.params... (see Sync instruction) */
   Instruction SYNC(uint32_t parameters);
   /*! typed write */
-  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType, bool is3D);
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType);
   /*! sample textures */
-  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset, bool is3D);
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
   /*! get image information , such as width/height/depth/... */
   Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
-  /*! get sampler information  */
-  Instruction GET_SAMPLER_INFO(Register dst, Register samplerInfo, uint8_t index);
   /*! label labelIndex */
   Instruction LABEL(LabelIndex labelIndex);
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index bb5229a..587517b 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -38,6 +38,8 @@ DECL_INSN(RNDD, UnaryInstruction)
 DECL_INSN(RNDE, UnaryInstruction)
 DECL_INSN(RNDU, UnaryInstruction)
 DECL_INSN(RNDZ, UnaryInstruction)
+DECL_INSN(SIMD_ANY, UnaryInstruction)
+DECL_INSN(SIMD_ALL, UnaryInstruction)
 DECL_INSN(POW, BinaryInstruction)
 DECL_INSN(MUL, BinaryInstruction)
 DECL_INSN(ADD, BinaryInstruction)
@@ -78,7 +80,6 @@ DECL_INSN(SAMPLE, SampleInstruction)
 DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
-DECL_INSN(GET_SAMPLER_INFO, GetSamplerInfoInstruction)
 DECL_INSN(MUL_HI, BinaryInstruction)
 DECL_INSN(I64_MUL_HI, BinaryInstruction)
 DECL_INSN(FBH, UnaryInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index 724d5c3..afed476 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -38,26 +38,57 @@ namespace ir {
       if (op == OP_RET) {
         workSet.insert(info);
         info->liveOut.insert(ocl::retVal);
-      } else if (op == OP_BRA) {
-        // If this is a backward jump, put it to the extra work list.
-        if (((BranchInstruction*)lastInsn)->getLabelIndex() < bb.getLabelIndex())
-          extraWorkSet.insert(info);
       }
     });
     // Now with iterative analysis, we compute liveout and livein sets
     this->computeLiveInOut();
-    for (auto it : extraWorkSet) {
-      for (auto reg : it->liveOut) {
-        it->extraLiveIn.insert(reg);
-      }
-    }
-    this->computeExtraLiveInOut();
+    // extend register (def in loop, use out-of-loop) liveness to the whole loop
+    set<Register> extentRegs;
+    this->computeExtraLiveInOut(extentRegs);
+    // analyze uniform values. The extentRegs contains all the values which is
+    // defined in a loop and use out-of-loop which could not be a uniform. The reason
+    // is that when it reenter the second time, it may active different lanes. So
+    // reenter many times may cause it has different values in different lanes.
+    this->analyzeUniform(&extentRegs);
   }
 
   Liveness::~Liveness(void) {
     for (auto &pair : liveness) GBE_SAFE_DELETE(pair.second);
   }
 
+  void Liveness::analyzeUniform(set<Register> *extentRegs) {
+    fn.foreachBlock([this, extentRegs](const BasicBlock &bb) {
+      const_cast<BasicBlock&>(bb).foreach([this, extentRegs](const Instruction &insn) {
+        const uint32_t srcNum = insn.getSrcNum();
+        const uint32_t dstNum = insn.getDstNum();
+        bool uniform = true;
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const Register reg = insn.getSrc(srcID);
+          if (!fn.isUniformRegister(reg))
+            uniform = false;
+        }
+        // A destination is a killed value
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const Register reg = insn.getDst(dstID);
+          int opCode = insn.getOpcode();
+          // FIXME, ADDSAT and uniform vector should be supported.
+          if (uniform &&
+              fn.getRegisterFamily(reg) != ir::FAMILY_QWORD &&
+              !insn.getParent()->definedPhiRegs.contains(reg) &&
+              opCode != ir::OP_ATOMIC &&
+              opCode != ir::OP_MUL_HI &&
+              opCode != ir::OP_HADD &&
+              opCode != ir::OP_RHADD &&
+              opCode != ir::OP_ADDSAT &&
+              (dstNum == 1 || insn.getOpcode() != ir::OP_LOAD) &&
+              !extentRegs->contains(reg)
+             )
+            fn.setRegisterUniform(reg, true);
+        }
+      });
+    });
+  }
+
   void Liveness::initBlock(const BasicBlock &bb) {
     GBE_ASSERT(liveness.contains(&bb) == false);
     BlockInfo *info = GBE_NEW(BlockInfo, bb);
@@ -97,8 +128,10 @@ namespace ir {
       for (auto prev : currInfo->bb.getPredecessorSet()) {
         BlockInfo *prevInfo = liveness[prev];
         for (auto currInVar : currInfo->upwardUsed) {
-          auto changed = prevInfo->liveOut.insert(currInVar);
-          if (changed.second) isChanged = true;
+          if (!prevInfo->bb.undefPhiRegs.contains(currInVar)) {
+            auto changed = prevInfo->liveOut.insert(currInVar);
+            if (changed.second) isChanged = true;
+          }
         }
         if (isChanged )
           workSet.insert(prevInfo);
@@ -110,8 +143,6 @@ namespace ir {
       BlockInfo *info = liveness[&bb];
       auto &outVarSet = info->liveOut;
       auto &inVarSet = info->upwardUsed;
-      auto &extraInVarSet = info->extraLiveIn;
-      auto &extraOutVarSet = info->extraLiveOut;
       printf("\n\tin Lives: ");
       for (auto inVar : inVarSet) {
         printf("%d ", inVar);
@@ -126,87 +157,63 @@ namespace ir {
     });
 #endif
    }
-
 /*
-  Consider the following scenario, %100's normal liveness will start from Ln-1's
-  position. In normal analysis, the Ln-1 is not Ln's predecessor, thus the liveness
-  of %100 will be passed to Ln and then will not be passed to L0.
-
-  But considering we are running on a multilane with predication's vector machine.
-  The unconditional BR in Ln-1 may be removed and it will enter Ln with a subset of
-  the revert set of Ln-1's predication. For example when running Ln-1, the active lane
-  is 0-7, then at Ln the active lane is 8-15. Then at the end of Ln, a subset of 8-15
-  will jump to L0. If a register %10 is allocated the same GRF as %100, given the fact
-  that their normal liveness doesn't overlapped, the a subset of 8-15 lanes will be
-  modified. If the %10 and %100 are the same vector data type, then we are fine. But if
-  %100 is a float vector, and the %10 is a bool or short vector, then we hit a bug here.
-
-L0:
-  ...
-  %10 = 5
-  ...
-Ln-1:
-  %100 = 2
-  BR Ln+1
-
-Ln:
-  ...
-  BR(%xxx) L0
-
-Ln+1:
-  %101 = %100 + 2;
-  ...
-
-  The solution to fix this issue is to build another liveness data. We will start with
-  those BBs with backward jump. Then pass all the liveOut register as extra liveIn
-  of current BB and then forward this extra liveIn to all the blocks. This is very similar
-  to the normal liveness analysis just with reverse direction.
+  As we run in SIMD mode with prediction mask to indicate active lanes.
+  If a vreg is defined in a loop, and there are som uses of the vreg out of the loop,
+  the define point may be run several times under *different* prediction mask.
+  For these kinds of vreg, we must extend the vreg liveness into the whole loop.
+  If we don't do this, it's liveness is killed before the def point inside loop.
+  If the vreg's corresponding physical reg is assigned to other vreg during the
+  killed period, and the instructions before kill point were re-executed with different prediction,
+  the inactive lanes of vreg maybe over-written. Then the out-of-loop use will got wrong data.
 */
-  void Liveness::computeExtraLiveInOut(void) {
-    while(!extraWorkSet.empty()) {
-      struct BlockInfo *currInfo = *extraWorkSet.begin();
-      extraWorkSet.erase(currInfo);
-      for (auto currInVar : currInfo->extraLiveIn)
-        currInfo->extraLiveOut.insert(currInVar);
-      bool isChanged = false;
-      for (auto succ : currInfo->bb.getSuccessorSet()) {
-        BlockInfo *succInfo = liveness[succ];
-        for (auto currOutVar : currInfo->extraLiveOut) {
-          bool changed = false;
-          if (!succInfo->upwardUsed.contains(currOutVar)) {
-            auto it  = succInfo->extraLiveIn.insert(currOutVar);
-            changed = it.second;
+  void Liveness::computeExtraLiveInOut(set<Register> &extentRegs) {
+    const vector<Loop *> &loops = fn.getLoops();
+    extentRegs.clear();
+    if(loops.size() == 0) return;
+
+    for (auto l : loops) {
+      for (auto x : l->exits) {
+        const BasicBlock &a = fn.getBlock(x.first);
+        const BasicBlock &b = fn.getBlock(x.second);
+        BlockInfo * exiting = liveness[&a];
+        BlockInfo * exit = liveness[&b];
+        std::vector<Register> toExtend;
+
+        if(b.getPredecessorSet().size() > 1) {
+          for (auto p : exit->upwardUsed)
+            toExtend.push_back(p);
+        } else {
+          std::set_intersection(exiting->liveOut.begin(), exiting->liveOut.end(), exit->upwardUsed.begin(), exit->upwardUsed.end(), std::back_inserter(toExtend));
+        }
+        if (toExtend.size() == 0) continue;
+        for(auto r : toExtend)
+          extentRegs.insert(r);
+        for (auto bb : l->bbs) {
+          BlockInfo * bI = liveness[&fn.getBlock(bb)];
+          for(auto r : toExtend) {
+            if(!bI->upwardUsed.contains(r))
+              bI->upwardUsed.insert(r);
+            bI->liveOut.insert(r);
           }
-          if (changed) isChanged = true;
         }
-        if (isChanged)
-          extraWorkSet.insert(succInfo);}
-    };
+      }
+    }
 #if 0
     fn.foreachBlock([this](const BasicBlock &bb){
       printf("label %d:\n", bb.getLabelIndex());
       BlockInfo *info = liveness[&bb];
       auto &outVarSet = info->liveOut;
       auto &inVarSet = info->upwardUsed;
-      auto &extraInVarSet = info->extraLiveIn;
-      auto &extraOutVarSet = info->extraLiveOut;
-      printf("\n\tin Lives: ");
+      printf("\n\tLive Ins: ");
       for (auto inVar : inVarSet) {
         printf("%d ", inVar);
       }
-      printf("\n\textra in Lives: ");
-      for (auto inVar : extraInVarSet) {
-        printf("%d ", inVar);
-      }
       printf("\n");
-      printf("\tout Lives: ");
+      printf("\tLive outs: ");
       for (auto outVar : outVarSet) {
         printf("%d ", outVar);
       }
-      printf("\n\textra out Lives: ");
-      for (auto outVar : extraOutVarSet) {
-        printf("%d ", outVar);
-      }
       printf("\n");
 
     });
diff --git a/backend/src/ir/liveness.hpp b/backend/src/ir/liveness.hpp
index 9198eae..d55e00d 100644
--- a/backend/src/ir/liveness.hpp
+++ b/backend/src/ir/liveness.hpp
@@ -69,8 +69,6 @@ namespace ir {
       INLINE bool inVarKill(Register reg) const {
         return varKill.contains(reg);
       }
-      UEVar extraLiveIn;
-      LiveOut extraLiveOut;
       UEVar upwardUsed;
       LiveOut liveOut;
       VarKill varKill;
@@ -96,17 +94,6 @@ namespace ir {
       return info.upwardUsed;
     }
 
-    /*! Get the set of extra registers alive at the end of the block */
-    const LiveOut &getExtraLiveOut(const BasicBlock *bb) const {
-      const BlockInfo &info = this->getBlockInfo(bb);
-      return info.extraLiveOut;
-    }
-    /*! Get the set of extra registers alive at the beginning of the block */
-    const UEVar &getExtraLiveIn(const BasicBlock *bb) const {
-      const BlockInfo &info = this->getBlockInfo(bb);
-      return info.extraLiveIn;
-    }
-
     /*! Return the function the liveness was computed on */
     INLINE const Function &getFunction(void) const { return fn; }
     /*! Actually do something for each successor / predecessor of *all* blocks */
@@ -140,10 +127,11 @@ namespace ir {
     void initInstruction(BlockInfo &info, const Instruction &insn);
     /*! Now really compute LiveOut based on UEVar and VarKill */
     void computeLiveInOut(void);
-    void computeExtraLiveInOut(void);
+    void computeExtraLiveInOut(set<Register> &extentRegs);
+    void analyzeUniform(set<Register> *extentRegs);
     /*! Set of work list block which has exit(return) instruction */
     typedef set <struct BlockInfo*> WorkSet;
-    WorkSet workSet, extraWorkSet;
+    WorkSet workSet;
 
     /*! Use custom allocators */
     GBE_CLASS(Liveness);
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index ad1ea32..f71fd72 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -120,21 +120,23 @@ namespace ir {
   };
 
   INLINE uint64_t getOffsetFromImm(const Immediate &imm) {
-    switch (imm.type) {
+    switch (imm.getType()) {
       // bit-cast these ones
       case TYPE_DOUBLE:
-      case TYPE_FLOAT:
+      case TYPE_FLOAT: NOT_SUPPORTED; return 0;
       case TYPE_S64:
       case TYPE_U64:
       case TYPE_U32:
       case TYPE_U16:
-      case TYPE_U8: return imm.data.u64;
+      case TYPE_U8:
       // sign extend these ones
-      case TYPE_S32: return int64_t(imm.data.s32);
-      case TYPE_S16: return int64_t(imm.data.s16);
-      case TYPE_S8: return int64_t(imm.data.s8);
+      case TYPE_S32:
+      case TYPE_S16:
+      case TYPE_S8: return imm.getIntegerValue();
       case TYPE_BOOL:
       case TYPE_HALF: NOT_SUPPORTED; return 0;
+      default:
+        GBE_ASSERT(0 && "Unsupported imm type.\n");
     }
     return 0;
   }
@@ -239,7 +241,8 @@ namespace ir {
           if(inserted.contains(argLocation)) {
             pushed = argLocation.getRegister();
           } else {
-            pushed = fn->newRegister(family);
+            // pushed register should be uniform register.
+            pushed = fn->newRegister(family, true);
             this->appendPushedConstant(pushed, argLocation);
             inserted.insert(argLocation);
           }
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
new file mode 100644
index 0000000..9d60402
--- /dev/null
+++ b/backend/src/ir/printf.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file printf.cpp
+ *
+ */
+
+#include <stdarg.h>
+#include "printf.hpp"
+
+namespace gbe
+{
+  namespace ir
+  {
+
+    pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER;
+
+    uint32_t PrintfSet::append(PrintfFmt* fmt, Unit& unit)
+    {
+      fmts.push_back(*fmt);
+
+      for (auto &f : fmts.back()) {
+        if (f.type == PRINTF_SLOT_TYPE_STRING)
+          continue;
+
+        slots.push_back(&f);
+      }
+
+      /* Update the total size of size. */
+      if (slots.size() > 0)
+        sizeOfSize = slots.back()->state->out_buf_sizeof_offset
+                     + getPrintfBufferElementSize(slots.size() - 1);
+
+      return (uint32_t)fmts.size();
+    }
+
+    static void generatePrintfFmtString(PrintfState& state, std::string& str)
+    {
+      char num_str[16];
+      str += "%";
+
+      if (state.left_justified) {
+        str += "-";
+      }
+
+      if (state.sign_symbol == 1) {
+        str += "+";
+      } else if (state.sign_symbol == 2) {
+        str += " ";
+      }
+
+      if (state.alter_form) {
+        str += "#";
+      }
+
+      if (state.zero_padding) {
+        str += "0";
+      }
+
+      if (state.min_width >= 0) {
+        snprintf(num_str, 16, "%d", state.min_width);
+        str += num_str;
+      }
+
+      if (state.precision >= 0) {
+        str += ".";
+        snprintf(num_str, 16, "%d", state.precision);
+        str += num_str;
+      }
+
+      switch (state.length_modifier) {
+        case PRINTF_LM_HH:
+          str += "hh";
+          break;
+        case PRINTF_LM_H:
+          str += "h";
+          break;
+        case PRINTF_LM_L:
+          str += "l";
+          break;
+        case PRINTF_LM_HL:
+          str += "";
+          break;
+        default:
+          assert(state.length_modifier == PRINTF_LM_NONE);
+      }
+    }
+
+#define PRINT_SOMETHING(target_ty, conv)  do {                          \
+      if (!vec_i)                                                       \
+        pf_str = pf_str + std::string(#conv);                           \
+      printf(pf_str.c_str(),                                            \
+             ((target_ty *)((char *)buf_addr + slot.state->out_buf_sizeof_offset * \
+                            global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
+             [(k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i]);\
+    } while (0)
+
+
+    void PrintfSet::outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+                                 size_t global_wk_sz1, size_t global_wk_sz2)
+    {
+      LockOutput lock;
+      size_t i, j, k;
+      std::string pf_str;
+      int stmt = 0;
+
+      for (auto &pf : fmts) {
+        for (i = 0; i < global_wk_sz0; i++) {
+          for (j = 0; j < global_wk_sz1; j++) {
+            for (k = 0; k < global_wk_sz2; k++) {
+
+              int flag = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
+                                             + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
+              if (flag) {
+                for (auto &slot : pf) {
+                  pf_str = "";
+                  int vec_num;
+
+                  if (slot.type == PRINTF_SLOT_TYPE_STRING) {
+                    printf("%s", slot.str);
+                    continue;
+                  }
+                  assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+
+                  generatePrintfFmtString(*slot.state, pf_str);
+
+                  vec_num = slot.state->vector_n > 0 ? slot.state->vector_n : 1;
+
+                  for (int vec_i = 0; vec_i < vec_num; vec_i++) {
+                    if (vec_i)
+                      printf(",");
+
+                    switch (slot.state->conversion_specifier) {
+                      case PRINTF_CONVERSION_D:
+                      case PRINTF_CONVERSION_I:
+                        PRINT_SOMETHING(int, d);
+                        break;
+
+                      case PRINTF_CONVERSION_O:
+                        PRINT_SOMETHING(int, o);
+                        break;
+                      case PRINTF_CONVERSION_U:
+                        PRINT_SOMETHING(int, u);
+                        break;
+                      case PRINTF_CONVERSION_X:
+                        PRINT_SOMETHING(int, X);
+                        break;
+                      case PRINTF_CONVERSION_x:
+                        PRINT_SOMETHING(int, x);
+                        break;
+
+                      case PRINTF_CONVERSION_C:
+                        PRINT_SOMETHING(char, c);
+                        break;
+
+                      case PRINTF_CONVERSION_F:
+                        PRINT_SOMETHING(float, F);
+                        break;
+                      case PRINTF_CONVERSION_f:
+                        PRINT_SOMETHING(float, f);
+                        break;
+                      case PRINTF_CONVERSION_E:
+                        PRINT_SOMETHING(float, E);
+                        break;
+                      case PRINTF_CONVERSION_e:
+                        PRINT_SOMETHING(float, e);
+                        break;
+                      case PRINTF_CONVERSION_G:
+                        PRINT_SOMETHING(float, G);
+                        break;
+                      case PRINTF_CONVERSION_g:
+                        PRINT_SOMETHING(float, g);
+                        break;
+                      case PRINTF_CONVERSION_A:
+                        PRINT_SOMETHING(float, A);
+                        break;
+                      case PRINTF_CONVERSION_a:
+                        PRINT_SOMETHING(float, a);
+                        break;
+                      case PRINTF_CONVERSION_P:
+                        PRINT_SOMETHING(int, p);
+                        break;
+
+                      case PRINTF_CONVERSION_S:
+                        pf_str = pf_str + "s";
+                        printf(pf_str.c_str(), slot.state->str.c_str());
+                        break;
+
+                      default:
+                        assert(0);
+                        return;
+                    }
+                  }
+
+                  pf_str = "";
+                }
+              }
+            }
+          }
+        }
+        stmt++;
+      }
+    }
+  } /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
new file mode 100644
index 0000000..4db7245
--- /dev/null
+++ b/backend/src/ir/printf.hpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file printf.hpp
+ *
+ */
+#ifndef __GBE_IR_PRINTF_HPP__
+#define __GBE_IR_PRINTF_HPP__
+
+#include <string.h>
+#include "sys/map.hpp"
+#include "sys/vector.hpp"
+#include "unit.hpp"
+
+namespace gbe
+{
+  namespace ir
+  {
+
+    /* Things about printf info. */
+    enum {
+      PRINTF_LM_NONE,
+      PRINTF_LM_HH,
+      PRINTF_LM_H,
+      PRINTF_LM_L,
+      PRINTF_LM_HL,
+    };
+
+    enum {
+      PRINTF_CONVERSION_INVALID,
+      PRINTF_CONVERSION_D,
+      PRINTF_CONVERSION_I,
+      PRINTF_CONVERSION_O,
+      PRINTF_CONVERSION_U,
+      PRINTF_CONVERSION_X,
+      PRINTF_CONVERSION_x,
+      PRINTF_CONVERSION_F,
+      PRINTF_CONVERSION_f,
+      PRINTF_CONVERSION_E,
+      PRINTF_CONVERSION_e,
+      PRINTF_CONVERSION_G,
+      PRINTF_CONVERSION_g,
+      PRINTF_CONVERSION_A,
+      PRINTF_CONVERSION_a,
+      PRINTF_CONVERSION_C,
+      PRINTF_CONVERSION_S,
+      PRINTF_CONVERSION_P
+    };
+
+    struct PrintfState {
+      char left_justified;
+      char sign_symbol; //0 for nothing, 1 for sign, 2 for space.
+      char alter_form;
+      char zero_padding;
+      char vector_n;
+      int min_width;
+      int precision;
+      int length_modifier;
+      char conversion_specifier;
+      int out_buf_sizeof_offset;  // Should *global_total_size to get the full offset.
+      std::string str;            //if %s, the string store here.
+    };
+
+    enum {
+      PRINTF_SLOT_TYPE_NONE,
+      PRINTF_SLOT_TYPE_STRING,
+      PRINTF_SLOT_TYPE_STATE
+    };
+
+    struct PrintfSlot {
+      int type;
+      union {
+        char* str;
+        PrintfState* state;
+        void *ptr;
+      };
+
+      PrintfSlot(void) {
+        type = PRINTF_SLOT_TYPE_NONE;
+        ptr = NULL;
+      }
+
+      PrintfSlot(const char * s) {
+        type = PRINTF_SLOT_TYPE_STRING;
+        int len = strlen(s);
+        str = (char*)malloc((len + 1) * sizeof(char));
+        memcpy(str, s, (len + 1) * sizeof(char));
+        str[len] = 0;
+      }
+
+      PrintfSlot(PrintfState * st) {
+        type = PRINTF_SLOT_TYPE_STATE;
+        state = (PrintfState *)malloc(sizeof(PrintfState));
+        memcpy(state, st, sizeof(PrintfState));
+      }
+
+      PrintfSlot(const PrintfSlot & other) {
+        if (other.type == PRINTF_SLOT_TYPE_STRING) {
+          int len = strlen(other.str);
+          str = (char*)malloc((len + 1) * sizeof(char));
+          memcpy(str, other.str, (len + 1) * sizeof(char));
+          str[len] = 0;
+          type = PRINTF_SLOT_TYPE_STRING;
+        } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
+          type = PRINTF_SLOT_TYPE_STATE;
+          state = (PrintfState *)malloc(sizeof(PrintfState));
+          memcpy(state, other.state, sizeof(PrintfState));
+        } else {
+          type = PRINTF_SLOT_TYPE_NONE;
+          ptr = NULL;
+        }
+      }
+
+      PrintfSlot(PrintfSlot && other) {
+        void *p = other.ptr;
+        type = other.type;
+        other.ptr = ptr;
+        ptr = p;
+      }
+
+      ~PrintfSlot(void) {
+        if (ptr)
+          free(ptr);
+      }
+    };
+
+    class Context;
+
+    class PrintfSet //: public Serializable
+    {
+    public:
+      PrintfSet(const PrintfSet& other) {
+        for (auto &f : other.fmts) {
+          fmts.push_back(f);
+        }
+
+        for (auto &s : other.slots) {
+          slots.push_back(s);
+        }
+
+        sizeOfSize = other.sizeOfSize;
+        btiBuf = other.btiBuf;
+        btiIndexBuf = other.btiIndexBuf;
+      }
+
+      PrintfSet(void) = default;
+
+      struct LockOutput {
+        LockOutput(void) {
+          pthread_mutex_lock(&lock);
+        }
+
+        ~LockOutput(void) {
+          pthread_mutex_unlock(&lock);
+        }
+      };
+
+      typedef vector<PrintfSlot> PrintfFmt;
+      uint32_t append(PrintfFmt* fmt, Unit &unit);
+
+      uint32_t getPrintfNum(void) const {
+        return fmts.size();
+      }
+
+      uint32_t getPrintfSizeOfSize(void) const {
+        return sizeOfSize;
+      }
+
+      void setBufBTI(uint8_t b)      { btiBuf = b; }
+      void setIndexBufBTI(uint8_t b) { btiIndexBuf = b; }
+      uint8_t getBufBTI() const      { return btiBuf; }
+      uint8_t getIndexBufBTI() const { return btiIndexBuf; }
+
+      uint32_t getPrintfBufferElementSize(uint32_t i) {
+        PrintfSlot* slot = slots[i];
+        int vec_num = 1;
+        if (slot->state->vector_n > 0) {
+          vec_num = slot->state->vector_n;
+        }
+
+        assert(vec_num > 0 && vec_num <= 16);
+
+        switch (slot->state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_X:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_P:
+          /* Char will be aligned to sizeof(int) here. */
+          case PRINTF_CONVERSION_C:
+            return (uint32_t)(sizeof(int) * vec_num);
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            return (uint32_t)(sizeof(float) * vec_num);
+          case PRINTF_CONVERSION_S:
+            return (uint32_t)0;
+          default:
+            break;
+        }
+        assert(0);
+        return 0;
+      }
+
+      void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+                        size_t global_wk_sz1, size_t global_wk_sz2);
+
+    private:
+      vector<PrintfFmt> fmts;
+      vector<PrintfSlot*> slots;
+      uint32_t sizeOfSize; // Total sizeof size.
+      friend struct LockOutput;
+      uint8_t btiBuf;
+      uint8_t btiIndexBuf;
+      static pthread_mutex_t lock;
+      GBE_CLASS(PrintfSet);
+    };
+  } /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_PRINTF_HPP__ */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index ef3ea28..fc69367 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -37,51 +37,56 @@ namespace ir {
         "local_size_0", "local_size_1", "local_size_2",
         "global_size_0", "global_size_1", "global_size_2",
         "global_offset_0", "global_offset_1", "global_offset_2",
-        "stack_pointer",
+        "stack_pointer", "stack_buffer",
         "block_ip",
-        "barrier_id", "thread_number",
-        "work_dimension", "sampler_info",
-        "emask", "notemask", "barriermask", "retVal"
+        "barrier_id", "thread_number", "work_dimension",
+        "zero", "one",
+        "retVal", "slm_offset",
+        "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "invalid"
     };
 
 #if GBE_DEBUG
-#define DECL_NEW_REG(FAMILY, REG) \
-   r = fn.newRegister(FAMILY_DWORD); \
+#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
+   r = fn.newRegister(FAMILY_DWORD, UNIFORM); \
    GBE_ASSERT(r == REG);
 #else
-#define DECL_NEW_REG(FAMILY, REG) \
-   fn.newRegister(FAMILY_DWORD);
+#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
+   fn.newRegister(FAMILY_DWORD, UNIFORM);
 #endif /* GBE_DEBUG */
     static void init(Function &fn) {
       IF_DEBUG(Register r);
-      DECL_NEW_REG(FAMILY_DWORD, lid0);
-      DECL_NEW_REG(FAMILY_DWORD, lid1);
-      DECL_NEW_REG(FAMILY_DWORD, lid2);
-      DECL_NEW_REG(FAMILY_DWORD, groupid0);
-      DECL_NEW_REG(FAMILY_DWORD, groupid1);
-      DECL_NEW_REG(FAMILY_DWORD, groupid2);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup0);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup1);
-      DECL_NEW_REG(FAMILY_DWORD, numgroup2);
-      DECL_NEW_REG(FAMILY_DWORD, lsize0);
-      DECL_NEW_REG(FAMILY_DWORD, lsize1);
-      DECL_NEW_REG(FAMILY_DWORD, lsize2);
-      DECL_NEW_REG(FAMILY_DWORD, gsize0);
-      DECL_NEW_REG(FAMILY_DWORD, gsize1);
-      DECL_NEW_REG(FAMILY_DWORD, gsize2);
-      DECL_NEW_REG(FAMILY_DWORD, goffset0);
-      DECL_NEW_REG(FAMILY_DWORD, goffset1);
-      DECL_NEW_REG(FAMILY_DWORD, goffset2);
-      DECL_NEW_REG(FAMILY_DWORD, stackptr);
-      DECL_NEW_REG(FAMILY_WORD, blockip);
-      DECL_NEW_REG(FAMILY_DWORD, barrierid);
-      DECL_NEW_REG(FAMILY_DWORD, threadn);
-      DECL_NEW_REG(FAMILY_DWORD, workdim);
-      DECL_NEW_REG(FAMILY_WORD, samplerinfo);
-      DECL_NEW_REG(FAMILY_WORD, emask);
-      DECL_NEW_REG(FAMILY_WORD, notemask);
-      DECL_NEW_REG(FAMILY_WORD, barriermask);
-      DECL_NEW_REG(FAMILY_WORD, retVal);
+      DECL_NEW_REG(FAMILY_DWORD, lid0, 0);
+      DECL_NEW_REG(FAMILY_DWORD, lid1, 0);
+      DECL_NEW_REG(FAMILY_DWORD, lid2, 0);
+      DECL_NEW_REG(FAMILY_DWORD, groupid0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, groupid1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, groupid2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, lsize0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, lsize1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, lsize2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, gsize0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, gsize1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, gsize2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, goffset0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, goffset1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, goffset2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
+      DECL_NEW_REG(FAMILY_DWORD, stackbuffer, 1);
+      DECL_NEW_REG(FAMILY_WORD,  blockip, 0);
+      DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
+      DECL_NEW_REG(FAMILY_DWORD, threadn, 1);
+      DECL_NEW_REG(FAMILY_DWORD, workdim, 1);
+      DECL_NEW_REG(FAMILY_DWORD, zero, 1);
+      DECL_NEW_REG(FAMILY_DWORD, one, 1);
+      DECL_NEW_REG(FAMILY_WORD, retVal, 1);
+      DECL_NEW_REG(FAMILY_WORD, slmoffset, 1);
+      DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index d84c48a..4e89bdd 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -60,16 +60,19 @@ namespace ir {
     static const Register goffset1 = Register(16); // get_global_offset(1)
     static const Register goffset2 = Register(17); // get_global_offset(2)
     static const Register stackptr = Register(18); // stack pointer
-    static const Register blockip = Register(19);  // blockip
-    static const Register barrierid = Register(20);// barrierid
-    static const Register threadn = Register(21);  // number of threads
-    static const Register workdim = Register(22);  // work dimention.
-    static const Register samplerinfo = Register(23); // store sampler info.
-    static const Register emask = Register(24);    // store the emask bits for the branching fix.
-    static const Register notemask = Register(25); // store the !emask bits for the branching fix.
-    static const Register barriermask = Register(26); // software mask for barrier.
-    static const Register retVal = Register(27);   // helper register to do data flow analysis.
-    static const uint32_t regNum = 28;             // number of special registers
+    static const Register stackbuffer = Register(19); // stack buffer base address.
+    static const Register blockip = Register(20);  // blockip
+    static const Register barrierid = Register(21);// barrierid
+    static const Register threadn = Register(22);  // number of threads
+    static const Register workdim = Register(23);  // work dimention.
+    static const Register zero = Register(24);     //  scalar register holds zero.
+    static const Register one = Register(25);     //  scalar register holds one. 
+    static const Register retVal = Register(26);   // helper register to do data flow analysis.
+    static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
+    static const Register printfbptr = Register(28); // printf buffer address .
+    static const Register printfiptr = Register(29); // printf index buffer address.
+    static const Register invalid = Register(30);  // used for valid comparation.
+    static const uint32_t regNum = 31;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index 4f36c2e..7bd4f6e 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -70,17 +70,23 @@ namespace ir {
   {
   public:
     /*! Build a register. All fields will be immutable */
-    INLINE RegisterData(RegisterFamily family = FAMILY_DWORD) : family(family) {}
+    INLINE RegisterData(RegisterFamily family,
+                        bool uniform = false) : family(family), uniform(uniform) {}
     /*! Copy constructor */
-    INLINE RegisterData(const RegisterData &other) : family(other.family) {}
+    INLINE RegisterData(const RegisterData &other) : family(other.family), uniform(other.uniform) {}
     /*! Copy operator */
     INLINE RegisterData &operator= (const RegisterData &other) {
       this->family = other.family;
+      this->uniform = other.uniform;
       return *this;
     }
     /*! Nothing really happens here */
     INLINE ~RegisterData(void) {}
     RegisterFamily family;            //!< Register size or if it is a flag
+    INLINE const bool isUniform() const { return uniform; }
+    INLINE void setUniform(bool uni) { uniform = uni; }
+  private:
+    bool uniform;
     GBE_CLASS(RegisterData);
   };
 
@@ -107,11 +113,11 @@ namespace ir {
   {
   public:
     /*! Return the index of a newly allocated register */
-    INLINE Register append(RegisterFamily family) {
+    INLINE Register append(RegisterFamily family, bool uniform = false) {
       GBE_ASSERTM(regNum() < MAX_INDEX,
                   "Too many defined registers (only 65535 are supported)");
       const uint16_t index = regNum();
-      const RegisterData reg(family);
+      const RegisterData reg(family, uniform);
       regs.push_back(reg);
       return Register(index);
     }
@@ -130,6 +136,10 @@ namespace ir {
     INLINE void appendTuple(void) {}
     /*! Return a copy of the register at index */
     INLINE RegisterData get(Register index) const { return regs[index]; }
+    /*! Return true if the specified register is uniform type. */
+    INLINE bool isUniform(Register index) { return regs[index].isUniform(); }
+    /*! Set a register to uniform or varying data type*/
+    INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); }
     /*! Get the register index from the tuple */
     INLINE Register get(Tuple index, uint32_t which) const {
       return regTuples[uint16_t(index) + which];
@@ -150,9 +160,6 @@ namespace ir {
     GBE_CLASS(RegisterFile);
   };
 
-  /*! Useful to encode anything special */
-  static const Register invalidRegister(RegisterFile::MAX_INDEX);
-
   /*! Output the register file string in the given stream */
   std::ostream &operator<< (std::ostream &out, const RegisterFile &file);
 
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index b67c1b7..7e8355f 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -27,6 +27,7 @@
 namespace gbe {
 namespace ir {
 
+#ifdef GBE_COMPILER_AVAILABLE
   uint8_t SamplerSet::appendReg(uint32_t key, Context *ctx) {
     uint8_t samplerSlot = samplerMap.size();
     samplerMap.insert(std::make_pair(key, samplerSlot));
@@ -52,6 +53,7 @@ namespace ir {
     // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
     // work around.
     arg->type = ir::FunctionArgument::SAMPLER;
+    arg->info.typeName = "sampler_t";
     int32_t id = ctx->getFunction().getArgID(arg);
     GBE_ASSERT(id < (1 << __CLK_SAMPLER_ARG_BITS));
 
@@ -61,6 +63,7 @@ namespace ir {
     }
     return appendReg(SAMPLER_ID(id), ctx);
   }
+#endif
 
 #define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
 #define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index dd1f3b6..2b51ce3 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -56,6 +56,8 @@ namespace ir {
       samplerMap.insert(other.samplerMap.begin(), other.samplerMap.end());
     }
 
+    bool empty() const { return samplerMap.empty(); }
+
     SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
     SamplerSet() {}
 
diff --git a/backend/src/ir/type.cpp b/backend/src/ir/type.cpp
index a6a2e44..56f5c12 100644
--- a/backend/src/ir/type.cpp
+++ b/backend/src/ir/type.cpp
@@ -40,6 +40,8 @@ namespace ir {
       case TYPE_HALF: return out << "half";
       case TYPE_FLOAT: return out << "float";
       case TYPE_DOUBLE: return out << "double";
+      default :
+        GBE_ASSERT(0 && "Unsupported type\n");
     };
     return out;
   }
diff --git a/backend/src/ir/type.hpp b/backend/src/ir/type.hpp
index 1e24906..8bfbdc8 100644
--- a/backend/src/ir/type.hpp
+++ b/backend/src/ir/type.hpp
@@ -46,7 +46,8 @@ namespace ir {
     TYPE_U64,      //!< unsigned 64 bits integer
     TYPE_HALF,     //!< 16 bits floating point value
     TYPE_FLOAT,    //!< 32 bits floating point value
-    TYPE_DOUBLE    //!< 64 bits floating point value
+    TYPE_DOUBLE,   //!< 64 bits floating point value
+    TYPE_LARGE_INT //!< integer larger than 64 bits.
   };
 
   /*! Output a string for the type in the given stream */
@@ -72,8 +73,9 @@ namespace ir {
       case TYPE_U64:
       case TYPE_DOUBLE:
         return FAMILY_QWORD;
+      default:
+        return FAMILY_DWORD;
     };
-    return FAMILY_DWORD;
   }
 
   /*! Return a type for each register family */
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index 4aeffe9..4f9d740 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -27,7 +27,7 @@
 namespace gbe {
 namespace ir {
 
-  Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize) {}
+  Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize), valid(true) {}
   Unit::~Unit(void) {
     for (const auto &pair : functions) GBE_DELETE(pair.second);
   }
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index d8eab79..adebd3f 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -72,12 +72,15 @@ namespace ir {
     ConstantSet& getConstantSet(void) { return constantSet; }
     /*! Return the constant set */
     const ConstantSet& getConstantSet(void) const { return constantSet; }
+    void setValid(bool value) { valid = value; }
+    bool getValid() { return valid; }
   private:
     friend class ContextInterface; //!< Can free modify the unit
     hash_map<std::string, Function*> functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
     PointerSize pointerSize; //!< Size shared by all pointers
     GBE_CLASS(Unit);
+    bool valid;
   };
 
   /*! Output the unit string in the given stream */
@@ -87,4 +90,3 @@ namespace ir {
 } /* namespace gbe */
 
 #endif /* __GBE_IR_UNIT_HPP__ */
-
diff --git a/backend/src/ir/value.cpp b/backend/src/ir/value.cpp
index 11eb0a2..a055bdf 100644
--- a/backend/src/ir/value.cpp
+++ b/backend/src/ir/value.cpp
@@ -97,6 +97,8 @@ namespace ir {
     // Iterate over all the predecessors
     const auto &preds = bb.getPredecessorSet();
     for (const auto &pred : preds) {
+      if (pred->undefPhiRegs.contains(reg))
+        continue;
       RegDefSet &predDef = this->getDefSet(pred, reg);
       for (auto def : predDef) udChain.insert(def);
     }
@@ -521,6 +523,17 @@ namespace ir {
   const DefSet &FunctionDAG::getDef(const Instruction *insn, uint32_t srcID) const {
     return this->getDef(ValueUse(insn, srcID));
   }
+  const UseSet *FunctionDAG::getRegUse(const Register &reg) const {
+    auto it = regUse.find(reg);
+    GBE_ASSERT(it != regUse.end());
+    return it->second;
+  }
+  const DefSet *FunctionDAG::getRegDef(const Register &reg) const {
+    auto it = regDef.find(reg);
+    GBE_ASSERT(it != regDef.end());
+    return it->second;
+  }
+
   const ValueDef *FunctionDAG::getDefAddress(const ValueDef &def) const {
     auto it = defName.find(def);
     GBE_ASSERT(it != defName.end() && it->second != NULL);
diff --git a/backend/src/llvm/llvm_barrier_nodup.cpp b/backend/src/llvm/llvm_barrier_nodup.cpp
new file mode 100644
index 0000000..791df00
--- /dev/null
+++ b/backend/src/llvm/llvm_barrier_nodup.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_barrier_nodup.cpp
+ *
+ *  This pass is to remove or add noduplicate function attribute for barrier functions.
+ *  Basically, we want to set NoDuplicate for those __gen_barrier_xxx functions. But if
+ *  a sub function calls those barrier functions, the sub function will not be inlined
+ *  in llvm's inlining pass. This is what we don't want. As inlining such a function in
+ *  the caller is safe, we just don't want it to duplicate the call. So Introduce this
+ *  pass to remove the NoDuplicate function attribute before the inlining pass and restore
+ *  it after.
+ *  
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+    class BarrierNodup : public ModulePass
+    {
+    public:
+      static char ID;
+      BarrierNodup(bool nodup) :
+        ModulePass(ID), nodup(nodup) {}
+
+      void getAnalysisUsage(AnalysisUsage &AU) const {
+
+      }
+
+      virtual const char *getPassName() const {
+        return "SPIR backend: set barrier no duplicate attr";
+      }
+
+      virtual bool runOnModule(Module &M)
+      {
+        using namespace llvm;
+        bool changed = false;
+        for (auto &F : M) {
+          if (F.getName() == "__gen_ocl_barrier_local_and_global" ||
+              F.getName() == "__gen_ocl_barrier_local"            ||
+              F.getName() == "__gen_ocl_barrier_global") {
+            if (nodup) {
+              if (!F.hasFnAttribute(Attribute::NoDuplicate)) {
+                F.addFnAttr(Attribute::NoDuplicate);
+                changed = true;
+              }
+            } else {
+              if (F.hasFnAttribute(Attribute::NoDuplicate)) {
+                auto attrs = F.getAttributes();
+                F.setAttributes(attrs.removeAttribute(M.getContext(),
+                                AttributeSet::FunctionIndex,
+                                Attribute::NoDuplicate));
+                changed = true;
+              }
+            }
+          }
+        }
+
+        return changed;
+      }
+    private:
+      bool nodup;
+    };
+
+
+    ModulePass *createBarrierNodupPass(bool Nodup) {
+      return new BarrierNodup(Nodup);
+    }
+
+    char BarrierNodup::ID = 0;
+} // end namespace
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 2d4fb0a..6cb3834 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -71,7 +71,7 @@
  *   is intercepted, we just abort
  */
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #if LLVM_VERSION_MINOR <= 2
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
@@ -127,13 +127,21 @@
 #else
 #include "llvm/IR/DataLayout.h"
 #endif
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
+#endif
+
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
 #include "llvm/Support/InstVisitor.h"
+#elif LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/InstVisitor.h"
 #else
 #include "llvm/InstVisitor.h"
 #endif
@@ -142,14 +150,16 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Config/config.h"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "ir/context.hpp"
 #include "ir/unit.hpp"
 #include "ir/liveness.hpp"
+#include "ir/value.hpp"
 #include "sys/set.hpp"
 #include "sys/cvar.hpp"
+#include "backend/program.h"
+#include <sstream>
 
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
@@ -178,7 +188,7 @@ namespace gbe
   }
 
   /*! LLVM IR Type to Gen IR type translation */
-  static ir::Type getType(const ir::Context &ctx, const Type *type)
+  static ir::Type getType(ir::Context &ctx, const Type *type)
   {
     GBE_ASSERT(isScalarType(type));
     if (type->isFloatTy() == true)
@@ -202,12 +212,11 @@ namespace gbe
       return ir::TYPE_S32;
     if (type == Type::getInt64Ty(type->getContext()))
       return ir::TYPE_S64;
-    GBE_ASSERT(0);
-    return ir::TYPE_S64;
+    return ir::TYPE_LARGE_INT;
   }
 
   /*! LLVM IR Type to Gen IR unsigned type translation */
-  static ir::Type getUnsignedType(const ir::Context &ctx, const Type *type)
+  static ir::Type getUnsignedType(ir::Context &ctx, const Type *type)
   {
     GBE_ASSERT(type->isIntegerTy() == true);
     if (type == Type::getInt1Ty(type->getContext()))
@@ -220,12 +229,12 @@ namespace gbe
       return ir::TYPE_U32;
     if (type == Type::getInt64Ty(type->getContext()))
       return ir::TYPE_U64;
-    GBE_ASSERT(0);
+    ctx.getUnit().setValid(false);
     return ir::TYPE_U64;
   }
 
   /*! Type to register family translation */
-  static ir::RegisterFamily getFamily(const ir::Context &ctx, const Type *type)
+  static ir::RegisterFamily getFamily(ir::Context &ctx, const Type *type)
   {
     GBE_ASSERT(isScalarType(type) == true);
     if (type == Type::getInt1Ty(type->getContext()))
@@ -240,14 +249,14 @@ namespace gbe
       return ir::FAMILY_QWORD;
     if (type->isPointerTy())
       return ctx.getPointerFamily();
-    GBE_ASSERT(0);
+    ctx.getUnit().setValid(false);
     return ir::FAMILY_BOOL;
   }
 
   /*! Get number of element to process dealing either with a vector or a scalar
    *  value
    */
-  static ir::Type getVectorInfo(const ir::Context &ctx, Type *llvmType, Value *value, uint32_t &elemNum, bool useUnsigned = false)
+  static ir::Type getVectorInfo(ir::Context &ctx, Type *llvmType, Value *value, uint32_t &elemNum, bool useUnsigned = false)
   {
     ir::Type type;
     if (llvmType->isVectorTy() == true) {
@@ -329,7 +338,7 @@ namespace gbe
       scalarMap[key] = reg;
     }
     /*! Allocate a new scalar register */
-    ir::Register newScalar(Value *value, Value *key = NULL, uint32_t index = 0u)
+    ir::Register newScalar(Value *value, Value *key = NULL, uint32_t index = 0u, bool uniform = false)
     {
       // we don't allow normal constant, but GlobalValue is a special case,
       // it needs a register to store its address
@@ -342,7 +351,7 @@ namespace gbe
         case Type::DoubleTyID:
         case Type::PointerTyID:
           GBE_ASSERT(index == 0);
-          return this->newScalar(value, key, type, index);
+          return this->_newScalar(value, key, type, index, uniform);
           break;
         case Type::VectorTyID:
         {
@@ -353,7 +362,7 @@ namespace gbe
               elementTypeID != Type::FloatTyID &&
               elementTypeID != Type::DoubleTyID)
             GBE_ASSERTM(false, "Vectors of elements are not supported");
-            return this->newScalar(value, key, elementType, index);
+            return this->_newScalar(value, key, elementType, index, uniform);
           break;
         }
         default: NOT_SUPPORTED;
@@ -411,9 +420,9 @@ namespace gbe
     /*! This creates a scalar register for a Value (index is the vector index when
      *  the value is a vector of scalars)
      */
-    ir::Register newScalar(Value *value, Value *key, Type *type, uint32_t index) {
+    ir::Register _newScalar(Value *value, Value *key, Type *type, uint32_t index, bool uniform) {
       const ir::RegisterFamily family = getFamily(ctx, type);
-      const ir::Register reg = ctx.reg(family);
+      const ir::Register reg = ctx.reg(family, uniform);
       key = key == NULL ? value : key;
       this->insertRegister(reg, key, index);
       return reg;
@@ -441,6 +450,11 @@ namespace gbe
      *  compare instructions we need to invert to decrease branch complexity
      */
     set<const Value*> conditionSet;
+    map<const Value*, int> globalPointer;
+    /*!
+     *  <phi,phiCopy> node information for later optimization
+     */
+    map<const ir::Register, const ir::Register> phiMap;
     /*! We visit each function twice. Once to allocate the registers and once to
      *  emit the Gen IR instructions
      */
@@ -449,9 +463,15 @@ namespace gbe
       PASS_EMIT_INSTRUCTIONS = 1
     } pass;
 
+    typedef enum {
+      CONST_INT,
+      CONST_FLOAT,
+      CONST_DOUBLE
+    } ConstTypeId;
+
     LoopInfo *LI;
     const Module *TheModule;
-
+    int btiBase;
   public:
     static char ID;
     explicit GenWriter(ir::Unit &unit)
@@ -460,7 +480,8 @@ namespace gbe
         ctx(unit),
         regTranslator(ctx),
         LI(0),
-        TheModule(0)
+        TheModule(0),
+        btiBase(BTI_RESERVED_NUM)
     {
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
       pass = PASS_EMIT_REGISTERS;
@@ -477,6 +498,13 @@ namespace gbe
     /*! helper function for parsing global constant data */
     void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
     void collectGlobalConstant(void) const;
+    ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u);
+    const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u);
+
+    uint32_t incBtiBase() {
+      GBE_ASSERT(btiBase <= BTI_MAX_ID);
+      return btiBase++;
+    }
 
     bool runOnFunction(Function &F) {
      // Do not codegen any 'available_externally' functions at all, they have
@@ -489,14 +517,19 @@ namespace gbe
       if(!bKernel) return false;
 
       LI = &getAnalysis<LoopInfo>();
-
       emitFunction(F);
+      phiMap.clear();
+      globalPointer.clear();
+      // Reset for next function
+      btiBase = BTI_RESERVED_NUM;
       return false;
     }
 
     virtual bool doFinalization(Module &M) { return false; }
     /*! handle global variable register allocation (local, constant space) */
     void allocateGlobalVariableRegister(Function &F);
+    /*! gather all the loops in the function and add them to ir::Function */
+    void gatherLoopInfo(ir::Function &fn);
     /*! Emit the complete function code and declaration */
     void emitFunction(Function &F);
     /*! Handle input and output function parameters */
@@ -506,9 +539,11 @@ namespace gbe
     /*! Each block end may require to emit MOVs for further PHIs */
     void emitMovForPHI(BasicBlock *curr, BasicBlock *succ);
     /*! Alocate one or several registers (if vector) for the value */
-    INLINE void newRegister(Value *value, Value *key = NULL);
+    INLINE void newRegister(Value *value, Value *key = NULL, bool uniform = false);
     /*! get the register for a llvm::Constant */
     ir::Register getConstantRegister(Constant *c, uint32_t index = 0);
+    /*! get constant pointer */
+    ir::Register getConstantPointerRegister(ConstantExpr *ce, uint32_t index = 0);
     /*! Return a valid register from an operand (can use LOADI to make one) */
     INLINE ir::Register getRegister(Value *value, uint32_t index = 0);
     /*! Create a new immediate from a constant */
@@ -523,6 +558,8 @@ namespace gbe
     template <bool isLoad, typename T> void emitLoadOrStore(T &I);
     /*! Will try to remove MOVs due to PHI resolution */
     void removeMOVs(const ir::Liveness &liveness, ir::Function &fn);
+    /*! Optimize phi move based on liveness information */
+    void optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn);
     /*! Will try to remove redundants LOADI in basic blocks */
     void removeLOADIs(const ir::Liveness &liveness, ir::Function &fn);
     /*! To avoid lost copy, we need two values for PHI. This function create a
@@ -579,7 +616,18 @@ namespace gbe
     void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;}
     template <bool isLoad, typename T> void visitLoadOrStore(T &I);
 
+    INLINE void gatherBTI(Value *pointer, ir::BTI &bti);
+    // batch vec4/8/16 load/store
+    INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                  Value *llvmValue, const ir::Register ptr,
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
+    private:
+      ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
+      template <typename T, typename P = T>
+      ir::ImmediateIndex processSeqConstant(ConstantDataSequential *seq,
+                                            int index, ConstTypeId tid);
+      ir::ImmediateIndex processConstantVector(ConstantVector *cv, int index);
   };
 
   char GenWriter::ID = 0;
@@ -651,8 +699,11 @@ namespace gbe
       case Type::TypeID::VectorTyID:
         {
           const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+          const VectorType *vecTy = cast<VectorType>(type);
           GBE_ASSERT(cds);
           getSequentialData(cds, mem, offset);
+          if(vecTy->getNumElements() == 3) // OCL spec require align to vec4
+            offset += getTypeByteSize(unit, vecTy->getElementType());
           break;
         }
       case Type::TypeID::IntegerTyID:
@@ -715,14 +766,42 @@ namespace gbe
     return false;
   }
 
-  template <typename U, typename T>
-  static U processConstant(Constant *CPV, T doIt, uint32_t index = 0u)
+  #define GET_EFFECT_DATA(_seq, _index, _tid) \
+    ((_tid == CONST_INT) ? _seq->getElementAsInteger(_index) : \
+    ((_tid == CONST_FLOAT) ? _seq->getElementAsFloat(_index) : \
+    _seq->getElementAsDouble(_index)))
+
+  // typename P is for bool only, as c++ set the &vector<bool)vec[0] to void
+  // type. We have to use uint8_t for bool vector.
+  template <typename T, typename P>
+  ir::ImmediateIndex GenWriter::processSeqConstant(ConstantDataSequential *seq,
+                                                   int index, ConstTypeId tid) {
+    if (index >= 0) {
+      const T data = GET_EFFECT_DATA(seq, index, tid);
+      return ctx.newImmediate(data);
+    } else {
+      vector<P> array;
+      for(int i = 0; i < seq->getNumElements(); i++)
+        array.push_back(GET_EFFECT_DATA(seq, i, tid));
+      return ctx.newImmediate((T*)&array[0], array.size());
+    }
+  }
+
+  ir::ImmediateIndex GenWriter::processConstantVector(ConstantVector *cv, int index) {
+    if (index >= 0) {
+      Constant *c = cv->getOperand(index);
+      return processConstantImmIndex(c, -1);
+    } else {
+      vector<ir::ImmediateIndex> immVector;
+      for (uint32_t i = 0; i < cv->getNumOperands(); i++)
+        immVector.push_back(processConstantImmIndex(cv->getOperand(i)));
+      return ctx.newImmediate(immVector);
+    }
+  }
+
+  ir::ImmediateIndex GenWriter::processConstantImmIndexImpl(Constant *CPV, int32_t index)
   {
-#if GBE_DEBUG
-    GBE_ASSERTM(dyn_cast<ConstantExpr>(CPV) == NULL, "Unsupported constant expression");
-    if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType())
-      GBE_ASSERTM(false, "Unsupported constant expression");
-#endif /* GBE_DEBUG */
+    GBE_ASSERT(dyn_cast<ConstantExpr>(CPV) == NULL);
 
 #if LLVM_VERSION_MINOR > 0
     ConstantDataSequential *seq = dyn_cast<ConstantDataSequential>(CPV);
@@ -730,26 +809,19 @@ namespace gbe
     if (seq) {
       Type *Ty = seq->getElementType();
       if (Ty == Type::getInt1Ty(CPV->getContext())) {
-        const uint64_t u64 = seq->getElementAsInteger(index);
-        return doIt(bool(u64));
+        return processSeqConstant<bool, uint8_t>(seq, index, CONST_INT);
       } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
-        const uint64_t u64 = seq->getElementAsInteger(index);
-        return doIt(uint8_t(u64));
+        return processSeqConstant<uint8_t>(seq, index, CONST_INT);
       } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
-        const uint64_t u64 = seq->getElementAsInteger(index);
-        return doIt(uint16_t(u64));
+        return processSeqConstant<uint16_t>(seq, index, CONST_INT);
       } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
-        const uint64_t u64 = seq->getElementAsInteger(index);
-        return doIt(uint32_t(u64));
+        return processSeqConstant<uint32_t>(seq, index, CONST_INT);
       } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
-        const uint64_t u64 = seq->getElementAsInteger(index);
-        return doIt(u64);
+        return processSeqConstant<uint64_t>(seq, index, CONST_INT);
       } else if (Ty == Type::getFloatTy(CPV->getContext())) {
-        const float f32 = seq->getElementAsFloat(index);
-        return doIt(f32);
+        return processSeqConstant<float>(seq, index, CONST_FLOAT);
       } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
-        const double f64 = seq->getElementAsDouble(index);
-        return doIt(f64);
+        return processSeqConstant<double>(seq, index, CONST_DOUBLE);
       }
     } else
 #endif /* LLVM_VERSION_MINOR > 0 */
@@ -760,32 +832,32 @@ namespace gbe
         Ty = (cast<VectorType>(Ty))->getElementType();
       if (Ty == Type::getInt1Ty(CPV->getContext())) {
         const bool b = 0;
-        return doIt(b);
+        return ctx.newImmediate(b);
       } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
         const uint8_t u8 = 0;
-        return doIt(u8);
+        return ctx.newImmediate(u8);
       } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
         const uint16_t u16 = 0;
-        return doIt(u16);
+        return ctx.newImmediate(u16);
       } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
         const uint32_t u32 = 0;
-        return doIt(u32);
+        return ctx.newImmediate(u32);
       } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
         const uint64_t u64 = 0;
-        return doIt(u64);
+        return ctx.newImmediate(u64);
       } else if (Ty == Type::getFloatTy(CPV->getContext())) {
         const float f32 = 0;
-        return doIt(f32);
+        return ctx.newImmediate(f32);
       } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
         const double f64 = 0;
-        return doIt(f64);
+        return ctx.newImmediate(f64);
       } else {
         GBE_ASSERTM(false, "Unsupporte aggregate zero type.");
-        return doIt(uint32_t(0));
+        return ctx.newImmediate(uint32_t(0));
       }
     } else {
       if (dyn_cast<ConstantVector>(CPV))
-        CPV = extractConstantElem(CPV, index);
+        return processConstantVector(dyn_cast<ConstantVector>(CPV), index);
       GBE_ASSERTM(dyn_cast<ConstantExpr>(CPV) == NULL, "Unsupported constant expression");
 
       // Integers
@@ -793,32 +865,47 @@ namespace gbe
         Type* Ty = CI->getType();
         if (Ty == Type::getInt1Ty(CPV->getContext())) {
           const bool b = CI->getZExtValue();
-          return doIt(b);
+          return ctx.newImmediate(b);
         } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
           const uint8_t u8 = CI->getZExtValue();
-          return doIt(u8);
+          return ctx.newImmediate(u8);
         } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
           const uint16_t u16 = CI->getZExtValue();
-          return doIt(u16);
+          return ctx.newImmediate(u16);
         } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
           const uint32_t u32 = CI->getZExtValue();
-          return doIt(u32);
+          return ctx.newImmediate(u32);
         } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
           const uint64_t u64 = CI->getZExtValue();
-          return doIt(u64);
+          return ctx.newImmediate(u64);
         } else {
-          GBE_ASSERTM(false, "Unsupported integer size");
-          return doIt(uint64_t(0));
+          if (CI->getValue().getActiveBits() > 64) {
+            ctx.getUnit().setValid(false);
+            return ctx.newImmediate(uint64_t(0));
+          }
+          return ctx.newImmediate(uint64_t(CI->getZExtValue()));
         }
       }
 
       // NULL pointers
       if(isa<ConstantPointerNull>(CPV)) {
-        return doIt(uint32_t(0));
+        return ctx.newImmediate(uint32_t(0));
       }
 
-      // Floats and doubles
       const Type::TypeID typeID = CPV->getType()->getTypeID();
+      if (isa<UndefValue>(CPV)) {
+        Type* Ty = CPV->getType();
+        if (Ty == Type::getInt1Ty(CPV->getContext())) return ctx.newImmediate(false);
+        if (Ty == Type::getInt8Ty(CPV->getContext())) return ctx.newImmediate((uint8_t)0);
+        if (Ty == Type::getInt16Ty(CPV->getContext())) return ctx.newImmediate((uint16_t)0);
+        if (Ty == Type::getInt32Ty(CPV->getContext())) return ctx.newImmediate((uint32_t)0);
+        if (Ty == Type::getInt64Ty(CPV->getContext())) return ctx.newImmediate((uint64_t)0);
+        if (Ty == Type::getFloatTy(CPV->getContext())) return ctx.newImmediate((float)0);
+        if (Ty == Type::getDoubleTy(CPV->getContext())) return ctx.newImmediate((double)0);
+        GBE_ASSERT(0 && "Unsupported undef value type.\n");
+      }
+
+      // Floats and doubles
       switch (typeID) {
         case Type::FloatTyID:
         case Type::DoubleTyID:
@@ -828,10 +915,10 @@ namespace gbe
 
           if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
             const float f32 = FPC->getValueAPF().convertToFloat();
-            return doIt(f32);
+            return ctx.newImmediate(f32);
           } else {
             const double f64 = FPC->getValueAPF().convertToDouble();
-            return doIt(f64);
+            return ctx.newImmediate(f64);
           }
         }
         break;
@@ -842,24 +929,89 @@ namespace gbe
     }
 
     GBE_ASSERTM(false, "Unsupported constant type");
-    return doIt(uint64_t(0));
+    return ctx.newImmediate(uint64_t(0));
   }
 
-  /*! Pfff. I cannot use a lambda, since it is templated. Congratulation c++ */
-  struct NewImmediateFunctor
-  {
-    NewImmediateFunctor(ir::Context &ctx) : ctx(ctx) {}
-    template <typename T> ir::ImmediateIndex operator() (const T &t) {
-      return ctx.newImmediate(t);
+  ir::ImmediateIndex GenWriter::processConstantImmIndex(Constant *CPV, int32_t index) {
+    if (dyn_cast<ConstantExpr>(CPV) == NULL)
+      return processConstantImmIndexImpl(CPV, index);
+
+    if (dyn_cast<ConstantExpr>(CPV)) {
+      ConstantExpr *ce = dyn_cast<ConstantExpr>(CPV);
+      ir::Type type = getType(ctx, ce->getType());
+      switch (ce->getOpcode()) {
+        default:
+          //ce->dump();
+          GBE_ASSERT(0 && "unsupported ce opcode.\n");
+        case Instruction::Trunc:
+        {
+          const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+          return ctx.processImm(ir::IMM_TRUNC, immIndex, type);
+        }
+        case Instruction::BitCast:
+        {
+          const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+          if (type == ir::TYPE_LARGE_INT)
+            return immIndex;
+          return ctx.processImm(ir::IMM_BITCAST, immIndex, type);
+        }
+        case Instruction::Add:
+        case Instruction::Sub:
+        case Instruction::Mul:
+        case Instruction::SDiv:
+        case Instruction::SRem:
+        case Instruction::Shl:
+        case Instruction::AShr:
+        case Instruction::LShr:
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor: {
+          const ir::ImmediateIndex lhs  = processConstantImmIndex(ce->getOperand(0), -1);
+          const ir::ImmediateIndex rhs  = processConstantImmIndex(ce->getOperand(1), -1);
+          switch (ce->getOpcode()) {
+          default:
+            //ce->dump();
+            GBE_ASSERTM(0, "Unsupported constant expression.\n");
+          case Instruction::Add:
+            return ctx.processImm(ir::IMM_ADD, lhs, rhs, type);
+          case Instruction::Sub:
+            return ctx.processImm(ir::IMM_SUB, lhs, rhs, type);
+          case Instruction::Mul:
+            return ctx.processImm(ir::IMM_MUL, lhs, rhs, type);
+          case Instruction::SDiv:
+            return ctx.processImm(ir::IMM_DIV, lhs, rhs, type);
+          case Instruction::SRem:
+            return ctx.processImm(ir::IMM_REM, lhs, rhs, type);
+          case Instruction::Shl:
+            return ctx.processImm(ir::IMM_SHL, lhs, rhs, type);
+          case Instruction::AShr:
+            return ctx.processImm(ir::IMM_ASHR, lhs, rhs, type);
+          case Instruction::LShr:
+            return ctx.processImm(ir::IMM_LSHR, lhs, rhs, type);
+          case Instruction::And:
+            return ctx.processImm(ir::IMM_AND, lhs, rhs, type);
+          case Instruction::Or:
+            return ctx.processImm(ir::IMM_OR, lhs, rhs, type);
+          case Instruction::Xor:
+            return ctx.processImm(ir::IMM_XOR, lhs, rhs, type);
+          }
+        }
+      }
     }
-    ir::Context &ctx;
-  };
+    GBE_ASSERT(0 && "unsupported constant.\n");
+    return ctx.newImmediate((uint32_t)0);
+  }
+
+  const ir::Immediate &GenWriter::processConstantImm(Constant *CPV, int32_t index) {
+    ir::ImmediateIndex immIndex = processConstantImmIndex(CPV, index);
+    return ctx.getFunction().getImmediate(immIndex);
+  }
 
   ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV, uint32_t index) {
-    return processConstant<ir::ImmediateIndex>(CPV, NewImmediateFunctor(ctx), index);
+    return processConstantImmIndex(CPV, index);
   }
 
-  void GenWriter::newRegister(Value *value, Value *key) {
+  void GenWriter::newRegister(Value *value, Value *key, bool uniform) {
     auto type = value->getType();
     auto typeID = type->getTypeID();
     switch (typeID) {
@@ -867,23 +1019,96 @@ namespace gbe
       case Type::FloatTyID:
       case Type::DoubleTyID:
       case Type::PointerTyID:
-        regTranslator.newScalar(value, key);
+        regTranslator.newScalar(value, key, 0, uniform);
         break;
       case Type::VectorTyID:
       {
         auto vectorType = cast<VectorType>(type);
         const uint32_t elemNum = vectorType->getNumElements();
         for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-          regTranslator.newScalar(value, key, elemID);
+          regTranslator.newScalar(value, key, elemID, uniform);
         break;
       }
       default: NOT_SUPPORTED;
     };
   }
 
+  ir::Register GenWriter::getConstantPointerRegister(ConstantExpr *expr, uint32_t elemID) {
+    Value* val = expr->getOperand(0);
+
+    if (expr->isCast()) {
+      ir::Register pointer_reg;
+      if(isa<ConstantExpr>(val)) {
+        // try to get the real pointer register, for case like:
+        // store i64 ptrtoint (i8 addrspace(3)* getelementptr inbounds ...
+        // in which ptrtoint and getelementptr are ConstantExpr.
+        pointer_reg = getConstantPointerRegister(dyn_cast<ConstantExpr>(val), elemID);
+      } else {
+        pointer_reg = regTranslator.getScalar(val, elemID);
+      }
+      // if ptrToInt request another type other than 32bit, convert as requested
+      ir::Type dstType = getType(ctx, expr->getType());
+      ir::Type srcType = getType(ctx, val->getType());
+      if(srcType != dstType && dstType != ir::TYPE_S32) {
+        ir::Register tmp = ctx.reg(getFamily(dstType));
+        ctx.CVT(dstType, srcType, tmp, pointer_reg);
+        return tmp;
+      }
+      return pointer_reg;
+    }
+    else if (expr->getOpcode() == Instruction::GetElementPtr) {
+      uint32_t TypeIndex;
+      uint32_t constantOffset = 0;
+
+      Value *pointer = val;
+      CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+      for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
+        uint32_t offset = 0;
+        ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
+        GBE_ASSERT(ConstOP);
+        TypeIndex = ConstOP->getZExtValue();
+        if (op == 1) {
+          if (TypeIndex != 0) {
+            Type *elementType = (cast<PointerType>(pointer->getType()))->getElementType();
+            uint32_t elementSize = getTypeByteSize(unit, elementType);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            elementSize += getPadding(elementSize, align);
+            offset += elementSize * TypeIndex;
+          }
+        } else {
+          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+          {
+            Type* elementType = CompTy->getTypeAtIndex(ty_i);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            offset += getPadding(offset, align);
+            offset += getTypeByteSize(unit, elementType);
+          }
+          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+          offset += getPadding(offset, align);
+        }
+
+        constantOffset += offset;
+        CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+      }
+
+      ir::Register pointer_reg;
+      if(isa<ConstantExpr>(pointer))
+        pointer_reg = getConstantPointerRegister(dyn_cast<ConstantExpr>(pointer), elemID);
+      else
+        pointer_reg = regTranslator.getScalar(pointer, elemID);
+
+      ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
+      ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32));
+      ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
+      ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
+      return reg;
+    }
+    else
+      assert(0);
+  }
+
   ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
     GBE_ASSERT(c != NULL);
-
     if(isa<GlobalValue>(c)) {
       return regTranslator.getScalar(c, elemID);
     }
@@ -905,79 +1130,18 @@ namespace gbe
     }
 
     if(isa<ConstantExpr>(c)) {
-      ConstantExpr * ce = dyn_cast<ConstantExpr>(c);
-
-      if(ce->isCast()) {
-        Value* op = ce->getOperand(0);
-        ir::Register pointer_reg;
-        if(isa<ConstantExpr>(op)) {
-          // try to get the real pointer register, for case like:
-          // store i64 ptrtoint (i8 addrspace(3)* getelementptr inbounds ...
-          // in which ptrtoint and getelementptr are ConstantExpr.
-          pointer_reg = getConstantRegister(dyn_cast<Constant>(op), elemID);
-        } else {
-          pointer_reg = regTranslator.getScalar(op, elemID);
-        }
-        // if ptrToInt request another type other than 32bit, convert as requested
-        ir::Type dstType = getType(ctx, ce->getType());
-        if(ce->getOpcode() == Instruction::PtrToInt && ir::TYPE_S32 != dstType) {
-          ir::Register tmp = ctx.reg(getFamily(dstType));
-          ctx.CVT(dstType, ir::TYPE_S32, tmp, pointer_reg);
-          return tmp;
-        }
-        return pointer_reg;
-      } else {
-        uint32_t TypeIndex;
-        uint32_t constantOffset = 0;
-
-        // currently only GetElementPtr is handled
-        GBE_ASSERT(ce->getOpcode() == Instruction::GetElementPtr);
-        Value *pointer = ce->getOperand(0);
-        CompositeType* CompTy = cast<CompositeType>(pointer->getType());
-        for(uint32_t op=1; op<ce->getNumOperands(); ++op) {
-          uint32_t offset = 0;
-          ConstantInt* ConstOP = dyn_cast<ConstantInt>(ce->getOperand(op));
-          GBE_ASSERT(ConstOP);
-          TypeIndex = ConstOP->getZExtValue();
-          if (op == 1) {
-            if (TypeIndex != 0) {
-              Type *elementType = (cast<PointerType>(pointer->getType()))->getElementType();
-              uint32_t elementSize = getTypeByteSize(unit, elementType);
-              uint32_t align = getAlignmentByte(unit, elementType);
-              elementSize += getPadding(elementSize, align);
-              offset += elementSize * TypeIndex;
-            }
-          } else {
-            for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
-            {
-              Type* elementType = CompTy->getTypeAtIndex(ty_i);
-              uint32_t align = getAlignmentByte(unit, elementType);
-              offset += getPadding(offset, align);
-              offset += getTypeByteSize(unit, elementType);
-            }
-
-            const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
-            offset += getPadding(offset, align);
-          }
-
-          constantOffset += offset;
-          CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
-        }
-
-        ir::Register pointer_reg;
-        pointer_reg = regTranslator.getScalar(pointer, elemID);
-        ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
-        ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32));
-        ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
-        ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
-        return reg;
-      }
+      // Check whether this is a constant drived from a pointer.
+      Constant *itC = c;
+      while(isa<ConstantExpr>(itC))
+        itC = dyn_cast<ConstantExpr>(itC)->getOperand(0);
+      if (itC->getType()->isPointerTy())
+        return getConstantPointerRegister(dyn_cast<ConstantExpr>(c), elemID);
     }
 
     const ir::ImmediateIndex immIndex = this->newImmediate(c, elemID);
     const ir::Immediate imm = ctx.getImmediate(immIndex);
-    const ir::Register reg = ctx.reg(getFamily(imm.type));
-    ctx.LOADI(imm.type, reg, immIndex);
+    const ir::Register reg = ctx.reg(getFamily(imm.getType()));
+    ctx.LOADI(imm.getType(), reg, immIndex);
     return reg;
   }
 
@@ -1012,7 +1176,7 @@ namespace gbe
       // If the "taken" successor is the next block, we try to invert the
       // branch.
       BasicBlock *succ = I->getSuccessor(0);
-      if (llvm::next(Function::iterator(bb)) != Function::iterator(succ))
+      if (std::next(Function::iterator(bb)) != Function::iterator(succ))
         return;
 
       // More than one use is too complicated: we skip it
@@ -1040,15 +1204,15 @@ namespace gbe
     for (BasicBlock::iterator I = succ->begin(); isa<PHINode>(I); ++I) {
       PHINode *PN = cast<PHINode>(I);
       Value *IV = PN->getIncomingValueForBlock(curr);
+      Type *llvmType = PN->getType();
+      const ir::Type type = getType(ctx, llvmType);
+      Value *PHICopy = this->getPHICopy(PN);
+      const ir::Register dst = this->getRegister(PHICopy);
       if (!isa<UndefValue>(IV)) {
-        Type *llvmType = PN->getType();
-        const ir::Type type = getType(ctx, llvmType);
 
         // Emit the MOV required by the PHI function. We do it simple and do not
         // try to optimize them. A next data flow analysis pass on the Gen IR
         // will remove them
-        Value *PHICopy = this->getPHICopy(PN);
-        const ir::Register dst = this->getRegister(PHICopy);
         Constant *CP = dyn_cast<Constant>(IV);
         if (CP) {
           GBE_ASSERT(isa<GlobalValue>(CP) == false);
@@ -1056,13 +1220,20 @@ namespace gbe
           if (CPV && dyn_cast<ConstantVector>(CPV) &&
               isa<UndefValue>(extractConstantElem(CPV, 0)))
             continue;
-          const ir::ImmediateIndex immIndex = this->newImmediate(CP);
-          const ir::Immediate imm = ctx.getImmediate(immIndex);
-          ctx.LOADI(imm.type, dst, immIndex);
+          ctx.MOV(type, dst, getRegister(CP));
         } else if (regTranslator.valueExists(IV,0) || dyn_cast<Constant>(IV)) {
           const ir::Register src = this->getRegister(IV);
           ctx.MOV(type, dst, src);
         }
+        assert(!ctx.getBlock()->undefPhiRegs.contains(dst));
+        ctx.getBlock()->definedPhiRegs.insert(dst);
+      } else {
+        // If this is an undefined value, we don't need emit phi copy here.
+        // But we need to record it. As latter, at liveness's backward analysis,
+        // we don't need to pass the phi value/register to this BB which the phi
+        // value is undefined. Otherwise, the phi value's liveness will be extent
+        // incorrectly and may be extent to the basic block zero which is really bad.
+        ctx.getBlock()->undefPhiRegs.insert(dst);
       }
     }
   }
@@ -1075,29 +1246,101 @@ namespace gbe
     // Loop over the kernel metadatas to set the required work group size.
     NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
     size_t reqd_wg_sz[3] = {0, 0, 0};
-    for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++)
-    {
-      MDNode *node = clKernelMetaDatas->getOperand(i);
-      if (node->getOperand(0) != &F) continue;
-      for(uint j = 0; j < node->getNumOperands() - 1; j++)
-      {
-        MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
-        if (attrNode == NULL) break;
-        MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
-        if (attrName && attrName->getString() == "reqd_work_group_size") {
-          GBE_ASSERT(attrNode->getNumOperands() == 4);
-          ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
-          ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
-          ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
-          GBE_ASSERT(x && y && z);
-          reqd_wg_sz[0] = x->getZExtValue();
-          reqd_wg_sz[1] = y->getZExtValue();
-          reqd_wg_sz[2] = z->getZExtValue();
-          break;
-        }
+    size_t hint_wg_sz[3] = {0, 0, 0};
+    ir::FunctionArgument::InfoFromLLVM llvmInfo;
+    MDNode *node = NULL;
+    MDNode *addrSpaceNode = NULL;
+    MDNode *typeNameNode = NULL;
+    MDNode *accessQualNode = NULL;
+    MDNode *typeQualNode = NULL;
+    MDNode *argNameNode = NULL;
+
+    std::string functionAttributes;
+
+    /* First find the meta data belong to this function. */
+    for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
+      node = clKernelMetaDatas->getOperand(i);
+      if (node->getOperand(0) == &F) break;
+      node = NULL;
+    }
+
+    /* because "-cl-kernel-arg-info", should always have meta data. */
+    if (!F.arg_empty())
+      assert(node);
+
+
+    for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+      MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
+      if (attrNode == NULL) break;
+      MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+      if (!attrName) continue;
+
+      if (attrName->getString() == "reqd_work_group_size") {
+        GBE_ASSERT(attrNode->getNumOperands() == 4);
+        ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+        ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+        ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+        GBE_ASSERT(x && y && z);
+        reqd_wg_sz[0] = x->getZExtValue();
+        reqd_wg_sz[1] = y->getZExtValue();
+        reqd_wg_sz[2] = z->getZExtValue();
+        functionAttributes += attrName->getString();
+        std::stringstream param;
+        char buffer[100];
+        param <<"(";
+        param << reqd_wg_sz[0];
+        param << ",";
+        param << reqd_wg_sz[1];
+        param << ",";
+        param << reqd_wg_sz[2];
+        param <<")";
+        param >> buffer;
+        functionAttributes += buffer;
+        functionAttributes += " ";
+        break;
+      } else if (attrName->getString() == "kernel_arg_addr_space") {
+        addrSpaceNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_access_qual") {
+        accessQualNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_type") {
+        typeNameNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_type_qual") {
+        typeQualNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_name") {
+        argNameNode = attrNode;
+      } else if (attrName->getString() == "vec_type_hint") {
+        GBE_ASSERT(attrNode->getNumOperands() == 3);
+        functionAttributes += attrName->getString();
+        functionAttributes += " ";
+      } else if (attrName->getString() == "work_group_size_hint") {
+        GBE_ASSERT(attrNode->getNumOperands() == 4);
+        ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+        ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+        ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+        GBE_ASSERT(x && y && z);
+        hint_wg_sz[0] = x->getZExtValue();
+        hint_wg_sz[1] = y->getZExtValue();
+        hint_wg_sz[2] = z->getZExtValue();
+        functionAttributes += attrName->getString();
+        std::stringstream param;
+        char buffer[100];
+        param <<"(";
+        param << hint_wg_sz[0];
+        param << ",";
+        param << hint_wg_sz[1];
+        param << ",";
+        param << hint_wg_sz[2];
+        param <<")";
+        param >> buffer;
+        functionAttributes += buffer;
+        functionAttributes += " ";
       }
     }
+    ctx.appendSurface(1, ir::ocl::stackbuffer);
+
     ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
+
+    ctx.getFunction().setFunctionAttributes(functionAttributes);
     // Loop over the arguments and output registers for them
     if (!F.arg_empty()) {
       uint32_t argID = 0;
@@ -1111,18 +1354,29 @@ namespace gbe
         const std::string &argName = I->getName().str();
         Type *type = I->getType();
 
-        //add support for vector argument
+        llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
+        llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+        if (llvmInfo.typeName.find("image") != std::string::npos &&
+            llvmInfo.typeName.find("*") != std::string::npos) {
+          uint32_t start = llvmInfo.typeName.find("image");
+          uint32_t end = llvmInfo.typeName.find("*");
+          llvmInfo.typeName = llvmInfo.typeName.substr(start, end - start);
+        }
+        llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
+        llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
+        llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+
+        // function arguments are uniform values.
+        this->newRegister(I, NULL, true);
+        // add support for vector argument.
         if(type->isVectorTy()) {
           VectorType *vectorType = cast<VectorType>(type);
-
-          this->newRegister(I);
           ir::Register reg = getRegister(I, 0);
-
           Type *elemType = vectorType->getElementType();
           const uint32_t elemSize = getTypeByteSize(unit, elemType);
           const uint32_t elemNum = vectorType->getNumElements();
           //vector's elemType always scalar type
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize, getAlignmentByte(unit, type));
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type), 0);
 
           ir::Function& fn = ctx.getFunction();
           for(uint32_t i=1; i < elemNum; i++) {
@@ -1135,9 +1389,9 @@ namespace gbe
 
         GBE_ASSERTM(isScalarType(type) == true,
                     "vector type in the function argument is not supported yet");
-        const ir::Register reg = regTranslator.newScalar(I);
+        const ir::Register reg = getRegister(I);
         if (type->isPointerTy() == false)
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, getTypeByteSize(unit, type), getAlignmentByte(unit, type));
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
         else {
           PointerType *pointerType = dyn_cast<PointerType>(type);
           Type *pointed = pointerType->getElementType();
@@ -1148,7 +1402,7 @@ namespace gbe
           if (I->hasByValAttr()) {
 #endif /* LLVM_VERSION_MINOR <= 1 */
             const size_t structSize = getTypeByteSize(unit, pointed);
-            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, structSize, getAlignmentByte(unit, type));
+            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
           }
           // Regular user provided pointer (global, local or constant)
           else {
@@ -1158,18 +1412,21 @@ namespace gbe
             const uint32_t align = getAlignmentByte(unit, pointed);
               switch (addrSpace) {
               case ir::MEM_GLOBAL:
-                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, ptrSize, align);
+                globalPointer.insert(std::make_pair(I, btiBase));
+                ctx.appendSurface(btiBase, reg);
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
+                incBtiBase();
               break;
               case ir::MEM_LOCAL:
-                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, ptrSize, align);
+                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, 0xfe);
                 ctx.getFunction().setUseSLM(true);
               break;
               case ir::MEM_CONSTANT:
-                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, ptrSize, align);
+                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg,  llvmInfo, ptrSize, align, 0x2);
               break;
               case ir::IMAGE:
-                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize, align);
-                ctx.getFunction().getImageSet()->append(reg, &ctx);
+                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align, 0x0);
+                ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
               break;
               default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
             }
@@ -1255,6 +1512,77 @@ namespace gbe
     });
   }
 
+  void GenWriter::optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn)
+  {
+    // The overall idea behind is we check whether there is any interference
+    // between phi and phiCopy live range. If there is no point that
+    // phi & phiCopy are both alive, then we can optimize off the move
+    // from phiCopy to phi, and use phiCopy directly instead of phi.
+    using namespace ir;
+    ir::FunctionDAG *dag = new ir::FunctionDAG(liveness);
+
+    for (auto &it : phiMap) {
+      const Register phi = it.first;
+      const Register phiCopy = it.second;
+
+      const ir::DefSet *phiCopyDef = dag->getRegDef(phiCopy);
+      const ir::UseSet *phiUse = dag->getRegUse(phi);
+      const DefSet *phiDef = dag->getRegDef(phi);
+      bool isOpt = true;
+      for (auto &x : *phiCopyDef) {
+        const ir::Instruction * phiCopyDefInsn = x->getInstruction();
+        const ir::BasicBlock *bb = phiCopyDefInsn->getParent();
+        const Liveness::LiveOut &out = liveness.getLiveOut(bb);
+        // phi & phiCopy are both alive at the endpoint of bb,
+        // thus can not be optimized.
+        if (out.contains(phi)) {
+          isOpt = false;
+          break;
+        }
+        // If phi is used in the same BB that define the phiCopy,
+        // we need carefully check the liveness of phi & phiCopy.
+        // Make sure their live ranges do not interfere.
+        bool phiUsedInSameBB = false;
+        for (auto &y : *phiUse) {
+          const ir::Instruction *phiUseInsn = y->getInstruction();
+          const ir::BasicBlock *bb2 = phiUseInsn->getParent();
+          if (bb2 == bb) {
+            phiUsedInSameBB = true;
+          }
+        }
+        // Check phi is not used between phiCopy def point and bb's end point,
+        // which is often referred as 'phi swap issue', just like below:
+        //   MOV phiCopy_1, x;
+        //   MOV phiCopy_2, phi_1;
+        if (phiUsedInSameBB ) {
+          for (auto it = --bb->end(); it != bb->end() ; --it) {
+            const Instruction &p = *it;
+
+            if (&p == phiCopyDefInsn) break;
+            // we only care MOV here
+            if (p.getSrcNum() == 1 && p.getSrc(0) == phi) {
+              isOpt = false;
+              break;
+            }
+          }
+        }
+      }
+
+      // [MOV phi, phiCopy;] can be removed. So we remove it
+      // and replace phi uses with phiCopy
+      if (isOpt) {
+        for (auto &x : *phiDef) {
+          const_cast<Instruction *>(x->getInstruction())->remove();
+        }
+        for (auto &x : *phiUse) {
+          const Instruction *phiUseInsn = x->getInstruction();
+          replaceSrc(const_cast<Instruction *>(phiUseInsn), phi, phiCopy);
+        }
+      }
+    }
+    delete dag;
+  }
+
   void GenWriter::removeMOVs(const ir::Liveness &liveness, ir::Function &fn)
   {
     // We store the last write and last read for each register
@@ -1383,6 +1711,24 @@ namespace gbe
   BVAR(OCL_OPTIMIZE_PHI_MOVES, true);
   BVAR(OCL_OPTIMIZE_LOADI, true);
 
+  static const Instruction *getInstructionUseLocal(const Value *v) {
+    // Local variable can only be used in one kernel function. So, if we find
+    // one instruction that use the local variable, simply return.
+    const Instruction *insn = NULL;
+    for(Value::const_use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
+    // After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+      const User *theUser = *iter;
+#else
+      const User *theUser = iter->getUser();
+#endif
+      if(isa<Instruction>(theUser)) return cast<const Instruction>(theUser);
+      insn = getInstructionUseLocal(theUser);
+      if(insn != NULL) break;
+    }
+    return insn;
+  }
+
   void GenWriter::allocateGlobalVariableRegister(Function &F)
   {
     // Allocate a address register for each global variable
@@ -1394,6 +1740,14 @@ namespace gbe
 
       ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
       if(addrSpace == ir::MEM_LOCAL) {
+        const Value * val = cast<Value>(&v);
+        const Instruction *insn = getInstructionUseLocal(val);
+        GBE_ASSERT(insn && "Can't find a valid reference instruction for local variable.");
+
+        const BasicBlock * bb = insn->getParent();
+        const Function * func = bb->getParent();
+        if(func != &F) continue;
+
         ir::Function &f = ctx.getFunction();
         f.setUseSLM(true);
         const Constant *c = v.getInitializer();
@@ -1403,22 +1757,6 @@ namespace gbe
         uint32_t padding = getPadding(oldSlm*8, align);
 
         f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty));
-        const Value * val = cast<Value>(&v);
-        // local variable can only be used in one kernel function. so, don't need to check its all uses.
-        // loop through the Constant to find the instruction that use the global variable
-        // FIXME need to find a more grace way to find the function which use this local data.
-        const Instruction * insn = NULL;
-        for( Value::const_use_iterator it = val->use_begin(), prev = val->use_begin();
-             it != prev->use_end() && insn == NULL;
-             prev = it, it = it->use_begin() )
-          for( Value::const_use_iterator innerIt = it;
-               innerIt != val->use_end() && insn == NULL;
-               innerIt++)
-            insn = dyn_cast<Instruction>(*innerIt);
-        GBE_ASSERT(insn && "Can't find a valid reference instruction for local variable.");
-        const BasicBlock * bb = insn->getParent();
-        const Function * func = bb->getParent();
-        if(func != &F) continue;
 
         this->newRegister(const_cast<GlobalVariable*>(&v));
         ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
@@ -1431,10 +1769,106 @@ namespace gbe
         GBE_ASSERT(con.getName() == v.getName());
         ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
       } else {
-        GBE_ASSERT(0);
+        if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
+          ctx.appendSurface(btiBase, ir::ocl::printfbptr);
+          ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
+          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
+        } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
+          ctx.appendSurface(btiBase, ir::ocl::printfiptr);
+          ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
+          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
+	} else if(v.getName().str().substr(0, 4) == ".str") {
+          /* When there are multi printf statements in multi kernel fucntions within the same
+             translate unit, if they have the same sting parameter, such as
+             kernel_func1 () {
+               printf("Line is %d\n", line_num1);
+             }
+             kernel_func2 () {
+               printf("Line is %d\n", line_num2);
+             }
+             The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
+             So when translating the kernel_func1, we can not unref that global var, so we will
+             get here. Just ignore it to avoid assert. */
+        } else {
+          GBE_ASSERT(0);
+        }
+      }
+    }
+
+  }
+  static INLINE void findAllLoops(LoopInfo * LI, std::vector<std::pair<Loop*, int>> &lp)
+  {
+      for (Loop::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) {
+        lp.push_back(std::make_pair(*I, -1));
+      }
+      if (lp.size() == 0) return;
+
+      uint32_t i = 0;
+      do {
+        const std::vector<Loop*> subLoops = lp[i].first->getSubLoops();
+        for(auto sub : subLoops)
+          lp.push_back(std::make_pair(sub, i));
+        i++;
+      } while(i < lp.size());
+  }
+
+  void GenWriter::gatherLoopInfo(ir::Function &fn) {
+    vector<ir::LabelIndex> loopBBs;
+    vector<std::pair<ir::LabelIndex, ir::LabelIndex>> loopExits;
+    std::vector<std::pair<Loop*, int>> lp;
+
+    findAllLoops(LI, lp);
+#if GBE_DEBUG
+    // check two loops' interference
+    for(unsigned int i = 0; i < lp.size(); i++) {
+        SmallVector<Loop::Edge, 8> exitBBs;
+        lp[i].first->getExitEdges(exitBBs);
+
+      const std::vector<BasicBlock*> &inBBs = lp[i].first->getBlocks();
+      std::vector<ir::LabelIndex> bbs1;
+      for(auto x : inBBs) {
+        bbs1.push_back(labelMap[x]);
+      }
+      std::sort(bbs1.begin(), bbs1.end());
+      for(unsigned int j = i+1; j < lp.size(); j++) {
+        if(! lp[i].first->contains(lp[j].first)) {
+          const std::vector<BasicBlock*> &inBBs2 = lp[j].first->getBlocks();
+          std::vector<ir::LabelIndex> bbs2;
+          std::vector<ir::LabelIndex> bbs3;
+
+          for(auto x : inBBs2) {
+            bbs2.push_back(labelMap[x]);
+          }
+
+          std::sort(bbs2.begin(), bbs2.end());
+          std::set_intersection(bbs1.begin(), bbs1.end(), bbs2.begin(), bbs2.end(), std::back_inserter(bbs3));
+          GBE_ASSERT(bbs3.size() < 1);
+        }
       }
     }
+#endif
+
+    for (auto loop : lp) {
+      loopBBs.clear();
+      loopExits.clear();
+
+      const std::vector<BasicBlock*> &inBBs = loop.first->getBlocks();
+      for (auto b : inBBs) {
+        GBE_ASSERT(labelMap.find(b) != labelMap.end());
+        loopBBs.push_back(labelMap[b]);
+      }
 
+      SmallVector<Loop::Edge, 8> exitBBs;
+      loop.first->getExitEdges(exitBBs);
+      for(auto b : exitBBs){
+        GBE_ASSERT(labelMap.find(b.first) != labelMap.end());
+        GBE_ASSERT(labelMap.find(b.second) != labelMap.end());
+        loopExits.push_back(std::make_pair(labelMap[b.first], labelMap[b.second]));
+      }
+      fn.addLoop(loopBBs, loopExits);
+    }
   }
 
   void GenWriter::emitFunction(Function &F)
@@ -1452,6 +1886,7 @@ namespace gbe
     }
 
     ctx.startFunction(F.getName());
+    ir::Function &fn = ctx.getFunction();
     this->regTranslator.clear();
     this->labelMap.clear();
     this->emitFunctionPrototype(F);
@@ -1472,17 +1907,20 @@ namespace gbe
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
       this->simplifyTerminator(BB);
 
+    // gather loop info, which is useful for liveness analysis
+    gatherLoopInfo(fn);
+
     // ... then, emit the instructions for all basic blocks
     pass = PASS_EMIT_INSTRUCTIONS;
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
       emitBasicBlock(BB);
-    ir::Function &fn = ctx.getFunction();
     ctx.endFunction();
 
     // Liveness can be shared when we optimized the immediates and the MOVs
-    const ir::Liveness liveness(fn);
+    ir::Liveness liveness(fn);
 
     if (OCL_OPTIMIZE_LOADI) this->removeLOADIs(liveness, fn);
+    if (OCL_OPTIMIZE_PHI_MOVES) this->optimizePhiCopy(liveness, fn);
     if (OCL_OPTIMIZE_PHI_MOVES) this->removeMOVs(liveness, fn);
   }
 
@@ -1642,10 +2080,26 @@ namespace gbe
       case ICmpInst::FCMP_OGE: ctx.GE(type, dst, src0, src1); break;
       case ICmpInst::FCMP_OLT: ctx.LT(type, dst, src0, src1); break;
       case ICmpInst::FCMP_OGT: ctx.GT(type, dst, src0, src1); break;
-      case ICmpInst::FCMP_ORD: ctx.ORD(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_ORD:
+        //If there is a constant between src0 and src1, this constant value
+        //must ordered, otherwise, llvm will optimize the instruction to ture.
+        //So discard this constant value, only compare the other src.
+        if(isa<ConstantFP>(I.getOperand(0)))
+          ctx.EQ(type, dst, src1, src1);
+        else if(isa<ConstantFP>(I.getOperand(1)))
+          ctx.EQ(type, dst, src0, src0);
+        else
+          ctx.ORD(type, dst, src0, src1);
+        break;
       case ICmpInst::FCMP_UNO:
-        ctx.ORD(type, tmp, src0, src1);
-        ctx.XOR(insnType, dst, tmp, getRegister(cv));  //TODO: Use NOT directly
+        if(isa<ConstantFP>(I.getOperand(0)))
+          ctx.NE(type, dst, src1, src1);
+        else if(isa<ConstantFP>(I.getOperand(1)))
+          ctx.NE(type, dst, src0, src0);
+        else {
+          ctx.ORD(type, tmp, src0, src1);
+          ctx.XOR(insnType, dst, tmp, getRegister(cv));  //TODO: Use NOT directly
+        }
         break;
       case ICmpInst::FCMP_UEQ:
         ctx.NE(type, tmp, src0, src1);
@@ -1742,7 +2196,7 @@ namespace gbe
           const ir::ImmediateIndex index = ctx.newImmediate(CPV);
           const ir::Immediate imm = ctx.getImmediate(index);
           const ir::Register reg = this->getRegister(dstValue);
-          ctx.LOADI(imm.type, reg, index);
+          ctx.LOADI(imm.getType(), reg, index);
         }
       }
       break;
@@ -1753,6 +2207,11 @@ namespace gbe
         uint32_t srcElemNum = 0, dstElemNum = 0 ;
         ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
         ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
+        // As long and double are not compatible in register storage
+        // and we do not support double yet, simply put an assert here
+        GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
+        GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
+
         if(srcElemNum > 1 || dstElemNum > 1) {
           // Build the tuple data in the vector
           vector<ir::Register> srcTupleData;
@@ -1831,15 +2290,6 @@ namespace gbe
     }
   }
 
-  /*! Once again, it is a templated functor. No lambda */
-  struct InsertExtractFunctor {
-    InsertExtractFunctor(ir::Context &ctx) : ctx(ctx) {}
-    template <typename T> ir::Immediate operator() (const T &t) {
-      return ir::Immediate(t);
-    }
-    ir::Context &ctx;
-  };
-
   /*! Because there are still fake insert/extract instruction for
    *  load/store, so keep empty function here */
   void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
@@ -1908,6 +2358,7 @@ namespace gbe
     const ir::Register dst = this->getRegister(&I);
     const ir::Register src = this->getRegister(copy);
     ctx.MOV(type, dst, src);
+    phiMap.insert(std::make_pair(dst, src));
   }
 
   void GenWriter::regAllocateBranchInst(BranchInst &I) {}
@@ -1923,7 +2374,7 @@ namespace gbe
     // successor
     if (I.isConditional() == false) {
       BasicBlock *target = I.getSuccessor(0);
-      if (llvm::next(Function::iterator(bb)) != Function::iterator(target)) {
+      if (std::next(Function::iterator(bb)) != Function::iterator(target)) {
         GBE_ASSERT(labelMap.find(target) != labelMap.end());
         const ir::LabelIndex labelIndex = labelMap[target];
         ctx.BRA(labelIndex);
@@ -1947,7 +2398,7 @@ namespace gbe
 
       // If non-taken target is the next block, there is nothing to do
       BasicBlock *bb = I.getParent();
-      if (llvm::next(Function::iterator(bb)) == Function::iterator(nonTaken))
+      if (std::next(Function::iterator(bb)) == Function::iterator(nonTaken))
         return;
 
       // This is slightly more complicated here. We need to issue one more
@@ -1984,6 +2435,10 @@ namespace gbe
             this->newRegister(&I);
           break;
 #endif /* LLVM_VERSION_MINOR >= 2 */
+          case Intrinsic::debugtrap:
+          case Intrinsic::dbg_value:
+          case Intrinsic::dbg_declare:
+          break;
           default:
           GBE_ASSERTM(false, "Unsupported intrinsics");
         }
@@ -2034,6 +2489,10 @@ namespace gbe
         regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
       case GEN_OCL_GET_WORK_DIM:
         regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
+      case GEN_OCL_PRINTF_BUF_ADDR:
+        regTranslator.newScalarProxy(ir::ocl::printfbptr, dst); break;
+      case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
+        regTranslator.newScalarProxy(ir::ocl::printfiptr, dst); break;
       case GEN_OCL_FBH:
       case GEN_OCL_FBL:
       case GEN_OCL_COS:
@@ -2055,7 +2514,6 @@ namespace gbe
       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
       case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
       case GEN_OCL_GET_IMAGE_DEPTH:
-      case GEN_OCL_GET_SAMPLER_INFO:
       case GEN_OCL_ATOMIC_ADD0:
       case GEN_OCL_ATOMIC_ADD1:
       case GEN_OCL_ATOMIC_SUB0:
@@ -2092,31 +2550,35 @@ namespace gbe
       case GEN_OCL_LGBARRIER:
         ctx.getFunction().setUseSLM(true);
         break;
-      case GEN_OCL_WRITE_IMAGE0:
-      case GEN_OCL_WRITE_IMAGE1:
-      case GEN_OCL_WRITE_IMAGE2:
-      case GEN_OCL_WRITE_IMAGE3:
-      case GEN_OCL_WRITE_IMAGE4:
-      case GEN_OCL_WRITE_IMAGE5:
-      case GEN_OCL_WRITE_IMAGE10:
-      case GEN_OCL_WRITE_IMAGE11:
-      case GEN_OCL_WRITE_IMAGE12:
-      case GEN_OCL_WRITE_IMAGE13:
-      case GEN_OCL_WRITE_IMAGE14:
-      case GEN_OCL_WRITE_IMAGE15:
+      case GEN_OCL_WRITE_IMAGE_I_1D:
+      case GEN_OCL_WRITE_IMAGE_UI_1D:
+      case GEN_OCL_WRITE_IMAGE_F_1D:
+      case GEN_OCL_WRITE_IMAGE_I_2D:
+      case GEN_OCL_WRITE_IMAGE_UI_2D:
+      case GEN_OCL_WRITE_IMAGE_F_2D:
+      case GEN_OCL_WRITE_IMAGE_I_3D:
+      case GEN_OCL_WRITE_IMAGE_UI_3D:
+      case GEN_OCL_WRITE_IMAGE_F_3D:
         break;
-      case GEN_OCL_READ_IMAGE0:
-      case GEN_OCL_READ_IMAGE1:
-      case GEN_OCL_READ_IMAGE2:
-      case GEN_OCL_READ_IMAGE3:
-      case GEN_OCL_READ_IMAGE4:
-      case GEN_OCL_READ_IMAGE5:
-      case GEN_OCL_READ_IMAGE10:
-      case GEN_OCL_READ_IMAGE11:
-      case GEN_OCL_READ_IMAGE12:
-      case GEN_OCL_READ_IMAGE13:
-      case GEN_OCL_READ_IMAGE14:
-      case GEN_OCL_READ_IMAGE15:
+      case GEN_OCL_READ_IMAGE_I_1D:
+      case GEN_OCL_READ_IMAGE_UI_1D:
+      case GEN_OCL_READ_IMAGE_F_1D:
+      case GEN_OCL_READ_IMAGE_I_2D:
+      case GEN_OCL_READ_IMAGE_UI_2D:
+      case GEN_OCL_READ_IMAGE_F_2D:
+      case GEN_OCL_READ_IMAGE_I_3D:
+      case GEN_OCL_READ_IMAGE_UI_3D:
+      case GEN_OCL_READ_IMAGE_F_3D:
+
+      case GEN_OCL_READ_IMAGE_I_1D_I:
+      case GEN_OCL_READ_IMAGE_UI_1D_I:
+      case GEN_OCL_READ_IMAGE_F_1D_I:
+      case GEN_OCL_READ_IMAGE_I_2D_I:
+      case GEN_OCL_READ_IMAGE_UI_2D_I:
+      case GEN_OCL_READ_IMAGE_F_2D_I:
+      case GEN_OCL_READ_IMAGE_I_3D_I:
+      case GEN_OCL_READ_IMAGE_UI_3D_I:
+      case GEN_OCL_READ_IMAGE_F_3D_I:
       {
         // dst is a 4 elements vector. We allocate all 4 registers here.
         uint32_t elemNum;
@@ -2133,6 +2595,8 @@ namespace gbe
       case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_UPSAMPLE_LONG:
       case GEN_OCL_MAD:
+      case GEN_OCL_FMAX:
+      case GEN_OCL_FMIN:
       case GEN_OCL_SADD_SAT_CHAR:
       case GEN_OCL_SADD_SAT_SHORT:
       case GEN_OCL_SADD_SAT_INT:
@@ -2181,21 +2645,17 @@ namespace gbe
       case GEN_OCL_SAT_CONV_F32_TO_U32:
       case GEN_OCL_CONV_F16_TO_F32:
       case GEN_OCL_CONV_F32_TO_F16:
+      case GEN_OCL_SIMD_ANY:
+      case GEN_OCL_SIMD_ALL:
         this->newRegister(&I);
         break;
+      case GEN_OCL_PRINTF:
+        break;
       default:
         GBE_ASSERTM(false, "Function call are not supported yet");
     };
   }
 
-  struct U64CPVExtractFunctor {
-    U64CPVExtractFunctor(ir::Context &ctx) : ctx(ctx) {}
-    template <typename T> INLINE uint64_t operator() (const T &t) {
-      return uint64_t(t);
-    }
-    ir::Context &ctx;
-  };
-
   void GenWriter::emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode) {
     CallSite::arg_iterator AI = CS.arg_begin();
 #if GBE_DEBUG
@@ -2216,6 +2676,8 @@ namespace gbe
     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register dst = this->getRegister(&I);
 
+    ir::BTI bti;
+    gatherBTI(*AI, bti);
     vector<ir::Register> src;
     uint32_t srcNum = 0;
     while(AI != AE) {
@@ -2223,7 +2685,7 @@ namespace gbe
       srcNum++;
     }
     const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-    ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
+    ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
   }
 
   /* append a new sampler. should be called before any reference to
@@ -2235,10 +2697,10 @@ namespace gbe
     {
       // This is not a kernel argument sampler, we need to append it to sampler set,
       // and allocate a sampler slot for it.
-      auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-      GBE_ASSERTM(x.type == ir::TYPE_U16 || x.type == ir::TYPE_S16, "Invalid sampler type");
+      const ir::Immediate &x = processConstantImm(CPV);
+      GBE_ASSERTM(x.getType() == ir::TYPE_U16 || x.getType() == ir::TYPE_S16, "Invalid sampler type");
 
-      index = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+      index = ctx.getFunction().getSamplerSet()->append(x.getIntegerValue(), &ctx);
     } else {
       const ir::Register samplerReg = this->getRegister(*AI);
       index = ctx.getFunction().getSamplerSet()->append(samplerReg, &ctx);
@@ -2284,9 +2746,14 @@ namespace gbe
           case Intrinsic::lifetime_end:
           break;
 #endif /* LLVM_VERSION_MINOR >= 2 */
+          case Intrinsic::debugtrap:
+          case Intrinsic::dbg_value:
+          case Intrinsic::dbg_declare:
+          break;
           default: NOT_IMPLEMENTED;
         }
       } else {
+        int image_dim;
         // Get the name of the called function and handle it
         Value *Callee = I.getCalledValue();
         const std::string fnName = Callee->getName();
@@ -2318,6 +2785,20 @@ namespace gbe
             ctx.ALU1(ir::OP_ABS, ir::TYPE_S32, dst, src);
             break;
           }
+          case GEN_OCL_SIMD_ALL:
+          {
+            const ir::Register src = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU1(ir::OP_SIMD_ALL, ir::TYPE_S16, dst, src);
+            break;
+          }
+          case GEN_OCL_SIMD_ANY:
+          {
+            const ir::Register src = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
+            break;
+          }
           case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
@@ -2376,26 +2857,31 @@ namespace gbe
             ctx.GET_IMAGE_INFO(infoType, reg, surfaceID, infoReg);
             break;
           }
-          case GEN_OCL_GET_SAMPLER_INFO:
-          {
-            GBE_ASSERT(AI != AE);
-            const uint8_t index = this->appendSampler(AI); ++AI;
-            const ir::Register reg = this->getRegister(&I, 0);
-            ctx.GET_SAMPLER_INFO(reg, ir::ocl::samplerinfo, index);
-            break;
-          }
-          case GEN_OCL_READ_IMAGE0:
-          case GEN_OCL_READ_IMAGE1:
-          case GEN_OCL_READ_IMAGE2:
-          case GEN_OCL_READ_IMAGE3:
-          case GEN_OCL_READ_IMAGE4:
-          case GEN_OCL_READ_IMAGE5:
-          case GEN_OCL_READ_IMAGE10:
-          case GEN_OCL_READ_IMAGE11:
-          case GEN_OCL_READ_IMAGE12:
-          case GEN_OCL_READ_IMAGE13:
-          case GEN_OCL_READ_IMAGE14:
-          case GEN_OCL_READ_IMAGE15:
+
+          case GEN_OCL_READ_IMAGE_I_1D:
+          case GEN_OCL_READ_IMAGE_UI_1D:
+          case GEN_OCL_READ_IMAGE_F_1D:
+          case GEN_OCL_READ_IMAGE_I_1D_I:
+          case GEN_OCL_READ_IMAGE_UI_1D_I:
+          case GEN_OCL_READ_IMAGE_F_1D_I:
+            image_dim = 1;
+            goto handle_read_image;
+          case GEN_OCL_READ_IMAGE_I_2D:
+          case GEN_OCL_READ_IMAGE_UI_2D:
+          case GEN_OCL_READ_IMAGE_F_2D:
+          case GEN_OCL_READ_IMAGE_I_2D_I:
+          case GEN_OCL_READ_IMAGE_UI_2D_I:
+          case GEN_OCL_READ_IMAGE_F_2D_I:
+            image_dim = 2;
+            goto handle_read_image;
+          case GEN_OCL_READ_IMAGE_I_3D:
+          case GEN_OCL_READ_IMAGE_UI_3D:
+          case GEN_OCL_READ_IMAGE_F_3D:
+          case GEN_OCL_READ_IMAGE_I_3D_I:
+          case GEN_OCL_READ_IMAGE_UI_3D_I:
+          case GEN_OCL_READ_IMAGE_F_3D_I:
+            image_dim = 3;
+handle_read_image:
           {
             GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
             const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
@@ -2403,15 +2889,26 @@ namespace gbe
             const uint8_t sampler = this->appendSampler(AI);
             ++AI;
 
-            GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
-            GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
+            ir::Register ucoord;
+            ir::Register vcoord;
             ir::Register wcoord;
-            bool is3D = false;
-            if (it->second >= GEN_OCL_READ_IMAGE10 && it->second <= GEN_OCL_READ_IMAGE15) {
-              GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
-              is3D = true;
-            } else
-              wcoord = ucoord; // not used, just a padding.
+
+            GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
+            if (image_dim > 1) {
+              GBE_ASSERT(AI != AE);
+              vcoord = this->getRegister(*AI);
+              ++AI;
+            } else {
+              vcoord = ir::ocl::invalid;
+            }
+
+            if (image_dim > 2) {
+              GBE_ASSERT(AI != AE);
+              wcoord = this->getRegister(*AI);
+              ++AI;
+            } else {
+              wcoord = ir::ocl::invalid;
+            }
 
             vector<ir::Register> dstTupleData, srcTupleData;
             const uint32_t elemNum = 4;
@@ -2426,71 +2923,86 @@ namespace gbe
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
             GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
             assert(CPV);
-            auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-            GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
-            samplerOffset = x.data.u32;
+            const ir::Immediate &x = processConstantImm(CPV);
+            GBE_ASSERTM(x.getType() == ir::TYPE_U32 || x.getType() == ir::TYPE_S32, "Invalid sampler type");
+            samplerOffset = x.getIntegerValue();
 #endif
             const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
             const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 3);
 
-            ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32;
+            ir::Type dstType = ir::TYPE_U32;
 
             switch(it->second) {
-              case GEN_OCL_READ_IMAGE0:
-              case GEN_OCL_READ_IMAGE2:
-              case GEN_OCL_READ_IMAGE10:
-              case GEN_OCL_READ_IMAGE12:
+              case GEN_OCL_READ_IMAGE_I_1D:
+              case GEN_OCL_READ_IMAGE_UI_1D:
+              case GEN_OCL_READ_IMAGE_I_2D:
+              case GEN_OCL_READ_IMAGE_UI_2D:
+              case GEN_OCL_READ_IMAGE_I_3D:
+              case GEN_OCL_READ_IMAGE_UI_3D:
+              case GEN_OCL_READ_IMAGE_I_1D_I:
+              case GEN_OCL_READ_IMAGE_UI_1D_I:
+              case GEN_OCL_READ_IMAGE_I_2D_I:
+              case GEN_OCL_READ_IMAGE_UI_2D_I:
+              case GEN_OCL_READ_IMAGE_I_3D_I:
+              case GEN_OCL_READ_IMAGE_UI_3D_I:
                 dstType = ir::TYPE_U32;
-                srcType = ir::TYPE_S32;
                 break;
-              case GEN_OCL_READ_IMAGE1:
-              case GEN_OCL_READ_IMAGE3:
-              case GEN_OCL_READ_IMAGE11:
-              case GEN_OCL_READ_IMAGE13:
-                dstType = ir::TYPE_U32;
-                srcType = ir::TYPE_FLOAT;
-                break;
-              case GEN_OCL_READ_IMAGE4:
-              case GEN_OCL_READ_IMAGE14:
+              case GEN_OCL_READ_IMAGE_F_1D:
+              case GEN_OCL_READ_IMAGE_F_2D:
+              case GEN_OCL_READ_IMAGE_F_3D:
+              case GEN_OCL_READ_IMAGE_F_1D_I:
+              case GEN_OCL_READ_IMAGE_F_2D_I:
+              case GEN_OCL_READ_IMAGE_F_3D_I:
                 dstType = ir::TYPE_FLOAT;
-                srcType = ir::TYPE_S32;
-                break;
-              case GEN_OCL_READ_IMAGE5:
-              case GEN_OCL_READ_IMAGE15:
-                srcType = dstType = ir::TYPE_FLOAT;
                 break;
               default:
                 GBE_ASSERT(0); // never been here.
             }
 
+            bool isFloatCoord = it->second <= GEN_OCL_READ_IMAGE_F_3D;
+
             ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
-                       srcType == ir::TYPE_FLOAT, sampler, samplerOffset, is3D);
+                       isFloatCoord, sampler, samplerOffset);
             break;
           }
-          case GEN_OCL_WRITE_IMAGE0:
-          case GEN_OCL_WRITE_IMAGE1:
-          case GEN_OCL_WRITE_IMAGE2:
-          case GEN_OCL_WRITE_IMAGE3:
-          case GEN_OCL_WRITE_IMAGE4:
-          case GEN_OCL_WRITE_IMAGE5:
-          case GEN_OCL_WRITE_IMAGE10:
-          case GEN_OCL_WRITE_IMAGE11:
-          case GEN_OCL_WRITE_IMAGE12:
-          case GEN_OCL_WRITE_IMAGE13:
-          case GEN_OCL_WRITE_IMAGE14:
-          case GEN_OCL_WRITE_IMAGE15:
+
+          case GEN_OCL_WRITE_IMAGE_I_1D:
+          case GEN_OCL_WRITE_IMAGE_UI_1D:
+          case GEN_OCL_WRITE_IMAGE_F_1D:
+            image_dim = 1;
+            goto handle_write_image;
+          case GEN_OCL_WRITE_IMAGE_I_2D:
+          case GEN_OCL_WRITE_IMAGE_UI_2D:
+          case GEN_OCL_WRITE_IMAGE_F_2D:
+            image_dim = 2;
+            goto handle_write_image;
+          case GEN_OCL_WRITE_IMAGE_I_3D:
+          case GEN_OCL_WRITE_IMAGE_UI_3D:
+          case GEN_OCL_WRITE_IMAGE_F_3D:
+            image_dim = 3;
+handle_write_image:
           {
             GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
             const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
-            GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
-            GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
-            ir::Register wcoord;
-            bool is3D = false;
-            if(it->second >= GEN_OCL_WRITE_IMAGE10 && it->second <= GEN_OCL_WRITE_IMAGE15) {
-              GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
-              is3D = true;
+            ir::Register ucoord, vcoord, wcoord;
+
+            GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
+
+            if (image_dim > 1) {
+              GBE_ASSERT(AI != AE);
+              vcoord = this->getRegister(*AI);
+              ++AI;
             } else
-              wcoord = ucoord; // not used, just padding.
+              vcoord = ir::ocl::invalid;
+
+            if (image_dim > 2) {
+              GBE_ASSERT(AI != AE);
+              wcoord = this->getRegister(*AI);
+              ++AI;
+            } else {
+              wcoord = ir::ocl::invalid;
+            }
+
             GBE_ASSERT(AI != AE);
             vector<ir::Register> srcTupleData;
 
@@ -2505,36 +3017,27 @@ namespace gbe
             }
             const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 7);
 
-            ir::Type srcType = ir::TYPE_U32, coordType = ir::TYPE_U32;
+            ir::Type srcType = ir::TYPE_U32;
 
             switch(it->second) {
-              case GEN_OCL_WRITE_IMAGE0:
-              case GEN_OCL_WRITE_IMAGE2:
-              case GEN_OCL_WRITE_IMAGE10:
-              case GEN_OCL_WRITE_IMAGE12:
-                srcType = coordType = ir::TYPE_U32;
-                break;
-              case GEN_OCL_WRITE_IMAGE1:
-              case GEN_OCL_WRITE_IMAGE3:
-              case GEN_OCL_WRITE_IMAGE11:
-              case GEN_OCL_WRITE_IMAGE13:
-                coordType = ir::TYPE_FLOAT;
+              case GEN_OCL_WRITE_IMAGE_I_1D:
+              case GEN_OCL_WRITE_IMAGE_UI_1D:
+              case GEN_OCL_WRITE_IMAGE_I_2D:
+              case GEN_OCL_WRITE_IMAGE_UI_2D:
+              case GEN_OCL_WRITE_IMAGE_I_3D:
+              case GEN_OCL_WRITE_IMAGE_UI_3D:
                 srcType = ir::TYPE_U32;
                 break;
-              case GEN_OCL_WRITE_IMAGE4:
-              case GEN_OCL_WRITE_IMAGE14:
+              case GEN_OCL_WRITE_IMAGE_F_1D:
+              case GEN_OCL_WRITE_IMAGE_F_2D:
+              case GEN_OCL_WRITE_IMAGE_F_3D:
                 srcType = ir::TYPE_FLOAT;
-                coordType = ir::TYPE_U32;
-                break;
-              case GEN_OCL_WRITE_IMAGE5:
-              case GEN_OCL_WRITE_IMAGE15:
-                srcType = coordType = ir::TYPE_FLOAT;
                 break;
               default:
                 GBE_ASSERT(0); // never been here.
             }
 
-            ctx.TYPED_WRITE(surfaceID, srcTuple, srcType, coordType, is3D);
+            ctx.TYPED_WRITE(surfaceID, srcTuple, srcType, ir::TYPE_U32);
             break;
           }
           case GEN_OCL_MUL_HI_INT:
@@ -2663,6 +3166,22 @@ namespace gbe
             ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
             break;
           }
+          case GEN_OCL_FMAX:
+          case GEN_OCL_FMIN:{
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
+            //Becasue cmp's sources are same as sel's source, so cmp instruction and sel
+            //instruction will be merged to one sel_cmp instruction in the gen selection
+            //Add two intruction here for simple.
+            if(it->second == GEN_OCL_FMAX)
+              ctx.GE(getType(ctx, I.getType()), cmp, src0, src1);
+            else
+              ctx.LT(getType(ctx, I.getType()), cmp, src0, src1);
+            ctx.SEL(getType(ctx, I.getType()), dst, cmp, src0, src1);
+            break;
+          }
           case GEN_OCL_HADD: {
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
@@ -2754,6 +3273,16 @@ namespace gbe
             ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, getRegister(&I), getRegister(I.getOperand(0)));
             break;
 #undef DEF
+
+          case GEN_OCL_PRINTF:
+          {
+            ir::PrintfSet::PrintfFmt* fmt = (ir::PrintfSet::PrintfFmt*)getPrintfInfo(&I);
+            ctx.getFunction().getPrintfSet()->append(fmt, unit);
+            assert(fmt);
+            break;
+          }
+          case GEN_OCL_PRINTF_BUF_ADDR:
+          case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
           default: break;
         }
       }
@@ -2779,11 +3308,13 @@ namespace gbe
     if (I.isArrayAllocation() == true) {
       Constant *CPV = dyn_cast<Constant>(src);
       GBE_ASSERT(CPV);
-      const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx));
-      ir::Immediate imm = ctx.getImmediate(immIndex);
-      imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4);
+      const ir::Immediate &imm = processConstantImm(CPV);
+      const uint64_t elemNum = imm.getIntegerValue();
       elementSize *= elemNum;
-      ctx.setImmediate(immIndex, imm);
+      if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+        immIndex = ctx.newImmediate(uint32_t(ALIGN(elementSize, 4)));
+      else
+        immIndex = ctx.newImmediate(uint64_t(ALIGN(elementSize, 4)));
     }
 
     // Now emit the stream of instructions to get the allocated pointer
@@ -2809,10 +3340,10 @@ namespace gbe
       }
     }
     // Set the destination register properly
-    ctx.MOV(imm.type, dst, stack);
+    ctx.MOV(imm.getType(), dst, stack);
 
-    ctx.LOADI(imm.type, reg, immIndex);
-    ctx.ADD(imm.type, stack, stack, reg);
+    ctx.LOADI(imm.getType(), reg, immIndex);
+    ctx.ADD(imm.getType(), stack, stack, reg);
     ctx.getFunction().pushStackSize(elementSize);
   }
 
@@ -2827,6 +3358,156 @@ namespace gbe
   }
   void GenWriter::regAllocateStoreInst(StoreInst &I) {}
 
+  void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                                      Value *llvmValues, const ir::Register ptr,
+                                      const ir::AddressSpace addrSpace,
+                                      Type * elemType, bool isLoad, ir::BTI bti) {
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
+    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
+    const uint32_t perMsgNum = elemNum / msgNum;
+
+    for (uint32_t msg = 0; msg < msgNum; ++msg) {
+      // Build the tuple data in the vector
+      vector<ir::Register> tupleData; // put registers here
+      for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
+        ir::Register reg;
+        if(regTranslator.isUndefConst(llvmValues, elemID)) {
+          Value *v = Constant::getNullValue(elemType);
+          reg = this->getRegister(v);
+        } else
+          reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
+
+        tupleData.push_back(reg);
+      }
+      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
+
+      // We may need to update to offset the pointer
+      ir::Register addr;
+      if (msg == 0)
+        addr = ptr;
+      else {
+        const ir::Register offset = ctx.reg(pointerFamily);
+        ir::ImmediateIndex immIndex;
+        ir::Type immType;
+        // each message can read/write 16 byte
+        const int32_t stride = 16;
+        if (pointerFamily == ir::FAMILY_DWORD) {
+          immIndex = ctx.newImmediate(int32_t(msg*stride));
+          immType = ir::TYPE_S32;
+        } else {
+          immIndex = ctx.newImmediate(int64_t(msg*stride));
+          immType = ir::TYPE_S64;
+        }
+
+        addr = ctx.reg(pointerFamily);
+        ctx.LOADI(immType, offset, immIndex);
+        ctx.ADD(immType, addr, ptr, offset);
+      }
+
+      // Emit the instruction
+      if (isLoad)
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+      else
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+    }
+  }
+
+  // The idea behind is to search along the use-def chain, and find out all
+  // possible source of the pointer. Then in later codeGen, we can emit
+  // read/store instructions to these btis gathered.
+  void GenWriter::gatherBTI(Value *pointer, ir::BTI &bti) {
+    typedef map<const Value*, int>::iterator GlobalPtrIter;
+    Value *p;
+    size_t idx = 0;
+    int nBTI = 0;
+    std::vector<Value*> candidates;
+    candidates.push_back(pointer);
+    std::set<Value*> processed;
+
+    while (idx < candidates.size()) {
+      bool isPrivate = false;
+      bool needNewBTI = true;
+      p = candidates[idx];
+
+      while (dyn_cast<User>(p) && !dyn_cast<GlobalVariable>(p)) {
+
+        if (processed.find(p) == processed.end()) {
+          processed.insert(p);
+        } else {
+          // This use-def chain falls into a loop,
+          // it does not introduce a new buffer source.
+          needNewBTI = false;
+          break;
+        }
+
+        if (dyn_cast<SelectInst>(p)) {
+          SelectInst *sel = cast<SelectInst>(p);
+          p = sel->getTrueValue();
+          candidates.push_back(sel->getFalseValue());
+          continue;
+        }
+
+        if (dyn_cast<PHINode>(p)) {
+          PHINode* phi = cast<PHINode>(p);
+          int n = phi->getNumIncomingValues();
+          for (int j = 1; j < n; j++)
+            candidates.push_back(phi->getIncomingValue(j));
+          p = phi->getIncomingValue(0);
+          continue;
+        }
+
+        if (dyn_cast<AllocaInst>(p)) {
+          isPrivate = true;
+          break;
+        }
+        p = cast<User>(p)->getOperand(0);
+      }
+
+      if (needNewBTI == false) {
+        // go to next possible pointer source
+        idx++; continue;
+      }
+
+      uint8_t new_bti = 0;
+      if (isPrivate) {
+        new_bti = BTI_PRIVATE;
+      } else {
+        if(isa<Argument>(p) && dyn_cast<Argument>(p)->hasByValAttr()) {
+          // structure value implementation is not complete now,
+          // they are now treated as push constant, so, the load/store
+          // here is not as meaningful.
+          bti.bti[0] = BTI_PRIVATE;
+          bti.count = 1;
+          break;
+        }
+        Type *ty = p->getType();
+        if(ty->getPointerAddressSpace() == 3) {
+          // __local memory
+          new_bti = 0xfe;
+        } else {
+          // __global memory
+          GlobalPtrIter iter = globalPointer.find(p);
+          GBE_ASSERT(iter != globalPointer.end());
+          new_bti = iter->second;
+        }
+      }
+      // avoid duplicate
+      bool bFound = false;
+      for (int j = 0; j < nBTI; j++) {
+        if (bti.bti[j] == new_bti) {
+          bFound = true; break;
+        }
+      }
+      if (bFound == false) {
+        bti.bti[nBTI++] = new_bti;
+        bti.count = nBTI;
+      }
+      idx++;
+    }
+    GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
+  }
+
   extern int OCL_SIMD_WIDTH;
   template <bool isLoad, typename T>
   INLINE void GenWriter::emitLoadOrStore(T &I)
@@ -2838,15 +3519,18 @@ namespace gbe
     const bool dwAligned = (I.getAlignment() % 4) == 0;
     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register ptr = this->getRegister(llvmPtr);
-
+    ir::BTI binding;
+    if(addrSpace == ir::MEM_GLOBAL || addrSpace == ir::MEM_PRIVATE) {
+      gatherBTI(llvmPtr, binding);
+    }
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
       const ir::Register values = this->getRegister(llvmValues);
       if (isLoad)
-        ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
+        ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
       else
-        ctx.STORE(type, ptr, addrSpace, dwAligned, values);
+        ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
     }
     // A vector type requires to build a tuple
     else {
@@ -2864,12 +3548,14 @@ namespace gbe
       // count here.
       if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
           elemNum = 3;
+
       // The code is going to be fairly different from types to types (based on
       // size of each vector element)
       const ir::Type type = getType(ctx, elemType);
       const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      const ir::RegisterFamily dataFamily = getFamily(type);
 
-      if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
+      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
           // Build the tuple data in the vector
@@ -2888,58 +3574,18 @@ namespace gbe
 
           // Emit the instruction
           if (isLoad)
-            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned);
+            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
           else
-            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned);
+            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
         }
         // Not supported by the hardware. So, we split the message and we use
         // strided loads and stores
         else {
-          // We simply use several uint4 loads
-          const uint32_t msgNum = elemNum / 4;
-          for (uint32_t msg = 0; msg < msgNum; ++msg) {
-            // Build the tuple data in the vector
-            vector<ir::Register> tupleData; // put registers here
-            for (uint32_t elemID = 0; elemID < 4; ++elemID) {
-              ir::Register reg;
-              if(regTranslator.isUndefConst(llvmValues, elemID)) {
-                Value *v = Constant::getNullValue(elemType);
-                reg = this->getRegister(v);
-              } else
-                reg = this->getRegister(llvmValues, 4*msg+elemID);
-
-              tupleData.push_back(reg);
-            }
-            const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], 4);
-
-            // We may need to update to offset the pointer
-            ir::Register addr;
-            if (msg == 0)
-              addr = ptr;
-            else {
-              const ir::Register offset = ctx.reg(pointerFamily);
-              ir::ImmediateIndex immIndex;
-              ir::Type immType;
-              if (pointerFamily == ir::FAMILY_DWORD) {
-                immIndex = ctx.newImmediate(int32_t(msg*sizeof(uint32_t[4])));
-                immType = ir::TYPE_S32;
-              } else {
-                immIndex = ctx.newImmediate(int64_t(msg*sizeof(uint64_t[4])));
-                immType = ir::TYPE_S64;
-              }
-
-              addr = ctx.reg(pointerFamily);
-              ctx.LOADI(immType, offset, immIndex);
-              ctx.ADD(immType, addr, ptr, offset);
-            }
-
-            // Emit the instruction
-            if (isLoad)
-              ctx.LOAD(type, tuple, addr, addrSpace, 4, true);
-            else
-              ctx.STORE(type, tuple, addr, addrSpace, 4, true);
-          }
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
         }
+      }
+      else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
           if(regTranslator.isUndefConst(llvmValues, elemID))
@@ -2959,9 +3605,9 @@ namespace gbe
               ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
           }
           if (isLoad)
-           ctx.LOAD(type, addr, addrSpace, dwAligned, reg);
+           ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
           else
-           ctx.STORE(type, addr, addrSpace, dwAligned, reg);
+           ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
         }
       }
     }
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 389d5f3..cc5cdad 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -84,12 +84,21 @@ namespace gbe
   /*! Remove the GEP instructions */
   llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
 
+  /*! Merge load/store if possible */
+  llvm::BasicBlockPass *createLoadStoreOptimizationPass();
+
   /*! Scalarize all vector op instructions */
   llvm::FunctionPass* createScalarizePass();
+  /*! Remove/add NoDuplicate function attribute for barrier functions. */
+  llvm::ModulePass* createBarrierNodupPass(bool);
 
   /*! Convert the Intrinsic call to gen function */
   llvm::BasicBlockPass *createIntrinsicLoweringPass();
 
+  /*! Passer the printf function call. */
+  llvm::FunctionPass* createPrintfParserPass();
+
+  void* getPrintfInfo(llvm::CallInst* inst);
 } /* namespace gbe */
 
 #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index de2890c..f3ce096 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -33,6 +33,8 @@ DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
 DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
 DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
 DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
+DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
+DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
 
 // Barrier function
 DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
@@ -44,34 +46,38 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
 DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
 
 // To read_image functions.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijtiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijtffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijtiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijtffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjtiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjtffj)
-
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijtiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijtfffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijtiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijtfffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjtiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D, _Z21__gen_ocl_read_imageijtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D, _Z22__gen_ocl_read_imageuijtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D, _Z21__gen_ocl_read_imagefjtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D, _Z21__gen_ocl_read_imageijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D, _Z22__gen_ocl_read_imageuijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D, _Z21__gen_ocl_read_imagefjtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D, _Z21__gen_ocl_read_imageijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D, _Z22__gen_ocl_read_imageuijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D, _Z21__gen_ocl_read_imagefjtfffj)
+// work around read image with the LD message. The coords are integer type.
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D_I, _Z21__gen_ocl_read_imageijtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D_I, _Z22__gen_ocl_read_imageuijtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D_I, _Z21__gen_ocl_read_imagefjtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D_I, _Z21__gen_ocl_read_imageijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D_I, _Z22__gen_ocl_read_imageuijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D_I, _Z21__gen_ocl_read_imagefjtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D_I, _Z21__gen_ocl_read_imageijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D_I, _Z22__gen_ocl_read_imageuijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D_I, _Z21__gen_ocl_read_imagefjtiiij)
 
 // To write_image functions.
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE0, _Z22__gen_ocl_write_imageijiiDv4_i)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE1, _Z22__gen_ocl_write_imageijffDv4_i)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE4, _Z22__gen_ocl_write_imagefjiiDv4_f)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE5, _Z22__gen_ocl_write_imagefjffDv4_f)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE2, _Z23__gen_ocl_write_imageuijiiDv4_j)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE3, _Z23__gen_ocl_write_imageuijffDv4_j)
-
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE10, _Z22__gen_ocl_write_imageijiiiDv4_i)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE11, _Z22__gen_ocl_write_imageijfffDv4_i)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE12, _Z23__gen_ocl_write_imageuijiiiDv4_j)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE13, _Z23__gen_ocl_write_imageuijfffDv4_j)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE14, _Z22__gen_ocl_write_imagefjiiiDv4_f)
-DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE15, _Z22__gen_ocl_write_imagefjfffDv4_f)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_1D, _Z22__gen_ocl_write_imageijiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_1D, _Z23__gen_ocl_write_imageuijiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_1D, _Z22__gen_ocl_write_imagefjiDv4_f)
+
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_2D, _Z22__gen_ocl_write_imageijiiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_2D, _Z23__gen_ocl_write_imageuijiiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_2D, _Z22__gen_ocl_write_imagefjiiDv4_f)
+
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_3D, _Z22__gen_ocl_write_imageijiiiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_3D, _Z23__gen_ocl_write_imageuijiiiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_3D, _Z22__gen_ocl_write_imagefjiiiDv4_f)
 
 // To get image info function
 DECL_LLVM_GEN_FUNCTION(GET_IMAGE_WIDTH, __gen_ocl_get_image_width)
@@ -146,9 +152,6 @@ DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
 
-// get sampler info
-DECL_LLVM_GEN_FUNCTION(GET_SAMPLER_INFO, __gen_ocl_get_sampler_info)
-
 // saturate convert
 DECL_LLVM_GEN_FUNCTION(SAT_CONV_U8_TO_I8,  _Z16convert_char_sath)
 DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_I8, _Z16convert_char_sats)
@@ -181,4 +184,13 @@ DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U32, _Z16convert_uint_sati)
 DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U32, _Z16convert_uint_satf)
 
 DECL_LLVM_GEN_FUNCTION(CONV_F16_TO_F32, __gen_ocl_f16to32)
-DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
\ No newline at end of file
+DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
+
+// SIMD level function for internal usage
+DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)
+DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
+
+// printf function
+DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+DECL_LLVM_GEN_FUNCTION(PRINTF_BUF_ADDR, __gen_ocl_printf_get_buf_addr)
+DECL_LLVM_GEN_FUNCTION(PRINTF_INDEX_BUF_ADDR, __gen_ocl_printf_get_index_buf_addr)
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index 1942860..7d04318 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -20,7 +20,7 @@
  * \author Yang Rong <rong.r.yang at intel.com>
  */
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #if LLVM_VERSION_MINOR <= 2
 #include "llvm/Function.h"
 #include "llvm/InstrTypes.h"
@@ -42,8 +42,6 @@
 #else
 #include "llvm/IR/IRBuilder.h"
 #endif /* LLVM_VERSION_MINOR <= 1 */
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include "llvm/llvm_gen_backend.hpp"
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
new file mode 100644
index 0000000..4bfc7f6
--- /dev/null
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling, Song <ruiling.song at intel.com>
+ *
+ * The Idea is that: As GEN support at most 4 successive DWORD load/store,
+ * then merge successive load/store that are compatible is beneficial.
+ * The method of checking whether two load/store is compatible are borrowed
+ * from Vectorize passes in llvm.
+ */
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+
+using namespace llvm;
+namespace gbe {
+  class GenLoadStoreOptimization : public BasicBlockPass {
+
+  public:
+    static char ID;
+    ScalarEvolution *SE;
+    const DataLayout *TD;
+    GenLoadStoreOptimization() : BasicBlockPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.setPreservesCFG();
+    }
+
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      SE = &getAnalysis<ScalarEvolution>();
+      #if LLVM_VERSION_MINOR >= 5
+        DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+        TD = DLP ? &DLP->getDataLayout() : nullptr;
+      #else
+        TD = getAnalysisIfAvailable<DataLayout>();
+      #endif
+      return optimizeLoadStore(BB);
+    }
+    Type    *getValueType(Value *insn);
+    Value   *getPointerOperand(Value *I);
+    unsigned getAddressSpace(Value *I);
+    bool     isSimpleLoadStore(Value *I);
+    bool     optimizeLoadStore(BasicBlock &BB);
+
+    bool     isLoadStoreCompatible(Value *A, Value *B);
+    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+    BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
+                                               SmallVector<Instruction*, 4> &merged,
+                                               BasicBlock::iterator &start,
+                                               unsigned maxLimit,
+                                               bool isLoad);
+
+    virtual const char *getPassName() const {
+      return "Merge compatible Load/stores for Gen";
+    }
+  };
+
+  char GenLoadStoreOptimization::ID = 0;
+
+  Value *GenLoadStoreOptimization::getPointerOperand(Value *I) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+    if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+    return NULL;
+  }
+  unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) {
+    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+    return -1;
+  }
+  bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) {
+    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->isSimple();
+    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->isSimple();
+    return false;
+  }
+  Type *GenLoadStoreOptimization::getValueType(Value *insn) {
+    if(LoadInst *ld = dyn_cast<LoadInst>(insn)) return ld->getType();
+    if(StoreInst *st = dyn_cast<StoreInst>(insn)) return st->getValueOperand()->getType();
+
+    return NULL;
+  }
+
+  bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) {
+    Value *ptrA = getPointerOperand(A);
+    Value *ptrB = getPointerOperand(B);
+    unsigned ASA = getAddressSpace(A);
+    unsigned ASB = getAddressSpace(B);
+
+    // Check that the address spaces match and that the pointers are valid.
+    if (!ptrA || !ptrB || (ASA != ASB)) return false;
+
+    if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false;
+    // Check that A and B are of the same type.
+    if (ptrA->getType() != ptrB->getType()) return false;
+
+    // Calculate the distance.
+    const SCEV *ptrSCEVA = SE->getSCEV(ptrA);
+    const SCEV *ptrSCEVB = SE->getSCEV(ptrB);
+    const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB);
+    const SCEVConstant *constOffSCEV = dyn_cast<SCEVConstant>(offsetSCEV);
+
+    // Non constant distance.
+    if (!constOffSCEV) return false;
+
+    int64_t offset = constOffSCEV->getValue()->getSExtValue();
+    Type *Ty = cast<PointerType>(ptrA->getType())->getElementType();
+    // The Instructions are connsecutive if the size of the first load/store is
+    // the same as the offset.
+    int64_t sz = TD->getTypeStoreSize(Ty);
+    return ((-offset) == sz);
+  }
+
+  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+    IRBuilder<> Builder(&BB);
+
+    unsigned size = merged.size();
+    SmallVector<Value *, 4> values;
+    for(unsigned i = 0; i < size; i++) {
+      values.push_back(merged[i]);
+    }
+    LoadInst *ld = cast<LoadInst>(merged[0]);
+    unsigned align = ld->getAlignment();
+    unsigned addrSpace = ld->getPointerAddressSpace();
+    // insert before first load
+    Builder.SetInsertPoint(ld);
+    VectorType *vecTy = VectorType::get(ld->getType(), size);
+    Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
+                                          PointerType::get(vecTy, addrSpace));
+    LoadInst *vecValue = Builder.CreateLoad(vecPtr);
+    vecValue->setAlignment(align);
+
+    for (unsigned i = 0; i < size; ++i) {
+      Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i));
+      values[i]->replaceAllUsesWith(S);
+    }
+  }
+
+  BasicBlock::iterator
+  GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
+                            SmallVector<Instruction*, 4> &merged,
+                            BasicBlock::iterator &start,
+                            unsigned maxLimit,
+                            bool isLoad) {
+
+    BasicBlock::iterator stepForward = start;
+    if(!isSimpleLoadStore(start)) return stepForward;
+
+    merged.push_back(start);
+
+    BasicBlock::iterator E = BB.end();
+    BasicBlock::iterator J = ++start;
+
+    for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
+      if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
+        if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
+          merged.push_back(J);
+          stepForward = ++J;
+        }
+      } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+        // simple stop to keep read/write order
+        break;
+      }
+
+      if(merged.size() >= 4) break;
+    }
+    return stepForward;
+  }
+
+  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+    IRBuilder<> Builder(&BB);
+
+    unsigned size = merged.size();
+    SmallVector<Value *, 4> values;
+    for(unsigned i = 0; i < size; i++) {
+      values.push_back(cast<StoreInst>(merged[i])->getValueOperand());
+    }
+    StoreInst *st = cast<StoreInst>(merged[0]);
+    unsigned addrSpace = st->getPointerAddressSpace();
+
+    unsigned align = st->getAlignment();
+    // insert before the last store
+    Builder.SetInsertPoint(merged[size-1]);
+
+    Type *dataTy = st->getValueOperand()->getType();
+    VectorType *vecTy = VectorType::get(dataTy, size);
+    Value * parent = UndefValue::get(vecTy);
+    for(unsigned i = 0; i < size; i++) {
+      parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
+    }
+
+    Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace));
+    StoreInst *newST = Builder.CreateStore(parent, newPtr);
+    newST->setAlignment(align);
+  }
+
+  bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
+    bool changed = false;
+    SmallVector<Instruction*, 4> merged;
+    for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
+      if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
+        bool isLoad = isa<LoadInst>(*BBI) ? true: false;
+        Type *ty = getValueType(BBI);
+        if(ty->isVectorTy()) continue;
+        // we only support DWORD data type merge
+        if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
+        BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
+        if(merged.size() > 1) {
+          if(isLoad)
+            mergeLoad(BB, merged);
+          else
+            mergeStore(BB, merged);
+          // remove merged insn
+          int size = merged.size();
+          for(int i = 0; i < size; i++)
+            merged[i]->eraseFromParent();
+          changed = true;
+        }
+        merged.clear();
+      }
+    }
+    return changed;
+  }
+
+  BasicBlockPass *createLoadStoreOptimizationPass() {
+    return new GenLoadStoreOptimization();
+  }
+};
+
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index d30a570..1a38a0c 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -30,7 +30,7 @@
  * Segovia) the right to use another license for it (MIT here)
  */
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #if LLVM_VERSION_MINOR <= 2
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
@@ -86,13 +86,12 @@
 #else
 #include "llvm/IR/DataLayout.h"
 #endif
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
 #include "llvm/Support/InstVisitor.h"
+#elif LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/InstVisitor.h"
 #else
 #include "llvm/InstVisitor.h"
 #endif
@@ -101,7 +100,6 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Config/config.h"
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "ir/unit.hpp"
@@ -171,14 +169,21 @@ namespace gbe
     switch (Ty->getTypeID()) {
       case Type::VoidTyID:    NOT_SUPPORTED;
       case Type::PointerTyID: return unit.getPointerSize();
-      case Type::IntegerTyID: return cast<IntegerType>(Ty)->getBitWidth();
+      case Type::IntegerTyID:
+      {
+        // use S16 to represent SLM bool variables.
+        int bitWidth = cast<IntegerType>(Ty)->getBitWidth();
+        return (bitWidth == 1) ? 16 : bitWidth;
+      }
       case Type::HalfTyID:    return 16;
       case Type::FloatTyID:   return 32;
       case Type::DoubleTyID:  return 64;
       case Type::VectorTyID:
       {
         const VectorType* VecTy = cast<VectorType>(Ty);
-        return VecTy->getNumElements() * getTypeBitSize(unit, VecTy->getElementType());
+        uint32_t numElem = VecTy->getNumElements();
+        if(numElem == 3) numElem = 4; // OCL spec
+        return numElem * getTypeBitSize(unit, VecTy->getElementType());
       }
       case Type::ArrayTyID:
       {
@@ -375,7 +380,7 @@ namespace gbe
     //replace uses of the GEP instruction with the newly calculated pointer
     GEPInst->replaceAllUsesWith(intToPtrInst);
     GEPInst->dropAllReferences();
-    GEPInst->removeFromParent();
+    GEPInst->eraseFromParent();
 
 #if FORMER_VERSION
     //insert new pointer into parent list
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
new file mode 100644
index 0000000..00e1ef8
--- /dev/null
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -0,0 +1,851 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_printf_parser.cpp
+ *
+ * When there are printf functions existing, we have something to do here.
+ * Because the GPU's feature, it is relatively hard to parse and caculate the
+ * printf's format string. OpenCL 1.2 restrict the format string to be a
+ * constant string and can be decided at compiling time. So we add a pass here
+ * to parse the format string and check whether the parameters is valid.
+ * If all are valid, we will generate the according instruction to store the
+ * parameter content into the printf buffer. And if something is invalid, a
+ * warning is generated and the printf instruction is skipped in order to avoid
+ * GPU error. We also keep the relationship between the printf format and printf
+ * content in GPU's printf buffer here, and use the system's C standard printf to
+ * print the content after kernel executed.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+#include "ir/printf.hpp"
+
+using namespace llvm;
+
+namespace gbe
+{
+  using namespace ir;
+
+  /* Return the conversion_specifier if succeed, -1 if failed. */
+  static char __parse_printf_state(char *begin, char *end, char** rend, PrintfState * state)
+  {
+    const char *fmt;
+    state->left_justified = 0;
+    state->sign_symbol = 0; //0 for nothing, 1 for sign, 2 for space.
+    state->alter_form = 0;
+    state->zero_padding = 0;
+    state->vector_n = 0;
+    state->min_width = -1;
+    state->precision = -1;
+    state->length_modifier = 0;
+    state->conversion_specifier = PRINTF_CONVERSION_INVALID;
+    state->out_buf_sizeof_offset = -1;
+
+    fmt = begin;
+
+    if (*fmt != '%')
+      return -1;
+
+#define FMT_PLUS_PLUS do {                                  \
+      if (fmt + 1 <= end) fmt++;                             \
+      else {                                                \
+        printf("Error, line: %d, fmt > end\n", __LINE__);   \
+        return -1;                                          \
+      }                                                     \
+    }  while(0)
+
+    FMT_PLUS_PLUS;
+
+    // parse the flags.
+    while (*fmt == '-' || *fmt == '+' || *fmt == ' ' || *fmt == '#' || *fmt == '0')
+      switch (*fmt) {
+        case '-':
+          /* The result of the conversion is left-justified within the field. */
+          state->left_justified = 1;
+          FMT_PLUS_PLUS;
+          break;
+        case '+':
+          /* The result of a signed conversion always begins with a plus or minus sign. */
+          state->sign_symbol = 1;
+          FMT_PLUS_PLUS;
+          break;
+        case ' ':
+          /* If the first character of a signed conversion is not a sign, or if a signed
+             conversion results in no characters, a space is prefixed to the result.
+             If the space and + flags both appear,the space flag is ignored. */
+          if (state->sign_symbol == 0) state->sign_symbol = 2;
+          FMT_PLUS_PLUS;
+          break;
+        case '#':
+          /*The result is converted to an alternative form. */
+          state->alter_form = 1;
+          FMT_PLUS_PLUS;
+          break;
+        case '0':
+          if (!state->left_justified) state->zero_padding = 1;
+          FMT_PLUS_PLUS;
+          break;
+        default:
+          break;
+      }
+
+    // The minimum field width
+    while ((*fmt >= '0') && (*fmt <= '9')) {
+      if (state->min_width < 0)
+        state->min_width = 0;
+      state->min_width = state->min_width * 10 + (*fmt - '0');
+      FMT_PLUS_PLUS;
+    }
+
+    // The precision
+    if (*fmt == '.') {
+      FMT_PLUS_PLUS;
+      state->precision = 0;
+      while (*fmt >= '0' && *fmt <= '9') {
+        state->precision = state->precision * 10 + (*fmt - '0');
+        FMT_PLUS_PLUS;
+      }
+    }
+
+    // handle the vector specifier.
+    if (*fmt == 'v') {
+      FMT_PLUS_PLUS;
+      switch (*fmt) {
+        case '2':
+        case '3':
+        case '4':
+        case '8':
+          state->vector_n = *fmt - '0';
+          FMT_PLUS_PLUS;
+          break;
+        case '1':
+          FMT_PLUS_PLUS;
+          if (*fmt == '6') {
+            state->vector_n = 16;
+            FMT_PLUS_PLUS;
+          } else
+            return -1;
+          break;
+        default:
+          //Wrong vector, error.
+          return -1;
+      }
+    }
+
+    // length modifiers
+    if (*fmt == 'h') {
+      FMT_PLUS_PLUS;
+      if (*fmt == 'h') { //hh
+        state->length_modifier = PRINTF_LM_HH;
+        FMT_PLUS_PLUS;
+      } else if (*fmt == 'l') { //hl
+        state->length_modifier = PRINTF_LM_HL;
+        FMT_PLUS_PLUS;
+      } else { //h
+        state->length_modifier = PRINTF_LM_H;
+      }
+    } else if (*fmt == 'l') {
+      state->length_modifier = PRINTF_LM_L;
+      FMT_PLUS_PLUS;
+    }
+
+#define CONVERSION_SPEC_AND_RET(XXX, xxx)                           \
+    case XXX:                                                       \
+      state->conversion_specifier = PRINTF_CONVERSION_##xxx;        \
+      FMT_PLUS_PLUS;                                                \
+      *rend = (char *)fmt;                                          \
+      return XXX;                                                   \
+      break;
+
+    // conversion specifiers
+    switch (*fmt) {
+        CONVERSION_SPEC_AND_RET('d', D)
+        CONVERSION_SPEC_AND_RET('i', I)
+        CONVERSION_SPEC_AND_RET('o', O)
+        CONVERSION_SPEC_AND_RET('u', U)
+        CONVERSION_SPEC_AND_RET('x', x)
+        CONVERSION_SPEC_AND_RET('X', X)
+        CONVERSION_SPEC_AND_RET('f', f)
+        CONVERSION_SPEC_AND_RET('F', F)
+        CONVERSION_SPEC_AND_RET('e', e)
+        CONVERSION_SPEC_AND_RET('E', E)
+        CONVERSION_SPEC_AND_RET('g', g)
+        CONVERSION_SPEC_AND_RET('G', G)
+        CONVERSION_SPEC_AND_RET('a', a)
+        CONVERSION_SPEC_AND_RET('A', A)
+        CONVERSION_SPEC_AND_RET('c', C)
+        CONVERSION_SPEC_AND_RET('s', S)
+        CONVERSION_SPEC_AND_RET('p', P)
+
+      // %% has been handled
+
+      default:
+        return -1;
+    }
+  }
+
+  static PrintfSet::PrintfFmt* parser_printf_fmt(char* format, int& num)
+  {
+    char* begin;
+    char* end;
+    char* p;
+    char ret_char;
+    char* rend;
+    PrintfState state;
+    PrintfSet::PrintfFmt* printf_fmt = new PrintfSet::PrintfFmt();
+
+    p = format;
+    begin = format;
+    end = format + strlen(format);
+
+    /* Now parse it. */
+    while (*begin) {
+      p = begin;
+
+again:
+      while (p < end && *p != '%') {
+        p++;
+      }
+      if (p < end && p + 1 == end) { // String with % at end.
+        printf("string end with %%\n");
+        goto error;
+      }
+      if (*(p + 1) == '%') { // %%
+        p += 2;
+        goto again;
+      }
+
+      if (p != begin) {
+        std::string s = std::string(begin, size_t(p - begin));
+        printf_fmt->push_back(PrintfSlot(s.c_str()));
+      }
+
+      if (p == end) // finish
+        break;
+
+      /* Now parse the % start conversion_specifier. */
+      ret_char = __parse_printf_state(p, end, &rend, &state);
+      if (ret_char < 0)
+        goto error;
+
+      printf_fmt->push_back(&state);
+      num++;
+
+      if (rend == end)
+        break;
+
+      begin = rend;
+    }
+
+#if 0
+    {
+      int j = 0;
+      for (auto &s : *printf_fmt) {
+        j++;
+        if (s.type == PRINTF_SLOT_TYPE_STATE) {
+          fprintf(stderr, "---- %d ---: state : \n", j);
+          fprintf(stderr, "		     left_justified : %d\n", s.state->left_justified);
+          fprintf(stderr, "		     sign_symbol: %d\n", s.state->sign_symbol);
+          fprintf(stderr, "		     alter_form : %d\n", s.state->alter_form);
+          fprintf(stderr, "		     zero_padding : %d\n", s.state->zero_padding);
+          fprintf(stderr, "		     vector_n : %d\n", s.state->vector_n);
+          fprintf(stderr, "		     min_width : %d\n", s.state->min_width);
+          fprintf(stderr, "		     precision : %d\n", s.state->precision);
+          fprintf(stderr, "		     length_modifier : %d\n", s.state->length_modifier);
+          fprintf(stderr, "		     conversion_specifier : %d\n", s.state->conversion_specifier);
+        } else if (s.type == PRINTF_SLOT_TYPE_STRING) {
+          fprintf(stderr, "---- %d ---: string :  %s\n", j, s.str);
+        }
+      }
+    }
+#endif
+
+    return printf_fmt;
+
+error:
+    printf("error format string.\n");
+    delete printf_fmt;
+    return NULL;
+  }
+
+  class PrintfParser : public FunctionPass
+  {
+  public:
+    static char ID;
+    typedef std::pair<Instruction*, bool> PrintfInst;
+    std::vector<PrintfInst> deadprintfs;
+    Module* module;
+    IRBuilder<>* builder;
+    Type* intTy;
+    Value* pbuf_ptr;
+    Value* index_buf_ptr;
+    int out_buf_sizeof_offset;
+    static map<CallInst*, PrintfSet::PrintfFmt*> printfs;
+    int printf_num;
+
+    PrintfParser(void) : FunctionPass(ID)
+    {
+      module = NULL;
+      builder = NULL;
+      intTy = NULL;
+      out_buf_sizeof_offset = 0;
+      printfs.clear();
+      pbuf_ptr = NULL;
+      index_buf_ptr = NULL;
+      printf_num = 0;
+    }
+
+    ~PrintfParser(void)
+    {
+      for (auto &s : printfs) {
+        delete s.second;
+        s.second = NULL;
+      }
+      printfs.clear();
+    }
+
+
+    bool parseOnePrintfInstruction(CallInst *& call);
+    bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size);
+
+    virtual const char *getPassName() const
+    {
+      return "Printf Parser";
+    }
+
+    virtual bool runOnFunction(llvm::Function &F);
+  };
+
+  bool PrintfParser::parseOnePrintfInstruction(CallInst *& call)
+  {
+    CallSite CS(call);
+    CallSite::arg_iterator CI_FMT = CS.arg_begin();
+    int param_num = 0;
+
+    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
+    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+    if (!arg0_ptr) {
+      return false;
+    }
+
+    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+    if (!fmt_arg || !fmt_arg->isCString()) {
+      return false;
+    }
+
+    std::string fmt = fmt_arg->getAsCString();
+
+    PrintfSet::PrintfFmt* printf_fmt = NULL;
+
+    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
+      return false;
+    }
+
+    /* iff parameter more than %, error. */
+    /* str_fmt arg0 arg1 ... NULL */
+    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
+      delete printf_fmt;
+      return false;
+    }
+
+    /* FIXME: Because the OpenCL language do not support va macro, and we do not want
+       to introduce the va_list, va_start and va_end into our code, we just simulate
+       the function calls to caculate the offset caculation here. */
+#define BUILD_CALL_INST(name) \
+    CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
+                             "__gen_ocl_get_"#name,                                         \
+                             IntegerType::getInt32Ty(module->getContext()),                 \
+                             NULL)))
+
+    BUILD_CALL_INST(group_id2);
+    BUILD_CALL_INST(group_id1);
+    BUILD_CALL_INST(group_id0);
+    BUILD_CALL_INST(global_size2);
+    BUILD_CALL_INST(global_size1);
+    BUILD_CALL_INST(global_size0);
+    BUILD_CALL_INST(local_id2);
+    BUILD_CALL_INST(local_id1);
+    BUILD_CALL_INST(local_id0);
+    BUILD_CALL_INST(local_size2);
+    BUILD_CALL_INST(local_size1);
+    BUILD_CALL_INST(local_size0);
+
+#undef BUILD_CALL_INST
+
+    Value* op0 = NULL;
+    Value* val = NULL;
+    /* calculate offset for later usage.
+       offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+       + (local_id1 + local_size1 * group_id1) * global_size0
+       + (local_id0 + local_size0 * group_id0)) * sizeof(type)  */
+
+    // local_size2 * group_id2
+    val = builder->CreateMul(local_size2, group_id2);
+    // local_id2 + local_size2 * group_id2
+    val = builder->CreateAdd(local_id2, val);
+    // global_size1 * global_size0
+    op0 = builder->CreateMul(global_size1, global_size0);
+    // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+    Value* offset1 = builder->CreateMul(val, op0);
+    // local_size1 * group_id1
+    val = builder->CreateMul(local_size1, group_id1);
+    // local_id1 + local_size1 * group_id1
+    val = builder->CreateAdd(local_id1, val);
+    // (local_id1 + local_size1 * group_id1) * global_size_0
+    Value* offset2 = builder->CreateMul(val, global_size0);
+    // local_size0 * group_id0
+    val = builder->CreateMul(local_size0, group_id0);
+    // local_id0 + local_size0 * group_id0
+    val = builder->CreateAdd(local_id0, val);
+    // The total sum
+    val = builder->CreateAdd(val, offset1);
+    Value* offset = builder->CreateAdd(val, offset2);
+
+    /////////////////////////////////////////////////////
+    /* calculate index address.
+       index_addr = (index_offset + offset )* sizeof(int) + index_buf_ptr
+       index_offset = global_size2 * global_size1 * global_size0 * printf_num */
+
+    // global_size2 * global_size1
+    op0 = builder->CreateMul(global_size2, global_size1);
+    // global_size2 * global_size1 * global_size0
+    Value* glXg2Xg3 = builder->CreateMul(op0, global_size0);
+    Value* index_offset = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, printf_num));
+    // index_offset + offset
+    op0 = builder->CreateAdd(index_offset, offset);
+    // (index_offset + offset)* sizeof(int)
+    op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)));
+    // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int)
+    op0 = builder->CreateAdd(index_buf_ptr, op0);
+    Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
+    builder->CreateStore(ConstantInt::get(intTy, 1), index_addr);// The flag
+
+    int i = 1;
+    Value* data_addr = NULL;
+    for (auto &s : *printf_fmt) {
+      if (s.type == PRINTF_SLOT_TYPE_STRING)
+        continue;
+
+      assert(i < static_cast<int>(call->getNumOperands()) - 1);
+
+      Value *out_arg = call->getOperand(i);
+      Type *dst_type = NULL;
+      int sizeof_size = 0;
+      if (!generateOneParameterInst(s, out_arg, dst_type, sizeof_size)) {
+        printf("Printf: %d, parameter %d may have no result because some error\n",
+               printf_num, i - 1);
+        i++;
+        continue;
+      }
+
+      s.state->out_buf_sizeof_offset = out_buf_sizeof_offset;
+      if (!sizeof_size) {
+        i++;
+        continue;
+      }
+
+      assert(dst_type);
+
+      /////////////////////////////////////////////////////
+      /* Calculate the data address.
+      data_addr = data_offset + pbuf_ptr + offset * sizeof(specify)
+      data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset
+
+      //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */
+      op0 = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
+      //offset * sizeof(specify)
+      val = builder->CreateMul(offset, ConstantInt::get(intTy, sizeof_size));
+      //data_offset + pbuf_ptr
+      op0 = builder->CreateAdd(pbuf_ptr, op0);
+      op0 = builder->CreateAdd(op0, val);
+      data_addr = builder->CreateIntToPtr(op0, dst_type);
+      builder->CreateStore(out_arg, data_addr);
+
+      out_buf_sizeof_offset += ((sizeof_size + 3) / 4) * 4;
+      i++;
+    }
+
+    CallInst* printf_inst = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+                              "__gen_ocl_printf", Type::getVoidTy(module->getContext()),
+                              NULL)));
+    assert(printfs[printf_inst] == NULL);
+    printfs[printf_inst] = printf_fmt;
+    printf_num++;
+    return true;
+  }
+
+  bool PrintfParser::runOnFunction(llvm::Function &F)
+  {
+    bool changed = false;
+    switch (F.getCallingConv()) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+      case CallingConv::PTX_Device:
+        return false;
+      case CallingConv::PTX_Kernel:
+#else
+      case CallingConv::C:
+#endif
+        break;
+      default:
+        GBE_ASSERTM(false, "Unsupported calling convention");
+    }
+
+    module = F.getParent();
+    intTy = IntegerType::get(module->getContext(), 32);
+
+    // As we inline all function calls, so skip non-kernel functions
+    bool bKernel = isKernelFunction(F);
+    if(!bKernel) return false;
+
+    builder = new IRBuilder<>(module->getContext());
+
+    /* Iter the function and find printf. */
+    for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
+      for (BasicBlock::iterator instI = B->begin(),
+           instE = B->end(); instI != instE; ++instI) {
+
+        llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
+        if (!call) {
+          continue;
+        }
+
+        if (call->getCalledFunction()->getIntrinsicID() != 0)
+          continue;
+
+        Value *Callee = call->getCalledValue();
+        const std::string fnName = Callee->getName();
+
+        if (fnName != "__gen_ocl_printf_stub")
+          continue;
+
+        changed = true;
+
+        builder->SetInsertPoint(call);
+
+        if (!pbuf_ptr) {
+          /* alloc a new buffer ptr to collect the print output. */
+          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
+          pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+        }
+        if (!index_buf_ptr) {
+          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
+          index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+        }
+
+        deadprintfs.push_back(PrintfInst(cast<Instruction>(call),parseOnePrintfInstruction(call)));
+      }
+    }
+
+    /* Replace the instruction's operand if using printf's return value. */
+    for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
+      for (BasicBlock::iterator instI = B->begin(),
+           instE = B->end(); instI != instE; ++instI) {
+
+        for (unsigned i = 0; i < instI->getNumOperands(); i++) {
+          for (auto &prf : deadprintfs) {
+            if (instI->getOperand(i) == prf.first) {
+
+              if (prf.second == true) {
+                instI->setOperand(i, ConstantInt::get(intTy, 0));
+              } else {
+                instI->setOperand(i, ConstantInt::get(intTy, -1));
+              }
+            }
+          }
+        }
+      }
+    }
+
+    /* Kill the dead printf instructions. */
+    for (auto &prf : deadprintfs) {
+      prf.first->dropAllReferences();
+      if (prf.first->use_empty())
+        prf.first->eraseFromParent();
+    }
+
+    deadprintfs.clear();
+    delete builder;
+
+    return changed;
+  }
+
+  bool PrintfParser::generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size)
+  {
+    assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+    assert(builder);
+
+    /* Check whether the arg match the format specifer. If needed, some
+       conversion need to be applied. */
+    switch (arg->getType()->getTypeID()) {
+      case Type::IntegerTyID: {
+        bool sign = false;
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+            sign = true;
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_X:
+            /* If the bits change, we need to consider the signed. */
+            if (arg->getType() != Type::getInt32Ty(module->getContext())) {
+              arg = builder->CreateIntCast(arg, Type::getInt32Ty(module->getContext()), sign);
+            }
+
+            /* Int to Int, just store. */
+            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(int);
+            return true;
+
+          case PRINTF_CONVERSION_C:
+            /* Int to Char, add a conversion. */
+            arg = builder->CreateIntCast(arg, Type::getInt8Ty(module->getContext()), false);
+            dst_type = Type::getInt8PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(char);
+            return true;
+
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            printf("Warning: Have a float paramter for %%d like specifier, take care of it\n");
+            arg = builder->CreateSIToFP(arg, Type::getFloatTy(module->getContext()));
+            dst_type = Type::getFloatPtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(float);
+            return true;
+
+          case PRINTF_CONVERSION_S:
+            /* Here, the case is printf("xxx%s", 0); we should output the null. */
+            sizeof_size = 0;
+            slot.state->str = "(null)";
+            return true;
+
+          default:
+            return false;
+        }
+
+        break;
+      }
+
+      case Type::DoubleTyID:
+      case Type::FloatTyID: {
+        /* Because the printf is a variable parameter function, it does not have the
+           function prototype, so the compiler will always promote the arg to the
+           longest precise type for float. So here, we can always find it is double. */
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+            /* Float to Int, add a conversion. */
+            printf("Warning: Have a int paramter for %%f like specifier, take care of it\n");
+            arg = builder->CreateFPToSI(arg, Type::getInt32Ty(module->getContext()));
+            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(int);
+            return true;
+
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_X:
+            /* Float to uint, add a conversion. */
+            printf("Warning: Have a uint paramter for %%f like specifier, take care of it\n");
+            arg = builder->CreateFPToUI(arg, Type::getInt32Ty(module->getContext()));
+            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(int);
+            return true;
+
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            arg = builder->CreateFPCast(arg, Type::getFloatTy(module->getContext()));
+            dst_type = Type::getFloatPtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(float);
+            return true;
+
+          default:
+            return false;
+        }
+
+        break;
+      }
+
+      /* %p and %s */
+      case Type::PointerTyID:
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_S: {
+            llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(arg);
+            llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+            if (!arg0_ptr) {
+              return false;
+            }
+
+            ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+            if (!fmt_arg || !fmt_arg->isCString()) {
+              return false;
+            }
+            sizeof_size = 0;
+            slot.state->str = fmt_arg->getAsCString();
+            return true;
+          }
+          case PRINTF_CONVERSION_P: {
+            arg = builder->CreatePtrToInt(arg, Type::getInt32Ty(module->getContext()));
+            dst_type = arg->getType()->getPointerTo(1);
+            sizeof_size = sizeof(int);
+            return true;
+          }
+          default:
+            return false;
+        }
+
+        break;
+
+      case Type::VectorTyID: {
+        Type* vect_type = arg->getType();
+        Type* elt_type = vect_type->getVectorElementType();
+        int vec_num = vect_type->getVectorNumElements();
+        bool sign = false;
+
+        if (vec_num != slot.state->vector_n) {
+          return false;
+        }
+
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+            sign = true;
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_X:
+            if (elt_type->getTypeID() != Type::IntegerTyID)
+              return false;
+
+            /* If the bits change, we need to consider the signed. */
+            if (elt_type != Type::getInt32Ty(elt_type->getContext())) {
+              Value *II = NULL;
+              for (int i = 0; i < vec_num; i++) {
+                Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getInt32Ty(elt_type->getContext()), vec_num));
+                Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
+                Value *org = builder->CreateExtractElement(arg, cv);
+                Value *cvt = builder->CreateIntCast(org, Type::getInt32Ty(module->getContext()), sign);
+                II = builder->CreateInsertElement(vec, cvt, cv);
+              }
+              arg = II;
+            }
+
+            dst_type = arg->getType()->getPointerTo(1);
+            sizeof_size = sizeof(int) * vec_num;
+            return true;
+
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            if (elt_type->getTypeID() != Type::DoubleTyID && elt_type->getTypeID() != Type::FloatTyID)
+              return false;
+
+            if (elt_type->getTypeID() != Type::FloatTyID) {
+              Value *II = NULL;
+              for (int i = 0; i < vec_num; i++) {
+                Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getFloatTy(elt_type->getContext()), vec_num));
+                Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
+                Value *org = builder->CreateExtractElement(arg, cv);
+                Value* cvt  = builder->CreateFPCast(org, Type::getFloatTy(module->getContext()));
+                II = builder->CreateInsertElement(vec, cvt, cv);
+              }
+              arg = II;
+            }
+        }
+        dst_type = arg->getType()->getPointerTo(1);
+        sizeof_size = sizeof(int) * vec_num;
+        return true;
+      }
+
+      default:
+        return false;
+    }
+
+    return false;
+  }
+
+  map<CallInst*, PrintfSet::PrintfFmt*> PrintfParser::printfs;
+
+  void* getPrintfInfo(CallInst* inst)
+  {
+    if (PrintfParser::printfs[inst])
+      return (void*)PrintfParser::printfs[inst];
+    return NULL;
+  }
+
+  FunctionPass* createPrintfParserPass()
+  {
+    return new PrintfParser();
+  }
+  char PrintfParser::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index c1790f7..3e48fbf 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -1,47 +1,43 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- */
-
 /**
  * \file llvm_scalarize.cpp
- * \author Yang Rong <rong.r.yang at intel.com>
  *
  * This file is derived from:
- *  https://code.google.com/p/lunarglass/source/browse/trunk/Core/Passes/Transforms/Scalarize.cpp?r=605
+ *  https://code.google.com/p/lunarglass/source/browse/trunk/Core/Passes/Transforms/Scalarize.cpp?r=903
  */
 
 //===- Scalarize.cpp - Scalarize LunarGLASS IR ----------------------------===//
 //
 // LunarGLASS: An Open Modular Shader Compiler Architecture
-// Copyright (C) 2010-2011 LunarG, Inc.
+// Copyright (C) 2010-2014 LunarG, Inc.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//     Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
 //
-// This program is free software; you can redistribute it and/or
-// modify it under the terms of the GNU General Public License
-// as published by the Free Software Foundation; version 2 of the
-// License.
+//     Redistributions in binary form must reproduce the above
+//     copyright notice, this list of conditions and the following
+//     disclaimer in the documentation and/or other materials provided
+//     with the distribution.
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
+//     Neither the name of LunarG Inc. nor the names of its
+//     contributors may be used to endorse or promote products derived
+//     from this software without specific prior written permission.
 //
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-// 02110-1301, USA.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 //
 //===----------------------------------------------------------------------===//
 //
@@ -63,7 +59,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
@@ -87,8 +83,14 @@
 #else
 #include "llvm/IR/IRBuilder.h"
 #endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
+#endif
 #include "llvm/Support/raw_ostream.h"
 
 #include "llvm/llvm_gen_backend.hpp"
@@ -639,22 +641,29 @@ namespace gbe {
 
         // Get the function arguments
         CallSite CS(call);
-        CallSite::arg_iterator CI = CS.arg_begin() + 3;
+        CallSite::arg_iterator CI = CS.arg_begin() + 2;
 
         switch (it->second) {
           default: break;
-          case GEN_OCL_READ_IMAGE0:
-          case GEN_OCL_READ_IMAGE1:
-          case GEN_OCL_READ_IMAGE2:
-          case GEN_OCL_READ_IMAGE3:
-          case GEN_OCL_READ_IMAGE4:
-          case GEN_OCL_READ_IMAGE5:
-          case GEN_OCL_READ_IMAGE10:
-          case GEN_OCL_READ_IMAGE11:
-          case GEN_OCL_READ_IMAGE12:
-          case GEN_OCL_READ_IMAGE13:
-          case GEN_OCL_READ_IMAGE14:
-          case GEN_OCL_READ_IMAGE15:
+          case GEN_OCL_READ_IMAGE_I_1D:
+          case GEN_OCL_READ_IMAGE_UI_1D:
+          case GEN_OCL_READ_IMAGE_F_1D:
+          case GEN_OCL_READ_IMAGE_I_2D:
+          case GEN_OCL_READ_IMAGE_UI_2D:
+          case GEN_OCL_READ_IMAGE_F_2D:
+          case GEN_OCL_READ_IMAGE_I_3D:
+          case GEN_OCL_READ_IMAGE_UI_3D:
+          case GEN_OCL_READ_IMAGE_F_3D:
+
+	  case GEN_OCL_READ_IMAGE_I_1D_I:
+          case GEN_OCL_READ_IMAGE_UI_1D_I:
+          case GEN_OCL_READ_IMAGE_F_1D_I:
+          case GEN_OCL_READ_IMAGE_I_2D_I:
+          case GEN_OCL_READ_IMAGE_UI_2D_I:
+          case GEN_OCL_READ_IMAGE_F_2D_I:
+          case GEN_OCL_READ_IMAGE_I_3D_I:
+          case GEN_OCL_READ_IMAGE_UI_3D_I:
+          case GEN_OCL_READ_IMAGE_F_3D_I:
           case GEN_OCL_GET_IMAGE_WIDTH:
           case GEN_OCL_GET_IMAGE_HEIGHT:
           {
@@ -662,19 +671,17 @@ namespace gbe {
             extractFromVector(call);
             break;
           }
-          case GEN_OCL_WRITE_IMAGE10:
-          case GEN_OCL_WRITE_IMAGE11:
-          case GEN_OCL_WRITE_IMAGE12:
-          case GEN_OCL_WRITE_IMAGE13:
-          case GEN_OCL_WRITE_IMAGE14:
-          case GEN_OCL_WRITE_IMAGE15:
+          case GEN_OCL_WRITE_IMAGE_I_3D:
+          case GEN_OCL_WRITE_IMAGE_UI_3D:
+          case GEN_OCL_WRITE_IMAGE_F_3D:
             CI++;
-          case GEN_OCL_WRITE_IMAGE0:
-          case GEN_OCL_WRITE_IMAGE1:
-          case GEN_OCL_WRITE_IMAGE2:
-          case GEN_OCL_WRITE_IMAGE3:
-          case GEN_OCL_WRITE_IMAGE4:
-          case GEN_OCL_WRITE_IMAGE5:
+          case GEN_OCL_WRITE_IMAGE_I_2D:
+          case GEN_OCL_WRITE_IMAGE_UI_2D:
+          case GEN_OCL_WRITE_IMAGE_F_2D:
+            CI++;
+          case GEN_OCL_WRITE_IMAGE_I_1D:
+          case GEN_OCL_WRITE_IMAGE_UI_1D:
+          case GEN_OCL_WRITE_IMAGE_F_1D:
           {
             *CI = InsertToVector(call, *CI);
             break;
@@ -689,8 +696,10 @@ namespace gbe {
   {
     if(bt->getOperand(0)->getType()->isVectorTy())
       bt->setOperand(0, InsertToVector(bt, bt->getOperand(0)));
-    if(bt->getType()->isVectorTy())
+    if(bt->getType()->isVectorTy()) {
+      setAppendPoint(bt);
       extractFromVector(bt);
+    }
     return false;
   }
 
@@ -809,7 +818,6 @@ namespace gbe {
     for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
        phiI != phiE; ++phiI) {
       assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
-
       // Fill in each component of this phi
       VectorValues& vVals = vectorVals[*phiI];
       for (int c = 0; c < GetComponentCount(*phiI); ++c) {
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 8b2ac04..84ba383 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -22,7 +22,7 @@
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
@@ -55,11 +55,14 @@
 #include "llvm/Assembly/PrintModulePass.h"
 #endif
 
+#include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/llvm_gen_backend.hpp"
 #include "llvm/llvm_to_gen.hpp"
 #include "sys/cvar.hpp"
 #include "sys/platform.hpp"
 
+#include <clang/CodeGen/CodeGenAction.h>
+
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -68,17 +71,28 @@
 namespace gbe
 {
   BVAR(OCL_OUTPUT_LLVM, false);
+  BVAR(OCL_OUTPUT_CFG, false);
+  BVAR(OCL_OUTPUT_CFG_ONLY, false);
   BVAR(OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS, false);
   using namespace llvm;
 
-  void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo)
+  void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL)
   {
     FunctionPassManager FPM(&mod);
-    FPM.add(new DataLayout(&mod));
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    FPM.add(new DataLayoutPass(DL));
+#else
+    FPM.add(new DataLayout(DL));
+#endif
+
+    // XXX remove the verifier pass to workaround a non-fatal error.
+    // add this pass cause the Clang abort with the following error message:
+    // "Global is external, but doesn't have external or weak linkage"
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
-    FPM.add(createVerifierPass(true));
+    //FPM.add(createVerifierPass(true));
 #else
-    FPM.add(createVerifierPass());
+    //FPM.add(createVerifierPass());
 #endif
     FPM.add(new TargetLibraryInfo(*libraryInfo));
     FPM.add(createTypeBasedAliasAnalysisPass());
@@ -96,11 +110,15 @@ namespace gbe
     FPM.doFinalization();
   }
 
-  void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, int optLevel)
+  void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL, int optLevel)
   {
     llvm::PassManager MPM;
 
-    MPM.add(new DataLayout(&mod));
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    MPM.add(new DataLayoutPass(DL));
+#else
+    MPM.add(new DataLayout(DL));
+#endif
     MPM.add(new TargetLibraryInfo(*libraryInfo));
     MPM.add(createTypeBasedAliasAnalysisPass());
     MPM.add(createBasicAliasAnalysisPass());
@@ -112,11 +130,14 @@ namespace gbe
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
     MPM.add(createPruneEHPass());             // Remove dead EH info
+    MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
     MPM.add(createFunctionInliningPass(200000));
+    MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
     MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
 
     //MPM.add(createScalarReplAggregatesPass(64, true, -1, -1, 64))
-    //MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+    if(optLevel > 0)
+      MPM.add(createSROAPass(/*RequiresDomTree*/ false));
     MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
     MPM.add(createJumpThreadingPass());         // Thread jumps.
     MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
@@ -135,7 +156,7 @@ namespace gbe
     MPM.add(createLoopDeletionPass());          // Delete dead loops
     MPM.add(createLoopUnrollPass());          // Unroll small loops
     if(optLevel > 0)
-      MPM.add(createGVNPass(true));                 // Remove redundancies
+      MPM.add(createGVNPass());                 // Remove redundancies
     MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
     MPM.add(createSCCPPass());                  // Constant prop with SCCP
 
@@ -157,10 +178,8 @@ namespace gbe
     MPM.run(mod);
   }
 
-  bool llvmToGen(ir::Unit &unit, const char *fileName, int optLevel)
+  bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module, int optLevel)
   {
-    // Get the global LLVM context
-    llvm::LLVMContext& c = llvm::getGlobalContext();
     std::string errInfo;
     std::unique_ptr<llvm::raw_fd_ostream> o = NULL;
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS || OCL_OUTPUT_LLVM)
@@ -169,19 +188,28 @@ namespace gbe
     // Get the module from its file
     llvm::SMDiagnostic Err;
     std::auto_ptr<Module> M;
-    M.reset(ParseIRFile(fileName, Err, c));
-    if (M.get() == 0) return false;
-    Module &mod = *M.get();
+    if(fileName){
+      // only when module is null, Get the global LLVM context
+      llvm::LLVMContext& c = llvm::getGlobalContext();
+      M.reset(ParseIRFile(fileName, Err, c));
+      if (M.get() == 0) return false;
+    }
+    Module &mod = (module!=NULL)?*(llvm::Module*)module:*M.get();
+    DataLayout DL(&mod);
 
     Triple TargetTriple(mod.getTargetTriple());
     TargetLibraryInfo *libraryInfo = new TargetLibraryInfo(TargetTriple);
     libraryInfo->disableAllFunctions();
 
-    runFuntionPass(mod, libraryInfo);
-    runModulePass(mod, libraryInfo, optLevel);
+    runFuntionPass(mod, libraryInfo, DL);
+    runModulePass(mod, libraryInfo, DL, optLevel);
 
     llvm::PassManager passes;
-
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    passes.add(new DataLayoutPass(DL));
+#else
+    passes.add(new DataLayout(DL));
+#endif
     // Print the code before further optimizations
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
@@ -191,14 +219,24 @@ namespace gbe
 #endif
     passes.add(createIntrinsicLoweringPass());
     passes.add(createFunctionInliningPass(200000));
-    passes.add(createScalarReplAggregatesPass()); // Break up allocas
+    passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
+    passes.add(createLoadStoreOptimizationPass());
     passes.add(createRemoveGEPPass(unit));
     passes.add(createConstantPropagationPass());
     passes.add(createLowerSwitchPass());
     passes.add(createPromoteMemoryToRegisterPass());
-    passes.add(createGVNPass());                  // Remove redundancies
+    if(optLevel > 0)
+      passes.add(createGVNPass());                  // Remove redundancies
+    passes.add(createPrintfParserPass());
     passes.add(createScalarizePass());        // Expand all vector ops
     passes.add(createDeadInstEliminationPass());  // Remove simplified instructions
+    passes.add(createCFGSimplificationPass());     // Merge & remove BBs
+    passes.add(createScalarizePass());        // Expand all vector ops
+
+    if(OCL_OUTPUT_CFG)
+      passes.add(createCFGPrinterPass());
+    if(OCL_OUTPUT_CFG_ONLY)
+      passes.add(createCFGOnlyPrinterPass());
     passes.add(createGenPass(unit));
 
     // Print the code extra optimization passes
@@ -209,7 +247,6 @@ namespace gbe
       passes.add(createPrintModulePass(&*o));
 #endif
     passes.run(mod);
-
     return true;
   }
 } /* namespace gbe */
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
index 50ea267..41e3477 100644
--- a/backend/src/llvm/llvm_to_gen.hpp
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -32,7 +32,7 @@ namespace gbe {
 
   /*! Convert the LLVM IR code to a GEN IR code,
 		  optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
-  bool llvmToGen(ir::Unit &unit, const char *fileName, int optLevel);
+  bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module, int optLevel);
 
 } /* namespace gbe */
 
diff --git a/backend/src/ocl_barrier.ll b/backend/src/ocl_barrier.ll
index 9f46347..4e55fcb 100644
--- a/backend/src/ocl_barrier.ll
+++ b/backend/src/ocl_barrier.ll
@@ -6,9 +6,9 @@
 
 declare i32 @_get_local_mem_fence() nounwind alwaysinline
 declare i32 @_get_global_mem_fence() nounwind alwaysinline
-declare void @__gen_ocl_barrier_local() nounwind alwaysinline
-declare void @__gen_ocl_barrier_global() nounwind alwaysinline
-declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
 
 define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
   %1 = icmp eq i32 %flags, 3
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index b736a88..52f5365 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -1,6 +1,7 @@
 // This file includes defines that are common to both kernel code and
 // the NVPTX back-end.
-
+#ifndef __OCL_COMMON_DEFINES__
+#define __OCL_COMMON_DEFINES__
 //
 // Common defines for Image intrinsics
 // Channel order
@@ -121,3 +122,5 @@ typedef enum clk_sampler_type {
 // Memory synchronization
 #define CLK_LOCAL_MEM_FENCE     (1 << 0)
 #define CLK_GLOBAL_MEM_FENCE    (1 << 1)
+
+#endif   /* __OCL_COMMON_DEFINES__ */
\ No newline at end of file
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
index 7ec2aec..8326768 100644
--- a/backend/src/ocl_convert.h
+++ b/backend/src/ocl_convert.h
@@ -2281,7 +2281,7 @@ DEF(uint, float);
 
 #define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x > MAX ? (DSTTYPE)MAX : x < MIN ? (DSTTYPE)MIN : x; \
+    return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
   }
 DEF(char, long, -128, 127);
 DEF(uchar, long, 0, 255);
@@ -2295,7 +2295,7 @@ DEF(ulong, float, 0, 1.8446744073709552e+19f);
 
 #define DEF(DSTTYPE, SRCTYPE, MAX) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x > MAX ? (DSTTYPE)MAX : x; \
+    return x >= MAX ? (DSTTYPE)MAX : x; \
   }
 DEF(char, ulong, 127);
 DEF(uchar, ulong, 255);
@@ -2307,12 +2307,12 @@ DEF(uint, ulong, 0xffffffffu);
 
 INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
   ulong MAX = 0x7ffffffffffffffful;
-  return x > MAX ? MAX : x;
+  return x >= MAX ? MAX : x;
 }
 
 #define DEF(DSTTYPE, SRCTYPE) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x < 0 ? 0 : x; \
+    return x <= 0 ? 0 : x; \
   }
 DEF(ushort, char);
 DEF(uint, char);
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index d191b8e..f648a8c 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -83,13 +83,25 @@ DEF(double);
 // This is a transitional hack to bypass the LLVM 3.3 built-in types.
 // See the Khronos SPIR specification for handling of these types.
 #define __texture __attribute__((address_space(4)))
+struct _image1d_t;
+typedef __texture struct _image1d_t* __image1d_t;
+struct _image1d_buffer_t;
+typedef __texture struct _image1d_buffer_t* __image1d_buffer_t;
+struct _image1d_array_t;
+typedef __texture struct _image1d_array_t* __image1d_array_t;
 struct _image2d_t;
 typedef __texture struct _image2d_t* __image2d_t;
+struct _image2d_array_t;
+typedef __texture struct _image2d_array_t* __image2d_array_t;
 struct _image3d_t;
 typedef __texture struct _image3d_t* __image3d_t;
 typedef const ushort __sampler_t;
 typedef size_t __event_t;
+#define image1d_t __image1d_t
+#define image1d_buffer_t __image1d_buffer_t
+#define image1d_array_t __image1d_array_t
 #define image2d_t __image2d_t
+#define image2d_array_t __image2d_array_t
 #define image3d_t __image3d_t
 #define sampler_t __sampler_t
 #define event_t __event_t
@@ -108,9 +120,11 @@ typedef size_t __event_t;
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL preprocessor directives & macros
 /////////////////////////////////////////////////////////////////////////////
-#define __OPENCL_VERSION__ 110
+#define __OPENCL_VERSION__ 120
 #define __CL_VERSION_1_0__ 100
 #define __CL_VERSION_1_1__ 110
+#define __CL_VERSION_1_2__ 120
+
 #define __ENDIAN_LITTLE__ 1
 #define __IMAGE_SUPPORT__ 1
 #define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
@@ -239,7 +253,7 @@ OVERLOADABLE int ocl_sadd_sat(int x, int y);
 INLINE_OVERLOADABLE int add_sat(int x, int y) { return ocl_sadd_sat(x, y); }
 OVERLOADABLE int ocl_ssub_sat(int x, int y);
 INLINE_OVERLOADABLE int sub_sat(int x, int y) {
-  return (y == 0x80000000u) ? (x & 0x7FFFFFFF) : ocl_ssub_sat(x, y);
+  return (y == 0x80000000u) ? (ocl_sadd_sat(ocl_sadd_sat(0x7fffffff, x), 1)) : ocl_ssub_sat(x, y);
 }
 OVERLOADABLE long ocl_sadd_sat(long x, long y);
 INLINE_OVERLOADABLE long add_sat(long x, long y) {
@@ -638,6 +652,14 @@ INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
   return y > x ? (y - x) : (x - y);
 }
 
+
+/////////////////////////////////////////////////////////////////////////////
+// SIMD level function
+/////////////////////////////////////////////////////////////////////////////
+short __gen_ocl_simd_any(short);
+short __gen_ocl_simd_all(short);
+
+
 /////////////////////////////////////////////////////////////////////////////
 // Work Items functions (see 6.11.1 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
@@ -1479,65 +1501,63 @@ INLINE_OVERLOADABLE float tan(float x)
 
 INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
-  return __gen_ocl_cos(x * M_PI_F);
+  int ix;
+  if(isinf(x) || isnan(x)) { return NAN; }
+  if(x < 0.0f) { x = -x; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 1.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return __kernel_cosf(m*M_PI_F, 0.0f);
+   case 1:
+   case 2:
+    return __kernel_sinf((0.5f-m)*M_PI_F, 0.0f, 0);
+   case 3:
+   case 4:
+    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
+   case 5:
+   case 6:
+    return __kernel_sinf((m-1.5f)*M_PI_F, 0.0f, 0);
+   default:
+    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
+   }
 }
 INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  float y, z;
-  int n, ix;
-  ix = *(int *) (&x) & 0x7fffffff;
-  if (ix < 0x3e800000)
-    return __gen_ocl_sin(M_PI_F * x);
-  y = -x;
-  z = __gen_ocl_rndd(y);
-  if (z != y) {
-    y *= 0.5f;
-    y = 2.f * (y - __gen_ocl_rndd(y));
-    n = y * 4.f;
-  } else {
-    if (ix >= 0x4b800000) {
-      y = 0;
-      n = 0;
-    } else {
-      if (ix < 0x4b000000)
-        z = y + 8.3886080000e+06f;
-      int n = *(int *) (&z);
-      n &= 1;
-      y = n;
-      n <<= 2;
-    }
-  }
-  switch (n) {
-  case 0:
-    y = __gen_ocl_sin(M_PI_F * y);
-    break;
-  case 1:
-  case 2:
-    y = __gen_ocl_cos(M_PI_F * (0.5f - y));
-    break;
-  case 3:
-  case 4:
-    y = __gen_ocl_sin(M_PI_F * (1.f - y));
-    break;
-  case 5:
-  case 6:
-    y = -__gen_ocl_cos(M_PI_F * (y - 1.5f));
-    break;
-  default:
-    y = __gen_ocl_sin(M_PI_F * (y - 2.f));
-    break;
-  }
-  return -y;
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return sign*__kernel_sinf(m*M_PI_F, 0.0f, 0);
+   case 1:
+   case 2:
+    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+   case 3:
+   case 4:
+    return -sign*__kernel_sinf((m-1.0f)*M_PI_F, 0.0f, 0);
+   case 5:
+   case 6:
+    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+   default:
+    return -sign*__kernel_sinf((2.0f-m)*M_PI_F, 0.0f, 0);
+   }
+
 }
 INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
 INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
@@ -2246,10 +2266,36 @@ INLINE_OVERLOADABLE float native_tan(float x) {
   return native_sin(x) / native_cos(x);
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
-  return native_tan(x * M_PI_F);
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  int n = __gen_ocl_internal_floor(m*4.0f);
+  if(m == 0.5f) {
+    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
+  }
+  if(m == 0.0f) {
+    return (ix&0x1) == 0 ? 0.0f : -0.0f;
+  }
+
+  switch(n) {
+    case 0:
+      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
+    case 1:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    case 2:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    default:
+      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
+  }
 }
-INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(x); }
-INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
+INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
+INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
 INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
   /* copied from fdlibm */
@@ -2380,20 +2426,21 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
 INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
   return __gen_ocl_internal_acos(x) / M_PI_F;
 }
+__constant float atanhi[4] = {
+  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+};
+__constant float atanlo[4] = {
+  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+};
+
 INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
   /* copied from fdlibm */
-  float atanhi[4];
-  atanhi[0] = 4.6364760399e-01; /* atan(0.5)hi 0x3eed6338 */
-  atanhi[1] = 7.8539812565e-01; /* atan(1.0)hi 0x3f490fda */
-  atanhi[2] = 9.8279368877e-01; /* atan(1.5)hi 0x3f7b985e */
-  atanhi[3] = 1.5707962513e+00; /* atan(inf)hi 0x3fc90fda */
-
-  float atanlo[4];
-  atanlo[0] = 5.0121582440e-09; /* atan(0.5)lo 0x31ac3769 */
-  atanlo[1] =  3.7748947079e-08; /* atan(1.0)lo 0x33222168 */
-  atanlo[2] =  3.4473217170e-08; /* atan(1.5)lo 0x33140fb4 */
-  atanlo[3] =  7.5497894159e-08; /* atan(inf)lo 0x33a22168 */
-
   float aT[11];
   aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
   aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
@@ -2621,7 +2668,7 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
 
 INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
   //use native instruction when it has enough precision
-  if (x > 128 || x < -128)
+  if (x > -0x1.6p1 && x < 0x1.6p1)
   {
     return native_exp(x);
   }
@@ -2629,24 +2676,16 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
   float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
   u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
   twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
-  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one = 1.0,
-  huge = 1.0e+30,
-  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
-  P2 = -2.7777778450e-03, /* 0xbb360b61 */
-  P3 = 6.6137559770e-05, /* 0x388ab355 */
-  P4 = -1.6533901999e-06, /* 0xb5ddea0e */
-  P5 =	4.1381369442e-08; /* 0x3331bb4c */
-  float ln2HI[2],ln2LO[2],halF[2];
-  float y,hi=0.0,lo=0.0,c,t;
+  ivln2	 =	1.4426950216e+00; /* 0x3fb8aa3b =1/ln2 */
+  float y,hi=0.0,lo=0.0,t;
   int k=0,xsb;
   unsigned hx;
-  ln2HI[0] = 6.9313812256e-01;	/* 0x3f317180 */
-  ln2HI[1] = -6.9313812256e-01;	/* 0xbf317180 */
-  ln2LO[0] = 9.0580006145e-06;  	/* 0x3717f7d1 */
-  ln2LO[1] = -9.0580006145e-06; /* 0xb717f7d1 */
-  halF[0] = 0.5;
-  halF[1] =	-0.5;
+  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
+  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
+  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
+  float half_0 = 0.5;
+  float half_1 =	-0.5;
 
   GEN_OCL_GET_FLOAT_WORD(hx,x);
   xsb = (hx>>31)&1;		/* sign bit of x */
@@ -2654,37 +2693,27 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
 
   /* filter out non-finite argument */
   if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
-    if(hx>0x7f800000)
-      return x+x;			/* NaN */
-    if(hx==0x7f800000)
-      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
-    if(x > o_threshold) return huge*huge; /* overflow */
-    if(x < u_threshold) return twom100*twom100; /* underflow */
+    // native_exp already handled this
+    return native_exp(x);
   }
+
   /* argument reduction */
   if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
     if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
-      hi = x-ln2HI[xsb]; lo=ln2LO[xsb]; k = 1-xsb-xsb;
+      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0);
+      lo= xsb == 1? ln2LO_1 : ln2LO_0;
+      k = 1-xsb-xsb;
     } else {
-      k  = ivln2*x+halF[xsb];
+      float tmp = xsb == 1 ? half_1 : half_0;
+      k  = ivln2*x+tmp;
       t  = k;
-      hi = x - t*ln2HI[0];	/* t*ln2HI is exact here */
-      lo = t*ln2LO[0];
+      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
+      lo = t*ln2LO_0;
     }
     x  = hi - lo;
   }
-  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
-    if(huge+x>one) return one+x;/* trigger inexact */
-  }
-  else k = 0;
 
-  /* x is now in primary range */
-  t  = x*x;
-  c  = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
-  if(k==0)
-    return one-((x*c)/(c-(float)2.0)-x);
-  else
-    y = one-((lo-(x*c)/((float)2.0-c))-hi);
+  y = native_exp(x);
   if(k >= -125) {
     unsigned hy;
     GEN_OCL_GET_FLOAT_WORD(hy,y);
@@ -3171,6 +3200,8 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_exp10(float x){
 #define remainder __gen_ocl_internal_remainder
 #define ldexp __gen_ocl_internal_ldexp
 PURE CONST float __gen_ocl_mad(float a, float b, float c);
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
 INLINE_OVERLOADABLE float mad(float a, float b, float c) {
   return __gen_ocl_mad(a, b, c);
 }
@@ -3226,14 +3257,10 @@ DECL_MIN_MAX_CLAMP(long)
 DECL_MIN_MAX_CLAMP(ulong)
 #undef DECL_MIN_MAX_CLAMP
 INLINE_OVERLOADABLE float max(float a, float b) {
-  if(isnan(b))
-    return a;
-  return a > b ? a : b;
+  return __gen_ocl_fmax(a, b);
 }
 INLINE_OVERLOADABLE float min(float a, float b) {
-  if(isnan(b))
-    return a;
-  return a < b ? a : b;
+  return __gen_ocl_fmin(a, b);
 }
 INLINE_OVERLOADABLE float clamp(float v, float l, float u) {
   return max(min(v, u), l);
@@ -3351,6 +3378,232 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
     return y;
   return x > y ? (x - y) : +0.f;
 }
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,sn,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,hy,ix,iy,is;
+  float bp[2],dp_h[2],dp_l[2],
+  zero    =  0.0,
+  one	=  1.0,
+  two	=  2.0,
+  two24	=  16777216.0,	/* 0x4b800000 */
+  huge	=  1.0e30,
+  tiny    =  1.0e-30,
+  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  L3  =  3.3333334327e-01, /* 0x3eaaaaab */
+  L4  =  2.7272811532e-01, /* 0x3e8ba305 */
+  L5  =  2.3066075146e-01, /* 0x3e6c3255 */
+  L6  =  2.0697501302e-01, /* 0x3e53f142 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  P3   =  6.6137559770e-05, /* 0x388ab355 */
+  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5   =  4.1381369442e-08, /* 0x3331bb4c */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
+  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+  bp[0] = 1.0,bp[1] = 1.5,
+  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
+  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
+  if (ix < 0x00800000) {	   /* x < 2**-126  */
+    ix = 0;/* Gen does not support subnormal number now */
+  }
+  if (iy < 0x00800000) {	  /* y < 2**-126  */
+    iy = 0;/* Gen does not support subnormal number now */
+  }
+   /* y==zero: x**0 = 1 */
+  if(iy==0) return one;
+  if(hx==0x3f800000) return one;
+  /* +-NaN return x+y */
+  if(ix > 0x7f800000 || iy > 0x7f800000)
+    return (x+0.0f)+y+(0.0f);
+  /* determine if y is an odd int when x < 0
+     * yisint = 0	... y is not an integer
+     * yisint = 1	... y is an odd int
+     * yisint = 2	... y is an even int
+     */
+  yisint  = 0;
+  if(hx<0) {
+    if(iy>=0x4b800000) yisint = 2; /* even integer y */
+    else if(iy>=0x3f800000) {
+      k = (iy>>23)-0x7f;	   /* exponent */
+      j = iy>>(23-k);
+      if((j<<(23-k))==iy) yisint = 2-(j&1);
+    }
+  }
+  /* special value of y */
+  if (iy==0x7f800000) {	/* y is +-inf */
+    if (ix==0x3f800000)
+      //return  y - y;	/* inf**+-1 is NaN */
+      return one;
+    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
+      return (hy>=0)? y: zero;
+    else			/* (|x|<1)**-,+inf = inf,0 */
+      return (hy<0)?-y: zero;
+  }
+  if(iy==0x3f800000) {	/* y is  +-1 */
+    if(hy<0) return one/x; else return x;
+  }
+  if(hy==0x40000000) return x*x; /* y is  2 */
+  if(hy==0x3f000000) {	/* y is  0.5 */
+    if(hx>=0)return __gen_ocl_sqrt(x);
+  }
+
+  ax   = __gen_ocl_fabs(x);
+    /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+    z = ax;			/*x is +-0,+-inf,+-1*/
+    if(hy<0) z = one/z;	/* z = (1/|x|) */
+    if(hx<0) {
+      if(((ix-0x3f800000)|yisint)==0) {
+        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+      } else if(yisint==1)
+        z = -z;		/* (x<0)**odd = -(|x|**odd) */
+    }
+    return z;
+  }
+  n = ((uint)hx>>31)-1;
+
+  /* (x<0)**(non-int) is NaN */
+  if((n|yisint)==0) return (x-x)/(x-x);
+
+  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+
+  /* |y| is huge */
+  if(iy>0x4d000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
+    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+          log(x) by x-x^2/2+x^3/3-x^4/4 */
+    t = ax-1;		/* t has 20 trailing zeros */
+    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
+    v = t*ivln2_l-w*ivln2;
+    t1 = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+	/* take care subnormal number */
+    //if(ix<0x00800000)
+      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+	/* determine interval */
+    ix = j|0x3f800000;		/* normalize ix */
+    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+    GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+    /* t_h=ax+bp[k] High */
+    is = ((ix>>1)&0xfffff000)|0x20000000;
+    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = 3.0f+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xfffff000);
+    t_l = r-((t_h-3.0f)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xfffff000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+
+  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+  GEN_OCL_GET_FLOAT_WORD(is,y);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+  p_l = (y-y1)*t1+y*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  if (j>0x43000000)				/* if z > 128 */
+    return sn*huge*huge;			/* overflow */
+  else if (j==0x43000000) {			/* if z == 128 */
+    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
+    return sn*tiny*tiny;			/* underflow */
+  else if (j==0xc3160000){			/* z == -150 */
+    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
+  }
+
+  /*
+    * compute 2**(p_h+p_l)
+    */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
+    n = j+(0x00800000>>(k+1));
+    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
+    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+    n = ((n&0x007fffff)|0x00800000)>>(23-k);
+    if(j<0) n = -n;
+    p_h -= t;
+  }
+  t = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(is,t);
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+}
+
+
 INLINE_OVERLOADABLE float hypot(float x, float y) {
   //return __gen_ocl_sqrt(x*x + y*y);
   float a,b,an,bn,cn;
@@ -3480,8 +3733,46 @@ INLINE_OVERLOADABLE float pown(float x, int n) {
     return 1;
   return powr(x, n);
 }
+
+INLINE_OVERLOADABLE float internal_rootn(float x, int n, const bool isFastpath)
+{
+  float ax,re;
+  int sign = 0;
+  if( n == 0 )return NAN;
+  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
+  if( x < 0 && 0 == (n&1) )
+    return NAN;
+  if( x == 0.0 ){
+    switch( n & 0x80000001 ){
+      //rootn ( +-0,  n ) is +0 for even n > 0.
+      case 0:
+        return 0.0f;
+      //rootn ( +-0,  n ) is +-0 for odd n > 0.
+      case 1:
+        return x;
+      //rootn ( +-0,  n ) is +inf for even n < 0.
+      case 0x80000000:
+        return INFINITY;
+
+      //rootn ( +-0,  n ) is +-inf for odd n < 0.
+      case 0x80000001:
+        return __gen_ocl_internal_copysign(INFINITY, x);
+    }
+  }
+  ax = __gen_ocl_fabs(x);
+  if(x <0.0f && (n&1))
+    sign = 1;
+  if (isFastpath)
+    re = __gen_ocl_pow(ax,1.f/n);
+  else
+    re = __gen_ocl_internal_pow(ax,1.f/n);
+  if(sign)
+    re = -re;
+  return re;
+}
+
 INLINE_OVERLOADABLE float rootn(float x, int n) {
-  return powr(x, 1.f / n);
+  return internal_rootn(x, n, 0);
 }
 
 /////////////////////////////////////////////////////////////////////////////
@@ -3601,12 +3892,12 @@ INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
   *(p + 3 * offset + 2) = v.s2; \
 } \
 INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
-  return *(SPACE TYPE##3 *) (p + 3 * offset); \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
 }
 
 #define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
 INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
-  return *(SPACE TYPE##3 *) (p + 3 * offset); \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
 }
 
 #define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
@@ -3629,10 +3920,59 @@ INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
   DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
   DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
 
-DECL_UNTYPED_RW_ALL(char)
-DECL_UNTYPED_RW_ALL(uchar)
-DECL_UNTYPED_RW_ALL(short)
-DECL_UNTYPED_RW_ALL(ushort)
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+INLINE_OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+}
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 2 * offset) = v.s0; \
+  *(p + 2 * offset + 1) = v.s1; \
+} \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+  vstore2(v.lo, 2*offset, p); \
+  vstore2(v.hi, 2*offset, p+2); \
+} \
+INLINE_OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+  vstore4(v.lo, 2*offset, p); \
+  vstore4(v.hi, 2*offset, p+4); \
+} \
+INLINE_OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+  vstore8(v.lo, 2*offset, p); \
+  vstore8(v.hi, 2*offset, p+8); \
+}
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+  DECL_BYTE_RD_SPACE(TYPE, __global) \
+  DECL_BYTE_RD_SPACE(TYPE, __local) \
+  DECL_BYTE_RD_SPACE(TYPE, __private) \
+  DECL_BYTE_RD_SPACE(TYPE, __constant) \
+  DECL_BYTE_WR_SPACE(TYPE, __global) \
+  DECL_BYTE_WR_SPACE(TYPE, __local) \
+  DECL_BYTE_WR_SPACE(TYPE, __private)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
 DECL_UNTYPED_RW_ALL(int)
 DECL_UNTYPED_RW_ALL(uint)
 DECL_UNTYPED_RW_ALL(long)
@@ -3647,6 +3987,9 @@ DECL_UNTYPED_RW_ALL(double)
 #undef DECL_UNTYPED_RD_SPACE_N
 #undef DECL_UNTYPED_V3_SPACE
 #undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
 
 PURE CONST float __gen_ocl_f16to32(short h);
 PURE CONST short __gen_ocl_f32to16(float f);
@@ -4234,96 +4577,155 @@ int __gen_ocl_force_simd16(void);
 // Image access functions
 /////////////////////////////////////////////////////////////////////////////
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+// 1D read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+
+// 2D & 1D Array read
 OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
 OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
 OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+// 3D & 2D Array read
 OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
 OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
 OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+
+// 1D write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, float4 color);
 
+// 2D & 1D Array write
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
 OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color);
 OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color);
 
+// 3D & 2D Array write
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color);
 OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
 OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
+
 int __gen_ocl_get_image_width(uint surface_id);
 int __gen_ocl_get_image_height(uint surface_id);
 int __gen_ocl_get_image_channel_data_type(uint surface_id);
 int __gen_ocl_get_image_channel_order(uint surface_id);
 int __gen_ocl_get_image_depth(uint surface_id);
-ushort __gen_ocl_get_sampler_info(sampler_t sampler);
-
-#define GET_IMAGE(cl_image, surface_id) \
-    uint surface_id = (uint)cl_image
+/* The printf function. */
+/* From LLVM 3.4, c string are all in constant address space */
+#if 100*__clang_major__ + __clang_minor__ < 304
+int __gen_ocl_printf_stub(const char * format, ...);
+#else
+int __gen_ocl_printf_stub(constant char * format, ...);
+#endif
+#define printf __gen_ocl_printf_stub
 
+// 2D 3D Image Common Macro
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
 #define GEN_FIX_1 1
 #else
 #define GEN_FIX_1 0
 #endif
 
-#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix,          \
-                        image_type, type, suffix, coord_type)                \
+#define GET_IMAGE(cl_image, surface_id) \
+    uint surface_id = (uint)cl_image
+INLINE_OVERLOADABLE float __gen_compute_array_index(const float index, image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  float array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+INLINE_OVERLOADABLE float __gen_compute_array_index(float index, image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  float array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  int array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(index, 0, array_size - 1);
+}
+
+INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  int array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(index, 0, array_size - 1);
+}
+
+#define DECL_READ_IMAGE0(int_clamping_fix,                                   \
+                        image_type, type, suffix, coord_type, n)             \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               const sampler_t sampler,      \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai);                          \
+    if (int_clamping_fix &&                                                  \
+        ((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) &&             \
+        ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST))               \
+            return   __gen_ocl_read_image ##suffix(                          \
+                        EXPEND_READ_COORD(surface_id, sampler, coord));      \
+    return  __gen_ocl_read_image ##suffix(                                   \
+                    EXPEND_READ_COORDF(surface_id, sampler, coord), 0);      \
+  }
+
+#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,         \
+                        image_type, type, suffix, coord_type, n)             \
   INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
-                                               sampler_t sampler,            \
+                                               const sampler_t sampler,      \
                                                coord_type coord)             \
   {                                                                          \
     GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, float, ai)                         \
     coord_type tmpCoord = coord;                                             \
-    ushort samplerValue;                                                     \
     if (float_coord_rounding_fix | int_clamping_fix) {                       \
-      samplerValue = __gen_ocl_get_sampler_info(sampler);                    \
-      if (((samplerValue & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)         \
-          && ((samplerValue & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {   \
+      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
+          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
         if (float_coord_rounding_fix                                         \
-            && ((samplerValue & CLK_NORMALIZED_COORDS_TRUE) == 0)) {         \
+            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              \
           FIXUP_FLOAT_COORD(tmpCoord);                                       \
         }                                                                    \
         if (int_clamping_fix) {                                              \
-           if (OUT_OF_BOX(tmpCoord, surface_id,                              \
-                          (samplerValue & CLK_NORMALIZED_COORDS_TRUE))) {    \
-            unsigned int border_alpha;                                       \
-            int order = __gen_ocl_get_image_channel_order(surface_id);       \
-            if (!CLK_HAS_ALPHA(order)) {                                     \
-              border_alpha = 1;                                              \
+            coord_type intCoord;                                             \
+            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      \
+              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             \
             } else                                                           \
-              border_alpha = 0;                                              \
-              return (type)(0, 0, 0, border_alpha);                          \
-          } else                                                             \
+              intCoord = tmpCoord;                                           \
             return   __gen_ocl_read_image ##suffix(                          \
-                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\
+                       EXPEND_READ_COORDI(surface_id, sampler, intCoord));\
        }                                                                     \
       }                                                                      \
     }                                                                        \
     return  __gen_ocl_read_image ##suffix(                                   \
-                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 0);\
+                        EXPEND_READ_COORDF(surface_id, sampler, tmpCoord), 0);\
   }
 
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type)      \
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type, n)   \
   INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
                                                coord_type coord)             \
   {                                                                          \
     GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai)                           \
     return __gen_ocl_read_image ##suffix(                                    \
-           EXPEND_READ_COORD(surface_id,                                     \
+           EXPEND_READ_COORDF(surface_id,                                    \
                              CLK_NORMALIZED_COORDS_FALSE                     \
                              | CLK_ADDRESS_NONE                              \
-                             | CLK_FILTER_NEAREST, coord), 0);               \
+                             | CLK_FILTER_NEAREST, (float)coord), 0);        \
   }
 
 #define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
@@ -4333,15 +4735,78 @@ ushort __gen_ocl_get_sampler_info(sampler_t sampler);
     __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
   }
 
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
+#define DECL_IMAGE_INFO_COMMON(image_type)    \
+  INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_data_type(surface_id); \
+  }\
+  INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_order(surface_id); \
+  } \
+  INLINE_OVERLOADABLE int get_image_width(image_type image) \
+  { \
+    GET_IMAGE(image, surface_id); \
+    return __gen_ocl_get_image_width(surface_id);  \
+  }
 
-#define OUT_OF_BOX(coord, surface, normalized)                   \
-  (coord.s0 < 0 || coord.s1 < 0 ||                               \
-   ((normalized == 0)                                            \
-     && (coord.s0 >= __gen_ocl_get_image_width(surface)          \
-         || coord.s1 >= __gen_ocl_get_image_height(surface)))    \
-   || ((normalized != 0) && (coord.s0 > 0x1p0 || coord.s1 > 0x1p0)))
+// 1D
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix)                       \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int, 1)               \
+  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float, 1)  \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int, 1)                        \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int)                                    \
+  DECL_WRITE_IMAGE(image_type, type, suffix, float)
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord < 0 ? -1 : coord), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord = srcCoord * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord, color
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord < 0 && tmpCoord > -0x1p-20f)                  \
+      tmpCoord += -0x1p-9;                                     \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_t, uint4, ui)
+DECL_IMAGE(0, image1d_t, float4, f)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, uint4, ui)
+DECL_IMAGE(0, image1d_buffer_t, float4, f)
+
+// 1D Info
+DECL_IMAGE_INFO_COMMON(image1d_t)
+DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef DECL_IMAGE
+// End of 1D
+
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                       \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int ##n, n)              \
+  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n, n) \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n, n)                       \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                                  \
+  DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
+// 2D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
 
 #define FIXUP_FLOAT_COORD(tmpCoord)                            \
   {                                                            \
@@ -4351,32 +4816,75 @@ ushort __gen_ocl_get_sampler_info(sampler_t sampler);
       tmpCoord.s1 += -0x1p-9f;                                 \
   }
 
-#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                   \
-  DECL_READ_IMAGE(0, int_clamping_fix, image_type, type, suffix, int ##n)           \
-  DECL_READ_IMAGE(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n) \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n)                      \
-  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                              \
-  DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
-
 DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
 DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
 DECL_IMAGE(0, image2d_t, float4, f, 2)
 
+// 1D Array
+#undef GET_IMAGE_ARRAY_SIZE
 #undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
 #undef EXPEND_WRITE_COORD
-#undef OUT_OF_BOX
 #undef FIXUP_FLOAT_COORD
 
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, (int)0, ai, 2
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), 0, (int)ai, 2
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, __gen_compute_array_index(coord.s1, cl_image), color
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+  coord_type ai = __gen_compute_array_index(coord.s1, image);
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
+      tmpCoord.s0 += -0x1p-9;                                  \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, uint4, ui, 2)
+DECL_IMAGE(0, image1d_array_t, float4, f, 2)
+
+// 2D Info
+DECL_IMAGE_INFO_COMMON(image2d_t)
+INLINE_OVERLOADABLE int get_image_height(image2d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+
+// 1D Array info
+DECL_IMAGE_INFO_COMMON(image1d_array_t)
+INLINE_OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDI
+#undef EXPEND_READ_COORDF
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 2D and 1D Array
+
+// 3D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)coord.s2
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)(coord.s2 < 0 ? -1 : coord.s2), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id); \
+                                                  dstCoord.z = srcCoord.z * __gen_ocl_get_image_depth(id);
 #define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
-#define OUT_OF_BOX(coord, surface, normalized)                  \
-  (coord.s0 < 0 || coord.s1 < 0 || coord.s2 < 0 ||              \
-   ((normalized == 0)                                           \
-     && (coord.s0 >= __gen_ocl_get_image_width(surface)         \
-         || coord.s1 >= __gen_ocl_get_image_height(surface)     \
-         || coord.s2 >= __gen_ocl_get_image_depth(surface)))    \
-   || ((normalized != 0)                                        \
-        &&(coord.s0 > 1 || coord.s1 > 1 || coord.s2 > 1)))
 
 #define FIXUP_FLOAT_COORD(tmpCoord)                             \
   {                                                             \
@@ -4387,6 +4895,7 @@ DECL_IMAGE(0, image2d_t, float4, f, 2)
     if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20)              \
       tmpCoord.s2 += -0x1p-9;                                   \
   }
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
 
 DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
 DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
@@ -4395,72 +4904,241 @@ DECL_IMAGE(0, image3d_t, float4, f, 4)
 DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
 DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
 DECL_IMAGE(0, image3d_t, float4, f, 3)
+
 #undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
 #undef EXPEND_WRITE_COORD
-#undef OUT_OF_BOX
 #undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, ai, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)ai, 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, __gen_compute_array_index(coord.s2, cl_image), color
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                             \
+  {                                                             \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20)              \
+      tmpCoord.s0 += -0x1p-9;                                   \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20)              \
+      tmpCoord.s1 += -0x1p-9;                                   \
+  }
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+  coord_type ai = __gen_compute_array_index(coord.s2, image);
+
+// 2D Array
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 4)
+DECL_IMAGE(0, image2d_array_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 3)
+DECL_IMAGE(0, image2d_array_t, float4, f, 3)
+
+// 3D Info
+DECL_IMAGE_INFO_COMMON(image3d_t)
+INLINE_OVERLOADABLE int get_image_height(image3d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int get_image_depth(image3d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+INLINE_OVERLOADABLE int4 get_image_dim(image3d_t image)
+{
+  return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0};
+}
+
+// 2D Array Info
+DECL_IMAGE_INFO_COMMON(image2d_array_t)
+INLINE_OVERLOADABLE int get_image_height(image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_array_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+INLINE_OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 3D and 2D Array
 
 #undef DECL_IMAGE
 #undef DECL_READ_IMAGE
 #undef DECL_READ_IMAGE_NOSAMPLER
 #undef DECL_WRITE_IMAGE
 #undef GEN_FIX_1
+// End of Image
 
-#define DECL_IMAGE_INFO(image_type)    \
-  INLINE_OVERLOADABLE  int get_image_width(image_type image) \
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_width(surface_id);\
-  } \
-  INLINE_OVERLOADABLE  int get_image_height(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_height(surface_id); \
-  } \
-  INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_data_type(surface_id); \
-  }\
-  INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_order(surface_id); \
-  }
 
-DECL_IMAGE_INFO(image2d_t)
-DECL_IMAGE_INFO(image3d_t)
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x)
+{
+    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
 
-INLINE_OVERLOADABLE  int get_image_depth(image3d_t image)
-  {
-   GET_IMAGE(image, surface_id);
-   return __gen_ocl_get_image_depth(surface_id);
-  }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x)
+{
+    return native_log(x + native_sqrt(x * x + 1));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x)
+{
+    return 0.5f * native_log((1 + x) / (1 - x));
+}
 
-INLINE_OVERLOADABLE  int2 get_image_dim(image2d_t image)
-  { return (int2){get_image_width(image), get_image_height(image)}; }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x)
+{
+    return __gen_ocl_pow(x, 0.3333333333f);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x)
+{
+    return native_cos(x);
+}
 
-INLINE_OVERLOADABLE  int4 get_image_dim(image3d_t image)
-  { return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0}; }
-#if 0
-/* The following functions are not implemented yet. */
-DECL_IMAGE_INFO(image1d_t)
-DECL_IMAGE_INFO(image1d_buffer_t)
-DECL_IMAGE_INFO(image1d_array_t)
-DECL_IMAGE_INFO(image2d_array_t)
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x)
+{
+    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
 
-INLINE_OVERLOADABLE  int2 get_image_dim(image2d_array_t image)
-  { return __gen_ocl_get_image_dim(image); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x)
+{
+    return __gen_ocl_cos(x * M_PI_F);
+}
 
-INLINE_OVERLOADABLE  int4 get_image_dim(image2d_array_t image)
-  { return __gen_ocl_get_image_dim(image); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x)
+{
+    return native_exp(x);
+}
 
-INLINE_OVERLOADABLE  size_t get_image_array_size(image2d_array_t image)
-  { return __gen_ocl_get_image_array_size(image); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x)
+{
+    return native_exp10(x);
+}
 
-INLINE_OVERLOADABLE  size_t get_image_array_size(image1d_array_t image)
-  { return __gen_ocl_get_image_array_size(image); }
-#endif
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x)
+{
+    return __gen_ocl_pow(M_E_F, x) - 1;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y)
+{
+    return x-y*__gen_ocl_rndz(x/y);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y)
+{
+    return __gen_ocl_sqrt(x*x + y*y);
+}
+
+INLINE_OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x)
+{
+    return __gen_ocl_rndd(native_log2(x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n)
+{
+    return __gen_ocl_pow(2, n) * x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x)
+{
+    return native_log(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x)
+{
+    return native_log2(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x)
+{
+    return native_log10(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x)
+{
+    return native_log(x + 1);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x)
+{
+    return __gen_ocl_rndd(native_log2(x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y)
+{
+    return x-y*__gen_ocl_rnde(x/y);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n)
+{
+  return internal_rootn(x, n, 1);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x)
+{
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __global float *cosval)
+{
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __local float *cosval)
+{
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __private float *cosval)
+{
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x)
+{
+    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x)
+{
+    return __gen_ocl_sin(x * M_PI_F);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x)
+{
+    return native_tan(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x)
+{
+    float y = native_exp(-2 * x);
+    return (1 - y) / (1 + y);
+}
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 
@@ -4478,4 +5156,5 @@ INLINE_OVERLOADABLE  size_t get_image_array_size(image1d_array_t image)
 #undef CONST
 #undef OVERLOADABLE
 #undef INLINE
+
 #endif /* __GEN_OCL_STDLIB_H__ */
diff --git a/backend/src/update_blob_ocl_header.py b/backend/src/update_blob_ocl_header.py
index 7d6907a..50f2501 100755
--- a/backend/src/update_blob_ocl_header.py
+++ b/backend/src/update_blob_ocl_header.py
@@ -36,7 +36,7 @@ blobFileName = sys.argv[2]
 blobTempName = sys.argv[2] + '.tmp'
 safeUnlink(blobFileName)
 tmplFile = open(sys.argv[1], 'r')
-blob = open(sys.argv[2] + '.tmp', 'w')
+blob = open(blobTempName, 'w')
 path = os.path.dirname(sys.argv[1])
 if path == '':
     path = '.'
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..d96a2e0
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,21 @@
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../utests
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+set (benchmark_sources
+  ../utests/utest_error.c
+  ../utests/utest_assert.cpp
+  ../utests/utest.cpp
+  ../utests/utest_file_map.cpp
+  ../utests/utest_helper.cpp
+  enqueue_copy_buf.cpp)
+
+ADD_LIBRARY(benchmarks SHARED ${ADDMATHFUNC} ${benchmark_sources})
+
+#TARGET_LINK_LIBRARIES(benchmarks cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+TARGET_LINK_LIBRARIES(benchmarks cl m)
+
+ADD_EXECUTABLE(benchmark_run benchmark_run.cpp)
+TARGET_LINK_LIBRARIES(benchmark_run benchmarks)
diff --git a/benchmark/benchmark_run.cpp b/benchmark/benchmark_run.cpp
new file mode 100644
index 0000000..b29ccc3
--- /dev/null
+++ b/benchmark/benchmark_run.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest_run.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Just run the unit tests. The user can possibly provides the subset of it
+ */
+#include "utest_helper.hpp"
+#include "utest_exception.hpp"
+#include <iostream>
+#include <getopt.h>
+
+static const char *shortopts = "c:lanh";
+struct option longopts[] = {
+{"casename", required_argument, NULL, 'c'},
+{"list", no_argument, NULL, 'l'},
+{"all", no_argument, NULL, 'a'},
+{"allnoissue", no_argument, NULL, 'n'},
+{"help", no_argument, NULL, 'h'},
+{0, 0, 0, 0},
+};
+
+void usage()
+{
+    std::cout << "\
+Usage:\n\
+  ./utest_run <option>\n\
+\n\
+  option:\n\
+    -c <casename>: run sub-case named 'casename'\n\
+    -l           : list all the available case name\n\
+    -a           : run all test cases\n\
+    -n           : run all test cases without known issue (default option)\n\
+    -h           : display this usage\n\
+\
+    "<< std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+
+  int c = 0;
+  cl_ocl_init();
+
+  c = getopt_long (argc, argv, shortopts, longopts, NULL);
+
+  if (argc == 1)
+    c = 'n';
+  if (argc == 2 && c < 1 ){
+    c = 'c';
+    optarg = argv[1];
+  }
+
+  do {
+    switch (c)
+    {
+      case 'c':
+        try {
+          UTest::run(optarg);
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'l':
+        UTest::listAllCases();
+        break;
+
+      case 'a':
+        try {
+          UTest::runAll();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'n':
+        try {
+          UTest::runAllNoIssue();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'h':
+      default:
+        usage();
+        exit(1);
+    }
+  } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1);
+
+  cl_ocl_destroy();
+}
diff --git a/benchmark/enqueue_copy_buf.cpp b/benchmark/enqueue_copy_buf.cpp
new file mode 100644
index 0000000..0d0d4df
--- /dev/null
+++ b/benchmark/enqueue_copy_buf.cpp
@@ -0,0 +1,69 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+  unsigned int i;
+  cl_char* buf0;
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+  buf0 = (cl_char *)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 0, sizeof(char), 0, NULL, NULL, NULL);
+
+  for (i=0; i < sz; i++) {
+    buf0[i]=(rand() & 0xFF);
+  }
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf0, 0, NULL, NULL);
+
+  if (src_off + cb > sz || dst_off + cb > sz) {
+  /* Expect Error. */
+    OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                 src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+    return;
+  }
+
+  OCL_ASSERT(CL_SUCCESS == clEnqueueCopyBuffer(queue, buf[0], buf[1],
+    src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+}
+
+int tim_subtract(struct timeval *y, struct timeval *x, struct timeval *result){
+  if ( x->tv_sec > y->tv_sec )
+    return   -1;
+
+  if ((x->tv_sec == y->tv_sec) && (x->tv_usec > y->tv_usec))
+    return   -1;
+
+  if ( result != NULL){
+    result->tv_sec = ( y->tv_sec - x->tv_sec );
+    result->tv_usec = ( y->tv_usec - x->tv_usec );
+
+    if (result->tv_usec < 0){
+      result->tv_sec --;
+      result->tv_usec += 1000000;
+    }
+  }
+
+  int msec = 1000.0*(y->tv_sec - x->tv_sec) + (y->tv_usec - x->tv_usec)/1000.0;
+  return msec;
+}
+
+
+int enqueue_copy_buf(void)
+{
+  size_t i;
+  const size_t sz = 127 *1023 * 1023;
+  struct timeval start,stop;
+
+  gettimeofday(&start,0);
+
+  for (i=0; i<10; i++) {
+    test_copy_buf(sz, 0, 0, sz);
+  }
+
+  gettimeofday(&stop,0);
+  return tim_subtract(&stop, &start, 0);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(enqueue_copy_buf);
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 7870c12..7e5b730 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -9,16 +9,18 @@ the programs and run them on the GPU. The code base also contains the compiler
 part of the stack which is included in `backend/`. For more specific information
 about the compiler, please refer to `backend/README.md`
 
+News
+----
+[[Beignet project news|Beignet/NEWS]]
+
 Prerequisite
 ------------
 
 The project depends on the following external libaries:
 
-- Several X components (XLib, Xfixes, Xext)
 - libdrm libraries (libdrm and libdrm\_intel)
 - Various LLVM components
-- The compiler backend itself (libgbe)
-- Mesa git master version built with gbm enabled to support extension cl\_khr\_gl\_sharing.
+- If run with X server, beignet needs XLib, Xfixes and Xext installed. Otherwise, no X11 dependency.
 
 And if you want to work with the standard ICD libOpenCL.so, then you need
 two more packages (the following package name is for Ubuntu):
@@ -36,41 +38,16 @@ with anything older.
 
 [http://llvm.org/releases/](http://llvm.org/releases/)
 
-LLVM 3.3 , 3.4 and 3.5 are supported. Till now, the recommended LLVM version is 3.3.
-There are some severe OpenCL related regression in current clang 3.4/3.5 version.
+LLVM 3.3 , 3.4 and 3.5 are supported. Till now, the recommended LLVM/CLANG version is 3.5.
+There are some severe OpenCL related regression in clang 3.4 version.
 
 **Note about LLVM 3.4**
 
 * If you want to try Clang/LLVM 3.4, you need to disable terminfo:
 --disable-terminfo. It's a llvm 3.4 bug.
 
-**Note about LLVM 3.5**
-
-* If you want to try Clang/LLVM 3.5, you need to build the clang/llvm with cxx11 enabled:
---enable-cxx11.
-
-**Note about OpenCV support**
-
-* We only fully tested the OpenCV 2.4 branch with beignet. And the pass rate is about 99%
-  for beignet 0.8.0. The preferred LLVM/Clang version is 3.3. One OpenCV patch is needed
-  to work with LLVM/clang, the patch is already submitted to the OpenCV upstream 2.4 repo
-  and is waiting for review: [pull request](https://github.com/Itseez/opencv/pull/2318).
-  Before it is merged, you need to apply that patch manually to OpenCV 2.4 branch.
-* As some OpenCL kerne (in OpenCV 2.4 OCL test suite) runs more than 10 seconds, it may
-  be reset by the kernel as the kernel has a GPU hangcheck mechanism. You can disable the
-  hangcheck by invoke the following command on Ubuntu system:
-
-  `# echo -n 0 > /sys/module/i915/parameters/enable_hangcheck`
-
-  But this command is a little bit dangerous, as if your kernel hang, then the gpu will hang
-  forever.
-* For the OpenCV 3.0 branch, the pass rate may a little bit lower than the 2.4 branch.
-
-Also note that the code was compiled on GCC 4.6, GCC 4.7 and GCC 4.8. Since the code uses
-really recent C++11 features, you may expect problems with older compilers. Last
-time I tried, the code breaks ICC 12 and Clang with internal compiler errors
-while compiling anonymous nested lambda functions.
-
+Please be noted that the code was compiled on GCC 4.6, GCC 4.7 and GCC 4.8. Since the code
+uses really recent C++11 features, you may expect problems with older compilers.
 
 How to build and install
 ------------------------
@@ -103,9 +80,11 @@ be found in `utests/`.
 Simply invoke:
 `> make install`
 
-It installs the following three files to the beignet/ directory relatively to
+It installs the following six files to the beignet/ directory relatively to
 your library installation directory.
 - libcl.so
+- libgbeinterp.so
+- libgbe.so
 - ocl\_stdlib.h, ocl\_stdlib.h.pch
 - beignet.bc
 
@@ -115,8 +94,8 @@ It installs the OCL icd vendor files to /etc/OpenCL/vendors, if the system suppo
 How to run
 ----------
 
-Apart from the OpenCL library itself that can be used by any OpenCL application,
-this code also produces various tests to ensure the compiler and the run-time
+After build and install of beignet, you may need to check whether it works on your
+platform. Beignet also produces various tests to ensure the compiler and the run-time
 consistency. This small test framework uses a simple c++ registration system to
 register all the unit tests.
 
@@ -135,34 +114,78 @@ will run all the unit tests one after the others
 
 will only run `some_unit_test0` and `some_unit_test1` tests
 
-Supported Hardware
-------------------
+On all supported target platform, the pass rate should be 100%. If it is not, you may
+need to refer the "Known Issues" section.
+
+Supported Targets
+-----------------
+
+ * 3rd Generation Intel Core Processors
+ * Intel “Bay Trail” platforms with Intel HD Graphics
+ * 4th Generation Intel Core Processors, need kernel patch currently, see below
+   for details:
+
+Known Issues
+------------
+
+* GPU hang issues.
+  To check whether GPU hang, you could execute dmesg and check whether it has the following message:
+  `[17909.175965] [drm:i915_hangcheck_hung] *ERROR* Hangcheck timer elapsed...`
+  If it does, there was a GPU hang. Usually, this means something wrong in the kernel, as it indicates
+  the OCL kernel hasn't finished for about 6 seconds or even more. If you think the OCL kernel does need
+  to run that long and have confidence with the kernel, you could disable the linux kernel driver's
+  hang check feature to fix this hang issue. Just invoke the following command on Ubuntu system:
 
-The code was tested on IVB GT2 with ubuntu and fedora core distribution. The recommended
-kernel version is equal or newer than 3.11. Currently Only IVB is supported right now.
-Actually, the code was run on IVB GT2/GT1, and both system are well supported now.
+  `# echo -n 0 > /sys/module/i915/parameters/enable_hangcheck`
+
+  But this command is a little bit dangerous, as if your kernel really hang, then the gpu will lock up
+  forever until a reboot.
+
+* Almost all unit tests fail on Linux kernel 3.15/3.16.
+  There is a known issue in some versions of linux kernel which enable register whitelist feature
+  but miss some necessary registers which are required for beignet. The problematic version are
+  around 3.15 and 3.16 which have commit f0a346b... but haven't commit c9224f... If it is the case,
+  you can apply c9224f... manually and rebuild the kernel or just disable the parse command by
+  invoke the following command (use Ubuntu as an example):
+  `# echo 0 > /sys/module/i915/parameters/enable_cmd_parser`
+
+* Some unit test cases, maybe 20 to 30, fail on 4th Generation (HSW) platform.
+  The 4th Generation Intel Core Processors's support requires some Linux kernel
+  modification. You need to apply the patch at:
+  [https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support)
+
+* Precision issue.
+  Currently Gen does not provide native support of high precision math functions
+  required by OpenCL. We provide a software version to achieve high precision,
+  which you can turn on through `export OCL_STRICT_CONFORMANCE=1`.
+  But be careful, this would make your CL kernel run a little longer.
+
+* cl\_khr\_gl\_sharing.
+  This extension highly depends on mesa support. It seems that mesa would not provide
+  such type of extensions, we may have to hack with mesa source code to support this
+  extension. This feature used to work with a previous mesa git version. But now, it's
+  simply broken.
 
 TODO
 ----
 
-Interns of the OpenCL 1.1 spec, beignet is quite complete now. We can pass almost
+In terms of the OpenCL 1.2 spec, beignet is quite complete now. We can pass almost
 all the piglit OpenCL test cases now. And the pass rate for the OpenCV test suite
-is also good. There are still some remains work items listed as below, most of them
-are extension support and performance related.
-
-- Performance tuning. Till now, the focus of beignet project is to implement all
-  the mandatory functions/features specified by the OpenCL spec. There are plenty
-  of things need to do for performance tuning. For example, the extreme slow software
-  based sin/cos/... math functions due to the native math instruction lack of necessary
-  precision. And all the code is inlined which will increase the icache miss rate
+is also good which is about 99%. There are still some remains work items listed as below,
+most of them are extension support and performance related.
+
+- Performance tuning. There are some major optimizations need to be done,
+  Peephole optimization, convert to structured BBs and leverage Gen's structured
+  instructions, and optimize the extreme slow software based sin/cos/... math
+  functions due to the native math instruction lack of necessary precision.
+  And all the code is inlined which will increase the icache miss rate
   significantly. And many other things which are specified partially in
-  [[here|Beignet/Backend/TODO]]. We will focus on performance tuning after the version 0.8.
+  [[here|Beignet/Backend/TODO]].
 
 - Complete cl\_khr\_gl\_sharing support. We lack of some APIs implementation such
   as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
-  the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D. This work
-  highly depends on mesa support. It seems that mesa would not provide such type
-  of extensions, we may have to hack with mesa source code to support this extension.
+  the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D. We may need to
+  find a graceful way to co-work with mesa.
 
 - Check that NDRangeKernels can be pushed into _different_ queues from several
   threads.
@@ -198,5 +221,10 @@ to the beignet mail list and send patches to it for review.
 The official mail list is as below:
 [http://lists.freedesktop.org/mailman/listinfo/beignet](http://lists.freedesktop.org/mailman/listinfo/beignet)
 
-The wiki url is as below:
+Documents for OpenCL application developers
+-------------------------------------------
+- [[Cross compile|Beignet/howto/cross-compiler-howto]]
+- [[Kernel Optimization Guide|Beignet/optimization-guide]]
+
+The wiki URL is as below:
 [http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/Beignet/Backend.mdwn b/docs/Beignet/Backend.mdwn
index 99d678e..319ce81 100644
--- a/docs/Beignet/Backend.mdwn
+++ b/docs/Beignet/Backend.mdwn
@@ -30,7 +30,17 @@ Various environment variables
 
 Environment variables are used all over the code. Most important ones are:
 
-- `OCL_SIMD_WIDTH` `(8 or 16)`. Change the number of lanes per hardware thread
+- `OCL_STRICT_CONFORMANCE` `(0 or 1)`. Gen does not provide native high
+  precision math instructions compliant with OpenCL Spec. So we provide a
+  software version to meet the high precision requirement. Obviously the
+  software version's performance is not as good as native version supported by
+  GEN hardware. What's more, most graphics application don't need this high
+  precision, so we choose 0 as the default value. So OpenCL apps do not suffer
+  the performance penalty for using high precision math functions.
+
+- `OCL_SIMD_WIDTH` `(8 or 16)`. Select the number of lanes per hardware thread,
+  Normally, you don't need to set it, we will select suitable simd width for
+  a given kernel. Default value is 16.
 
 - `OCL_OUTPUT_GEN_IR` `(0 or 1)`. Output Gen IR (scalar intermediate
   representation) code
@@ -42,7 +52,35 @@ Environment variables are used all over the code. Most important ones are:
 
 - `OCL_OUTPUT_ASM` `(0 or 1)`. Output Gen ISA
 
-- `OCL_OUTPUT_REG_ALLOC` `(0 or 1)`. Output Gen register allocations
+- `OCL_OUTPUT_REG_ALLOC` `(0 or 1)`. Output Gen register allocations, including
+  virtual register to physical register mapping, live ranges.
+
+- `OCL_OUTPUT_BUILD_LOG` `(0 or 1)`. Output error messages if there is any
+  during CL kernel compiling and linking.
+
+- `OCL_OUTPUT_CFG` `(0 or 1)`. Output control flow graph in .dot file.
+
+- `OCL_OUTPUT_CFG_ONLY` `(0 or 1)`. Output control flow graph in .dot file,
+  but without instructions in each BasicBlock.
+
+- `OCL_PRE_ALLOC_INSN_SCHEDULE` `(0 or 1)`. The instruction scheduler in
+  beignet are currently splitted into two passes: before and after register
+  allocation. The pre-alloc scheduler tend to decrease register pressure.
+  This variable is used to disable/enable pre-alloc scheduler. This pass is
+  disabled now for some bugs.
+
+- `OCL_POST_ALLOC_INSN_SCHEDULE` `(0 or 1)`. Disable/enable post-alloc
+  instruction scheduler. The post-alloc scheduler tend to reduce instruction
+  latency. By default, this is enabled now.
+
+- `OCL_SIMD16_SPILL_THRESHOLD` `(0 to 256)`. Tune how much registers can be
+  spilled under SIMD16. Default value is 16. We find spill too much register
+  under SIMD16 is not as good as fall back to SIMD8 mode. So we set the
+  variable to control spilled register number under SIMD16.
+
+- `OCL_USE_PCH` `(0 or 1)`. The default value is 1. If it is enabled, we use
+  a pre compiled header file which include all basic ocl headers. This would
+  reduce the compile time.
 
 Implementation details
 ----------------------
@@ -50,7 +88,7 @@ Implementation details
 Several key decisions may use the hardware in an usual way. See the following
 documents for the technical details about the compiler implementation:
 
-- [[Flat address space|flat_address_space]]
+- [[Mixed buffer pointer)|mixed_buffer_pointer]] 
 - [[Unstructured branches|unstructured_branches]]
 - [[Scalar intermediate representation|gen_ir]]
 - [[Clean backend implementation|compiler_backend]]
diff --git a/docs/Beignet/Backend/TODO.mdwn b/docs/Beignet/Backend/TODO.mdwn
index 7728d6a..501c508 100644
--- a/docs/Beignet/Backend/TODO.mdwn
+++ b/docs/Beignet/Backend/TODO.mdwn
@@ -28,17 +28,17 @@ many things must be implemented:
   instructions at the end of each basic block . They can be easily optimized.
 
 - From LLVM 3.3, we use SPIR IR. We need to use the compiler defined type to
-  represent sampler_t/image2d_t/image1d_t/....
+  represent sampler\_t/image2d\_t/image1d\_t/....
 
 - Considering to use libclc in our project and avoid to use the PCH which is not
   compatible for different clang versions. And may contribute what we have done in
-  the ocl_stdlib.h to libclc if possible.
+  the ocl\_stdlib.h to libclc if possible.
 
 - Optimize math functions. If the native math instructions don't compy with the
   OCL spec, we use pure software style to implement those math instructions which
   is extremely slow, for example. The cos and sin for HD4000 platform are very slow.
   For some applications which may not need such a high accurate results. We may
-  provide a mechanism to use native_xxx functions instead of the extremely slow
+  provide a mechanism to use native\_xxx functions instead of the extremely slow
   version.
 
 Gen IR
@@ -46,6 +46,10 @@ Gen IR
 
 The code is defined in `src/ir`. Main things to do are:
 
+- Convert unstructured BBs to structured format, and leverage Gen's structured
+  instruction such as if/else/endif to encoding those BBs. Then we can save many
+  instructions which are used to maintain software pcips and predications.
+
 - Implement those llvm.memset/llvm.memcpy more efficiently. Currently, we lower
   them as normal memcpy at llvm module level and not considering the intrinsics
   all have a constant data length.
@@ -53,15 +57,6 @@ The code is defined in `src/ir`. Main things to do are:
 - Finishing the handling of function arguments (see the [[IR
   description|gen_ir]] for more details)
 
-- Adding support for linking IR units together. OpenCL indeed allows to create
-  programs from several sources
-
-- Uniform analysys. This is a major performance improvement. A "uniform" value
-  is basically a value where regardless the control flow, all the activated
-  lanes will be identical. Trivial examples are immediate values, function
-  arguments. Also, operations on uniform will produce uniform values and so
-  on...
-
 - Merging of independent uniform loads (and samples). This is a major
   performance improvement once the uniform analysis is done. Basically, several
   uniform loads may be collapsed into one load if no writes happens in-between.
@@ -83,14 +78,15 @@ The code is defined in `src/backend`. Main things to do are:
 - Implementing proper instruction selection. A "simple" tree matching algorithm
   should provide good results for Gen
 
-- Improving the instruction scheduling pass. The current scheduling code has some bugs,
-  we disable it by default currently. We need to fix them in the future.
+- Improving the instruction scheduling pass. Need to implement proper pre register
+  allocation scheduling to lower register pressure.
+
+- Reduce the macro instructions in gen\_context. The macro instructions added in
+  gen\_context will not get a chance to do post register allocation scheduling.
 
-- Some instructions are introduced in the last code generation stage. We need to
-  introduce a pass after that to eliminate dead instruction or duplicate MOVs and
-  some instructions with zero operands.
+- leverage the structured if/endif for branching processing.
 
-- leverage the structured if/endif for branching processing ?
+- Peephole optimization. There are many chances to do further peephole optimization.
 
 General plumbing
 ----------------
@@ -110,5 +106,5 @@ All of those code should be improved and cleaned up are tracked with "XXX"
 comments in the code.
 
 Parts of the code leaks memory when exceptions are used. There are some pointers
-to track and replace with std::unique_ptr. Note that we also add a custom memory
+to track and replace with std::unique\_ptr. Note that we also add a custom memory
 debugger that nicely complements (i.e. it is fast) Valgrind.
diff --git a/docs/Beignet/Backend/flat_address_space.mdwn b/docs/Beignet/Backend/flat_address_space.mdwn
deleted file mode 100644
index 3018a29..0000000
--- a/docs/Beignet/Backend/flat_address_space.mdwn
+++ /dev/null
@@ -1,98 +0,0 @@
-Flat Address Space
-==================
-
-Segmented address space...
---------------------------
-
-The first challenge with OpenCL is its very liberal use of pointers. The memory
-is segment into several address spaces:
-
-- private. This is the memory for each work item
-
-- global. These are buffers in memory shared by all work items and work groups
-
-- constant. These are constant buffers in memory shared by all work items and
-work groups as well
-
-- local. These is a memory shared by all work items in the *same* work group
-
-... But with no restriction inside each address space
------------------------------------------------------
-
-The challenge is that there is no restriction in OpenCL inside each address
-space i.e. the full C semantic applies in particular regarding pointer
-arithmetic.
-
-Therefore the following code is valid:
-
-<code>
-\_\_kernel void example(\_\_global int *dst, \_\_global int *src0, \_\_global int *src1)<br/>
-{<br/>
-  \_\_global int *from;<br/>
-  if (get\_global\_id(0) % 2)<br/>
-    from = src0;<br/>
-  else<br/>
-    from = src1;<br/>
-  dst[get\_global\_id(0)] = from[get\_global\_id(0)];<br/>
-}
-</code>
-
-As one may see, the load done in the last line actually mixes pointers from both
-source src0 and src1. This typically makes the use of binding table indices
-pretty hard. In we use binding table 0 for dst, 1 for src0 and 2 for src1 (for
-example), we are not able to express the load in the last line with one send
-only.
-
-No support for stateless in required messages
----------------------------------------------
-
-Furthermore, in IVB, we are going four types of messages to implement the loads
-and the stores
-
-- Byte scattered reads. They are used to read bytes/shorts/integers that are not
-aligned on 4 bytes. This is a gather message i.e. the user provides up to 16
-addresses
-
-- Byte scattered writes. They are used to write bytes/shorts/integers that are not
-aligned on 4 bytes. This is a scatter message i.e. the user provides up to 16
-addresses
-
-- Untyped reads. They allow to read from 1 to 4 double words (i.e 4 bytes) per
-lane. This is also a gather message i.e. up to 16 address are provided per
-message.
-
-- Untyped writes. They are the counter part of the untyped reads
-
-Problem is that IVB does not support stateless accesses for these messages. So
-surfaces are required. Secondly, stateless messages are not that interesting
-since all of them require a header which is still slow to assemble.
-
-Implemented solution
---------------------
-
-The solution is actually quite simple. Even with no stateless support, it is
-actually possible to simulate it with a surface. As one may see in the run-time
-code in `intel/intel_gpgpu.c`, we simply create a surface:
-
-- 2GB big
-
-- Which starts at offset 0
-
-Surprisingly, this surface can actually map the complete GTT address space which
-is 2GB big. One may look at `flat_address_space` unit test in the run-time code
-that creates and copies buffers in such a way that the complete GTT address
-space is traversed.
-
-This solution brings a pretty simple implementation in the compiler side.
-Basically, there is nothing to do when translating from LLVM to Gen ISA. A
-pointer to `__global` or `__constant` memory is simply a 32 bits offset in that
-surface.
-
-Related problems
-----------------
-
-There is one drawback for this approach. Since we use a 2GB surface that maps
-the complete GTT space, there is no protection at all. Each write can therefore
-potentially modify any buffer including the command buffer, the frame buffer or
-the kernel code. There is *no* protection at all in the hardware to prevent
-that.
diff --git a/docs/Beignet/Backend/gen_ir.mdwn b/docs/Beignet/Backend/gen_ir.mdwn
index 424e596..635cbb4 100644
--- a/docs/Beignet/Backend/gen_ir.mdwn
+++ b/docs/Beignet/Backend/gen_ir.mdwn
@@ -22,7 +22,7 @@ One the HW side, the situation is completely different:
   for the EU. This is a SIMD scalar mode.
 
 - The only source of vectors we are going to have is on the sends instructions
-  (and marginally for some other instructions like the div_rem math instruction)
+  (and marginally for some other instructions like the div\_rem math instruction)
 
 One may therefore argue that we need vector instructions to handle the sends.
 Send will indeed require both vector destinations and sources. This may be a
@@ -33,7 +33,7 @@ Indeed, if we look carefully at the send instructions we see that they will
 require vectors that are *not* vectors in LLVM IR. This code for example:
 
 <code>
-__global uint4 *src;<br/>
+\_\_global uint4 \*src;<br/>
 uint4 x = src[get\_global\_id(0)];<br/>
 </code>
 
@@ -190,7 +190,7 @@ Look at these three examples:
 
 <code>
 struct foo { int x; int y; }; </br>
-\_\_kernel void case1(\_\_global int *dst, struct foo bar) </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
 {<br/>
   dst[get\_global\_id(0)] = bar.x + bar.y;<br/>
 }
@@ -203,7 +203,7 @@ pushed into registers and we can replace the loads by register reads.
 
 <code>
 struct foo { int x[16]; }; </br>
-\_\_kernel void case1(\_\_global int *dst, struct foo bar) </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
 {<br/>
   dst[get\_global\_id(0)] = bar.x[get\_local\_id(0)];<br/>
 }
@@ -217,7 +217,7 @@ not supported yet).
 
 <code>
 struct foo { int x[16]; }; </br>
-\_\_kernel void case1(\_\_global int *dst, struct foo bar) </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
 {<br/>
 bar.x[0] = get\_global\_id(1);<br/>
   dst[get\_global\_id(0)] = bar.x[get\_local\_id(0)];<br/>
diff --git a/docs/Beignet/Backend/mixed_buffer_pointer.mdwn b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
new file mode 100644
index 0000000..f43ab7e
--- /dev/null
+++ b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
@@ -0,0 +1,46 @@
+Mixed Buffer Pointer
+--------------------
+
+Segmented address space...
+--------------------------
+
+The first challenge with OpenCL is its very liberal use of pointers. The memory
+is segment into several address spaces:
+
+- private. This is the memory for each work item
+
+- global. These are buffers in memory shared by all work items and work groups
+
+- constant. These are constant buffers in memory shared by all work items and
+work groups as well
+
+- local. These is a memory shared by all work items in the *same* work group
+
+... But with no restriction inside each address space
+-----------------------------------------------------
+
+The challenge is that there is no restriction in OpenCL inside each address
+space i.e. the full C semantic applies in particular regarding pointer
+arithmetic.
+
+Therefore the following code is valid:
+
+<code>
+\_\_kernel void example(\_\_global int *dst, \_\_global int *src0, \_\_global int *src1)<br/>
+{<br/>
+  \_\_global int *from;<br/>
+  if (get\_global\_id(0) % 2)<br/>
+    from = src0;<br/>
+  else<br/>
+    from = src1;<br/>
+  dst[get\_global\_id(0)] = from[get\_global\_id(0)];<br/>
+}
+</code>
+
+As one may see, the load done in the last line actually mixes pointers from both
+source src0 and src1. This typically makes the use of binding table indices
+pretty hard. In we use binding table 0 for dst, 1 for src0 and 2 for src1 (for
+example), we are not able to express the load in the last line with one send
+only. The pointer "from" in the last line is so called a mixed buffer pointer.
+
+(To be updated)
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
new file mode 100644
index 0000000..1adb48a
--- /dev/null
+++ b/docs/NEWS.mdwn
@@ -0,0 +1,16 @@
+# News
+
+## Sep 15, 2014
+[Beignet 0.9.3](https://01.org/zh/beignet/downloads/beignet-0.9.3-2014-09-15) is released. This is a bug-fix release.
+
+## July 17, 2014
+[Beignet 0.9.2](https://01.org/zh/beignet/downloads/beignet-0.9.2-2014-07-17) is released. This is a bug-fix release.
+
+## July 4, 2014
+[Beignet 0.9.1](https://01.org/zh/beignet/downloads/beignet-0.9.1-2014-07-04) is released. This is a bug-fix release.
+
+## June 26, 2014
+[Beignet 0.9.0](https://01.org/zh/beignet/downloads/beignet-0.9-2014-06-26) is released. This is a major release. Please see the release notes for more information.
+
+## Feb 12, 2014
+[Beignet 0.8.0](https://01.org/zh/beignet/downloads/2014/beignet-0.8.0-2014-02-12) is released. This is a major release. Please see the release notes for more information.
diff --git a/docs/howto/cross-compiler-howto.mdwn b/docs/howto/cross-compiler-howto.mdwn
new file mode 100644
index 0000000..535cd9a
--- /dev/null
+++ b/docs/howto/cross-compiler-howto.mdwn
@@ -0,0 +1,60 @@
+Cross Compiler HowTo
+====================
+
+Beignet supports both PC devices with full profile and embedded/handheld
+devices with embeded profile. This document describes how to build Beignet
+and OpenCL kernels for a target machine (embedded/handheld devices) in a
+host machine with the help of cross compiler, and also the large-size-reduced
+Beignet driver package for the target machine.
+
+Build Beignet with a cross compiler
+-----------------------------------
+
+Besides the general cross compile methods, reference the following options when
+configure Beignet with cmake.
+
+- LLVM_INSTALL_DIR
+  Beignet depends on llvm+clang, this option refers to the path of llvm-config,
+  llvm-as, llvm-link and clang in the cross compiler environment.
+
+- CMAKE_SKIP_RPATH
+  Some cross compiler systems forbid the usage of rpath in binaries/libraries,
+  set this option to be TRUE.
+
+- GEN_PCI_ID
+  It is the GPU pci_id of the target machine, for example, 0x0162 is the pciid
+  of Intel Ivybridge GPU, and 0x0f31 is Intel Baytrail GPU. The information can
+  be queried with command 'lspci -n'.
+
+- CMAKE_INSTALL_PREFIX
+  This option controls the prefix of installation path.
+
+Distribution of large-size-reduced Beignet driver package
+---------------------------------------------------------
+
+On embedded/handheld devices, storage and memory are scarce, it is necessary to
+provide only the OpenCL runtime library without OpenCL compiler, and only the
+executable binary kernel is supported on such devices.
+
+It means that just distribute libcl.so and libgbeinterp.so (~320k in total after strip)
+are enough for OpenCL embeded profile in the target machine.
+
+Build OpenCL kernels with OpenCL offline compiler
+-------------------------------------------------
+
+Since the target machine does not contain the OpenCL compiler, the OpenCL source
+kernel need to be compiled with an OpenCL offline compiler (gbe_bin_generater)
+into binary kernel in the host machine, and the OpenCL application can load the
+binary kernel with function clCreateProgramWithBinary.
+
+The OpenCL offline compiler gbe_bin_generater is the result of Beignet build and
+locates at .../your_path_to_build/backend/src/gbe_bin_generater, see below for the
+command options.
+
+gbe_bin_generater INFILE [-pbuild_parameter] -oOUTFILE -tGEN_PCI_ID
+
+For example, the following command builds OpenCL source kernel from file 'mykernel.cl'
+for Ivybridge with pci_id 0x0162, and write the result (executable binary kernel)
+into file 'mykernel.bin'.
+
+gbe_bin_generater mykernel.cl -omykernel.bin -t0x0162
diff --git a/docs/optimization-guide.mdwn b/docs/optimization-guide.mdwn
new file mode 100644
index 0000000..8fb29a6
--- /dev/null
+++ b/docs/optimization-guide.mdwn
@@ -0,0 +1,28 @@
+Optimization Guide
+====================
+
+All the SIMD optimization principle also apply to Beignet optimization.  
+Furthermore, there are some special tips for Beignet optimization.
+
+1. It is recommended to choose multiple of 16 work group size. Too much SLM usage may reduce parallelism at group level. 
+   If kernel uses large amount SLM, it's better to choose large work group size. Please refer the following table for recommendations
+   with some SLM usage.  
+| Amount of SLM | 0  | 4K | 8K  | 16K | 32K |  
+| WorkGroup size| 16 | 64 | 128 | 256 | 512 |
+
+2. GEN7's read/write on global memory with DWORD and DWORD4 are significantly faster than read/write on BYTE/WORD.  
+   Use DWORD or DWORD4 to access data in global memory if possible. If you cannot avoid the byte/word access, try to do it on SLM.
+
+3. Use float data type as much as possible.
+
+4. Avoid using long. GEN7's performance for long integer is poor.
+
+5. If there is a small constant buffer, define it in the kernel instead of using the constant buffer argument if possible.  
+   The compiler may optimize it if the buffer is defined inside kernel.
+
+6. Avoid unnecessary synchronizations, both in the runtime and in the kernel.  For examples, clFinish and clWaitForEvents in runtime  
+   and barrier() in the kernel.
+
+7. Consider native version of math built-ins, such as native\_sin, native\_cos, if your kernel is not precision sensitive.
+
+8. Try to eliminate branching as much as possible. For example using min, max, clamp or select built-ins instead of if/else if possible.
diff --git a/include/CL/cl.h b/include/CL/cl.h
index 4355e74..316565d 100644
--- a/include/CL/cl.h
+++ b/include/CL/cl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -21,8 +21,6 @@
  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
  ******************************************************************************/
 
-/* $Revision: 11985 $ on $Date: 2010-07-15 11:16:06 -0700 (Thu, 15 Jul 2010) $ */
-
 #ifndef __OPENCL_CL_H
 #define __OPENCL_CL_H
 
@@ -58,8 +56,10 @@ typedef cl_uint             cl_device_mem_cache_type;
 typedef cl_uint             cl_device_local_mem_type;
 typedef cl_bitfield         cl_device_exec_capabilities;
 typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
 
-typedef intptr_t			cl_context_properties;
+typedef intptr_t            cl_context_properties;
 typedef cl_uint             cl_context_info;
 typedef cl_uint             cl_command_queue_info;
 typedef cl_uint             cl_channel_order;
@@ -67,6 +67,7 @@ typedef cl_uint             cl_channel_type;
 typedef cl_bitfield         cl_mem_flags;
 typedef cl_uint             cl_mem_object_type;
 typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
 typedef cl_uint             cl_image_info;
 typedef cl_uint             cl_buffer_create_type;
 typedef cl_uint             cl_addressing_mode;
@@ -75,24 +76,43 @@ typedef cl_uint             cl_sampler_info;
 typedef cl_bitfield         cl_map_flags;
 typedef cl_uint             cl_program_info;
 typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
 typedef cl_int              cl_build_status;
 typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
 typedef cl_uint             cl_kernel_work_group_info;
 typedef cl_uint             cl_event_info;
 typedef cl_uint             cl_command_type;
 typedef cl_uint             cl_profiling_info;
 
+
 typedef struct _cl_image_format {
     cl_channel_order        image_channel_order;
     cl_channel_type         image_channel_data_type;
 } cl_image_format;
 
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
 
 typedef struct _cl_buffer_region {
     size_t                  origin;
     size_t                  size;
 } cl_buffer_region;
 
+
 /******************************************************************************/
 
 /* Error Codes */
@@ -111,6 +131,11 @@ typedef struct _cl_buffer_region {
 #define CL_MAP_FAILURE                              -12
 #define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
 
 #define CL_INVALID_VALUE                            -30
 #define CL_INVALID_DEVICE_TYPE                      -31
@@ -147,14 +172,21 @@ typedef struct _cl_buffer_region {
 #define CL_INVALID_MIP_LEVEL                        -62
 #define CL_INVALID_GLOBAL_WORK_SIZE                 -63
 #define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
 
 /* OpenCL Version */
 #define CL_VERSION_1_0                              1
 #define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
 
 /* cl_bool */
 #define CL_FALSE                                    0
 #define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
 
 /* cl_platform_info */
 #define CL_PLATFORM_PROFILE                         0x0900
@@ -168,6 +200,7 @@ typedef struct _cl_buffer_region {
 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
 
 /* cl_device_info */
@@ -221,7 +254,7 @@ typedef struct _cl_buffer_region {
 #define CL_DEVICE_VERSION                           0x102F
 #define CL_DEVICE_EXTENSIONS                        0x1030
 #define CL_DEVICE_PLATFORM                          0x1031
-/* 0x1032 reserved for CL_DEVICE_DOUBLE_FP_CONFIG */
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
 /* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
 #define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
@@ -233,6 +266,20 @@ typedef struct _cl_buffer_region {
 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
 #define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
+#define CL_DEVICE_PARENT_DEVICE                     0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
+#define CL_DEVICE_PARTITION_TYPE                    0x1046
+#define CL_DEVICE_REFERENCE_COUNT                   0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
 
 /* cl_device_fp_config - bitfield */
 #define CL_FP_DENORM                                (1 << 0)
@@ -242,6 +289,7 @@ typedef struct _cl_buffer_region {
 #define CL_FP_ROUND_TO_INF                          (1 << 4)
 #define CL_FP_FMA                                   (1 << 5)
 #define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
 
 /* cl_device_mem_cache_type */
 #define CL_NONE                                     0x0
@@ -266,8 +314,23 @@ typedef struct _cl_buffer_region {
 #define CL_CONTEXT_PROPERTIES                       0x1082
 #define CL_CONTEXT_NUM_DEVICES                      0x1083
 
-/* cl_context_info + cl_context_properties */
+/* cl_context_properties */
 #define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+    
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+    
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
 
 /* cl_command_queue_info */
 #define CL_QUEUE_CONTEXT                            0x1090
@@ -282,6 +345,14 @@ typedef struct _cl_buffer_region {
 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
 
 /* cl_channel_order */
 #define CL_R                                        0x10B0
@@ -297,6 +368,8 @@ typedef struct _cl_buffer_region {
 #define CL_Rx                                       0x10BA
 #define CL_RGx                                      0x10BB
 #define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
 
 /* cl_channel_type */
 #define CL_SNORM_INT8                               0x10D0
@@ -314,11 +387,16 @@ typedef struct _cl_buffer_region {
 #define CL_UNSIGNED_INT32                           0x10DC
 #define CL_HALF_FLOAT                               0x10DD
 #define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
 
 /* cl_mem_object_type */
 #define CL_MEM_OBJECT_BUFFER                        0x10F0
 #define CL_MEM_OBJECT_IMAGE2D                       0x10F1
 #define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
 
 /* cl_mem_info */
 #define CL_MEM_TYPE                                 0x1100
@@ -339,6 +417,10 @@ typedef struct _cl_buffer_region {
 #define CL_IMAGE_WIDTH                              0x1114
 #define CL_IMAGE_HEIGHT                             0x1115
 #define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
 
 /* cl_addressing_mode */
 #define CL_ADDRESS_NONE                             0x1130
@@ -361,6 +443,7 @@ typedef struct _cl_buffer_region {
 /* cl_map_flags - bitfield */
 #define CL_MAP_READ                                 (1 << 0)
 #define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
 
 /* cl_program_info */
 #define CL_PROGRAM_REFERENCE_COUNT                  0x1160
@@ -370,11 +453,20 @@ typedef struct _cl_buffer_region {
 #define CL_PROGRAM_SOURCE                           0x1164
 #define CL_PROGRAM_BINARY_SIZES                     0x1165
 #define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
 
 /* cl_program_build_info */
 #define CL_PROGRAM_BUILD_STATUS                     0x1181
 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
 #define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+    
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
 
 /* cl_build_status */
 #define CL_BUILD_SUCCESS                            0
@@ -388,6 +480,32 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_REFERENCE_COUNT                   0x1192
 #define CL_KERNEL_CONTEXT                           0x1193
 #define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+    
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
 
 /* cl_kernel_work_group_info */
 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
@@ -395,6 +513,7 @@ typedef struct _cl_buffer_region {
 #define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
 
 /* cl_event_info  */
 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
@@ -425,13 +544,17 @@ typedef struct _cl_buffer_region {
 #define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
 #define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
 #define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
 
 /* command execution status */
 #define CL_COMPLETE                                 0x0
 #define CL_RUNNING                                  0x1
 #define CL_SUBMITTED                                0x2
 #define CL_QUEUED                                   0x3
-  
+
 /* cl_buffer_create_type  */
 #define CL_BUFFER_CREATE_TYPE_REGION                0x1220
 
@@ -470,22 +593,35 @@ clGetDeviceInfo(cl_device_id    /* device */,
                 size_t          /* param_value_size */, 
                 void *          /* param_value */,
                 size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
 
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
 /* Context APIs  */
 extern CL_API_ENTRY cl_context CL_API_CALL
 clCreateContext(const cl_context_properties * /* properties */,
-                cl_uint                       /* num_devices */,
-                const cl_device_id *          /* devices */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
                 void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
-                void *                        /* user_data */,
-                cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_context CL_API_CALL
 clCreateContextFromType(const cl_context_properties * /* properties */,
-                        cl_device_type                /* device_type */,
+                        cl_device_type          /* device_type */,
                         void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
-                        void *                        /* user_data */,
-                        cl_int *                      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
@@ -520,25 +656,6 @@ clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
                       void *                /* param_value */,
                       size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
-#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-#warning CL_USE_DEPRECATED_OPENCL_1_0_APIS is defined. These APIs are unsupported and untested in OpenCL 1.1!
-/* 
- *  WARNING:
- *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
- *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
- *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
- *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
- *
- *  Software developers previously relying on this API are instructed to set the command queue 
- *  properties when creating the queue, instead. 
- */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
-                          cl_command_queue_properties   /* properties */, 
-                          cl_bool                        /* enable */,
-                          cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
-#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
-
 /* Memory Object APIs */
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateBuffer(cl_context   /* context */,
@@ -555,26 +672,12 @@ clCreateSubBuffer(cl_mem                   /* buffer */,
                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage2D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_row_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-                        
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage3D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */, 
-                size_t                  /* image_height */,
-                size_t                  /* image_depth */, 
-                size_t                  /* image_row_pitch */, 
-                size_t                  /* image_slice_pitch */, 
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */, 
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
                         
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
@@ -609,7 +712,7 @@ clSetMemObjectDestructorCallback(  cl_mem /* memobj */,
                                     void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
                                     void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
 
-/* Sampler APIs  */
+/* Sampler APIs */
 extern CL_API_ENTRY cl_sampler CL_API_CALL
 clCreateSampler(cl_context          /* context */,
                 cl_bool             /* normalized_coords */, 
@@ -647,6 +750,13 @@ clCreateProgramWithBinary(cl_context                     /* context */,
                           cl_int *                       /* binary_status */,
                           cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
 
@@ -662,7 +772,30 @@ clBuildProgram(cl_program           /* program */,
                void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_0;
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */, 
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */, 
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetProgramInfo(cl_program         /* program */,
@@ -711,6 +844,14 @@ clGetKernelInfo(cl_kernel       /* kernel */,
                 size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
 clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
                          cl_device_id               /* device */,
                          cl_kernel_work_group_info  /* param_name */,
@@ -718,7 +859,7 @@ clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
                          void *                     /* param_value */,
                          size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 
-/* Event Object APIs  */
+/* Event Object APIs */
 extern CL_API_ENTRY cl_int CL_API_CALL
 clWaitForEvents(cl_uint             /* num_events */,
                 const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
@@ -750,7 +891,7 @@ clSetEventCallback( cl_event    /* event */,
                     void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
                     void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
 
-/* Profiling APIs  */
+/* Profiling APIs */
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetEventProfilingInfo(cl_event            /* event */,
                         cl_profiling_info   /* param_name */,
@@ -771,7 +912,7 @@ clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
                     cl_mem              /* buffer */,
                     cl_bool             /* blocking_read */,
                     size_t              /* offset */,
-                    size_t              /* cb */, 
+                    size_t              /* size */, 
                     void *              /* ptr */,
                     cl_uint             /* num_events_in_wait_list */,
                     const cl_event *    /* event_wait_list */,
@@ -781,8 +922,8 @@ extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
                         cl_mem              /* buffer */,
                         cl_bool             /* blocking_read */,
-                        const size_t *      /* buffer_origin */,
-                        const size_t *      /* host_origin */, 
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */, 
                         const size_t *      /* region */,
                         size_t              /* buffer_row_pitch */,
                         size_t              /* buffer_slice_pitch */,
@@ -798,7 +939,7 @@ clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
                      cl_mem             /* buffer */, 
                      cl_bool            /* blocking_write */, 
                      size_t             /* offset */, 
-                     size_t             /* cb */, 
+                     size_t             /* size */, 
                      const void *       /* ptr */, 
                      cl_uint            /* num_events_in_wait_list */, 
                      const cl_event *   /* event_wait_list */, 
@@ -808,8 +949,8 @@ extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
                          cl_mem              /* buffer */,
                          cl_bool             /* blocking_write */,
-                         const size_t *      /* buffer_origin */,
-                         const size_t *      /* host_origin */, 
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */, 
                          const size_t *      /* region */,
                          size_t              /* buffer_row_pitch */,
                          size_t              /* buffer_slice_pitch */,
@@ -821,12 +962,23 @@ clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
                          cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
                             
 extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */, 
+                    const void *       /* pattern */, 
+                    size_t             /* pattern_size */, 
+                    size_t             /* offset */, 
+                    size_t             /* size */, 
+                    cl_uint            /* num_events_in_wait_list */, 
+                    const cl_event *   /* event_wait_list */, 
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
                     cl_mem              /* src_buffer */,
                     cl_mem              /* dst_buffer */, 
                     size_t              /* src_offset */,
                     size_t              /* dst_offset */,
-                    size_t              /* cb */, 
+                    size_t              /* size */, 
                     cl_uint             /* num_events_in_wait_list */,
                     const cl_event *    /* event_wait_list */,
                     cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
@@ -873,6 +1025,16 @@ clEnqueueWriteImage(cl_command_queue    /* command_queue */,
                     cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */, 
+                   const void *       /* fill_color */, 
+                   const size_t *     /* origin[3] */, 
+                   const size_t *     /* region[3] */, 
+                   cl_uint            /* num_events_in_wait_list */, 
+                   const cl_event *   /* event_wait_list */, 
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueCopyImage(cl_command_queue     /* command_queue */,
                    cl_mem               /* src_image */,
                    cl_mem               /* dst_image */, 
@@ -911,7 +1073,7 @@ clEnqueueMapBuffer(cl_command_queue /* command_queue */,
                    cl_bool          /* blocking_map */, 
                    cl_map_flags     /* map_flags */,
                    size_t           /* offset */,
-                   size_t           /* cb */,
+                   size_t           /* size */,
                    cl_uint          /* num_events_in_wait_list */,
                    const cl_event * /* event_wait_list */,
                    cl_event *       /* event */,
@@ -940,6 +1102,15 @@ clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
                         cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
                        cl_kernel        /* kernel */,
                        cl_uint          /* work_dim */,
@@ -959,7 +1130,7 @@ clEnqueueTask(cl_command_queue  /* command_queue */,
 
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
-					  void (CL_CALLBACK *user_func)(void *), 
+					  void (CL_CALLBACK * /*user_func*/)(void *), 
                       void *            /* args */,
                       size_t            /* cb_args */, 
                       cl_uint           /* num_mem_objects */,
@@ -970,16 +1141,17 @@ clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
                       cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarker(cl_command_queue    /* command_queue */,
-                cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
 
 extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
-                       cl_uint          /* num_events */,
-                       const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
 
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
 
 /* Extension function access
  *
@@ -988,7 +1160,51 @@ clEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_
  * check to make sure the address is not NULL, before using or 
  * calling the returned function address.
  */
-extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
+extern CL_API_ENTRY void * CL_API_CALL 
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+    
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 
 #ifdef __cplusplus
 }
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
index 99b86a6..38fac19 100644
--- a/include/CL/cl.hpp
+++ b/include/CL/cl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -23,13 +23,18 @@
 
 /*! \file
  *
- *   \brief C++ bindings for OpenCL 1.0 (rev 48) and OpenCL 1.1 (rev 33)    
- *   \author Benedict R. Gaster and Laurent Morichetti
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
  *   
- *   Additions and fixes from Brian Cole, March 3rd 2010.
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
  *   
- *   \version 1.1
- *   \date June 2010
+ *   \version 1.2.6
+ *   \date August 2013
  *
  *   Optional extension support
  *
@@ -55,8 +60,8 @@
  *
  * For detail documentation on the bindings see:
  *
- * The OpenCL C++ Wrapper API 1.1 (revision 04)
- *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.1.pdf
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
  *
  * \section example Example
  *
@@ -141,10 +146,21 @@
 #define CL_HPP_
 
 #ifdef _WIN32
+
 #include <windows.h>
 #include <malloc.h>
+#include <iterator>
+#include <intrin.h>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#pragma push_macro("max")
+#undef max
 #if defined(USE_DX_INTEROP)
 #include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
 #endif
 #endif // _WIN32
 
@@ -156,16 +172,36 @@
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenGL/OpenGL.h>
 #include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
 #else
 #include <GL/gl.h>
 #include <CL/opencl.h>
 #endif // !__APPLE__
 
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
 #if !defined(CL_CALLBACK)
 #define CL_CALLBACK
 #endif //CL_CALLBACK
 
 #include <utility>
+#include <limits>
 
 #if !defined(__NO_STD_VECTOR)
 #include <vector>
@@ -176,11 +212,15 @@
 #endif 
 
 #if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
-# include <alloca.h>
+#include <alloca.h>
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
 #endif // linux
 
 #include <cstring>
 
+
 /*! \namespace cl
  *
  * \brief The OpenCL C++ bindings are defined within this namespace.
@@ -188,6 +228,12 @@
  */
 namespace cl {
 
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
 #define __INIT_CL_EXT_FCN_PTR(name) \
     if(!pfn_##name) { \
         pfn_##name = (PFN_##name) \
@@ -195,17 +241,29 @@ namespace cl {
         if(!pfn_##name) { \
         } \
     }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
 
 class Program;
 class Device;
 class Context;
 class CommandQueue;
 class Memory;
+class Buffer;
 
 #if defined(__CL_ENABLE_EXCEPTIONS)
-#include <exception>
-/*! \class Error
- * \brief Exception class
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
  */
 class Error : public std::exception
 {
@@ -213,8 +271,14 @@ private:
     cl_int err_;
     const char * errStr_;
 public:
-    /*! Create a new CL error exception for a given error code
+    /*! \brief Create a new CL error exception for a given error code
      *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
      */
     Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
     {}
@@ -239,7 +303,7 @@ public:
      *
      *  \return The error code.
      */
-    const cl_int err(void) const { return err_; }
+    cl_int err(void) const { return err_; }
 };
 
 #define __ERR_STR(x) #x
@@ -247,9 +311,33 @@ public:
 #define __ERR_STR(x) NULL
 #endif // __CL_ENABLE_EXCEPTIONS
 
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
 //! \cond DOXYGEN_DETAIL
 #if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
-#define __GET_DEVICE_INFO_ERR               __ERR_STR(clgetDeviceInfo)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
 #define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
 #define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
 #define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
@@ -260,20 +348,29 @@ public:
 #define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
 #define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
 #define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
 #define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
 #define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
 #define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
 #define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
 
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
 #define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
 #define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
 
 #define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
 #define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
 #define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
 #define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
-#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
-#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
 #define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
 #define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
 
@@ -286,7 +383,14 @@ public:
 #define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
 #define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
 #define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
 #define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+
+#endif // #if defined(CL_VERSION_1_2)
 #define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
 
 #define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
@@ -297,9 +401,11 @@ public:
 #define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
 #define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
 #define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
 #define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
 #define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
 #define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
 #define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
 #define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
 #define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
@@ -308,76 +414,193 @@ public:
 #define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
 #define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
 #define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
-#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
-#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
-#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
 
 #define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
 #define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
 
-#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
 
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
 #define __FLUSH_ERR                         __ERR_STR(clFlush)
 #define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
 
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
 #define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
 #endif // __CL_USER_OVERRIDE_ERROR_STRINGS
 //! \endcond
 
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
 /*! \class string
  * \brief Simple string class, that provides a limited subset of std::string
  * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
  */
-class string
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
 {
 private:
     ::size_t size_;
     char * str_;
 public:
+    //! \brief Constructs an empty string, allocating no memory.
     string(void) : size_(0), str_(NULL)
     {
     }
 
-    string(char * str, ::size_t size) :
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
         size_(size),
         str_(NULL)
     {
-        str_ = new char[size_+1];
-        if (str_ != NULL) {
-            memcpy(str_, str, size_  * sizeof(char));
-            str_[size_] = '\0';
-        }
-        else {
-            size_ = 0;
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
         }
     }
 
-    string(char * str) :
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
         str_(NULL)
     {
-        size_= ::strlen(str);
-        str_ = new char[size_ + 1];
-        if (str_ != NULL) {
-            memcpy(str_, str, (size_ + 1) * sizeof(char));
+        if( str ) {
+            size_= ::strlen(str);
         }
-        else {
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
             size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            int copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
         }
     }
 
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
     string& operator=(const string& rhs)
     {
         if (this == &rhs) {
             return *this;
         }
 
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
         if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
             size_ = 0;
-            str_  = NULL;
         } 
         else {
+            str_ = new char[rhs.size_ + 1];
             size_ = rhs.size_;
-            str_ = new char[size_ + 1];
+            
             if (str_ != NULL) {
                 memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
             }
@@ -389,37 +612,42 @@ public:
         return *this;
     }
 
-    string(const string& rhs)
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
     {
         *this = rhs;
     }
 
+    //! \brief Destructor - frees memory used to hold the current value.
     ~string()
     {
-        if (str_ != NULL) {
-            delete[] str_;
-        }
+        delete[] str_;
+        str_ = NULL;
     }
-
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
     ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
     ::size_t length(void) const { return size(); }
 
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
     const char * c_str(void) const { return (str_) ? str_ : "";}
 };
-
-#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
-#include <string>
-typedef std::string STRING_CLASS;
-#elif !defined(__USE_DEV_STRING) 
 typedef cl::string STRING_CLASS;
-#endif
+#endif // #elif !defined(__USE_DEV_STRING) 
 
 #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
-#include <vector>
 #define VECTOR_CLASS std::vector
 #elif !defined(__USE_DEV_VECTOR) 
 #define VECTOR_CLASS cl::vector 
-#endif
 
 #if !defined(__MAX_DEFAULT_VECTOR_SIZE)
 #define __MAX_DEFAULT_VECTOR_SIZE 10
@@ -427,189 +655,279 @@ typedef cl::string STRING_CLASS;
 
 /*! \class vector
  * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
  * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
  */
 template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
-class vector
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
 {
 private:
     T data_[N];
     unsigned int size_;
-    bool empty_;
+
 public:
-    vector() : 
-        size_(-1),
-        empty_(true)
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
     {}
 
-    ~vector() {}
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
 
+    //! \brief Returns the number of elements currently contained.
     unsigned int size(void) const
     {
-        return size_ + 1;
+        return size_;
     }
-
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
     void clear()
     {
-        size_ = -1;
-        empty_ = true;
+        while(!empty()) {
+            pop_back();
+        }
     }
 
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
     void push_back (const T& x)
     { 
-        if (size() < N) {
-            size_++;  
-            data_[size_] = x;
-            empty_ = false;
+        if (size() < N) {    
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
         }
     }
 
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
     void pop_back(void)
     {
-        if (!empty_) {
+        if (size_ != 0) {
+            --size_;
             data_[size_].~T();
-            size_--;
-            if (size_ == -1) {
-                empty_ = true;
-            }
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
         }
     }
   
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
     vector(const vector<T, N>& vec) : 
-        size_(vec.size_),
-        empty_(vec.empty_)
+        size_(vec.size_)
     {
-        if (!empty_) {
-            memcpy(&data_[0], &vec.data_[0], size() * sizeof(T));
+        if (size_ != 0) {	
+            assign(vec.begin(), vec.end());
         }
     } 
 
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
     vector(unsigned int size, const T& val = T()) :
-        size_(-1),
-        empty_(true)
+        size_(0)
     {
         for (unsigned int i = 0; i < size; i++) {
             push_back(val);
         }
     }
 
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
     vector<T, N>& operator=(const vector<T, N>& rhs)
     {
         if (this == &rhs) {
             return *this;
         }
 
-        size_  = rhs.size_;
-        empty_ = rhs.empty_;
-
-        if (!empty_) {	
-            memcpy(&data_[0], &rhs.data_[0], size() * sizeof(T));
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
         }
     
         return *this;
     }
 
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
     bool operator==(vector<T,N> &vec)
     {
-        if (empty_ && vec.empty_) {
-            return true;
-        }
-
         if (size() != vec.size()) {
             return false;
         }
 
-        return memcmp(&data_[0], &vec.data_[0], size() * sizeof(T)) == 0 ? true : false;
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
     }
   
+    //! \brief Conversion operator to T*.
     operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
     operator const T* () const { return data_; }
    
+    //! \brief Tests whether this instance has any elements.
     bool empty (void) const
     {
-        return empty_;
+        return size_==0;
     }
   
+    //! \brief Returns the maximum number of elements this instance can hold.
     unsigned int max_size (void) const
     {
         return N;
     }
 
+    //! \brief Returns the maximum number of elements this instance can hold.
     unsigned int capacity () const
     {
-        return sizeof(T) * N;
+        return N;
     }
 
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
     T& operator[](int index)
     {
         return data_[index];
     }
   
-    T operator[](int index) const
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
     {
         return data_[index];
     }
   
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
     template<class I>
     void assign(I start, I end)
     {
         clear();   
-        while(start < end) {
+        while(start != end) {
             push_back(*start);
             start++;
         }
     }
 
     /*! \class iterator
-     * \brief Iterator class for vectors
+     * \brief Const iterator class for vectors
      */
     class iterator
     {
     private:
-        vector<T,N> vec_;
+        const vector<T,N> *vec_;
         int index_;
-        bool initialized_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
     public:
         iterator(void) : 
             index_(-1),
-            initialized_(false)
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
         {
-            index_ = -1;
-            initialized_ = false;
         }
 
         ~iterator(void) {}
 
-        static iterator begin(vector<T,N> &vec)
+        static iterator begin(const cl::vector<T,N> &vec)
         {
-            iterator i;
-
-            if (!vec.empty()) {
-                i.index_ = 0;
-            }
+            iterator i(vec, 0);
 
-            i.vec_ = vec;
-            i.initialized_ = true;
             return i;
         }
 
-        static iterator end(vector<T,N> &vec)
+        static iterator end(const cl::vector<T,N> &vec)
         {
-            iterator i;
+            iterator i(vec, vec.size());
 
-            if (!vec.empty()) {
-                i.index_ = vec.size();
-            }
-            i.vec_ = vec;
-            i.initialized_ = true;
             return i;
         }
     
         bool operator==(iterator i)
         {
             return ((vec_ == i.vec_) && 
-                    (index_ == i.index_) && 
-                    (initialized_ == i.initialized_));
+                    (index_ == i.index_));
         }
 
         bool operator!=(iterator i)
@@ -617,29 +935,35 @@ public:
             return (!(*this==i));
         }
 
-        void operator++()
+        iterator& operator++()
         {
-            index_++;
+            ++index_;
+            return *this;
         }
 
-        void operator++(int x)
+        iterator operator++(int)
         {
-            index_ += x;
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
         }
 
-        void operator--()
+        iterator& operator--()
         {
-            index_--;
+            --index_;
+            return *this;
         }
 
-        void operator--(int x)
+        iterator operator--(int)
         {
-            index_ -= x;
+            iterator retVal(*this);
+            --index_;
+            return retVal;
         }
 
-        T operator *()
+        const T& operator *() const
         {
-            return vec_[index_];
+            return (*vec_)[index_];
         }
     };
 
@@ -648,11 +972,21 @@ public:
         return iterator::begin(*this);
     }
 
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
     iterator end(void)
     {
         return iterator::end(*this);
     }
 
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
     T& front(void)
     {
         return data_[0];
@@ -670,110 +1004,230 @@ public:
 
     const T& back(void) const
     {
-        return data_[size_];
+        return data_[size_-1];
     }
 };  
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+    {
+#ifdef _WIN32
+        return (int)(InterlockedCompareExchange(
+           (volatile long*)dest, 
+           (long)exchange, 
+           (long)comparand));
+#elif defined(__APPLE__) || defined(__MACOSX)
+		return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
+#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
+        return (__sync_val_compare_and_swap(
+            dest, 
+            comparand, 
+            exchange));
+#endif // !_WIN32
+    }
+
+    inline void fence() { _mm_mfence(); }
+}; // namespace detail
+
     
-/*!
- * \brief size_t class used to interface between C++ and
- * OpenCL C calls that require arrays of size_t values, who's
- * size is known statically.
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
  */
 template <int N>
-struct size_t : public cl::vector< ::size_t, N> { };
+class size_t
+{ 
+private:
+    ::size_t data_[N];
 
-namespace detail {
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
 
-// GetInfo help struct
-template <typename Functor, typename T>
-struct GetInfoHelper
-{
-    static cl_int
-    get(Functor f, cl_uint name, T* param)
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
     {
-        return f(name, sizeof(T), param, NULL);
+        return data_[index];
     }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
 };
 
-// Specialized GetInfoHelper for VECTOR_CLASS params
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
 template <typename Func, typename T>
-struct GetInfoHelper<Func, VECTOR_CLASS<T> >
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
 {
-    static cl_int get(Func f, cl_uint name, VECTOR_CLASS<T>* param)
-    {
-        ::size_t required;
-        cl_int err = f(name, 0, NULL, &required);
-        if (err != CL_SUCCESS) {
-            return err;
-        }
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
 
-        T* value = (T*) alloca(required);
-        err = f(name, required, value, NULL);
-        if (err != CL_SUCCESS) {
-            return err;
-        }
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
 
-        param->assign(&value[0], &value[required/sizeof(T)]);
-        return CL_SUCCESS;
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
     }
-};
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
 
 // Specialized for getInfo<CL_PROGRAM_BINARIES>
 template <typename Func>
-struct GetInfoHelper<Func, VECTOR_CLASS<char *> >
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
 {
-    static cl_int
-    get(Func f, cl_uint name, VECTOR_CLASS<char *>* param)
-    {
-      cl_uint err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
-      if (err != CL_SUCCESS) {
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
         return err;
-      }
-      
-      return CL_SUCCESS;
     }
-};
+
+    return CL_SUCCESS;
+}
 
 // Specialized GetInfoHelper for STRING_CLASS params
 template <typename Func>
-struct GetInfoHelper<Func, STRING_CLASS>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
 {
-    static cl_int get(Func f, cl_uint name, STRING_CLASS* param)
-    {
-        ::size_t required;
-        cl_int err = f(name, 0, NULL, &required);
-        if (err != CL_SUCCESS) {
-            return err;
-        }
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
 
-        char* value = (char*) alloca(required);
-        err = f(name, required, value, NULL);
-        if (err != CL_SUCCESS) {
-            return err;
-        }
+    char* value = (char*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
 
-        *param = value;
-        return CL_SUCCESS;
+    *param = value;
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
     }
-};
 
-#define __GET_INFO_HELPER_WITH_RETAIN(CPP_TYPE) \
-namespace detail { \
-template <typename Func> \
-struct GetInfoHelper<Func, CPP_TYPE> \
-{ \
-    static cl_int get(Func f, cl_uint name, CPP_TYPE* param) \
-    { \
-      cl_uint err = f(name, sizeof(CPP_TYPE), param, NULL); \
-      if (err != CL_SUCCESS) { \
-        return err; \
-      } \
-      \
-      return ReferenceHandler<CPP_TYPE::cl_type>::retain((*param)()); \
-    } \
-}; \
-} 
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
 
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
 
 #define __PARAM_NAME_INFO_1_0(F) \
     F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
@@ -795,7 +1249,7 @@ struct GetInfoHelper<Func, CPP_TYPE> \
     F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
     F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
     F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
-    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_bitfield) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
     F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
     F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
     F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
@@ -804,7 +1258,7 @@ struct GetInfoHelper<Func, CPP_TYPE> \
     F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
     F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
     F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
-    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
     F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
     F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
     F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
@@ -872,7 +1326,7 @@ struct GetInfoHelper<Func, CPP_TYPE> \
     F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
     F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
     F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
-    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<cl_device_id>) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
     F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
     F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
     F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
@@ -910,6 +1364,7 @@ struct GetInfoHelper<Func, CPP_TYPE> \
     F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
     F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
     F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
     \
     F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
     F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
@@ -920,19 +1375,45 @@ struct GetInfoHelper<Func, CPP_TYPE> \
     F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
 #endif // CL_VERSION_1_1
 
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
 #if defined(USE_CL_DEVICE_FISSION)
 #define __PARAM_NAME_DEVICE_FISSION(F) \
     F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
-	F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-	F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
-	F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
-	F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
 #endif // USE_CL_DEVICE_FISSION
 
 template <typename enum_type, cl_int Name>
 struct param_traits {};
 
-#define __DECLARE_PARAM_TRAITS(token, param_name, T) \
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
 struct token;                                        \
 template<>                                           \
 struct param_traits<detail:: token,param_name>       \
@@ -941,16 +1422,78 @@ struct param_traits<detail:: token,param_name>       \
     typedef T param_type;                            \
 };
 
-__PARAM_NAME_INFO_1_0(__DECLARE_PARAM_TRAITS);
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
 #if defined(CL_VERSION_1_1)
-__PARAM_NAME_INFO_1_1(__DECLARE_PARAM_TRAITS);
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
 #endif // CL_VERSION_1_1
 
 #if defined(USE_CL_DEVICE_FISSION)
-__PARAM_NAME_DEVICE_FISSION(__DECLARE_PARAM_TRAITS);
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
 #endif // USE_CL_DEVICE_FISSION
 
-#undef __DECLARE_PARAM_TRAITS
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
 
 // Convenience functions
 
@@ -958,7 +1501,7 @@ template <typename Func, typename T>
 inline cl_int
 getInfo(Func f, cl_uint name, T* param)
 {
-    return GetInfoHelper<Func, T>::get(f, name, param);
+    return getInfoHelper(f, name, param, 0);
 }
 
 template <typename Func, typename Arg0>
@@ -984,8 +1527,7 @@ inline cl_int
 getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
 {
     GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
-    return GetInfoHelper<GetInfoFunctor0<Func, Arg0>, T>
-        ::get(f0, name, param);
+    return getInfoHelper(f0, name, param, 0);
 }
 
 template <typename Func, typename Arg0, typename Arg1, typename T>
@@ -993,34 +1535,68 @@ inline cl_int
 getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
 {
     GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
-    return GetInfoHelper<GetInfoFunctor1<Func, Arg0, Arg1>, T>
-        ::get(f0, name, param);
+    return getInfoHelper(f0, name, param, 0);
 }
 
 template<typename T>
 struct ReferenceHandler
 { };
 
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
 template <>
 struct ReferenceHandler<cl_device_id>
 {
     // cl_device_id does not have retain().
     static cl_int retain(cl_device_id)
-    { return CL_INVALID_DEVICE; }
+    { return CL_SUCCESS; }
     // cl_device_id does not have release().
     static cl_int release(cl_device_id)
-    { return CL_INVALID_DEVICE; }
+    { return CL_SUCCESS; }
 };
+#endif // #if defined(CL_VERSION_1_2)
 
 template <>
 struct ReferenceHandler<cl_platform_id>
 {
     // cl_platform_id does not have retain().
     static cl_int retain(cl_platform_id)
-    { return CL_INVALID_PLATFORM; }
+    { return CL_SUCCESS; }
     // cl_platform_id does not have release().
     static cl_int release(cl_platform_id)
-    { return CL_INVALID_PLATFORM; }
+    { return CL_SUCCESS; }
 };
 
 template <>
@@ -1086,6 +1662,58 @@ struct ReferenceHandler<cl_event>
     { return ::clReleaseEvent(event); }
 };
 
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' ) {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
 template <typename T>
 class Wrapper
 {
@@ -1098,6 +1726,8 @@ protected:
 public:
     Wrapper() : object_(NULL) { }
 
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
     ~Wrapper()
     {
         if (object_ != NULL) { release(); }
@@ -1106,14 +1736,21 @@ public:
     Wrapper(const Wrapper<cl_type>& rhs)
     {
         object_ = rhs.object_;
-        if (object_ != NULL) { retain(); }
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
     }
 
     Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
     {
-        if (object_ != NULL) { release(); }
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
         object_ = rhs.object_;
-        if (object_ != NULL) { retain(); }
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
         return *this;
     }
 
@@ -1122,6 +1759,8 @@ public:
     cl_type& operator ()() { return object_; }
 
 protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
 
     cl_int retain() const
     {
@@ -1134,39 +1773,120 @@ protected:
     }
 };
 
-#if defined(__CL_ENABLE_EXCEPTIONS)
-static inline cl_int errHandler (
-    cl_int err,
-    const char * errStr = NULL) throw(Error)
+template <>
+class Wrapper<cl_device_id>
 {
-    if (err != CL_SUCCESS) {
-        throw Error(err, errStr);
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
     }
-    return err;
-}
-#else
-static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
-{
-    return err;
-}
-#endif // __CL_ENABLE_EXCEPTIONS
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
 
 } // namespace detail
 //! \endcond
 
 /*! \stuct ImageFormat
- * \brief ImageFormat interface fro cl_image_format.
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
  */
 struct ImageFormat : public cl_image_format
 {
+    //! \brief Default constructor - performs no initialization.
     ImageFormat(){}
 
+    //! \brief Initializing constructor.
     ImageFormat(cl_channel_order order, cl_channel_type type)
     {
         image_channel_order = order;
         image_channel_data_type = type;
     }
 
+    //! \brief Assignment operator.
     ImageFormat& operator = (const ImageFormat& rhs)
     {
         if (this != &rhs) {
@@ -1177,18 +1897,41 @@ struct ImageFormat : public cl_image_format
     }
 };
 
-/*! \class Device
- * \brief Device interface for cl_device_id.
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
  */
 class Device : public detail::Wrapper<cl_device_id>
 {
 public:
-    Device(cl_device_id device) { object_ = device; }
-
+    //! \brief Default constructor - initializes to NULL.
     Device() : detail::Wrapper<cl_type>() { }
 
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
     Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
 
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from Device.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
     Device& operator = (const Device& rhs)
     {
         if (this != &rhs) {
@@ -1197,6 +1940,17 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
     template <typename T>
     cl_int getInfo(cl_device_info name, T* param) const
     {
@@ -1205,6 +1959,7 @@ public:
             __GET_DEVICE_INFO_ERR);
     }
 
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
     template <cl_int name> typename
     detail::param_traits<detail::cl_device_info, name>::param_type
     getInfo(cl_int* err = NULL) const
@@ -1218,23 +1973,53 @@ public:
         return param;
     }
 
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
 #if defined(USE_CL_DEVICE_FISSION)
-	cl_int createSubDevices(
-		const cl_device_partition_property_ext * properties,
-		VECTOR_CLASS<Device>* devices)
-	{
-		typedef CL_API_ENTRY cl_int 
-			( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
-				cl_device_id /*in_device*/,
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
                 const cl_device_partition_property_ext * /* properties */,
                 cl_uint /*num_entries*/,
                 cl_device_id * /*out_devices*/,
                 cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
 
-		static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
-		__INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
 
-		cl_uint n = 0;
+        cl_uint n = 0;
         cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __CREATE_SUB_DEVICES);
@@ -1248,24 +2033,40 @@ public:
 
         devices->assign(&ids[0], &ids[n]);
         return CL_SUCCESS;
- 	}
-#endif
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
 };
 
-/*! \class Platform
- *  \brief Platform interface.
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
  */
 class Platform : public detail::Wrapper<cl_platform_id>
 {
 public:
-    static const Platform null();
-
-    Platform(cl_platform_id platform) { object_ = platform; }
-
+    //! \brief Default constructor - initializes to NULL.
     Platform() : detail::Wrapper<cl_type>()  { }
 
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
     Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
 
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from Platform.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
     Platform& operator = (const Platform& rhs)
     {
         if (this != &rhs) {
@@ -1274,6 +2075,17 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
     cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
     {
         return detail::errHandler(
@@ -1281,6 +2093,7 @@ public:
             __GET_PLATFORM_INFO_ERR);
     }
 
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
     template <cl_int name> typename
     detail::param_traits<detail::cl_platform_info, name>::param_type
     getInfo(cl_int* err = NULL) const
@@ -1294,11 +2107,18 @@ public:
         return param;
     }
 
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
     cl_int getDevices(
         cl_device_type type,
         VECTOR_CLASS<Device>* devices) const
     {
         cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
         cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
@@ -1353,8 +2173,12 @@ public:
             cl_device_id * devices,
             cl_uint* num_devices);
 
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
         static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
-        __INIT_CL_EXT_FCN_PTR(clGetDeviceIDsFromD3D10KHR);
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
 
         cl_uint n = 0;
         cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
@@ -1387,10 +2211,19 @@ public:
     }
 #endif
 
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
     static cl_int get(
         VECTOR_CLASS<Platform>* platforms)
     {
         cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
         cl_int err = ::clGetPlatformIDs(0, NULL, &n);
         if (err != CL_SUCCESS) {
             return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
@@ -1406,41 +2239,189 @@ public:
         platforms->assign(&ids[0], &ids[n]);
         return CL_SUCCESS;
     }
-};
-
-static inline cl_int
-UnloadCompiler()
-{
-    return ::clUnloadCompiler();
-}
 
-class Context : public detail::Wrapper<cl_context>
-{
-public:
-    Context(
-        const VECTOR_CLASS<Device>& devices,
-        cl_context_properties* properties = NULL,
-        void (CL_CALLBACK * notifyFptr)(
-            const char *,
-            const void *,
-            ::size_t,
-            void *) = NULL,
-        void* data = NULL,
-        cl_int* err = NULL)
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
     {
-        cl_int error;
-        object_ = ::clCreateContext(
-            properties, (cl_uint) devices.size(),
-            (cl_device_id*) &devices.front(),
-            notifyFptr, data, &error);
+        cl_uint n = 0;
 
-        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
-        if (err != NULL) {
-            *err = error;
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
         }
-    }
 
-    Context(
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (errResult != NULL) {
+            *errResult = err;
+        }
+        
+        return ids[0];
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static volatile int default_initialized_;
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseContext() on the value held by this instance.
+     */
+    ~Context() { }
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
         cl_device_type type,
         cl_context_properties* properties = NULL,
         void (CL_CALLBACK * notifyFptr)(
@@ -1452,6 +2433,66 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
+
+#if !defined(__APPLE__) || !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            VECTOR_CLASS<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
         object_ = ::clCreateContextFromType(
             properties, type, notifyFptr, data, &error);
 
@@ -1461,10 +2502,79 @@ public:
         }
     }
 
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
     Context() : detail::Wrapper<cl_type>() { }
 
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainContext() on the parameter's cl_context.
+     */
     Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
 
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from Context.
+     * 
+     *  This calls clRetainContext() on the parameter and clReleaseContext() on
+     *  the previous value held by this instance.
+     */
     Context& operator = (const Context& rhs)
     {
         if (this != &rhs) {
@@ -1473,6 +2583,18 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
     template <typename T>
     cl_int getInfo(cl_context_info name, T* param) const
     {
@@ -1481,6 +2603,7 @@ public:
             __GET_CONTEXT_INFO_ERR);
     }
 
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
     template <cl_int name> typename
     detail::param_traits<detail::cl_context_info, name>::param_type
     getInfo(cl_int* err = NULL) const
@@ -1494,6 +2617,10 @@ public:
         return param;
     }
 
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
     cl_int getSupportedImageFormats(
         cl_mem_flags flags,
         cl_mem_object_type type,
@@ -1529,18 +2656,78 @@ public:
     }
 };
 
-__GET_INFO_HELPER_WITH_RETAIN(cl::Context)
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+    detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif
 
-/*! \class Event
- * \brief Event interface for cl_event.
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
  */
 class Event : public detail::Wrapper<cl_event>
 {
 public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseEvent() on the value held by this instance.
+     */
+    ~Event() { }
+ 
+    //! \brief Default constructor - initializes to NULL.
     Event() : detail::Wrapper<cl_type>() { }
 
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainEvent() on the parameter's cl_event.
+     */
     Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
 
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
     Event& operator = (const Event& rhs)
     {
         if (this != &rhs) {
@@ -1549,6 +2736,18 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment operator from cl_event.
+     * 
+     *  This calls clRetainEvent() on the parameter and clReleaseEvent() on
+     *  the previous value held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
     template <typename T>
     cl_int getInfo(cl_event_info name, T* param) const
     {
@@ -1557,6 +2756,7 @@ public:
             __GET_EVENT_INFO_ERR);
     }
 
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
     template <cl_int name> typename
     detail::param_traits<detail::cl_event_info, name>::param_type
     getInfo(cl_int* err = NULL) const
@@ -1570,6 +2770,7 @@ public:
         return param;
     }
 
+    //! \brief Wrapper for clGetEventProfilingInfo().
     template <typename T>
     cl_int getProfilingInfo(cl_profiling_info name, T* param) const
     {
@@ -1578,6 +2779,7 @@ public:
             __GET_EVENT_PROFILE_INFO_ERR);
     }
 
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
     template <cl_int name> typename
     detail::param_traits<detail::cl_profiling_info, name>::param_type
     getProfilingInfo(cl_int* err = NULL) const
@@ -1591,6 +2793,10 @@ public:
         return param;
     }
 
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
     cl_int wait() const
     {
         return detail::errHandler(
@@ -1599,6 +2805,10 @@ public:
     }
 
 #if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
     cl_int setCallback(
         cl_int type,
         void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
@@ -1614,6 +2824,10 @@ public:
     }
 #endif
 
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
     static cl_int
     waitForEvents(const VECTOR_CLASS<Event>& events)
     {
@@ -1624,15 +2838,18 @@ public:
     }
 };
 
-__GET_INFO_HELPER_WITH_RETAIN(cl::Event)
-
 #if defined(CL_VERSION_1_1)
-/*! \class UserEvent
- * \brief User event interface for cl_event.
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
  */
 class UserEvent : public Event
 {
 public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
     UserEvent(
         const Context& context,
         cl_int * err = NULL)
@@ -1648,10 +2865,13 @@ public:
         }
     }
 
+    //! \brief Default constructor - initializes to NULL.
     UserEvent() : Event() { }
 
+    //! \brief Copy constructor - performs shallow copy.
     UserEvent(const UserEvent& event) : Event(event) { }
 
+    //! \brief Assignment Operator - performs shallow copy.
     UserEvent& operator = (const UserEvent& rhs)
     {
         if (this != &rhs) {
@@ -1660,6 +2880,10 @@ public:
         return *this;
     }
 
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
     cl_int setStatus(cl_int status)
     {
         return detail::errHandler(
@@ -1669,6 +2893,10 @@ public:
 };
 #endif
 
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
 inline static cl_int
 WaitForEvents(const VECTOR_CLASS<Event>& events)
 {
@@ -1678,16 +2906,45 @@ WaitForEvents(const VECTOR_CLASS<Event>& events)
         __WAIT_FOR_EVENTS_ERR);
 }
 
-/*! \class Memory
- * \brief Memory interface for cl_mem.
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
  */
 class Memory : public detail::Wrapper<cl_mem>
 {
 public:
+ 
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseMemObject() on the value held by this instance.
+     */
+    ~Memory() {}
+
+    //! \brief Default constructor - initializes to NULL.
     Memory() : detail::Wrapper<cl_type>() { }
 
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainMemObject() on the parameter's cl_mem.
+     */
     Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
 
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from Memory.
+     * 
+     *  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
+     *  on the previous value held by this instance.
+     */
     Memory& operator = (const Memory& rhs)
     {
         if (this != &rhs) {
@@ -1696,6 +2953,18 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
     template <typename T>
     cl_int getInfo(cl_mem_info name, T* param) const
     {
@@ -1704,6 +2973,7 @@ public:
             __GET_MEM_OBJECT_INFO_ERR);
     }
 
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
     template <cl_int name> typename
     detail::param_traits<detail::cl_mem_info, name>::param_type
     getInfo(cl_int* err = NULL) const
@@ -1718,6 +2988,19 @@ public:
     }
 
 #if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
     cl_int setDestructorCallback(
         void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
         void * user_data = NULL)
@@ -1733,14 +3016,35 @@ public:
 
 };
 
-__GET_INFO_HELPER_WITH_RETAIN(cl::Memory)
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
 
-/*! \class Buffer
- * \brief Memory buffer interface.
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
  */
 class Buffer : public Memory
 {
 public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
     Buffer(
         const Context& context,
         cl_mem_flags flags,
@@ -1757,24 +3061,140 @@ public:
         }
     }
 
-    Buffer() : Memory() { }
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
 
-    Buffer(const Buffer& buffer) : Memory(buffer) { }
+        Context context = Context::getDefault(err);
 
-    Buffer& operator = (const Buffer& rhs)
-    {
-        if (this != &rhs) {
-            Memory::operator=(rhs);
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
         }
-        return *this;
     }
 
-#if defined(CL_VERSION_1_1)
-    Buffer createSubBuffer(
-        cl_mem_flags flags,
-        cl_buffer_create_type buffer_create_type,
-        const void * buffer_create_info,
-        cl_int * err = NULL)
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from Buffer - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const Buffer& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
     {
         Buffer result;
         cl_int error;
@@ -1791,11 +3211,19 @@ public:
         }
 
         return result;
-	}		
+    }		
 #endif
 };
 
 #if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
 class BufferD3D10 : public Buffer
 {
 public:
@@ -1803,6 +3231,11 @@ public:
     cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
     cl_int* errcode_ret);
 
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
     BufferD3D10(
         const Context& context,
         cl_mem_flags flags,
@@ -1810,7 +3243,20 @@ public:
         cl_int * err = NULL)
     {
         static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
         __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
 
         cl_int error;
         object_ = pfn_clCreateFromD3D10BufferKHR(
@@ -1825,10 +3271,25 @@ public:
         }
     }
 
+    //! \brief Default constructor - initializes to NULL.
     BufferD3D10() : Buffer() { }
 
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
 
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     BufferD3D10& operator = (const BufferD3D10& rhs)
     {
         if (this != &rhs) {
@@ -1836,15 +3297,35 @@ public:
         }
         return *this;
     }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
 };
 #endif
 
-/*! \class BufferGL
- * \brief Memory buffer interface for GL interop.
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
  */
 class BufferGL : public Buffer
 {
 public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
     BufferGL(
         const Context& context,
         cl_mem_flags flags,
@@ -1864,10 +3345,25 @@ public:
         }
     }
 
+    //! \brief Default constructor - initializes to NULL.
     BufferGL() : Buffer() { }
 
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
 
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     BufferGL& operator = (const BufferGL& rhs)
     {
         if (this != &rhs) {
@@ -1876,6 +3372,17 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
     cl_int getObjectInfo(
         cl_gl_object_type *type,
         GLuint * gl_object_name)
@@ -1886,12 +3393,22 @@ public:
     }
 };
 
-/*! \class BufferRenderGL
- * \brief Memory buffer interface for GL interop with renderbuffer.
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
  */
 class BufferRenderGL : public Buffer
 {
 public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
     BufferRenderGL(
         const Context& context,
         cl_mem_flags flags,
@@ -1905,16 +3422,31 @@ public:
             bufobj,
             &error);
 
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
         if (err != NULL) {
             *err = error;
         }
     }
 
+    //! \brief Default constructor - initializes to NULL.
     BufferRenderGL() : Buffer() { }
 
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
 
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     BufferRenderGL& operator = (const BufferRenderGL& rhs)
     {
         if (this != &rhs) {
@@ -1923,6 +3455,17 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
     cl_int getObjectInfo(
         cl_gl_object_type *type,
         GLuint * gl_object_name)
@@ -1933,16 +3476,34 @@ public:
     }
 };
 
-/*! \class Image
- * \brief Base class  interface for all images.
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
  */
 class Image : public Memory
 {
 protected:
+    //! \brief Default constructor - initializes to NULL.
     Image() : Memory() { }
 
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     Image(const Image& image) : Memory(image) { }
 
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from Image - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
     Image& operator = (const Image& rhs)
     {
         if (this != &rhs) {
@@ -1950,7 +3511,19 @@ protected:
         }
         return *this;
     }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
 public:
+    //! \brief Wrapper for clGetImageInfo().
     template <typename T>
     cl_int getImageInfo(cl_image_info name, T* param) const
     {
@@ -1958,7 +3531,8 @@ public:
             detail::getInfo(&::clGetImageInfo, object_, name, param),
             __GET_IMAGE_INFO_ERR);
     }
-
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
     template <cl_int name> typename
     detail::param_traits<detail::cl_image_info, name>::param_type
     getImageInfo(cl_int* err = NULL) const
@@ -1973,482 +3547,791 @@ public:
     }
 };
 
-/*! \class Image2D
- * \brief Image interface for 2D images.
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
  */
-class Image2D : public Image
+class Image1D : public Image
 {
 public:
-    Image2D(
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
         const Context& context,
         cl_mem_flags flags,
         ImageFormat format,
         ::size_t width,
-        ::size_t height,
-        ::size_t row_pitch = 0,
         void* host_ptr = NULL,
         cl_int* err = NULL)
     {
         cl_int error;
-        object_ = ::clCreateImage2D(
-            context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
 
-        detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
         if (err != NULL) {
             *err = error;
         }
     }
 
-    Image2D() { }
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
 
-    Image2D(const Image2D& image2D) : Image(image2D) { }
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D(const Image1D& image1D) : Image(image1D) { }
 
-    Image2D& operator = (const Image2D& rhs)
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from Image1D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const Image1D& rhs)
     {
         if (this != &rhs) {
             Image::operator=(rhs);
         }
         return *this;
     }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
 };
 
-/*! \class Image2DGL
- * \brief 2D image interface for GL interop.
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
  */
-class Image2DGL : public Image2D
+class Image1DBuffer : public Image
 {
 public:
-    Image2DGL(
+    Image1DBuffer(
         const Context& context,
         cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
     {
         cl_int error;
-        object_ = ::clCreateFromGLTexture2D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
             &error);
 
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
         if (err != NULL) {
             *err = error;
         }
     }
 
-    Image2DGL() : Image2D() { }
+    Image1DBuffer() { }
 
-    Image2DGL(const Image2DGL& image) : Image2D(image) { }
+    Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
 
-    Image2DGL& operator = (const Image2DGL& rhs)
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const Image1DBuffer& rhs)
     {
         if (this != &rhs) {
-            Image2D::operator=(rhs);
+            Image::operator=(rhs);
         }
         return *this;
     }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
 };
 
-/*! \class Image3D
- * \brief Image interface for 3D images.
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
  */
-class Image3D : public Image
+class Image1DArray : public Image
 {
 public:
-    Image3D(
+    Image1DArray(
         const Context& context,
         cl_mem_flags flags,
         ImageFormat format,
+        ::size_t arraySize,
         ::size_t width,
-        ::size_t height,
-        ::size_t depth,
-        ::size_t row_pitch = 0,
-        ::size_t slice_pitch = 0,
+        ::size_t rowPitch,
         void* host_ptr = NULL,
         cl_int* err = NULL)
     {
         cl_int error;
-        object_ = ::clCreateImage3D(
-            context(), flags, &format, width, height, depth, row_pitch,
-            slice_pitch, host_ptr, &error);
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
 
-        detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
         if (err != NULL) {
             *err = error;
         }
     }
 
-    Image3D() { }
+    Image1DArray() { }
 
-    Image3D(const Image3D& image3D) : Image(image3D) { }
+    Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
 
-    Image3D& operator = (const Image3D& rhs)
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const Image1DArray& rhs)
     {
         if (this != &rhs) {
             Image::operator=(rhs);
         }
         return *this;
     }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
 };
+#endif // #if defined(CL_VERSION_1_2)
 
-/*! \class Image2DGL
- * \brief 2D image interface for GL interop.
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
  */
-class Image3DGL : public Image3D
+class Image2D : public Image
 {
 public:
-    Image3DGL(
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
         const Context& context,
         cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
     {
         cl_int error;
-        object_ = ::clCreateFromGLTexture3D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
-            &error);
+        bool useCreateImage;
 
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
         }
-    }
-
-    Image3DGL() : Image3D() { }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
 
-    Image3DGL(const Image3DGL& image) : Image3D(image) { }
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
 
-    Image3DGL& operator = (const Image3DGL& rhs)
-    {
-        if (this != &rhs) {
-            Image3D::operator=(rhs);
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
         }
-        return *this;
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D(const Image2D& image2D) : Image(image2D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from Image2D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const Image2D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
     }
 };
 
-/*! \class Sampler
- * \brief Sampler interface for cl_sampler.
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
  */
-class Sampler : public detail::Wrapper<cl_sampler>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
 {
 public:
-    Sampler() { }
-
-    Sampler(
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
         const Context& context,
-        cl_bool normalized_coords,
-        cl_addressing_mode addressing_mode,
-        cl_filter_mode filter_mode,
-        cl_int* err = NULL)
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
     {
         cl_int error;
-        object_ = ::clCreateSampler(
-            context(), 
-            normalized_coords,
-            addressing_mode,
-            filter_mode,
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
             &error);
 
-        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
         if (err != NULL) {
             *err = error;
         }
+
     }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
 
-    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL(const Image2DGL& image) : Image2D(image) { }
 
-    Sampler& operator = (const Sampler& rhs)
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from Image2DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const Image2DGL& rhs)
     {
         if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
+            Image2D::operator=(rhs);
         }
         return *this;
     }
 
-    template <typename T>
-    cl_int getInfo(cl_sampler_info name, T* param) const
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
     {
-        return detail::errHandler(
-            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
-            __GET_SAMPLER_INFO_ERR);
+        Image2D::operator=(rhs);
+        return *this;
     }
+};
+#endif // #if !defined(CL_VERSION_1_2)
 
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_sampler_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
     {
-        typename detail::param_traits<
-            detail::cl_sampler_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
         if (err != NULL) {
-            *err = result;
+            *err = error;
         }
-        return param;
     }
-};
-
-__GET_INFO_HELPER_WITH_RETAIN(cl::Sampler)
-
-class Program;
-class CommandQueue;
-class Kernel;
 
-/*! \class NDRange
- * \brief NDRange interface
- */
-class NDRange
-{
-private:
-    size_t<3> sizes_;
-    cl_uint dimensions_;
+    Image2DArray() { }
 
-public:
-    NDRange()
-        : dimensions_(0)
-    { }
+    Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
 
-    NDRange(::size_t size0)
-        : dimensions_(1)
-    {
-        sizes_.push_back(size0);
-    }
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
 
-    NDRange(::size_t size0, ::size_t size1)
-        : dimensions_(2)
+    Image2DArray& operator = (const Image2DArray& rhs)
     {
-        sizes_.push_back(size0);
-        sizes_.push_back(size1);
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
     }
 
-    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
-        : dimensions_(3)
+    Image2DArray& operator = (const cl_mem& rhs)
     {
-        sizes_.push_back(size0);
-        sizes_.push_back(size1);
-        sizes_.push_back(size2);
+        Image::operator=(rhs);
+        return *this;
     }
-
-    operator const ::size_t*() const { return (const ::size_t*) sizes_; }
-    ::size_t dimensions() const { return dimensions_; }
 };
+#endif // #if defined(CL_VERSION_1_2)
 
-static const NDRange NullRange;
-
-/*!
- * \struct LocalSpaceArg
- * \brief Local address raper for use with Kernel::setArg
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
  */
-struct LocalSpaceArg
-{
-    ::size_t size_;
-};
-
-namespace detail {
-
-template <typename T>
-struct KernelArgumentHandler
-{
-    static ::size_t size(const T&) { return sizeof(T); }
-    static T* ptr(T& value) { return &value; }
-};
-
-template <>
-struct KernelArgumentHandler<LocalSpaceArg>
+class Image3D : public Image
 {
-    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
-    static void* ptr(LocalSpaceArg&) { return NULL; }
-};
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
 
-} 
-//! \endcond
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
 
-inline LocalSpaceArg
-__local(::size_t size)
-{
-    LocalSpaceArg ret = { size };
-    return ret;
-}
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
 
-class KernelFunctor;
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
 
-/*! \class Kernel
- * \brief Kernel interface that implements cl_kernel
- */
-class Kernel : public detail::Wrapper<cl_kernel>
-{
-public:
-    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() { }
 
-    Kernel() { }
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D(const Image3D& image3D) : Image(image3D) { }
 
-    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
 
-    Kernel& operator = (const Kernel& rhs)
+    /*! \brief Assignment from Image3D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const Image3D& rhs)
     {
         if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
+            Image::operator=(rhs);
         }
         return *this;
     }
 
-    template <typename T>
-    cl_int getInfo(cl_kernel_info name, T* param) const
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
     {
-        return detail::errHandler(
-            detail::getInfo(&::clGetKernelInfo, object_, name, param),
-            __GET_KERNEL_INFO_ERR);
+        Image::operator=(rhs);
+        return *this;
     }
+};
 
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
     {
-        typename detail::param_traits<
-            detail::cl_kernel_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
         if (err != NULL) {
-            *err = result;
+            *err = error;
         }
-        return param;
     }
 
-    template <typename T>
-    cl_int getWorkGroupInfo(
-        const Device& device, cl_kernel_work_group_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
-                __GET_KERNEL_WORK_GROUP_INFO_ERR);
-    }
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
 
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
-        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-        detail::cl_kernel_work_group_info, name>::param_type param;
-        cl_int result = getWorkGroupInfo(device, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL(const Image3DGL& image) : Image3D(image) { }
 
-    template <typename T>
-    cl_int setArg(cl_uint index, T value)
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from Image3DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const Image3DGL& rhs)
     {
-        return detail::errHandler(
-            ::clSetKernelArg(
-                object_,
-                index,
-                detail::KernelArgumentHandler<T>::size(value),
-                detail::KernelArgumentHandler<T>::ptr(value)),
-            __SET_KERNEL_ARGS_ERR);
+        if (this != &rhs) {
+            Image3D::operator=(rhs);
+        }
+        return *this;
     }
 
-    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
     {
-        return detail::errHandler(
-            ::clSetKernelArg(object_, index, size, argPtr),
-            __SET_KERNEL_ARGS_ERR);
+        Image3D::operator=(rhs);
+        return *this;
     }
-
-    KernelFunctor bind(
-        const CommandQueue& queue,
-        const NDRange& offset,
-        const NDRange& global,
-        const NDRange& local);
-
-    KernelFunctor bind(
-        const CommandQueue& queue,
-        const NDRange& global,
-        const NDRange& local);
 };
-
-__GET_INFO_HELPER_WITH_RETAIN(cl::Kernel)
-
-/*! \class Program
- * \brief Program interface that implements cl_program.
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
  */
-class Program : public detail::Wrapper<cl_program>
+class ImageGL : public Image
 {
 public:
-    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
-    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
-
-    Program(
+    ImageGL(
         const Context& context,
-        const Sources& sources,
-        cl_int* err = NULL)
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
     {
         cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
 
-        const ::size_t n = (::size_t)sources.size();
-        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
-        const char** strings = (const char**) alloca(n * sizeof(const char*));
-
-        for (::size_t i = 0; i < n; ++i) {
-            strings[i] = sources[(int)i].first;
-            lengths[i] = sources[(int)i].second;
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
         }
+    }
 
-        object_ = ::clCreateProgramWithSource(
-            context(), (cl_uint)n, strings, lengths, &error);
+    ImageGL() : Image() { }
 
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
-        if (err != NULL) {
-            *err = error;
+    ImageGL(const ImageGL& image) : Image(image) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const ImageGL& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
         }
+        return *this;
     }
 
-    Program(
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseSampler() on the value held by this instance.
+     */
+    ~Sampler() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
         const Context& context,
-        const VECTOR_CLASS<Device>& devices,
-        const Binaries& binaries,
-        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
         cl_int* err = NULL)
     {
         cl_int error;
-        const ::size_t n = binaries.size();
-        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
-        const unsigned char** images = (const unsigned char**) alloca(n * sizeof(const void*));
-
-        for (::size_t i = 0; i < n; ++i) {
-            images[i] = (const unsigned char*)binaries[(int)i].first;
-            lengths[i] = binaries[(int)i].second;
-        }
-
-        object_ = ::clCreateProgramWithBinary(
-            context(), (cl_uint) devices.size(),
-            (cl_device_id*)&devices.front(),
-            lengths, images, binaryStatus != NULL
-               ? (cl_int*) &binaryStatus->front()
-               : NULL, &error);
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
 
-        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
         if (err != NULL) {
             *err = error;
         }
     }
 
-    Program() { }
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainSampler() on the parameter's cl_sampler.
+     */
+    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
 
-    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
 
-    Program& operator = (const Program& rhs)
+    /*! \brief Assignment operator from Sampler.
+     * 
+     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
+     *  on the previous value held by this instance.
+     */
+    Sampler& operator = (const Sampler& rhs)
     {
         if (this != &rhs) {
             detail::Wrapper<cl_type>::operator=(rhs);
@@ -2456,130 +4339,188 @@ public:
         return *this;
     }
 
-    cl_int build(
-        const VECTOR_CLASS<Device>& devices,
-        const char* options = NULL,
-        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
-        void* data = NULL) const
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
     {
-        return detail::errHandler(
-            ::clBuildProgram(
-                object_,
-                (cl_uint)
-                devices.size(),
-                (cl_device_id*)&devices.front(),
-                options,
-                notifyFptr,
-                data),
-                __BUILD_PROGRAM_ERR);
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
     }
 
+    //! \brief Wrapper for clGetSamplerInfo().
     template <typename T>
-    cl_int getInfo(cl_program_info name, T* param) const
+    cl_int getInfo(cl_sampler_info name, T* param) const
     {
         return detail::errHandler(
-            detail::getInfo(&::clGetProgramInfo, object_, name, param),
-            __GET_PROGRAM_INFO_ERR);
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
     }
 
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
     template <cl_int name> typename
-    detail::param_traits<detail::cl_program_info, name>::param_type
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
         typename detail::param_traits<
-            detail::cl_program_info, name>::param_type param;
+            detail::cl_sampler_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
         if (err != NULL) {
             *err = result;
         }
         return param;
     }
+};
 
-    template <typename T>
-    cl_int getBuildInfo(
-        const Device& device, cl_program_build_info name, T* param) const
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
     {
-        return detail::errHandler(
-            detail::getInfo(
-                &::clGetProgramBuildInfo, object_, device(), name, param),
-                __GET_PROGRAM_BUILD_INFO_ERR);
+        sizes_[0] = size0;
     }
 
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_program_build_info, name>::param_type
-    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
     {
-        typename detail::param_traits<
-            detail::cl_program_build_info, name>::param_type param;
-        cl_int result = getBuildInfo(device, name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
+        sizes_[0] = size0;
+        sizes_[1] = size1;
     }
 
-    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
     {
-        cl_uint numKernels;
-        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-        }
-
-        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
-        err = ::clCreateKernelsInProgram(
-            object_, numKernels, (cl_kernel*) value, NULL);
-        if (err != CL_SUCCESS) {
-            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
-        }
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
 
-        kernels->assign(&value[0], &value[numKernels]);
-        return CL_SUCCESS;
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
     }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
 };
 
-__GET_INFO_HELPER_WITH_RETAIN(cl::Program)
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
 
-inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
 {
-    cl_int error;
-
-    object_ = ::clCreateKernel(program(), name, &error);
-    detail::errHandler(error, __CREATE_KERNEL_ERR);
-
-    if (err != NULL) {
-        *err = error;
-    }
+    ::size_t size_;
+};
 
-}
+namespace detail {
 
-/*! \class CommandQueue
- * \brief CommandQueue interface for cl_command_queue.
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
  */
-class CommandQueue : public detail::Wrapper<cl_command_queue>
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
 {
 public:
-    CommandQueue(
-        const Context& context,
-        const Device& device,
-        cl_command_queue_properties properties = 0,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateCommandQueue(
-            context(), device(), properties, &error);
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
 
-        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseKernel() on the value held by this instance.
+     */
+    ~Kernel() { }
 
-    CommandQueue() { }
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
 
-    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainKernel() on the parameter's cl_kernel.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
 
-    CommandQueue& operator = (const CommandQueue& rhs)
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from Kernel.
+     * 
+     *  This calls clRetainKernel() on the parameter and clReleaseKernel()
+     *  on the previous value held by this instance.
+     */
+    Kernel& operator = (const Kernel& rhs)
     {
         if (this != &rhs) {
             detail::Wrapper<cl_type>::operator=(rhs);
@@ -2587,21 +4528,31 @@ public:
         return *this;
     }
 
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
     template <typename T>
-    cl_int getInfo(cl_command_queue_info name, T* param) const
+    cl_int getInfo(cl_kernel_info name, T* param) const
     {
         return detail::errHandler(
-            detail::getInfo(
-                &::clGetCommandQueueInfo, object_, name, param),
-                __GET_COMMAND_QUEUE_INFO_ERR);
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
     }
 
     template <cl_int name> typename
-    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
     getInfo(cl_int* err = NULL) const
     {
         typename detail::param_traits<
-            detail::cl_command_queue_info, name>::param_type param;
+            detail::cl_kernel_info, name>::param_type param;
         cl_int result = getInfo(name, &param);
         if (err != NULL) {
             *err = result;
@@ -2609,1329 +4560,7800 @@ public:
         return param;
     }
 
-    cl_int enqueueReadBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
     {
         return detail::errHandler(
-            ::clEnqueueReadBuffer(
-                object_, buffer(), blocking, offset, size,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_READ_BUFFER_ERR);
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
     }
 
-    cl_int enqueueWriteBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        ::size_t offset,
-        ::size_t size,
-        const void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
     {
-        return detail::errHandler(
-            ::clEnqueueWriteBuffer(
-                object_, buffer(), blocking, offset, size,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-                __ENQUEUE_WRITE_BUFFER_ERR);
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
     }
+#endif // #if defined(CL_VERSION_1_2)
 
-    cl_int enqueueCopyBuffer(
-        const Buffer& src,
-        const Buffer& dst,
-        ::size_t src_offset,
-        ::size_t dst_offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
     {
         return detail::errHandler(
-            ::clEnqueueCopyBuffer(
-                object_, src(), dst(), src_offset, dst_offset, size,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQEUE_COPY_BUFFER_ERR);
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
     }
 
-#if defined(CL_VERSION_1_1)
-    cl_int enqueueReadBufferRect(
-        const Buffer& buffer,
-        cl_bool blocking,
-        const size_t<3>& buffer_offset,
-        const size_t<3>& host_offset,
-        const size_t<3>& region,
-        ::size_t buffer_row_pitch,
-        ::size_t buffer_slice_pitch,
-        ::size_t host_row_pitch,
-        ::size_t host_slice_pitch,
-        void *ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
     {
-        return detail::errHandler(
-            ::clEnqueueReadBufferRect(
-                object_, 
-                buffer(), 
-                blocking, 
-                (const ::size_t *)buffer_offset,
-                (const ::size_t *)host_offset,
-                (const ::size_t *)region,
-                buffer_row_pitch,
-                buffer_slice_pitch,
-                host_row_pitch,
-                host_slice_pitch,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-                __ENQUEUE_READ_BUFFER_RECT_ERR);
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
     }
 
-
-    cl_int enqueueWriteBufferRect(
-        const Buffer& buffer,
-        cl_bool blocking,
-        const size_t<3>& buffer_offset,
-        const size_t<3>& host_offset,
-        const size_t<3>& region,
-        ::size_t buffer_row_pitch,
-        ::size_t buffer_slice_pitch,
-        ::size_t host_row_pitch,
-        ::size_t host_slice_pitch,
-        void *ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    template <typename T>
+    cl_int setArg(cl_uint index, T value)
     {
         return detail::errHandler(
-            ::clEnqueueWriteBufferRect(
-                object_, 
-                buffer(), 
-                blocking, 
-                (const ::size_t *)buffer_offset,
-                (const ::size_t *)host_offset,
-                (const ::size_t *)region,
-                buffer_row_pitch,
-                buffer_slice_pitch,
-                host_row_pitch,
-                host_slice_pitch,
-                ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
     }
 
-    cl_int enqueueCopyBufferRect(
-        const Buffer& src,
-        const Buffer& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        ::size_t src_row_pitch,
-        ::size_t src_slice_pitch,
-        ::size_t dst_row_pitch,
-        ::size_t dst_slice_pitch,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
     {
         return detail::errHandler(
-            ::clEnqueueCopyBufferRect(
-                object_, 
-                src(), 
-                dst(), 
-                (const ::size_t *)src_origin, 
-                (const ::size_t *)dst_origin, 
-                (const ::size_t *)region,
-                src_row_pitch,
-                src_slice_pitch,
-                dst_row_pitch,
-                dst_slice_pitch,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQEUE_COPY_BUFFER_RECT_ERR);
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
     }
-#endif
+};
 
-    cl_int enqueueReadImage(
-        const Image& image,
-        cl_bool blocking,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t row_pitch,
-        ::size_t slice_pitch,
-        void* ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
-    {
-        return detail::errHandler(
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+		bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, binaryStatus != NULL
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const Program& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+	cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+				0,
+				NULL,
+				NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    cl_program prog = ::clLinkProgram(
+        Context::getDefault()(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static volatile int default_initialized_;
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        VECTOR_CLASS<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const CommandQueue& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
             ::clEnqueueReadImage(
                 object_, image(), blocking, (const ::size_t *) origin,
                 (const ::size_t *) region, row_pitch, slice_pitch, ptr,
                 (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_READ_IMAGE_ERR);
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueMarker(object_, (cl_event*) event),
+            __ENQUEUE_MARKER_ERR);
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                (const cl_event*) &events.front()),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
     }
+#endif
 
-    cl_int enqueueWriteImage(
-        const Image& image,
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
         cl_bool blocking,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t row_pitch,
-        ::size_t slice_pitch,
-        void* ptr,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
         const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
     {
-        return detail::errHandler(
-            ::clEnqueueWriteImage(
-                object_, image(), blocking, (const ::size_t *) origin,
-                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_WRITE_IMAGE_ERR);
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
     }
 
-    cl_int enqueueCopyImage(
-        const Image& src,
-        const Image& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
     {
-        return detail::errHandler(
-            ::clEnqueueCopyImage(
-                object_, src(), dst(), (const ::size_t *) src_origin,
-                (const ::size_t *)dst_origin, (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_COPY_IMAGE_ERR);
+
     }
 
-    cl_int enqueueCopyImageToBuffer(
-        const Image& src,
-        const Buffer& dst,
-        const size_t<3>& src_origin,
-        const size_t<3>& region,
-        ::size_t dst_offset,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
     {
-        return detail::errHandler(
-            ::clEnqueueCopyImageToBuffer(
-                object_, src(), dst(), (const ::size_t *) src_origin,
-                (const ::size_t *) region, dst_offset,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
     }
 
-    cl_int enqueueCopyBufferToImage(
-        const Buffer& src,
-        const Image& dst,
-        ::size_t src_offset,
-        const size_t<3>& dst_origin,
-        const size_t<3>& region,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
     {
-        return detail::errHandler(
-            ::clEnqueueCopyBufferToImage(
-                object_, src(), dst(), src_offset,
-                (const ::size_t *) dst_origin, (const ::size_t *) region,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
     }
 
-    void* enqueueMapBuffer(
-        const Buffer& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        ::size_t offset,
-        ::size_t size,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
     {
-        cl_int error;
-        void * result = ::clEnqueueMapBuffer(
-            object_, buffer(), blocking, flags, offset, size,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (cl_event*) event,
-            &error);
 
-        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-        return result;
     }
 
-    void* enqueueMapImage(
-        const Image& buffer,
-        cl_bool blocking,
-        cl_map_flags flags,
-        const size_t<3>& origin,
-        const size_t<3>& region,
-        ::size_t * row_pitch,
-        ::size_t * slice_pitch,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL,
-        cl_int* err = NULL) const
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
     {
-        cl_int error;
-        void * result = ::clEnqueueMapImage(
-            object_, buffer(), blocking, flags,
-            (const ::size_t *) origin, (const ::size_t *) region,
-            row_pitch, slice_pitch,
-            (events != NULL) ? (cl_uint) events->size() : 0,
-            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-            (cl_event*) event,
-            &error);
 
-        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
-        if (err != NULL) {
-              *err = error;
-        }
-        return result;
     }
 
-    cl_int enqueueUnmapMemObject(
-        const Memory& memory,
-        void* mapped_ptr,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
     {
-        return detail::errHandler(
-            ::clEnqueueUnmapMemObject(
-                object_, memory(), mapped_ptr,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+        events_.push_back(e);
     }
 
-    cl_int enqueueNDRangeKernel(
-        const Kernel& kernel,
-        const NDRange& offset,
-        const NDRange& global,
-        const NDRange& local,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
     {
-        return detail::errHandler(
-            ::clEnqueueNDRangeKernel(
-                object_, kernel(), (cl_uint) global.dimensions(),
-                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
-                (const ::size_t*) global,
-                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_NDRANGE_KERNEL_ERR);
+        events_.push_back(e);
     }
 
-    cl_int enqueueTask(
-        const Kernel& kernel,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
     {
-        return detail::errHandler(
-            ::clEnqueueTask(
-                object_, kernel(),
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_TASK_ERR);
+        events_.push_back(e);
     }
 
-    cl_int enqueueNativeKernel(
-        void (*userFptr)(void *),
-        std::pair<void*, ::size_t> args,
-        const VECTOR_CLASS<Memory>* mem_objects = NULL,
-        const VECTOR_CLASS<const void*>* mem_locs = NULL,
-        const VECTOR_CLASS<Event>* events = NULL,
-        Event* event = NULL) const
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
     {
-        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
-            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
-            : NULL;
 
-        if (mems != NULL) {
-            for (unsigned int i = 0; i < mem_objects->size(); i++) {
-                mems[i] = ((*mem_objects)[i])();
-            }
-        }
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
 
-        return detail::errHandler(
-            ::clEnqueueNativeKernel(
-                object_, userFptr, args.first, args.second,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                mems,
-                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_NATIVE_KERNEL);
     }
 
-    cl_int enqueueMarker(Event* event = NULL) const
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
     {
-        return detail::errHandler(
-            ::clEnqueueMarker(object_, (cl_event*) event),
-            __ENQUEUE_MARKER_ERR);
+
     }
+};
+
+namespace detail {
+
+class NullType {};
 
-    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
     {
-        return detail::errHandler(
-            ::clEnqueueWaitForEvents(
-                object_,
-                (cl_uint) events.size(),
-                (const cl_event*) &events.front()),
-            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+        kernel.setArg(index, arg);
     }
+};  
 
-    cl_int enqueueAcquireGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-         return detail::errHandler(
-             ::clEnqueueAcquireGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (cl_event*) event),
-             __ENQUEUE_ACQUIRE_GL_ERR);
-     }
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
 
-    cl_int enqueueReleaseGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-         return detail::errHandler(
-             ::clEnqueueReleaseGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (cl_event*) event),
-             __ENQUEUE_RELEASE_GL_ERR);
-     }
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
 
-#if defined (USE_DX_INTEROP)
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
 
-    cl_int enqueueAcquireD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-         static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
-         __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
-		
-         return detail::errHandler(
-             pfn_clEnqueueAcquireD3D10ObjectsKHR(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL) ? (cl_event*) &events->front() : NULL,
-                 (cl_event*) event),
-             __ENQUEUE_ACQUIRE_GL_ERR);
-     }
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
 
-    cl_int enqueueReleaseD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
     {
-        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
-        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
 
-        return detail::errHandler(
-            pfn_clEnqueueReleaseD3D10ObjectsKHR(
-                object_,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL) ? (cl_event*) &events->front() : NULL,
-                (cl_event*) event),
-            __ENQUEUE_RELEASE_GL_ERR);
-    }
-#endif
 
-    cl_int enqueueBarrier() const
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
     {
-        return detail::errHandler(
-            ::clEnqueueBarrier(object_),
-            __ENQUEUE_BARRIER_ERR);
-    }
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
 
-    cl_int flush() const
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
     {
-        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
-    }
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
 
-    cl_int finish() const
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
     {
-        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
-    }
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
 };
 
-__GET_INFO_HELPER_WITH_RETAIN(cl::CommandQueue)
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
 
-/*! \class KernelFunctor
- * \brief Kernel functor interface
- *
- * \note Currently only functors of zero to ten arguments are supported. It
- * is straightforward to add more and a more general solution, similar to
- * Boost.Lambda could be followed if required in the future.
- */
-class KernelFunctor
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-private:
-    Kernel kernel_;
-    CommandQueue queue_;
-    NDRange offset_;
-    NDRange global_;
-    NDRange local_;
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
 
-    cl_int err_;
-public:
-    KernelFunctor() { }
 
-    KernelFunctor(
-        const Kernel& kernel,
-        const CommandQueue& queue,
-        const NDRange& offset,
-        const NDRange& global,
-        const NDRange& local) :
-            kernel_(kernel),
-            queue_(queue),
-            offset_(offset),
-            global_(global),
-            local_(local),
-            err_(CL_SUCCESS)
-    {}
+};
 
-    KernelFunctor& operator=(const KernelFunctor& rhs);
-
-    KernelFunctor(const KernelFunctor& rhs);
-
-    cl_int getError() { return err_; }
-
-    inline Event operator()(const VECTOR_CLASS<Event>* events = NULL);
-
-    template<typename A1>
-    inline Event operator()(
-        const A1& a1, 
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2, class A3>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3,
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2, class A3, class A4>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4,
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2, class A3, class A4, class A5>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5,
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2, class A3, class A4, class A5, class A6>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6,
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2, class A3, class A4,
-             class A5, class A6, class A7>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6, 
-        const A7& a7,
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6, 
-        const A7& a7, 
-        const A8& a8,
-        const VECTOR_CLASS<Event>* events = NULL);
-
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8, class A9>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6, 
-        const A7& a7, 
-        const A8& a8, 
-        const A9& a9,
-        const VECTOR_CLASS<Event>* events = NULL);
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8, class A9, class A10>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6,
-        const A7& a7, 
-        const A8& a8, 
-        const A9& a9, 
-        const A10& a10,
-        const VECTOR_CLASS<Event>* events = NULL);
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8, class A9, class A10,
-             class A11>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6,
-        const A7& a7, 
-        const A8& a8, 
-        const A9& a9, 
-        const A10& a10, 
-        const A11& a11,
-        const VECTOR_CLASS<Event>* events = NULL);
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8, class A9, class A10,
-             class A11, class A12>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6,
-        const A7& a7, 
-        const A8& a8, 
-        const A9& a9, 
-        const A10& a10, 
-        const A11& a11, 
-        const A12& a12,
-        const VECTOR_CLASS<Event>* events = NULL);
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8, class A9, class A10,
-             class A11, class A12, class A13>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6,
-        const A7& a7, 
-        const A8& a8, 
-        const A9& a9, 
-        const A10& a10, 
-        const A11& a11, 
-        const A12& a12, 
-        const A13& a13,
-        const VECTOR_CLASS<Event>* events = NULL);
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8, class A9, class A10,
-             class A11, class A12, class A13, class A14>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6,
-        const A7& a7, 
-        const A8& a8, 
-        const A9& a9, 
-        const A10& a10, 
-        const A11& a11,
-        const A12& a12, 
-        const A13& a13, 
-        const A14& a14,
-        const VECTOR_CLASS<Event>* events = NULL);
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    template<class A1, class A2, class A3, class A4, class A5,
-             class A6, class A7, class A8, class A9, class A10,
-             class A11, class A12, class A13, class A14, class A15>
-    inline Event operator()(
-        const A1& a1, 
-        const A2& a2, 
-        const A3& a3, 
-        const A4& a4, 
-        const A5& a5, 
-        const A6& a6,
-        const A7& a7, 
-        const A8& a8, 
-        const A9& a9, 
-        const A10& a10, 
-        const A11& a11,
-        const A12& a12, 
-        const A13& a13, 
-        const A14& a14, 
-        const A15& a15,
-        const VECTOR_CLASS<Event>* events = NULL);
-};
-
-inline KernelFunctor Kernel::bind(
-    const CommandQueue& queue,
-    const NDRange& offset,
-    const NDRange& global,
-    const NDRange& local)
-{
-    return KernelFunctor(*this,queue,offset,global,local);
-}
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
 
-inline KernelFunctor Kernel::bind(
-    const CommandQueue& queue,
-    const NDRange& global,
-    const NDRange& local)
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-    return KernelFunctor(*this,queue,NullRange,global,local);
-}
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
 
-inline KernelFunctor& KernelFunctor::operator=(const KernelFunctor& rhs)
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-    if (this == &rhs) {
-        return *this;
-    }
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    kernel_ = rhs.kernel_;
-    queue_  = rhs.queue_;
-    offset_ = rhs.offset_;
-    global_ = rhs.global_;
-    local_  = rhs.local_;
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    return *this;
-}
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
 
-inline KernelFunctor::KernelFunctor(const KernelFunctor& rhs) :
-    kernel_(rhs.kernel_),
-    queue_(rhs.queue_),
-    offset_(rhs.offset_),
-    global_(rhs.global_),
-    local_(rhs.local_)
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-}
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
 
-Event KernelFunctor::operator()(const VECTOR_CLASS<Event>* events)
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-    Event event;
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
 
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
 
-    return event;
-}
+};
 
-template<typename A1>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const VECTOR_CLASS<Event>* events)
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-    Event event;
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
 
-    kernel_.setArg(0,a1);
 
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
 
-    return event;
-}
 
-template<typename A1, typename A2>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2,
-    const VECTOR_CLASS<Event>* events)
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-    Event event;
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
 
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
 
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
 
-    return event;
-}
 
-template<typename A1, typename A2, typename A3>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3,
-    const VECTOR_CLASS<Event>* events)
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
 {
-    Event event;
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
 
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
 
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
+};
 
-    return event;
-}
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
 
-template<typename A1, typename A2, typename A3, typename A4>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
 
-template<typename A1, typename A2, typename A3, typename A4, typename A5>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+};
 
-template<typename A1, typename A2, typename A3, typename A4, typename A5,
-         typename A6>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
 
-template<typename A1, typename A2, typename A3, typename A4,
-         typename A5, typename A6, typename A7>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6, 
-    const A7& a7,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
 
-template<typename A1, typename A2, typename A3, typename A4, typename A5,
-         typename A6, typename A7, typename A8>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6, 
-    const A7& a7, 
-    const A8& a8,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+};
 
-template<typename A1, typename A2, typename A3, typename A4, typename A5,
-         typename A6, typename A7, typename A8, typename A9>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5,
-    const A6& a6, 
-    const A7& a7, 
-    const A8& a8, 
-    const A9& a9,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-    kernel_.setArg(8,a9);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
 
-template<typename A1, typename A2, typename A3, typename A4, typename A5,
-         typename A6, typename A7, typename A8, typename A9, typename A10>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6,
-    const A7& a7, 
-    const A8& a8, 
-    const A9& a9, 
-    const A10& a10,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-    kernel_.setArg(8,a9);
-    kernel_.setArg(9,a10);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
 
-template<class A1, class A2, class A3, class A4, class A5,
-         class A6, class A7, class A8, class A9, class A10,
-         class A11>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6,
-    const A7& a7, 
-    const A8& a8, 
-    const A9& a9, 
-    const A10& a10, 
-    const A11& a11,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-    kernel_.setArg(8,a9);
-    kernel_.setArg(9,a10);
-    kernel_.setArg(10,a11);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
 
-template<class A1, class A2, class A3, class A4, class A5,
-         class A6, class A7, class A8, class A9, class A10,
-         class A11, class A12>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6,
-    const A7& a7, 
-    const A8& a8, 
-    const A9& a9, 
-    const A10& a10, 
-    const A11& a11, 
-    const A12& a12,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
-
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-    kernel_.setArg(8,a9);
-    kernel_.setArg(9,a10);
-    kernel_.setArg(10,a11);
-    kernel_.setArg(11,a12);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
 
-template<class A1, class A2, class A3, class A4, class A5,
-         class A6, class A7, class A8, class A9, class A10,
-         class A11, class A12, class A13>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6,
-    const A7& a7, 
-    const A8& a8, 
-    const A9& a9, 
-    const A10& a10, 
-    const A11& a11, 
-    const A12& a12, 
-    const A13& a13,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-    kernel_.setArg(8,a9);
-    kernel_.setArg(9,a10);
-    kernel_.setArg(10,a11);
-    kernel_.setArg(11,a12);
-    kernel_.setArg(12,a13);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
 
-template<class A1, class A2, class A3, class A4, class A5,
-         class A6, class A7, class A8, class A9, class A10,
-         class A11, class A12, class A13, class A14>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5, 
-    const A6& a6,
-    const A7& a7, 
-    const A8& a8, 
-    const A9& a9, 
-    const A10& a10, 
-    const A11& a11,
-    const A12& a12, 
-    const A13& a13, 
-    const A14& a14,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-    kernel_.setArg(8,a9);
-    kernel_.setArg(9,a10);
-    kernel_.setArg(10,a11);
-    kernel_.setArg(11,a12);
-    kernel_.setArg(12,a13);
-    kernel_.setArg(13,a14);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
 
-template<class A1, class A2, class A3, class A4, class A5,
-         class A6, class A7, class A8, class A9, class A10,
-         class A11, class A12, class A13, class A14, class A15>
-Event KernelFunctor::operator()(
-    const A1& a1, 
-    const A2& a2, 
-    const A3& a3, 
-    const A4& a4, 
-    const A5& a5,
-    const A6& a6, 
-    const A7& a7, 
-    const A8& a8, 
-    const A9& a9, 
-    const A10& a10, 
-    const A11& a11,
-    const A12& a12, 
-    const A13& a13, 
-    const A14& a14, 
-    const A15& a15,
-    const VECTOR_CLASS<Event>* events)
-{
-    Event event;
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
     
-    kernel_.setArg(0,a1);
-    kernel_.setArg(1,a2);
-    kernel_.setArg(2,a3);
-    kernel_.setArg(3,a4);
-    kernel_.setArg(4,a5);
-    kernel_.setArg(5,a6);
-    kernel_.setArg(6,a7);
-    kernel_.setArg(7,a8);
-    kernel_.setArg(8,a9);
-    kernel_.setArg(9,a10);
-    kernel_.setArg(10,a11);
-    kernel_.setArg(11,a12);
-    kernel_.setArg(12,a13);
-    kernel_.setArg(13,a14);
-    kernel_.setArg(14,a15);
-
-    err_ = queue_.enqueueNDRangeKernel(
-        kernel_,
-        offset_,
-        global_,
-        local_,
-        NULL,    // bgaster_fixme - do we want to allow wait event lists?
-        &event);
-
-    return event;
-}
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    >
+{
+public:
+	typedef detail::KernelFunctorGlobal<             
+		       T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
 
 #undef __ERR_STR
 #if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
@@ -3945,11 +12367,13 @@ Event KernelFunctor::operator()(
 #undef __GET_IMAGE_INFO_ERR
 #undef __GET_SAMPLER_INFO_ERR
 #undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
 #undef __GET_KERNEL_WORK_GROUP_INFO_ERR
 #undef __GET_PROGRAM_INFO_ERR
 #undef __GET_PROGRAM_BUILD_INFO_ERR
 #undef __GET_COMMAND_QUEUE_INFO_ERR
 
+#undef __CREATE_CONTEXT_ERR
 #undef __CREATE_CONTEXT_FROM_TYPE_ERR
 #undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
 
@@ -3963,6 +12387,7 @@ Event KernelFunctor::operator()(
 #undef __CREATE_USER_EVENT_ERR
 #undef __SET_USER_EVENT_STATUS_ERR
 #undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
 
 #undef __WAIT_FOR_EVENTS_ERR
 
@@ -3970,6 +12395,7 @@ Event KernelFunctor::operator()(
 #undef __SET_KERNEL_ARGS_ERR
 #undef __CREATE_PROGRAM_WITH_SOURCE_ERR
 #undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
 #undef __BUILD_PROGRAM_ERR
 #undef __CREATE_KERNELS_IN_PROGRAM_ERR
 
@@ -3993,19 +12419,34 @@ Event KernelFunctor::operator()(
 #undef __ENQUEUE_TASK_ERR
 #undef __ENQUEUE_NATIVE_KERNEL
 
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
 #undef __UNLOAD_COMPILER_ERR
 #endif //__CL_USER_OVERRIDE_ERROR_STRINGS
 
-#undef __GET_INFO_HELPER_WITH_RETAIN
+#undef __CL_FUNCTION_TYPE
 
 // Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
 #undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
 #undef __CREATE_SUB_DEVICES
 
 #if defined(USE_CL_DEVICE_FISSION)
 #undef __PARAM_NAME_DEVICE_FISSION
 #endif // USE_CL_DEVICE_FISSION
 
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
 } // namespace cl
 
+#ifdef _WIN32
+#pragma pop_macro("max")
+#endif // _WIN32
+
 #endif // CL_HPP_
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
index ea9ab99..b6c90b3 100644
--- a/include/CL/cl_d3d10.h
+++ b/include/CL/cl_d3d10.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -43,31 +43,31 @@ typedef cl_uint cl_d3d10_device_set_khr;
 
 /******************************************************************************/
 
-// Error Codes
+/* Error Codes */
 #define CL_INVALID_D3D10_DEVICE_KHR                  -1002
 #define CL_INVALID_D3D10_RESOURCE_KHR                -1003
 #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
 #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
 
-// cl_d3d10_device_source_nv
+/* cl_d3d10_device_source_nv */
 #define CL_D3D10_DEVICE_KHR                          0x4010
 #define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
 
-// cl_d3d10_device_set_nv
+/* cl_d3d10_device_set_nv */
 #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
 #define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
 
-// cl_context_info
+/* cl_context_info */
 #define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
 #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
 
-// cl_mem_info
+/* cl_mem_info */
 #define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
 
-// cl_image_info
+/* cl_image_info */
 #define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
 
-// cl_command_type
+/* cl_command_type */
 #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
 #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
 
@@ -113,7 +113,7 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
-    cl_mem *         mem_objects,
+    const cl_mem *   mem_objects,
     cl_uint          num_events_in_wait_list,
     const cl_event * event_wait_list,
     cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
@@ -122,5 +122,5 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
 }
 #endif
 
-#endif  // __OPENCL_CL_D3D10_H
+#endif  /* __OPENCL_CL_D3D10_H */
 
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d11.h
similarity index 55%
copy from include/CL/cl_d3d10.h
copy to include/CL/cl_d3d11.h
index ea9ab99..2e0a63f 100644
--- a/include/CL/cl_d3d10.h
+++ b/include/CL/cl_d3d11.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -23,10 +23,10 @@
 
 /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 
-#ifndef __OPENCL_CL_D3D10_H
-#define __OPENCL_CL_D3D10_H
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
 
-#include <d3d10.h>
+#include <d3d11.h>
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 
@@ -35,92 +35,92 @@ extern "C" {
 #endif
 
 /******************************************************************************
- * cl_khr_d3d10_sharing                                                       */
-#define cl_khr_d3d10_sharing 1
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
 
-typedef cl_uint cl_d3d10_device_source_khr;
-typedef cl_uint cl_d3d10_device_set_khr;
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
 
 /******************************************************************************/
 
-// Error Codes
-#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
-#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
-#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
-#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
 
-// cl_d3d10_device_source_nv
-#define CL_D3D10_DEVICE_KHR                          0x4010
-#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
 
-// cl_d3d10_device_set_nv
-#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
-#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
 
-// cl_context_info
-#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
-#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
 
-// cl_mem_info
-#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
 
-// cl_image_info
-#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
 
-// cl_command_type
-#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
-#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
 
 /******************************************************************************/
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
     cl_platform_id             platform,
-    cl_d3d10_device_source_khr d3d_device_source,
+    cl_d3d11_device_source_khr d3d_device_source,
     void *                     d3d_object,
-    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_d3d11_device_set_khr    d3d_device_set,
     cl_uint                    num_entries,
     cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
     cl_context     context,
     cl_mem_flags   flags,
-    ID3D10Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
     cl_context        context,
     cl_mem_flags      flags,
-    ID3D10Texture2D * resource,
+    ID3D11Texture2D * resource,
     UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
     cl_context        context,
     cl_mem_flags      flags,
-    ID3D10Texture3D * resource,
+    ID3D11Texture3D * resource,
     UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
     const cl_mem *   mem_objects,
     cl_uint          num_events_in_wait_list,
     const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
     cl_command_queue command_queue,
     cl_uint          num_objects,
-    cl_mem *         mem_objects,
+    const cl_mem *   mem_objects,
     cl_uint          num_events_in_wait_list,
     const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // __OPENCL_CL_D3D10_H
+#endif  /* __OPENCL_CL_D3D11_H */
 
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000..23f1631
--- /dev/null
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,127 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
new file mode 100644
index 0000000..93e6c9c
--- /dev/null
+++ b/include/CL/cl_egl.h
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include <CL/cl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#endif  
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 4e92c7e..710bea8 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -34,15 +34,12 @@ extern "C" {
 #endif
 
 #ifdef __APPLE__
-	#include <OpenCL/cl.h>
+        #include <OpenCL/cl.h>
     #include <AvailabilityMacros.h>
 #else
-	#include <CL/cl.h>
+        #include <CL/cl.h>
 #endif
 
-/* cl_khr_fp64 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
-
 /* cl_khr_fp16 extension - no extension #define since it has no functions  */
 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
 
@@ -64,7 +61,7 @@ extern "C" {
  * before using.
  */
 #define cl_APPLE_SetMemObjectDestructor 1
-cl_int	CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
                                         void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
                                         void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
 
@@ -118,6 +115,52 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
     cl_uint *        /* num_platforms */);
 
 
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
+#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
 /******************************************
 * cl_nv_device_attribute_query extension *
 ******************************************/
@@ -130,12 +173,16 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
 
-
 /*********************************
 * cl_amd_device_attribute_query *
 *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
 
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
 
 #ifdef CL_VERSION_1_1
    /***********************************
@@ -201,7 +248,63 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
     #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
     #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
 
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+	/* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
 
+} cl_mem_ion_host_ptr;
 
 #endif /* CL_VERSION_1_1 */
 
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
index 3b4fe06..e52c1b6 100644
--- a/include/CL/cl_gl.h
+++ b/include/CL/cl_gl.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -21,20 +21,11 @@
  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
  **********************************************************************************/
 
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-/*
- * cl_gl.h contains Khronos-approved (KHR) OpenCL extensions which have
- * OpenGL dependencies. The application is responsible for #including
- * OpenGL or OpenGL ES headers before #including cl_gl.h.
- */
-
 #ifndef __OPENCL_CL_GL_H
 #define __OPENCL_CL_GL_H
 
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
-#include <OpenGL/CGLDevice.h>
 #else
 #include <CL/cl.h>
 #endif	
@@ -48,15 +39,21 @@ typedef cl_uint     cl_gl_texture_info;
 typedef cl_uint     cl_gl_platform_info;
 typedef struct __GLsync *cl_GLsync;
 
-/* cl_gl_object_type */
-#define CL_GL_OBJECT_BUFFER             0x2000
-#define CL_GL_OBJECT_TEXTURE2D          0x2001
-#define CL_GL_OBJECT_TEXTURE3D          0x2002
-#define CL_GL_OBJECT_RENDERBUFFER       0x2003
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
 
-/* cl_gl_texture_info */
-#define CL_GL_TEXTURE_TARGET            0x2004
-#define CL_GL_MIPMAP_LEVEL              0x2005
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLBuffer(cl_context     /* context */,
@@ -65,21 +62,13 @@ clCreateFromGLBuffer(cl_context     /* context */,
                      int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLRenderbuffer(cl_context   /* context */,
                            cl_mem_flags /* flags */,
@@ -89,7 +78,7 @@ clCreateFromGLRenderbuffer(cl_context   /* context */,
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLObjectInfo(cl_mem                /* memobj */,
                   cl_gl_object_type *   /* gl_object_type */,
-                  cl_GLuint *              /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
                   
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLTextureInfo(cl_mem               /* memobj */,
@@ -114,33 +103,51 @@ clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
                           const cl_event *      /* event_wait_list */,
                           cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
 
-/* cl_khr_gl_sharing extension  */
 
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
 #define cl_khr_gl_sharing 1
-
+    
 typedef cl_uint     cl_gl_context_info;
-
+    
 /* Additional Error Codes  */
 #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
-
+    
 /* cl_gl_context_info  */
 #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
 #define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
-
+    
 /* Additional cl_context_properties  */
 #define CL_GL_CONTEXT_KHR                       0x2008
 #define CL_EGL_DISPLAY_KHR                      0x2009
 #define CL_GLX_DISPLAY_KHR                      0x200A
 #define CL_WGL_HDC_KHR                          0x200B
 #define CL_CGL_SHAREGROUP_KHR                   0x200C
-
+    
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
                       cl_gl_context_info            /* param_name */,
                       size_t                        /* param_value_size */,
                       void *                        /* param_value */,
                       size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
+    
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
     const cl_context_properties * properties,
     cl_gl_context_info            param_name,
@@ -152,4 +159,4 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
 }
 #endif
 
-#endif  /* __OPENCL_CL_GL_H  */
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
index 26e4782..77d5353 100644
--- a/include/CL/cl_gl_ext.h
+++ b/include/CL/cl_gl_ext.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -41,7 +41,7 @@ extern "C" {
 
 /*
  * For each extension, follow this template
- * /* cl_VEN_extname extension  */
+ *  cl_VEN_extname extension  */
 /* #define cl_VEN_extname 1
  * ... define new types, if any
  * ... define new tokens, if any
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 3fd73da..f2fe9d4 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -122,6 +122,17 @@ typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImageFromLibvaIntel_fn)(
                              const cl_libva_image * /* info */,
                              cl_int *               /* errcode_ret */);
 
+/* Create buffer from libva's buffer object */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectFdIntel(cl_context   /* context */,
+                      cl_mem       /* Memory Obejct */,
+                      int*         /* returned fd */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectFdIntel_fn)(
+                             cl_context   /* context */,
+                             cl_mem       /* Memory Obejct */,
+                             int*         /* returned fd */);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
index 043b048..7f6f5e8 100644
--- a/include/CL/cl_platform.h
+++ b/include/CL/cl_platform.h
@@ -1,5 +1,5 @@
 /**********************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
@@ -46,19 +46,75 @@ extern "C" {
 #endif
 
 #ifdef __APPLE__
-    #define CL_EXTENSION_WEAK_LINK                  __attribute__((weak_import))       
-    #define CL_API_SUFFIX__VERSION_1_0              AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
-    #define CL_EXT_SUFFIX__VERSION_1_0              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
-    #define CL_API_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
-    #define CL_EXT_SUFFIX__VERSION_1_1              CL_EXTENSION_WEAK_LINK
-    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
 #else
-    #define CL_EXTENSION_WEAK_LINK                         
+    #define CL_EXTENSION_WEAK_LINK  
     #define CL_API_SUFFIX__VERSION_1_0
     #define CL_EXT_SUFFIX__VERSION_1_0
     #define CL_API_SUFFIX__VERSION_1_1
     #define CL_EXT_SUFFIX__VERSION_1_1
-    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #endif
+    #elif _WIN32
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #endif
 #endif
 
 #if (defined (_WIN32) && defined(_MSC_VER))
@@ -252,7 +308,7 @@ typedef double          cl_double   __attribute__((aligned(8)));
 
 #include <stddef.h>
 
-/* Mirror types to GL types. Mirror types allow us to avoid deciding which headers to load based on whether we are using GL or GLES here. */
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
 typedef unsigned int cl_GLuint;
 typedef int          cl_GLint;
 typedef unsigned int cl_GLenum;
@@ -395,6 +451,24 @@ typedef unsigned int cl_GLenum;
     #define __CL_DOUBLE4__  1
 #endif
 
+/* Define capabilities for anonymous struct members. */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && (_MSC_VER >= 1500)
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
 /* Define alignment keys */
 #if defined( __GNUC__ )
     #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
@@ -410,7 +484,7 @@ typedef unsigned int cl_GLenum;
 #endif
 
 /* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#if __CL_HAS_ANON_STRUCT__
     /* .xyzw and .s0123...{f|F} are supported */
     #define CL_HAS_NAMED_VECTOR_FIELDS 1
     /* .hi and .lo are supported */
@@ -423,10 +497,10 @@ typedef unsigned int cl_GLenum;
 typedef union
 {
     cl_char  CL_ALIGNED(2) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y; };
-   __extension__ struct{ cl_char  s0, s1; };
-   __extension__ struct{ cl_char  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2;
@@ -436,10 +510,10 @@ typedef union
 typedef union
 {
     cl_char  CL_ALIGNED(4) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3; };
-   __extension__ struct{ cl_char2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2[2];
@@ -455,10 +529,10 @@ typedef  cl_char4  cl_char3;
 typedef union
 {
     cl_char   CL_ALIGNED(8) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_char4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2[4];
@@ -474,10 +548,10 @@ typedef union
 typedef union
 {
     cl_char  CL_ALIGNED(16) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_char8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
 #endif
 #if defined( __CL_CHAR2__) 
     __cl_char2     v2[8];
@@ -498,10 +572,10 @@ typedef union
 typedef union
 {
     cl_uchar  CL_ALIGNED(2) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y; };
-   __extension__ struct{ cl_uchar  s0, s1; };
-   __extension__ struct{ cl_uchar  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
 #endif
 #if defined( __cl_uchar2__) 
     __cl_uchar2     v2;
@@ -511,10 +585,10 @@ typedef union
 typedef union
 {
     cl_uchar  CL_ALIGNED(4) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
-   __extension__ struct{ cl_uchar2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
 #endif
 #if defined( __CL_UCHAR2__) 
     __cl_uchar2     v2[2];
@@ -530,10 +604,10 @@ typedef  cl_uchar4  cl_uchar3;
 typedef union
 {
     cl_uchar   CL_ALIGNED(8) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_uchar4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
 #endif
 #if defined( __CL_UCHAR2__) 
     __cl_uchar2     v2[4];
@@ -549,10 +623,10 @@ typedef union
 typedef union
 {
     cl_uchar  CL_ALIGNED(16) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_uchar8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
 #endif
 #if defined( __CL_UCHAR2__) 
     __cl_uchar2     v2[8];
@@ -573,10 +647,10 @@ typedef union
 typedef union
 {
     cl_short  CL_ALIGNED(4) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y; };
-   __extension__ struct{ cl_short  s0, s1; };
-   __extension__ struct{ cl_short  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2;
@@ -586,10 +660,10 @@ typedef union
 typedef union
 {
     cl_short  CL_ALIGNED(8) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3; };
-   __extension__ struct{ cl_short2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2[2];
@@ -605,10 +679,10 @@ typedef  cl_short4  cl_short3;
 typedef union
 {
     cl_short   CL_ALIGNED(16) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_short4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2[4];
@@ -624,10 +698,10 @@ typedef union
 typedef union
 {
     cl_short  CL_ALIGNED(32) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_short8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
 #endif
 #if defined( __CL_SHORT2__) 
     __cl_short2     v2[8];
@@ -648,10 +722,10 @@ typedef union
 typedef union
 {
     cl_ushort  CL_ALIGNED(4) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y; };
-   __extension__ struct{ cl_ushort  s0, s1; };
-   __extension__ struct{ cl_ushort  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2;
@@ -661,10 +735,10 @@ typedef union
 typedef union
 {
     cl_ushort  CL_ALIGNED(8) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
-   __extension__ struct{ cl_ushort2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2[2];
@@ -680,10 +754,10 @@ typedef  cl_ushort4  cl_ushort3;
 typedef union
 {
     cl_ushort   CL_ALIGNED(16) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_ushort4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2[4];
@@ -699,10 +773,10 @@ typedef union
 typedef union
 {
     cl_ushort  CL_ALIGNED(32) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_ushort8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
 #endif
 #if defined( __CL_USHORT2__) 
     __cl_ushort2     v2[8];
@@ -722,10 +796,10 @@ typedef union
 typedef union
 {
     cl_int  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y; };
-   __extension__ struct{ cl_int  s0, s1; };
-   __extension__ struct{ cl_int  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2;
@@ -735,10 +809,10 @@ typedef union
 typedef union
 {
     cl_int  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3; };
-   __extension__ struct{ cl_int2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2[2];
@@ -754,10 +828,10 @@ typedef  cl_int4  cl_int3;
 typedef union
 {
     cl_int   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_int4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2[4];
@@ -773,10 +847,10 @@ typedef union
 typedef union
 {
     cl_int  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_int8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
 #endif
 #if defined( __CL_INT2__) 
     __cl_int2     v2[8];
@@ -797,10 +871,10 @@ typedef union
 typedef union
 {
     cl_uint  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y; };
-   __extension__ struct{ cl_uint  s0, s1; };
-   __extension__ struct{ cl_uint  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2;
@@ -810,10 +884,10 @@ typedef union
 typedef union
 {
     cl_uint  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
-   __extension__ struct{ cl_uint2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2[2];
@@ -829,10 +903,10 @@ typedef  cl_uint4  cl_uint3;
 typedef union
 {
     cl_uint   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_uint4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2[4];
@@ -848,10 +922,10 @@ typedef union
 typedef union
 {
     cl_uint  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_uint8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
 #endif
 #if defined( __CL_UINT2__) 
     __cl_uint2     v2[8];
@@ -871,10 +945,10 @@ typedef union
 typedef union
 {
     cl_long  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y; };
-   __extension__ struct{ cl_long  s0, s1; };
-   __extension__ struct{ cl_long  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2;
@@ -884,10 +958,10 @@ typedef union
 typedef union
 {
     cl_long  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3; };
-   __extension__ struct{ cl_long2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2[2];
@@ -903,10 +977,10 @@ typedef  cl_long4  cl_long3;
 typedef union
 {
     cl_long   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_long4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2[4];
@@ -922,10 +996,10 @@ typedef union
 typedef union
 {
     cl_long  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_long8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
 #endif
 #if defined( __CL_LONG2__) 
     __cl_long2     v2[8];
@@ -946,10 +1020,10 @@ typedef union
 typedef union
 {
     cl_ulong  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y; };
-   __extension__ struct{ cl_ulong  s0, s1; };
-   __extension__ struct{ cl_ulong  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2;
@@ -959,10 +1033,10 @@ typedef union
 typedef union
 {
     cl_ulong  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
-   __extension__ struct{ cl_ulong2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2[2];
@@ -978,10 +1052,10 @@ typedef  cl_ulong4  cl_ulong3;
 typedef union
 {
     cl_ulong   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_ulong4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2[4];
@@ -997,10 +1071,10 @@ typedef union
 typedef union
 {
     cl_ulong  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_ulong8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
 #endif
 #if defined( __CL_ULONG2__) 
     __cl_ulong2     v2[8];
@@ -1022,10 +1096,10 @@ typedef union
 typedef union
 {
     cl_float  CL_ALIGNED(8) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float  x, y; };
-   __extension__ struct{ cl_float  s0, s1; };
-   __extension__ struct{ cl_float  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2;
@@ -1035,10 +1109,10 @@ typedef union
 typedef union
 {
     cl_float  CL_ALIGNED(16) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float   x, y, z, w; };
-   __extension__ struct{ cl_float   s0, s1, s2, s3; };
-   __extension__ struct{ cl_float2  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2[2];
@@ -1054,10 +1128,10 @@ typedef  cl_float4  cl_float3;
 typedef union
 {
     cl_float   CL_ALIGNED(32) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float   x, y, z, w; };
-   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_float4  lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2[4];
@@ -1073,10 +1147,10 @@ typedef union
 typedef union
 {
     cl_float  CL_ALIGNED(64) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_float8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
 #endif
 #if defined( __CL_FLOAT2__) 
     __cl_float2     v2[8];
@@ -1097,10 +1171,10 @@ typedef union
 typedef union
 {
     cl_double  CL_ALIGNED(16) s[2];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y; };
-   __extension__ struct{ cl_double s0, s1; };
-   __extension__ struct{ cl_double lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2;
@@ -1110,10 +1184,10 @@ typedef union
 typedef union
 {
     cl_double  CL_ALIGNED(32) s[4];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3; };
-   __extension__ struct{ cl_double2 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2[2];
@@ -1129,10 +1203,10 @@ typedef  cl_double4  cl_double3;
 typedef union
 {
     cl_double   CL_ALIGNED(64) s[8];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __extension__ struct{ cl_double4 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2[4];
@@ -1148,10 +1222,10 @@ typedef union
 typedef union
 {
     cl_double  CL_ALIGNED(128) s[16];
-#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __extension__ struct{ cl_double8 lo, hi; };
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
 #endif
 #if defined( __CL_DOUBLE2__) 
     __cl_double2     v2[8];
@@ -1170,13 +1244,13 @@ typedef union
 /* Macro to facilitate debugging 
  * Usage:
  *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
- *   The first line ends with:   CL_PROGRAM_STRING_BEGIN \"
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
  *   Each line thereafter of OpenCL C source must end with: \n\
  *   The last line ends in ";
  *
  *   Example:
  *
- *   const char *my_program = CL_PROGRAM_STRING_BEGIN "\
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
  *   kernel void foo( int a, float * b )             \n\
  *   {                                               \n\
  *      // my comment                                \n\
@@ -1195,4 +1269,10 @@ typedef union
 }
 #endif
 
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && (_MSC_VER >= 1500)
+#pragma warning( pop )
+#endif
+
 #endif  /* __CL_PLATFORM_H  */
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
index 26a6389..3f00524 100644
--- a/include/CL/opencl.h
+++ b/include/CL/opencl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2008-2010 The Khronos Group Inc.
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and/or associated documentation files (the
diff --git a/kernels/compare_image_2d_and_1d_array.cl b/kernels/compare_image_2d_and_1d_array.cl
new file mode 100644
index 0000000..6aabb43
--- /dev/null
+++ b/kernels/compare_image_2d_and_1d_array.cl
@@ -0,0 +1,13 @@
+__kernel void
+compare_image_2d_and_1d_array(image2d_t a1, image1d_array_t a2, sampler_t sampler)
+{
+  float2 coord;
+  int4 color1;
+  int4 color2;
+  coord.x = (float)get_global_id(0) + 0.3f;
+  coord.y = (float)get_global_id(1) + 0.3f;
+  color1 = read_imagei(a1, sampler, coord);
+  color2 = read_imagei(a2, sampler, coord);
+//  printf("########## x y is (%f, %f), color1 is (%d %d %d %d), color2 is (%d %d %d %d)\n",
+//	  coord.x, coord.y, color1.x, color1.y, color1.z, color1.w, color2.x, color2.y, color2.z, color2.w);
+}
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
index 06ec8e7..dddde44 100644
--- a/kernels/compiler_async_copy.cl
+++ b/kernels/compiler_async_copy.cl
@@ -21,4 +21,4 @@ DEF(uint2);
 DEF(long2);
 DEF(ulong2);
 DEF(float2);
-DEF(double2);
+//DEF(double2);
diff --git a/kernels/compiler_constant_expr.cl b/kernels/compiler_constant_expr.cl
new file mode 100644
index 0000000..d40cead
--- /dev/null
+++ b/kernels/compiler_constant_expr.cl
@@ -0,0 +1,23 @@
+float3 foo_pow3(float3 src0, float3 src1)
+{
+  union {
+    float3 f3;
+    float   farray[4];
+  } s0, s1, dst;
+  s0.f3 = src0;
+  s1.f3 = src1;
+  int i;
+  for(i = 0; i < 3; i++)
+    dst.farray[i] = pow(s0.farray[i], s1.farray[i]);
+  return dst.f3;
+}
+
+__kernel void
+compiler_constant_expr(__global float* src, __global float *dst)
+{
+  int gid = get_global_id(0);
+  float3 f3 = vload3(gid, src);
+  float3 cf3 = (float3)(1.f, 2.f, 3.f);
+  float3 result = foo_pow3(f3, cf3);
+  vstore3(result, gid, dst); 
+} 
diff --git a/kernels/compiler_getelementptr_bitcast.cl b/kernels/compiler_getelementptr_bitcast.cl
new file mode 100644
index 0000000..0320abf
--- /dev/null
+++ b/kernels/compiler_getelementptr_bitcast.cl
@@ -0,0 +1,18 @@
+__kernel void compiler_getelementptr_bitcast(global float *src, global float *dst)
+{
+  int i = get_global_id(0);
+
+  __local  float ldata[256];
+  ldata[get_local_id(0)] = src[i];
+
+  //if use get_local_id(0) to index ldata, the issue is not reproduced
+  //so, just set the work group as 1 in the application
+  __local uchar *  pldata = (__local uchar *)&ldata[0];
+  uchar data;
+  for(int k = 0; k < 3; k++){
+    data = *pldata;
+    pldata++;
+  }
+
+  dst[i] = data;
+}
diff --git a/kernels/compiler_mixed_pointer.cl b/kernels/compiler_mixed_pointer.cl
new file mode 100644
index 0000000..78c5783
--- /dev/null
+++ b/kernels/compiler_mixed_pointer.cl
@@ -0,0 +1,23 @@
+
+kernel void compiler_mixed_pointer(__global uint* src1, __global uint *src2, __global uint *dst) {
+  int x = get_global_id(0);
+  global uint * tmp = NULL;
+
+  switch(x) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[x] = tmp[x];
+}
+
+kernel void compiler_mixed_pointer1(__global uint* src, __global uint *dst1, __global uint *dst2) {
+  int x = get_global_id(0);
+  global uint * tmp = x < 5 ? dst1 : dst2;
+  tmp[x] = src[x];
+}
diff --git a/kernels/compiler_simd_all.cl b/kernels/compiler_simd_all.cl
new file mode 100644
index 0000000..504710b
--- /dev/null
+++ b/kernels/compiler_simd_all.cl
@@ -0,0 +1,12 @@
+__kernel void compiler_simd_all(global int *src, global int *dst)
+{
+  int i = get_global_id(0);
+  if (i % 2 == 1) {
+    if (__gen_ocl_simd_all((src[i] < 12) && (src[i] > 0)))
+      dst[i] = 1;
+    else
+      dst[i] = 2;
+  }
+  else
+    dst[i] = 3;
+}
diff --git a/kernels/compiler_simd_any.cl b/kernels/compiler_simd_any.cl
new file mode 100644
index 0000000..3b04f82
--- /dev/null
+++ b/kernels/compiler_simd_any.cl
@@ -0,0 +1,15 @@
+__kernel void compiler_simd_any(global int *src, global int *dst)
+{
+  int i = get_global_id(0);
+
+  if (i % 2 == 1) {
+    if (__gen_ocl_simd_any(src[i] == 5) || __gen_ocl_simd_any(src[i] == 9))
+      dst[i] = 1;
+    else if (__gen_ocl_simd_any(src[i] == 6))
+      dst[i] = 0;
+    else
+      dst[i] = 2;
+  }
+  else
+    dst[i] = 3;
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
index 964f5e7..aec38b1 100644
--- a/kernels/compiler_vector_load_store.cl
+++ b/kernels/compiler_vector_load_store.cl
@@ -25,9 +25,9 @@ __kernel void test_##type ##n(__global type *pin, \
   TEST_TYPE(int,n)   \
   TEST_TYPE(uint,n)  \
   TEST_TYPE(float,n) \
-  TEST_TYPE(double,n)\
   TEST_TYPE(long,n)  \
   TEST_TYPE(ulong,n)
+//  TEST_TYPE(double,n)
 
 #if 0
   TEST_TYPE(half,n)
diff --git a/kernels/double_precision_check.cl b/kernels/double_precision_check.cl
new file mode 100644
index 0000000..e55cafa
--- /dev/null
+++ b/kernels/double_precision_check.cl
@@ -0,0 +1,11 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+__kernel void
+double_precision_check(__global float* src, __global float* dst)
+{
+  int id = (int)get_global_id(0);
+  double d0 = 0.12345678912345678 + src[1];
+  double d1 = 0.12355678922345678 + src[0];
+  float rem = d1 - d0;
+  dst[id] = rem;
+}
diff --git a/kernels/image_1D_buffer.cl b/kernels/image_1D_buffer.cl
new file mode 100644
index 0000000..e8e0a86
--- /dev/null
+++ b/kernels/image_1D_buffer.cl
@@ -0,0 +1,13 @@
+__kernel void image_1D_buffer(image1d_buffer_t image1, image1d_t image2, sampler_t sampler, __global int *results)
+{
+   int x = get_global_id(0);
+   int offset = x;
+
+   int4 col = read_imagei(image1, x);
+   int4 test = (col != read_imagei(image2, sampler, x));
+
+   if (test.x || test.y || test.z || test.w)
+      results[offset] = 0;
+   else
+      results[offset] = 1;
+}
diff --git a/kernels/include/runtime_compile_link_inc.h b/kernels/include/runtime_compile_link_inc.h
new file mode 100644
index 0000000..9b66850
--- /dev/null
+++ b/kernels/include/runtime_compile_link_inc.h
@@ -0,0 +1,4 @@
+int greater(long x, long y)
+{
+  return x > y ;
+}
diff --git a/kernels/runtime_compile_link.h b/kernels/runtime_compile_link.h
new file mode 100644
index 0000000..ae2c56e
--- /dev/null
+++ b/kernels/runtime_compile_link.h
@@ -0,0 +1 @@
+int comp_long(long x, long y);
diff --git a/kernels/runtime_compile_link_a.cl b/kernels/runtime_compile_link_a.cl
new file mode 100644
index 0000000..b17861f
--- /dev/null
+++ b/kernels/runtime_compile_link_a.cl
@@ -0,0 +1,13 @@
+#include "runtime_compile_link.h"
+#include "include/runtime_compile_link_inc.h"
+
+int comp_long(long x, long y)
+{
+  return x < y ;
+}
+
+kernel void runtime_compile_link_a(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  int j = comp_long(src1[i], src2[i]);
+  dst[i] = j ? 3 : 4;
+}
diff --git a/kernels/runtime_compile_link_b.cl b/kernels/runtime_compile_link_b.cl
new file mode 100644
index 0000000..89b5a2d
--- /dev/null
+++ b/kernels/runtime_compile_link_b.cl
@@ -0,0 +1,9 @@
+#include "runtime_compile_link.h"
+#include "include/runtime_compile_link_inc.h"
+
+kernel void runtime_compile_link_b(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  int j = comp_long(src1[i], src2[i]);
+  dst[i] = j ? 3 : 4;
+  int k = greater(src1[i], src2[i]);
+}
diff --git a/kernels/test_copy_image_1d.cl b/kernels/test_copy_image_1d.cl
new file mode 100644
index 0000000..88428bb
--- /dev/null
+++ b/kernels/test_copy_image_1d.cl
@@ -0,0 +1,9 @@
+__kernel void
+test_copy_image_1d(__read_only image1d_t src, __write_only image1d_t dst, sampler_t sampler)
+{
+  int coord;
+  int4 color;
+  coord = (int)get_global_id(0);
+  color = read_imagei(src, sampler, coord);
+  write_imagei(dst, coord, color);
+}
diff --git a/kernels/test_fill_image_1d.cl b/kernels/test_fill_image_1d.cl
new file mode 100644
index 0000000..db922af
--- /dev/null
+++ b/kernels/test_fill_image_1d.cl
@@ -0,0 +1,8 @@
+__kernel void
+test_fill_image_1d(__write_only image1d_t dst)
+{
+  int coord;
+  coord = (int)get_global_id(0);
+  uint4 color4 = {0, 1, 2 ,3};
+  write_imageui(dst, coord, color4);
+}
diff --git a/kernels/test_get_arg_info.cl b/kernels/test_get_arg_info.cl
new file mode 100644
index 0000000..43a804b
--- /dev/null
+++ b/kernels/test_get_arg_info.cl
@@ -0,0 +1,8 @@
+typedef struct _test_arg_struct {
+    int a;
+    int b;
+}test_arg_struct;
+
+kernel void test_get_arg_info(read_only global float const volatile *src, read_write local int read_only *dst, test_arg_struct extra) {
+
+}
diff --git a/kernels/test_get_image_info_array.cl b/kernels/test_get_image_info_array.cl
new file mode 100644
index 0000000..333da77
--- /dev/null
+++ b/kernels/test_get_image_info_array.cl
@@ -0,0 +1,25 @@
+__kernel void
+test_get_image_info_array(__write_only image1d_array_t a1, __write_only image2d_array_t a2, __global int *result)
+{
+  int w, h, array_sz;
+
+  w = get_image_width(a1);
+  array_sz = (int)get_image_array_size(a1);
+  int channel_data_type = get_image_channel_data_type(a1);
+  int channel_order = get_image_channel_order(a1);
+  result[0] = w;
+  result[1] = array_sz;
+  result[2] = channel_data_type;
+  result[3] = channel_order;
+
+  w = get_image_width(a2);
+  h = get_image_height(a2);
+  array_sz = (int)get_image_array_size(a2);
+  channel_data_type = get_image_channel_data_type(a2);
+  channel_order = get_image_channel_order(a2);
+  result[4] = w;
+  result[5] = h;
+  result[6] = array_sz;
+  result[7] = channel_data_type;
+  result[8] = channel_order;
+}
diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
new file mode 100644
index 0000000..84bb478
--- /dev/null
+++ b/kernels/test_printf.cl
@@ -0,0 +1,38 @@
+__kernel void
+test_printf(void)
+{
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int z = (int)get_global_id(2);
+  uint a = 'x';
+  float f = 5.0f;
+  int3 vec;
+  vec.x = x;
+  vec.y = y;
+  vec.z = z;
+
+  if (x == 0 && y == 0 && z == 0) {
+    printf("--- Welcome to the printf test of %s ---\n", "Intel Beignet");
+
+    printf("### output a char is %c\n", a);
+  }
+
+  if (x % 15 == 0)
+    if (y % 3 == 0)
+      if (z % 7 == 0)
+        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
+                vec, get_global_size(0), get_global_size(1), get_global_size(2));
+
+  if (x == 1)
+    if (y == 0) {
+      if (z % 2 == 0)
+          printf("#### output a float is %f\n", f);
+      else
+          printf("#### output a float to int is %d\n", f);
+    }
+
+  if (x == 0 && y == 0 && z == 0) {
+    printf("--- End to the printf test ---\n");
+  }
+
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 95ff56f..ce16a8c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,25 +1,60 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}
-                    ${DRM_INCLUDE_PATH}
-                    ${DRM_INCLUDE_PATH}/../
+                    ${DRM_INCLUDE_DIRS}
+                    ${DRM_INCLUDE_DIRS}/../
                     ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include
                     ${MESA_SOURCE_INCLUDES})
+
 macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
 foreach (KF ${KERNEL_FILES})
   set (input_file ${KERNEL_PATH}/${KF}.cl)
   set (output_file ${KERNEL_PATH}/${KF}_str.c)
   list (APPEND KERNEL_STR_FILES ${output_file})
-  add_custom_command(
-    OUTPUT ${output_file}
-    COMMAND rm -rf ${output_file}
-    COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
-    DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+  if(GEN_PCI_ID)
+    add_custom_command(
+      OUTPUT ${output_file}
+      COMMAND rm -rf ${output_file}
+      COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} -t${GEN_PCI_ID}
+      DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+  else(GEN_PCI_ID)
+    add_custom_command(
+      OUTPUT ${output_file}
+      COMMAND rm -rf ${output_file}
+      COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
+      DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+  endif(GEN_PCI_ID)
 endforeach (KF)
 endmacro (MakeKernelBinStr)
 
+macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES)
+  set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl)
+  set (file_content)
+  file (REMOVE ${output_file})
+  foreach (KF ${KERNEL_NAMES})
+    set (input_file ${KERNEL_PATH}/${KF}.cl)
+    file(READ ${input_file} file_content )
+    STRING(REGEX REPLACE ";" "\\\\;" file_content "${file_content}")
+    file(APPEND ${output_file} ${file_content})
+  endforeach (KF)
+endmacro (MakeBuiltInKernelStr)
+
 set (KERNEL_STR_FILES)
-set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 cl_internal_copy_buf_align16)
+set (KERNEL_NAMES cl_internal_copy_buf_align4
+cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
+cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset
+cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
+cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d
+cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
+cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
+cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
+cl_internal_fill_buf_align128 cl_internal_fill_image_1d
+cl_internal_fill_image_1d_array cl_internal_fill_image_2d
+cl_internal_fill_image_2d_array cl_internal_fill_image_3d)
+set (BUILT_IN_NAME  cl_internal_built_in_kernel)
+MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
 MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
+MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${BUILT_IN_NAME}")
 
 set(OPENCL_SRC
     ${KERNEL_STR_FILES}
@@ -27,6 +62,7 @@ set(OPENCL_SRC
     cl_alloc.c
     cl_kernel.c
     cl_program.c
+    cl_gbe_loader.cpp
     cl_sampler.c
     cl_event.c
     cl_enqueue.c
@@ -46,8 +82,16 @@ set(OPENCL_SRC
     intel/intel_gpgpu.c
     intel/intel_batchbuffer.c
     intel/intel_driver.c
-    x11/dricommon.c 
-    x11/va_dri2.c)
+    performance.c)
+
+if (X11_FOUND)
+  set(CMAKE_CXX_FLAGS "-DHAS_X11 ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_C_FLAGS "-DHAS_X11 ${CMAKE_C_FLAGS}")
+  set(OPENCL_SRC
+      ${OPENCL_SRC}
+      x11/dricommon.c
+      x11/va_dri2.c)
+endif (X11_FOUND)
 
 if (EGL_FOUND AND MESA_SOURCE_FOUND)
 set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
@@ -66,16 +110,17 @@ endif (OCLIcd_FOUND)
 
 SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
 
-link_directories (${LLVM_LIBRARY_DIR})
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
 add_library(cl SHARED ${OPENCL_SRC})
 target_link_libraries(
                       cl
-                      gbe
-                      ${XLIB_LIBRARY}
-                      ${XEXT_LIBRARY}
-                      ${XFIXES_LIBRARY}
-                      ${DRM_INTEL_LIBRARY}
-                      ${DRM_LIBRARY}
+                      ${X11_LIBRARIES}
+                      ${XEXT_LIBRARIES}
+                      ${XFIXES_LIBRARIES}
+                      ${DRM_INTEL_LIBRARIES}
+                      ${DRM_LIBRARIES}
+                      ${CMAKE_THREAD_LIBS_INIT}
+                      ${CMAKE_DL_LIBS}
                       ${OPENGL_LIBRARIES}
                       ${OPTIONAL_EGL_LIBRARY})
-install (TARGETS cl LIBRARY DESTINATION ${LIB_INSTALL_DIR}/beignet)
+install (TARGETS cl LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
diff --git a/src/cl_alloc.c b/src/cl_alloc.c
index 20d5578..93d2e6a 100644
--- a/src/cl_alloc.c
+++ b/src/cl_alloc.c
@@ -71,6 +71,7 @@ cl_free(void *ptr)
     return;
   atomic_dec(&cl_alloc_n);
   free(ptr);
+  ptr = NULL;
 }
 
 LOCAL size_t
diff --git a/src/cl_api.c b/src/cl_api.c
index 2a6f8ce..630511f 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -40,6 +40,8 @@
 #include <assert.h>
 #include <unistd.h>
 
+#include "performance.h"
+
 #ifndef CL_VERSION_1_2
 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
@@ -67,7 +69,7 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
               cl_event* event, enqueue_data* data, cl_command_type type)
 {
   cl_int status = cl_event_wait_events(num, wait_list, queue);
-  cl_event e;
+  cl_event e = NULL;
   if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
     e = cl_event_new(queue->ctx, queue, type, event!=NULL);
 
@@ -83,6 +85,7 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
       cl_event_new_enqueue_callback(e, data, num, wait_list);
     }
   }
+  queue->current_event = e;
   return status;
 }
 
@@ -167,6 +170,7 @@ cl_check_device_type(cl_device_type device_type)
 static cl_int
 cl_device_id_is_ok(const cl_device_id device)
 {
+  if(UNLIKELY(device == NULL)) return CL_FALSE;
   return device != cl_get_gt_device() ? CL_FALSE : CL_TRUE;
 }
 
@@ -250,8 +254,14 @@ clCreateSubDevices(cl_device_id                         in_device,
                    cl_device_id *                       out_devices,
                    cl_uint *                            num_devices_ret)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  /* Check parameter consistency */
+  if (UNLIKELY(out_devices == NULL && num_devices_ret == NULL))
+    return CL_INVALID_VALUE;
+  if (UNLIKELY(in_device == NULL && properties == NULL))
+    return CL_INVALID_VALUE;
+
+  *num_devices_ret = 0;
+  return CL_INVALID_DEVICE_PARTITION_COUNT;
 }
 
 cl_int
@@ -293,6 +303,7 @@ clCreateContext(const cl_context_properties *  properties,
                            pfn_notify,
                            user_data,
                            &err);
+  initialize_env_var();
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -513,7 +524,43 @@ clCreateImage(cl_context context,
   cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
+  if (image_format == NULL) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
+  if (image_format->image_channel_order < CL_R ||
+          image_format->image_channel_order > CL_RGBx) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
+  if (image_format->image_channel_data_type < CL_SNORM_INT8 ||
+          image_format->image_channel_data_type > CL_FLOAT) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
 
+  if (image_desc == NULL) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+  if (image_desc->image_type <= CL_MEM_OBJECT_BUFFER ||
+          image_desc->image_type > CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+  /* buffer refers to a valid buffer memory object if image_type is
+     CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
+  if (image_desc->image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
+         image_desc->buffer) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+  if (image_desc->num_mip_levels || image_desc->num_samples) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  /* Other details check for image_desc will leave to image create. */
   mem = cl_mem_new_image(context,
                          flags,
                          image_format,
@@ -629,7 +676,10 @@ clGetSupportedImageFormats(cl_context         ctx,
     err = CL_INVALID_VALUE;
     goto error;
   }
-  if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE2D &&
+  if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
+               image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+               image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
+               image_type != CL_MEM_OBJECT_IMAGE2D &&
                image_type != CL_MEM_OBJECT_IMAGE3D)) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -823,6 +873,30 @@ error:
     *errcode_ret = err;
   return program;
 }
+
+cl_program
+clCreateProgramWithBuiltInKernels(cl_context           context,
+                                  cl_uint              num_devices,
+                                  const cl_device_id * device_list,
+                                  const char *         kernel_names,
+                                  cl_int *             errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_CONTEXT (context);
+  INVALID_VALUE_IF (kernel_names == NULL);
+  program = cl_program_create_with_built_in_kernles(context,
+                                                    num_devices,
+                                                    device_list,
+                                                    kernel_names,
+                                                    &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
 cl_int
 clRetainProgram(cl_program program)
 {
@@ -880,12 +954,92 @@ error:
 }
 
 cl_int
+clCompileProgram(cl_program            program ,
+                 cl_uint               num_devices ,
+                 const cl_device_id *  device_list ,
+                 const char *          options ,
+                 cl_uint               num_input_headers ,
+                 const cl_program *    input_headers ,
+                 const char **         header_include_names ,
+                 void (CL_CALLBACK *   pfn_notify )(cl_program, void *),
+                 void *                user_data )
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_PROGRAM(program);
+  INVALID_VALUE_IF (num_devices > 1);
+  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
+  INVALID_VALUE_IF (num_input_headers == 0 && input_headers != NULL);
+  INVALID_VALUE_IF (num_input_headers != 0 && input_headers == NULL);
+
+  /* Everything is easy. We only support one device anyway */
+  if (num_devices != 0) {
+    assert(program->ctx);
+    INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+  }
+
+  /* TODO support create program from binary */
+  assert(program->source_type == FROM_LLVM ||
+      program->source_type == FROM_SOURCE ||
+      program->source_type == FROM_BINARY);
+  if((err = cl_program_compile(program, num_input_headers, input_headers, header_include_names, options)) != CL_SUCCESS) {
+    goto error;
+  }
+  program->is_built = CL_TRUE;
+
+  if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+  return err;
+}
+
+cl_program
+clLinkProgram(cl_context            context,
+              cl_uint               num_devices,
+              const cl_device_id *  device_list,
+              const char *          options,
+              cl_uint               num_input_programs,
+              const cl_program *    input_programs,
+              void (CL_CALLBACK *   pfn_notify)(cl_program  program, void * user_data),
+              void *                user_data,
+              cl_int *              errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_program program = NULL;
+  CHECK_CONTEXT (context);
+  INVALID_VALUE_IF (num_devices > 1);
+  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
+  INVALID_VALUE_IF (num_input_programs == 0 && input_programs != NULL);
+  INVALID_VALUE_IF (num_input_programs != 0 && input_programs == NULL);
+
+  program = cl_program_link(context, num_input_programs, input_programs, options, &err);
+
+  program->is_built = CL_TRUE;
+
+  if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_int
 clUnloadCompiler(void)
 {
   return CL_SUCCESS;
 }
 
 cl_int
+clUnloadPlatformCompiler(cl_platform_id platform)
+{
+  return CL_SUCCESS;
+}
+
+cl_int
 clGetProgramInfo(cl_program       program,
                  cl_program_info  param_name,
                  size_t           param_value_size,
@@ -909,15 +1063,28 @@ clGetProgramInfo(cl_program       program,
   } else if (param_name == CL_PROGRAM_DEVICES) {
     cl_device_id dev_id = program->ctx->device;
     FILL_GETINFO_RET (cl_device_id, 1, &dev_id, CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
+    cl_uint kernels_num = program->ker_n;
+    FILL_GETINFO_RET (cl_uint, 1, &kernels_num, CL_SUCCESS);
   } else if (param_name == CL_PROGRAM_SOURCE) {
 
     if (!program->source)
       FILL_GETINFO_RET (char, 1, &ret_str, CL_SUCCESS);
     FILL_GETINFO_RET (char, (strlen(program->source) + 1),
                    program->source, CL_SUCCESS);
+  } else if(param_name == CL_PROGRAM_KERNEL_NAMES) {
+    cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
   } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
-    if (program->binary == NULL) {
-      program->binary_sz = gbe_program_serialize_to_binary(program->opaque, &program->binary);
+    if (program->binary == NULL){
+      if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+      }else{
+        return CL_INVALID_BINARY;
+      }
     }
 
     if (program->binary == NULL || program->binary_sz == 0) {
@@ -933,7 +1100,15 @@ clGetProgramInfo(cl_program       program,
     /* param_value points to an array of n
        pointers allocated by the caller */
     if (program->binary == NULL) {
-      program->binary_sz = gbe_program_serialize_to_binary(program->opaque, &program->binary);
+      if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+      }else{
+        return CL_INVALID_BINARY;
+      }
     }
 
     if (program->binary == NULL || program->binary_sz == 0) {
@@ -965,17 +1140,7 @@ clGetProgramBuildInfo(cl_program             program,
   INVALID_DEVICE_IF (device != program->ctx->device);
 
   if (param_name == CL_PROGRAM_BUILD_STATUS) {
-    cl_build_status status;
-
-    if (!program->is_built)
-      status = CL_BUILD_NONE;
-    else if (program->ker_n > 0)
-      status = CL_BUILD_SUCCESS;
-    else
-      status = CL_BUILD_ERROR;
-    // TODO: Support CL_BUILD_IN_PROGRESS ?
-
-    FILL_GETINFO_RET (cl_build_status, 1, &status, CL_SUCCESS);
+    FILL_GETINFO_RET (cl_build_status, 1, &program->build_status, CL_SUCCESS);
   } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
     if (program->is_built && program->build_opts)
       ret_str = program->build_opts;
@@ -985,6 +1150,9 @@ clGetProgramBuildInfo(cl_program             program,
     FILL_GETINFO_RET (char, program->build_log_sz + 1, program->build_log, CL_SUCCESS);
     if (param_value_size_ret)
       *param_value_size_ret = program->build_log_sz + 1;
+  }else if (param_name == CL_PROGRAM_BINARY_TYPE){
+
+    FILL_GETINFO_RET (cl_uint, 1, &program->binary_type, CL_SUCCESS);
   } else {
     return CL_INVALID_VALUE;
   }
@@ -1076,6 +1244,33 @@ error:
   return err;
 }
 
+cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
+        size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
+
+  if (param_name != CL_KERNEL_ARG_ADDRESS_QUALIFIER
+          && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
+          && param_name != CL_KERNEL_ARG_TYPE_NAME
+          && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
+          && param_name != CL_KERNEL_ARG_NAME) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (arg_index >= kernel->arg_n) {
+    err = CL_INVALID_ARG_INDEX;
+    goto error;
+  }
+
+  err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
+          param_value, param_value_size_ret);
+
+error:
+  return err;
+}
+
 cl_int
 clGetKernelInfo(cl_kernel        kernel,
                 cl_kernel_info   param_name,
@@ -1100,6 +1295,9 @@ clGetKernelInfo(cl_kernel        kernel,
   } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
     const char * n = cl_kernel_get_name(kernel);
     FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_ATTRIBUTES) {
+    const char * n = cl_kernel_get_attributes(kernel);
+    FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
   } else {
     return CL_INVALID_VALUE;
   }
@@ -1161,7 +1359,7 @@ clGetEventInfo(cl_event      event,
   } else if (param_name == CL_EVENT_COMMAND_TYPE) {
     FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
   } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
-    cl_event_update_status(event);
+    cl_event_update_status(event, 0);
     FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
   } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
     cl_uint ref = event->ref_n;
@@ -1405,6 +1603,11 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
     goto error;
   }
 
+  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+     err = CL_INVALID_OPERATION;
+     goto error;
+  }
+
   if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -1540,6 +1743,11 @@ clEnqueueWriteBufferRect(cl_command_queue     command_queue,
     goto error;
   }
 
+  if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
   if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -1599,6 +1807,157 @@ error:
 }
 
 cl_int
+clEnqueueFillImage(cl_command_queue   command_queue,
+                   cl_mem             image,
+                   const void *       fill_color,
+                   const size_t *     porigin,
+                   const size_t *     pregion,
+                   cl_uint            num_events_in_wait_list,
+                   const cl_event *   event_wait_list,
+                   cl_event *         event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(image, src_image);
+  FIXUP_IMAGE_REGION(src_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(src_image, porigin, origin);
+
+  if (command_queue->ctx != image->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (fill_color == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!origin || !region || origin[0] + region[0] > src_image->w || origin[1] + region[1] > src_image->h || origin[2] + region[2] > src_image->depth) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)){
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 ||origin[1] != 0 || region[2] != 1 || region[1] != 1)){
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  err = cl_image_fill(command_queue, fill_color, src_image, origin, region);
+  if (err) {
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueFillImage;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_image", "", command_queue);
+
+  return 0;
+
+ error:
+  return err;
+}
+
+cl_int
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  int i = 0;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (offset < 0 || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (pattern == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
+    if (valid_sz[i] == pattern_size)
+      break;
+  }
+  if (i == sizeof(valid_sz) / sizeof(size_t)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (offset % pattern_size || size % pattern_size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
+  if (err) {
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueFillBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue);
+
+  return 0;
+
+ error:
+  return err;
+}
+
+cl_int
 clEnqueueCopyBuffer(cl_command_queue     command_queue,
                     cl_mem               src_buffer,
                     cl_mem               dst_buffer,
@@ -1676,6 +2035,10 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
 
     err = cl_command_queue_flush(command_queue);
   }
+
+  if(b_output_kernel_perf)
+	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue);
+
   return 0;
 
 error:
@@ -1777,6 +2140,9 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue);
+
 error:
   return err;
 }
@@ -1785,8 +2151,8 @@ cl_int
 clEnqueueReadImage(cl_command_queue      command_queue,
                    cl_mem                mem,
                    cl_bool               blocking_read,
-                   const size_t *        origin,
-                   const size_t *        region,
+                   const size_t *        porigin,
+                   const size_t *        pregion,
                    size_t                row_pitch,
                    size_t                slice_pitch,
                    void *                ptr,
@@ -1799,6 +2165,8 @@ clEnqueueReadImage(cl_command_queue      command_queue,
 
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(mem, image);
+  FIXUP_IMAGE_REGION(image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
   if (command_queue->ctx != mem->ctx) {
      err = CL_INVALID_CONTEXT;
      goto error;
@@ -1864,8 +2232,8 @@ cl_int
 clEnqueueWriteImage(cl_command_queue     command_queue,
                     cl_mem               mem,
                     cl_bool              blocking_write,
-                    const size_t *       origin,
-                    const size_t *       region,
+                    const size_t *       porigin,
+                    const size_t *       pregion,
                     size_t               row_pitch,
                     size_t               slice_pitch,
                     const void *         ptr,
@@ -1878,6 +2246,8 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
 
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(mem, image);
+  FIXUP_IMAGE_REGION(image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
   if (command_queue->ctx != mem->ctx) {
     err = CL_INVALID_CONTEXT;
     goto error;
@@ -1943,9 +2313,9 @@ cl_int
 clEnqueueCopyImage(cl_command_queue      command_queue,
                    cl_mem                src_mem,
                    cl_mem                dst_mem,
-                   const size_t *        src_origin,
-                   const size_t *        dst_origin,
-                   const size_t *        region,
+                   const size_t *        psrc_origin,
+                   const size_t *        pdst_origin,
+                   const size_t *        pregion,
                    cl_uint               num_events_in_wait_list,
                    const cl_event *      event_wait_list,
                    cl_event *            event)
@@ -1958,6 +2328,9 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(src_mem, src_image);
   CHECK_IMAGE(dst_mem, dst_image);
+  FIXUP_IMAGE_REGION(src_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
+  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
   if (command_queue->ctx != src_mem->ctx ||
       command_queue->ctx != dst_mem->ctx) {
     err = CL_INVALID_CONTEXT;
@@ -2016,6 +2389,9 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue);
+
 error:
   return err;
 }
@@ -2024,8 +2400,8 @@ cl_int
 clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
                            cl_mem            src_mem,
                            cl_mem            dst_buffer,
-                           const size_t *    src_origin,
-                           const size_t *    region,
+                           const size_t *    psrc_origin,
+                           const size_t *    pregion,
                            size_t            dst_offset,
                            cl_uint           num_events_in_wait_list,
                            const cl_event *  event_wait_list,
@@ -2037,6 +2413,8 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(src_mem, src_image);
   CHECK_MEM(dst_buffer);
+  FIXUP_IMAGE_REGION(src_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
   if (command_queue->ctx != src_mem->ctx ||
       command_queue->ctx != dst_buffer->ctx) {
     err = CL_INVALID_CONTEXT;
@@ -2077,6 +2455,9 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue);
+
 error:
   return err;
 }
@@ -2086,8 +2467,8 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
                            cl_mem            src_buffer,
                            cl_mem            dst_mem,
                            size_t            src_offset,
-                           const size_t *    dst_origin,
-                           const size_t *    region,
+                           const size_t *    pdst_origin,
+                           const size_t *    pregion,
                            cl_uint           num_events_in_wait_list,
                            const cl_event *  event_wait_list,
                            cl_event *        event)
@@ -2098,6 +2479,8 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
   CHECK_QUEUE(command_queue);
   CHECK_MEM(src_buffer);
   CHECK_IMAGE(dst_mem, dst_image);
+  FIXUP_IMAGE_REGION(dst_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
   if (command_queue->ctx != src_buffer->ctx ||
       command_queue->ctx != dst_mem->ctx) {
     err = CL_INVALID_CONTEXT;
@@ -2138,11 +2521,16 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue);
+
 error:
   return err;
 }
 
-static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset, size_t size)
+static cl_int _cl_map_mem(cl_mem mem, void *ptr, void **mem_ptr,
+                          size_t offset, size_t size,
+                          const size_t *origin, const size_t *region)
 {
   cl_int slot = -1;
   int err = CL_SUCCESS;
@@ -2153,17 +2541,13 @@ static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset,
     sub_offset = buffer->sub_offset;
   }
 
-  if (!(*ptr = cl_mem_map_gtt_unsync(mem))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-  *ptr = (char*)(*ptr) + offset + sub_offset;
+  ptr = (char*)ptr + offset + sub_offset;
   if(mem->flags & CL_MEM_USE_HOST_PTR) {
     assert(mem->host_ptr);
     //only calc ptr here, will do memcpy in enqueue
-    *mem_ptr = mem->host_ptr + offset + sub_offset;
+    *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
   } else {
-    *mem_ptr = *ptr;
+    *mem_ptr = ptr;
   }
   /* Record the mapped address. */
   if (!mem->mapped_ptr_sz) {
@@ -2171,7 +2555,7 @@ static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset,
     mem->mapped_ptr = (cl_mapped_ptr *)malloc(
           sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
     if (!mem->mapped_ptr) {
-      cl_mem_unmap_gtt(mem);
+      cl_mem_unmap_auto(mem);
       err = CL_OUT_OF_HOST_MEMORY;
       goto error;
     }
@@ -2189,7 +2573,7 @@ static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset,
       cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
           sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
       if (!new_ptr) {
-        cl_mem_unmap_gtt (mem);
+        cl_mem_unmap_auto(mem);
         err = CL_OUT_OF_HOST_MEMORY;
         goto error;
       }
@@ -2204,8 +2588,17 @@ static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset,
   }
   assert(slot != -1);
   mem->mapped_ptr[slot].ptr = *mem_ptr;
-  mem->mapped_ptr[slot].v_ptr = *ptr;
+  mem->mapped_ptr[slot].v_ptr = ptr;
   mem->mapped_ptr[slot].size = size;
+  if(origin) {
+    assert(region);
+    mem->mapped_ptr[slot].origin[0] = origin[0];
+    mem->mapped_ptr[slot].origin[1] = origin[1];
+    mem->mapped_ptr[slot].origin[2] = origin[2];
+    mem->mapped_ptr[slot].region[0] = region[0];
+    mem->mapped_ptr[slot].region[1] = region[1];
+    mem->mapped_ptr[slot].region[2] = region[2];
+  }
   mem->map_ref++;
 error:
   if (err != CL_SUCCESS)
@@ -2251,10 +2644,6 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     goto error;
   }
 
-  err = _cl_map_mem(buffer, &ptr, &mem_ptr, offset, size);
-  if (err != CL_SUCCESS)
-    goto error;
-
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
 
   data = &no_wait_data;
@@ -2263,12 +2652,25 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
   data->offset      = offset;
   data->size        = size;
   data->ptr         = ptr;
+  data->unsync_map  = 1;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    data->unsync_map = 0;
     err = cl_enqueue_handle(event ? *event : NULL, data);
+    if (err != CL_SUCCESS)
+      goto error;
+    ptr = data->ptr;
     if(event) cl_event_set_status(*event, CL_COMPLETE);
+  } else {
+    if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
+      err = CL_MAP_FAILURE;
+      goto error;
+    }
   }
+  err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
+  if (err != CL_SUCCESS)
+    goto error;
 
 error:
   if (errcode_ret)
@@ -2281,8 +2683,8 @@ clEnqueueMapImage(cl_command_queue   command_queue,
                   cl_mem             mem,
                   cl_bool            blocking_map,
                   cl_map_flags       map_flags,
-                  const size_t *     origin,
-                  const size_t *     region,
+                  const size_t *     porigin,
+                  const size_t *     pregion,
                   size_t *           image_row_pitch,
                   size_t *           image_slice_pitch,
                   cl_uint            num_events_in_wait_list,
@@ -2293,10 +2695,13 @@ clEnqueueMapImage(cl_command_queue   command_queue,
   cl_int err = CL_SUCCESS;
   void *ptr  = NULL;
   void *mem_ptr = NULL;
+  size_t offset = 0;
   enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(mem, image);
+  FIXUP_IMAGE_REGION(image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
   if (command_queue->ctx != mem->ctx) {
     err = CL_INVALID_CONTEXT;
     goto error;
@@ -2312,10 +2717,6 @@ clEnqueueMapImage(cl_command_queue   command_queue,
     goto error;
   }
 
-  *image_row_pitch = image->row_pitch;
-  if (image_slice_pitch)
-    *image_slice_pitch = image->slice_pitch;
-
   if ((map_flags & CL_MAP_READ &&
        mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
       (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
@@ -2325,28 +2726,6 @@ clEnqueueMapImage(cl_command_queue   command_queue,
     goto error;
   }
 
-  if (!(ptr = cl_mem_map_gtt_unsync(mem))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-
-  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  size_t size;
-  if(region[2] == 1) {
-    if(region[1] == 1)
-      size = image->bpp * region[0];
-    else
-      size = image->row_pitch * (region[1] - 1) + (image->bpp * (origin[0] + region[0]));
-  } else {
-    size = image->slice_pitch * (region[2] - 1);
-    size += image->row_pitch * (origin[1] + region[1]);
-    size += image->bpp * (origin[0] + region[0]);
-  }
-
-  err = _cl_map_mem(mem, &ptr, &mem_ptr, offset, size);
-  if (err != CL_SUCCESS)
-    goto error;
-
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
 
   data = &no_wait_data;
@@ -2354,18 +2733,42 @@ clEnqueueMapImage(cl_command_queue   command_queue,
   data->mem_obj     = mem;
   data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
   data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
-  data->row_pitch   = *image_row_pitch;
-  if (image_slice_pitch)
-    data->slice_pitch = *image_slice_pitch;
   data->ptr         = ptr;
-  data->offset      = offset;
+  data->unsync_map  = 1;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    data->unsync_map = 0;
     err = cl_enqueue_handle(event ? *event : NULL, data);
+    if (err != CL_SUCCESS)
+      goto error;
+    ptr = data->ptr;
     if(event) cl_event_set_status(*event, CL_COMPLETE);
+  } else {
+    if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
+      err = CL_MAP_FAILURE;
+      goto error;
+    }
   }
 
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    if (image_slice_pitch)
+      *image_slice_pitch = image->host_slice_pitch;
+    *image_row_pitch = image->host_row_pitch;
+
+    offset = image->bpp*origin[0] + image->host_row_pitch*origin[1] + image->host_slice_pitch*origin[2];
+  } else {
+    if (image_slice_pitch)
+      *image_slice_pitch = image->slice_pitch;
+    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      *image_row_pitch = image->slice_pitch;
+    else
+      *image_row_pitch = image->row_pitch;
+
+    offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  }
+  err = _cl_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
+
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -2408,6 +2811,58 @@ error:
 }
 
 cl_int
+clEnqueueMigrateMemObjects(cl_command_queue        command_queue,
+                           cl_uint                 num_mem_objects,
+                           const cl_mem *          mem_objects,
+                           cl_mem_migration_flags  flags,
+                           cl_uint                 num_events_in_wait_list,
+                           const cl_event *        event_wait_list,
+                           cl_event *              event)
+{
+  /* So far, we just support 1 device and no subdevice. So all the command queues
+     belong to the small context. There is no need to migrate the mem objects by now. */
+  cl_int err = CL_SUCCESS;
+  cl_uint i = 0;
+  enqueue_data *data, defer_enqueue_data = { 0 };
+
+  if (!flags & CL_MIGRATE_MEM_OBJECT_HOST)
+    CHECK_QUEUE(command_queue);
+
+  if (num_mem_objects == 0 || mem_objects == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST |
+                         CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  for (i = 0; i < num_mem_objects; i++) {
+    CHECK_MEM(mem_objects[i]);
+    if (mem_objects[i]->ctx != command_queue->ctx) {
+      err = CL_INVALID_CONTEXT;
+      goto error;
+    }
+  }
+
+  /* really nothing to do, fill the event. */
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+  data = &defer_enqueue_data;
+  data->type = EnqueueMigrateMemObj;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
 clEnqueueNDRangeKernel(cl_command_queue  command_queue,
                        cl_kernel         kernel,
                        cl_uint           work_dim,
@@ -2527,6 +2982,13 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     err = cl_command_queue_flush(command_queue);
   }
 
+  if(b_output_kernel_perf)
+  {
+    if(kernel->program->build_opts != NULL)
+      time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
+    else
+      time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
+  }
 error:
   return err;
 }
@@ -2612,8 +3074,8 @@ error:
 }
 
 cl_int
-clEnqueueMarker(cl_command_queue     command_queue,
-                cl_event *           event)
+clEnqueueMarker(cl_command_queue command_queue,
+    cl_event *event)
 {
   cl_int err = CL_SUCCESS;
   CHECK_QUEUE(command_queue);
@@ -2622,7 +3084,23 @@ clEnqueueMarker(cl_command_queue     command_queue,
     goto error;
   }
 
-  cl_event_marker(command_queue, event);
+  cl_event_marker_with_wait_list(command_queue, 0, NULL, event);
+error:
+  return err;
+}
+
+cl_int
+clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+  cl_event_marker_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
 error:
   return err;
 }
@@ -2645,18 +3123,35 @@ clEnqueueBarrier(cl_command_queue  command_queue)
 {
   cl_int err = CL_SUCCESS;
   CHECK_QUEUE(command_queue);
-  cl_command_queue_set_barrier(command_queue);
+
+  cl_event_barrier_with_wait_list(command_queue, 0, NULL, NULL);
 
 error:
   return err;
 }
 
+cl_int
+clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+  cl_event_barrier_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
+error:
+  return err;
+}
+
 #define EXTFUNC(x)                      \
   if (strcmp(#x, func_name) == 0)       \
     return (void *)x;
 
-void*
-clGetExtensionFunctionAddress(const char *func_name)
+static void*
+internal_clGetExtensionFunctionAddress(const char *func_name)
 {
   if (func_name == NULL)
     return NULL;
@@ -2675,9 +3170,25 @@ clGetExtensionFunctionAddress(const char *func_name)
   EXTFUNC(clReportUnfreedIntel)
   EXTFUNC(clCreateBufferFromLibvaIntel)
   EXTFUNC(clCreateImageFromLibvaIntel)
+  EXTFUNC(clGetMemObjectFdIntel)
   return NULL;
 }
 
+void*
+clGetExtensionFunctionAddress(const char *func_name)
+{
+  return internal_clGetExtensionFunctionAddress(func_name);
+}
+
+void*
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+                              const char *func_name)
+{
+  if (UNLIKELY(platform != NULL && platform != intel_platform))
+    return NULL;
+  return internal_clGetExtensionFunctionAddress(func_name);
+}
+
 #undef EXTFUNC
 
 cl_int
@@ -2814,3 +3325,17 @@ error:
   return mem;
 }
 
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectFdIntel(cl_context context,
+                      cl_mem memobj,
+                      int* fd)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  CHECK_MEM (memobj);
+
+  err = cl_mem_get_fd(memobj, fd);
+
+error:
+  return err;
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4ac2e11..0be37a7 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -17,6 +17,7 @@
  * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
+#include "program.h" // for BTI_MAX_IMAGE_NUM
 #include "cl_command_queue.h"
 #include "cl_context.h"
 #include "cl_program.h"
@@ -28,6 +29,8 @@
 #include "cl_alloc.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
+#include "cl_event.h"
+#include "performance.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -73,6 +76,10 @@ cl_command_queue_delete(cl_command_queue queue)
   assert(queue);
   if (atomic_dec(&queue->ref_n) != 1) return;
 
+  // If there is a valid last event, we need to give it a chance to
+  // call the call-back function.
+  if (queue->last_event && queue->last_event->user_cb)
+    cl_event_update_status(queue->last_event, 1);
   /* Remove it from the list */
   assert(queue->ctx);
   pthread_mutex_lock(&queue->ctx->queue_lock);
@@ -88,7 +95,7 @@ cl_command_queue_delete(cl_command_queue queue)
     queue->fulsim_out = NULL;
   }
 
-  cl_thread_data_destroy(queue->thread_data);
+  cl_thread_data_destroy(queue);
   queue->thread_data = NULL;
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
@@ -129,13 +136,20 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
   for (i = 0; i < k->image_sz; i++) {
     int id = k->images[i].arg_idx;
     struct _cl_mem_image *image;
-    assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
+    assert(interp_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
     cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
                         image->intel_fmt, image->image_type,
                         image->w, image->h, image->depth,
-                        image->row_pitch, image->tiling);
+                        image->row_pitch, (cl_gpgpu_tiling)image->tiling);
+    // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
+    // on demand.
+    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_MAX_IMAGE_NUM, image->base.bo, image->offset,
+                          image->intel_fmt, image->image_type,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->tiling);
   }
   return CL_SUCCESS;
 }
@@ -150,15 +164,15 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   enum gbe_arg_type arg_type; /* kind of argument */
   for (i = 0; i < k->arg_n; ++i) {
     uint32_t offset; // location of the address in the curbe
-    arg_type = gbe_kernel_get_arg_type(k->opaque, i);
+    arg_type = interp_kernel_get_arg_type(k->opaque, i);
     if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
       continue;
-    offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
+    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
     if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
       struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
     } else {
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
     }
   }
 
@@ -207,7 +221,7 @@ cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k)
   /* Bind user defined surface */
   for (i = 0; i < k->arg_n; ++i) {
     size_t chunk_n, chunk_remainder;
-    if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
+    if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
       continue;
     mem = (cl_mem) k->args[i].mem;
     CHECK_MEM(mem);
@@ -315,7 +329,7 @@ cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
   int i, curr = 0;
   /* Bind user defined surface */
   for (i = 0; i < k->arg_n; ++i) {
-    if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
+    if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
       continue;
     mem = (cl_mem) k->args[i].mem;
     CHECK_MEM(mem);
@@ -376,6 +390,8 @@ cl_command_queue_ND_range(cl_command_queue queue,
                           const size_t *global_wk_sz,
                           const size_t *local_wk_sz)
 {
+  if(b_output_kernel_perf)
+    time_start(queue->ctx, cl_kernel_get_name(k), queue);
   const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
   cl_int err = CL_SUCCESS;
 
@@ -413,13 +429,49 @@ error:
   return err;
 }
 
-LOCAL cl_int
-cl_command_queue_flush(cl_command_queue queue)
+LOCAL void
+cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
 {
-  GET_QUEUE_THREAD_GPGPU(queue);
+  size_t global_wk_sz[3];
+  void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
 
   cl_gpgpu_flush(gpgpu);
 
+  if (printf_info && interp_get_printf_num(printf_info)) {
+    void *index_addr = cl_gpgpu_map_printf_buffer(gpgpu, 0);
+    void *buf_addr = NULL;
+    if (interp_get_printf_sizeof_size(printf_info))
+      buf_addr = cl_gpgpu_map_printf_buffer(gpgpu, 1);
+
+    interp_output_printf(printf_info, index_addr, buf_addr, global_wk_sz[0],
+                      global_wk_sz[1], global_wk_sz[2]);
+
+    cl_gpgpu_unmap_printf_buffer(gpgpu, 0);
+    if (interp_get_printf_sizeof_size(printf_info))
+      cl_gpgpu_unmap_printf_buffer(gpgpu, 1);
+  }
+
+  if (printf_info) {
+    interp_release_printf_info(printf_info);
+    global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
+    cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
+  }
+}
+
+LOCAL cl_int
+cl_command_queue_flush(cl_command_queue queue)
+{
+  GET_QUEUE_THREAD_GPGPU(queue);
+  cl_command_queue_flush_gpgpu(queue, gpgpu);
+  // As we don't have a deadicate timer thread to take care the possible
+  // event which has a call back function registerred and the event will
+  // be released at the call back function, no other function will access
+  // the event any more. If we don't do this here, we will leak that event
+  // and all the corresponding buffers which is really bad.
+  if (queue->last_event && queue->last_event->user_cb)
+    cl_event_update_status(queue->last_event, 1);
+  if (queue->current_event)
+    cl_event_flush(queue->current_event);
   cl_invalid_thread_gpgpu(queue);
   return CL_SUCCESS;
 }
@@ -427,7 +479,7 @@ cl_command_queue_flush(cl_command_queue queue)
 LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
-  cl_gpgpu_sync(cl_get_thread_batch_buf());
+  cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
   return CL_SUCCESS;
 }
 
@@ -489,18 +541,82 @@ cl_command_queue_remove_event(cl_command_queue queue, cl_event event)
   if(i == queue->wait_events_num)
     return;
 
-  if(queue->barrier_index >= i)
-    queue->barrier_index -= 1;
-
-  for(; i<queue->wait_events_num-1; i++) {
-    queue->wait_events[i] = queue->wait_events[i+1];
+  if(i == queue->wait_events_num - 1) {
+    queue->wait_events[i] = NULL;
+  } else {
+    for(; i<queue->wait_events_num-1; i++) {
+      queue->wait_events[i] = queue->wait_events[i+1];
+    }
   }
   queue->wait_events_num -= 1;
 }
 
+#define DEFAULT_WAIT_EVENTS_SIZE  16
 LOCAL void
-cl_command_queue_set_barrier(cl_command_queue queue)
+cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event)
 {
-    queue->barrier_index = queue->wait_events_num;
+  cl_int i=0;
+  cl_event *new_list;
+
+  assert(queue != NULL);
+  if(queue->barrier_events == NULL) {
+    queue->barrier_events_size = DEFAULT_WAIT_EVENTS_SIZE;
+    TRY_ALLOC_NO_ERR (queue->barrier_events, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+  }
+
+  for(i=0; i<queue->barrier_events_num; i++) {
+    if(queue->barrier_events[i] == event)
+      return;   //is in the barrier_events, need to insert
+  }
+
+  if(queue->barrier_events_num < queue->barrier_events_size) {
+    queue->barrier_events[queue->barrier_events_num++] = event;
+    return;
+  }
+
+  //barrier_events_num == barrier_events_size, array is full
+  queue->barrier_events_size *= 2;
+  TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+  memcpy(new_list, queue->barrier_events, sizeof(cl_event)*queue->barrier_events_num);
+  cl_free(queue->barrier_events);
+  queue->barrier_events = new_list;
+  queue->barrier_events[queue->barrier_events_num++] = event;
+  return;
+
+exit:
+  return;
+error:
+  if(queue->barrier_events)
+    cl_free(queue->barrier_events);
+  queue->barrier_events = NULL;
+  queue->barrier_events_size = 0;
+  queue->barrier_events_num = 0;
+  goto exit;
+
 }
 
+LOCAL void
+cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event)
+{
+  cl_int i=0;
+
+  if(queue->barrier_events_num == 0)
+    return;
+
+  for(i=0; i<queue->barrier_events_num; i++) {
+    if(queue->barrier_events[i] == event)
+      break;
+  }
+
+  if(i == queue->barrier_events_num)
+    return;
+
+  if(i == queue->barrier_events_num - 1) {
+    queue->barrier_events[i] = NULL;
+  } else {
+    for(; i<queue->barrier_events_num-1; i++) {
+      queue->barrier_events[i] = queue->barrier_events[i+1];
+    }
+  }
+  queue->barrier_events_num -= 1;
+}
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 40c272c..bd70f25 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -34,11 +34,14 @@ struct _cl_command_queue {
   uint64_t magic;                      /* To identify it as a command queue */
   volatile int ref_n;                  /* We reference count this object */
   cl_context ctx;                      /* Its parent context */
+  cl_event* barrier_events;               /* Point to array of non-complete user events that block this command queue */
+  cl_int    barrier_events_num;           /* Number of Non-complete user events */
+  cl_int    barrier_events_size;          /* The size of array that wait_events point to */
   cl_event* wait_events;               /* Point to array of non-complete user events that block this command queue */
   cl_int    wait_events_num;           /* Number of Non-complete user events */
   cl_int    wait_events_size;          /* The size of array that wait_events point to */
-  cl_int    barrier_index;             /* Indicate event count in wait_events as barrier events */
   cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
+  cl_event  current_event;             /* Current event. */
   cl_command_queue_properties  props;  /* Queue properties */
   cl_command_queue prev, next;         /* We chain the command queues together */
   void *thread_data;                   /* Used to store thread context data */
@@ -80,6 +83,9 @@ cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
 /* Flush for the command queue */
 extern cl_int cl_command_queue_flush(cl_command_queue);
 
+/* Flush for the specified gpgpu */
+extern void cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
+
 /* Wait for the completion of the command queue */
 extern cl_int cl_command_queue_finish(cl_command_queue);
 
@@ -95,8 +101,9 @@ extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
 /* Remove a user event from command's wait_events */
 extern void cl_command_queue_remove_event(cl_command_queue, cl_event);
 
-/* Set the barrier index */
-extern void cl_command_queue_set_barrier(cl_command_queue);
+extern void cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event);
+
+extern void cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event);
 
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index ba69589..330f0f9 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -30,7 +30,8 @@
 #include <stdio.h>
 #include <string.h>
 
-static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; }
+#define MAX_GROUP_SIZE_IN_HALFSLICE   512
+static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; }
 
 /* "Varing" payload is the part of the curbe that changes accross threads in the
  *  same work group. Right now, it consists in local IDs and block IPs
@@ -49,10 +50,10 @@ cl_set_varying_payload(const cl_kernel ker,
   int32_t id_offset[3], ip_offset;
   cl_int err = CL_SUCCESS;
 
-  id_offset[0] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
-  id_offset[1] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
-  id_offset[2] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
-  ip_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
+  id_offset[0] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
+  id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
+  id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
+  ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
   assert(id_offset[0] >= 0 &&
          id_offset[1] >= 0 &&
          id_offset[2] >= 0 &&
@@ -95,7 +96,7 @@ error:
   return err;
 }
 
-static void
+static int
 cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
 {
   /* calculate constant buffer size
@@ -106,16 +107,16 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
   size_t offset = 0;
   uint32_t raw_size = 0, aligned_size =0;
   gbe_program prog = ker->program->opaque;
-  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
-  size_t global_const_size = gbe_program_get_global_constant_size(prog);
+  const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
+  size_t global_const_size = interp_program_get_global_constant_size(prog);
   aligned_size = raw_size = global_const_size;
   /* Reserve 8 bytes to get rid of 0 address */
   if(global_const_size == 0) aligned_size = 8;
 
   for (arg = 0; arg < arg_n; ++arg) {
-    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
     if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
-      uint32_t alignment = gbe_kernel_get_arg_align(ker->opaque, arg);
+      uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
       assert(alignment != 0);
       cl_mem mem = ker->args[arg].mem;
       raw_size += mem->size;
@@ -124,15 +125,19 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
     }
   }
   if(raw_size == 0)
-     return;
+     return 0;
 
-  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size);
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size, BTI_CONSTANT);
+  if (bo == NULL)
+    return -1;
   cl_buffer_map(bo, 1);
   char * cst_addr = cl_buffer_get_virtual(bo);
+  if (cst_addr == NULL)
+    return -1;
 
   /* upload the global constant data */
   if (global_const_size > 0) {
-    gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+    interp_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
     offset += global_const_size;
   }
 
@@ -144,12 +149,12 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
   /* upload constant buffer argument */
   int32_t curbe_offset = 0;
   for (arg = 0; arg < arg_n; ++arg) {
-    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
     if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
       cl_mem mem = ker->args[arg].mem;
-      uint32_t alignment = gbe_kernel_get_arg_align(ker->opaque, arg);
+      uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
       offset = ALIGN(offset, alignment);
-      curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+      curbe_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
       assert(curbe_offset >= 0);
       *(uint32_t *) (ker->curbe + curbe_offset) = offset;
 
@@ -161,6 +166,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
     }
   }
   cl_buffer_unmap(bo);
+  return 0;
 }
 
 /* Will return the total amount of slm used */
@@ -174,7 +180,7 @@ cl_curbe_fill(cl_kernel ker,
 {
   int32_t offset;
 #define UPLOAD(ENUM, VALUE) \
-  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \
     *((uint32_t *) (ker->curbe + offset)) = VALUE;
   UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
   UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
@@ -192,36 +198,27 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
-  /* Upload sampler information. */
-  offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0);
-  if (offset >= 0) {
-    uint32_t i;
-    for(i = 0; i < ker->sampler_sz; i++, offset += 2) {
-      *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF;
-    }
-  }
-
   /* Write identity for the stack pointer. This is required by the stack pointer
    * computation in the kernel
    */
-  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
-    const uint32_t simd_sz = gbe_kernel_get_simd_width(ker->opaque);
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
+    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
     uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
     int32_t i;
     for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
   }
   /* Handle the various offsets to SLM */
-  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
-  int32_t arg, slm_offset = gbe_kernel_get_slm_size(ker->opaque);
+  const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
+  int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
   ker->local_mem_sz = 0;
   for (arg = 0; arg < arg_n; ++arg) {
-    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
     if (type != GBE_ARG_LOCAL_PTR)
       continue;
-    uint32_t align = gbe_kernel_get_arg_align(ker->opaque, arg);
+    uint32_t align = interp_kernel_get_arg_align(ker->opaque, arg);
     assert(align != 0);
     slm_offset = ALIGN(slm_offset, align);
-    offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+    offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
     assert(offset >= 0);
     uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
     *slmptr = slm_offset;
@@ -239,7 +236,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   const int32_t per_lane_stack_sz = ker->stack_size;
   const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
   const int32_t sub_value = GBE_STACK_BUFFER;
-  const int32_t offset = gbe_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+  const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
   int32_t stack_sz = per_lane_stack_sz;
 
   /* No stack required for this kernel */
@@ -250,17 +247,35 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
    * the size we need for the complete machine
    */
   assert(offset >= 0);
-  stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
-  stack_sz *= device->max_compute_unit;
-  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
+  stack_sz *= interp_kernel_get_simd_width(ker->opaque);
+  stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
+  /* Because HSW calc stack offset per thread is relative with half slice, when
+     thread schedule in half slice is not balance, would out of bound. Because
+     the max half slice is 4 in GT4, multiply stack size with 4 for safe.
+   */
+  if(cl_driver_get_ver(ctx->drv) == 75)
+    stack_sz *= 4;
+  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
 }
 
-static void
-cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker)
-{
-  int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
+static int
+cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
+  int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
+  int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+  size_t buf_size = global_sz * sizeof(int) * printf_num;
+  if (offset > 0) {
+    if (cl_gpgpu_set_printf_buffer(gpgpu, 0, buf_size, offset, interp_get_printf_indexbuf_bti(printf_info)) != 0)
+      return -1;
+  }
 
-  cl_gpgpu_set_scratch(gpgpu, scratch_sz);
+  value = GBE_CURBE_PRINTF_BUF_POINTER;
+  offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+  buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;
+  if (offset > 0) {
+    if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
+      return -1;
+  }
+  return 0;
 }
 
 LOCAL cl_int
@@ -277,9 +292,13 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   cl_gpgpu_kernel kernel;
   const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
   size_t i, batch_sz = 0u, local_sz = 0u;
-  size_t cst_sz = ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque);
+  size_t cst_sz = ker->curbe_sz= interp_kernel_get_curbe_size(ker->opaque);
+  int32_t scratch_sz = interp_kernel_get_scratch_size(ker->opaque);
   size_t thread_n = 0u;
+  int printf_num = 0;
   cl_int err = CL_SUCCESS;
+  size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
+  void* printf_info = NULL;
 
   /* Setup kernel */
   kernel.name = "KERNEL";
@@ -287,25 +306,41 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.bo = ker->bo;
   kernel.barrierID = 0;
   kernel.slm_sz = 0;
-  kernel.use_slm = gbe_kernel_use_slm(ker->opaque);
+  kernel.use_slm = interp_kernel_use_slm(ker->opaque);
 
   /* Compute the number of HW threads we need */
   TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
   kernel.curbe_sz = cst_sz;
 
+  if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
+    fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
+    return CL_OUT_OF_RESOURCES;
+  }
   /* Curbe step 1: fill the constant urb buffer data shared by all threads */
   if (ker->curbe) {
     kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
-    if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
+    if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
+      fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz);
       return CL_OUT_OF_RESOURCES;
+    }
   }
 
+  printf_info = interp_dup_printfset(ker->opaque);
+  cl_gpgpu_set_printf_info(gpgpu, printf_info, (size_t *)global_wk_sz);
+
   /* Setup the kernel */
   if (queue->props & CL_QUEUE_PROFILING_ENABLE)
-    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
+    err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
   else
-    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
+    err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 0);
+  if (err != 0)
+    goto error;
+  printf_num = interp_get_printf_num(printf_info);
+  if (printf_num) {
+    if (cl_bind_printf(gpgpu, ker, printf_info, printf_num, global_size) != 0)
+      goto error;
+  }
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
@@ -314,11 +349,14 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind all samplers */
   cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
-  cl_setup_scratch(gpgpu, ker);
+  if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
+    goto error;
+
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
 
-  cl_upload_constant_buffer(queue, ker);
+  if (cl_upload_constant_buffer(queue, ker) != 0)
+    goto error;
 
   cl_gpgpu_states_setup(gpgpu, &kernel);
 
@@ -330,13 +368,15 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
         memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
     }
     TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
-    cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
+    if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
+      goto error;
   }
 
   /* Start a new batch buffer */
   batch_sz = cl_kernel_compute_batch_sz(ker);
-  cl_gpgpu_batch_reset(gpgpu, batch_sz);
-  cl_set_thread_batch_buf(cl_gpgpu_ref_batch_buf(gpgpu));
+  if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0)
+    goto error;
+  cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
   cl_gpgpu_batch_start(gpgpu);
 
   /* Issue the GPGPU_WALKER command */
@@ -344,7 +384,11 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Close the batch buffer and submit it */
   cl_gpgpu_batch_end(gpgpu, 0);
+  return CL_SUCCESS;
+
 error:
-  return err;
+  fprintf(stderr, "error occured. \n");
+  exit(-1);
+  return CL_OUT_OF_RESOURCES;
 }
 
diff --git a/src/cl_context.c b/src/cl_context.c
index 8190e6a..152faf3 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -188,6 +188,7 @@ error:
 LOCAL void
 cl_context_delete(cl_context ctx)
 {
+  int i = 0;
   if (UNLIKELY(ctx == NULL))
     return;
 
@@ -195,6 +196,26 @@ cl_context_delete(cl_context ctx)
   if (atomic_dec(&ctx->ref_n) > 1)
     return;
 
+  /* delete the internal programs. */
+  for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
+    if (ctx->internel_kernels[i]) {
+      cl_kernel_delete(ctx->internel_kernels[i]);
+      ctx->internel_kernels[i] = NULL;
+
+      assert(ctx->internal_prgs[i]);
+      cl_program_delete(ctx->internal_prgs[i]);
+      ctx->internal_prgs[i] = NULL;
+    }
+
+    if (ctx->internel_kernels[i]) {
+      cl_kernel_delete(ctx->built_in_kernels[i]);
+      ctx->built_in_kernels[i] = NULL;
+    }
+  }
+
+  cl_program_delete(ctx->built_in_prgs);
+  ctx->built_in_prgs = NULL;
+
   /* All object lists should have been freed. Otherwise, the reference counter
    * of the context cannot be 0
    */
@@ -203,7 +224,6 @@ cl_context_delete(cl_context ctx)
   assert(ctx->buffers == NULL);
   assert(ctx->drv);
   cl_free(ctx->prop_user);
-  cl_set_thread_batch_buf(NULL);
   cl_driver_delete(ctx->drv);
   ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(ctx);
@@ -251,8 +271,7 @@ cl_kernel
 cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
 {
   cl_int ret;
-  if (!ctx->internal_prgs[index])
-  {
+  if (!ctx->internal_prgs[index]) {
     size_t length = strlen(str_kernel) + 1;
     ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
 
@@ -265,20 +284,47 @@ cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kern
 
     ctx->internal_prgs[index]->is_built = 1;
 
-    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+        if (index != i) {
+          assert(ctx->internal_prgs[i] == NULL);
+          assert(ctx->internel_kernels[i] == NULL);
+          cl_program_add_ref(ctx->internal_prgs[index]);
+          ctx->internal_prgs[i] = ctx->internal_prgs[index];
+        }
+
+        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_2", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_4", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_8", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_16", NULL);
+        } else
+          assert(0);
+      }
+    } else {
+      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    }
   }
 
   return ctx->internel_kernels[index];
 }
 
 cl_kernel
-cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
                   const char * str_kernel, size_t size, const char * str_option)
 {
   cl_int ret;
   cl_int binary_status = CL_SUCCESS;
-  if (!ctx->internal_prgs[index])
-  {
+  if (!ctx->internal_prgs[index]) {
     ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
       &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
 
@@ -291,7 +337,35 @@ cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
 
     ctx->internal_prgs[index]->is_built = 1;
 
-    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+        if (index != i) {
+          assert(ctx->internal_prgs[i] == NULL);
+          assert(ctx->internel_kernels[i] == NULL);
+          cl_program_add_ref(ctx->internal_prgs[index]);
+          ctx->internal_prgs[i] = ctx->internal_prgs[index];
+        }
+
+        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_2", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_4", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_8", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_16", NULL);
+        } else
+          assert(0);
+      }
+    } else {
+      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    }
   }
 
   return ctx->internel_kernels[index];
diff --git a/src/cl_context.h b/src/cl_context.h
index 29bcb9f..75afbf6 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -40,18 +40,35 @@ enum _cl_gl_context_type {
 };
 
 enum _cl_internal_ker_type {
-  CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
-  CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+  CL_INTERNAL_KERNEL_MIN = 0,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
   CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
   CL_ENQUEUE_COPY_BUFFER_RECT,
-  CL_ENQUEUE_COPY_IMAGE_0,             //copy image 2d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_1,             //copy image 3d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_2,             //copy image 2d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_3,             //copy image 3d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0,   //copy image 2d to buffer
-  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
+  CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,             //copy image 2d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,             //copy image 3d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
+  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
+  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
+  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
+  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
+  CL_ENQUEUE_FILL_IMAGE_1D,             //fill image 1d
+  CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,       //fill image 1d array
+  CL_ENQUEUE_FILL_IMAGE_2D,             //fill image 2d
+  CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,       //fill image 2d array
+  CL_ENQUEUE_FILL_IMAGE_3D,             //fill image 3d
   CL_INTERNAL_KERNEL_MAX
 };
 
@@ -91,6 +108,8 @@ struct _cl_context {
                                     /* All programs internal used, for example clEnqueuexxx api use */
   cl_kernel  internel_kernels[CL_INTERNAL_KERNEL_MAX];
                                     /* All kernels  for clenqueuexxx api, for example clEnqueuexxx api use */
+  cl_program built_in_prgs;  /*all built-in kernels belongs to this program only*/
+  cl_kernel  built_in_kernels[CL_INTERNAL_KERNEL_MAX];
   uint32_t ver;                     /* Gen version */
   struct _cl_context_prop props;
   cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
@@ -140,7 +159,7 @@ extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
 extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
 
 /* Get the internal used kernel from binary*/
-extern cl_kernel cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
                   const char * str_kernel, size_t size, const char * str_option);
 
 #endif /* __CL_CONTEXT_H__ */
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 9c18406..28bd5f0 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -20,6 +20,8 @@
 #ifndef __CL_DEVICE_DATA_H__
 #define __CL_DEVICE_DATA_H__
 
+#define INVALID_CHIP_ID -1 //returned by intel_get_device_id if no device found
+
 #define PCI_CHIP_GM45_GM                0x2A42
 #define PCI_CHIP_IGD_E_G                0x2E02
 #define PCI_CHIP_Q45_G                  0x2E12
@@ -67,6 +69,9 @@
 #define PCI_CHIP_IVYBRIDGE_M_GT1        0x0156  /* Mobile */
 #define PCI_CHIP_IVYBRIDGE_M_GT2        0x0166
 #define PCI_CHIP_IVYBRIDGE_S_GT1        0x015a  /* Server */
+#define PCI_CHIP_IVYBRIDGE_S_GT2        0x016a
+
+#define PCI_CHIP_BAYTRAIL_T 0x0F31
 
 #define IS_IVB_GT1(devid)               \
   (devid == PCI_CHIP_IVYBRIDGE_GT1 ||   \
@@ -75,9 +80,13 @@
 
 #define IS_IVB_GT2(devid)               \
   (devid == PCI_CHIP_IVYBRIDGE_GT2 ||   \
-   devid == PCI_CHIP_IVYBRIDGE_M_GT2)
+   devid == PCI_CHIP_IVYBRIDGE_M_GT2 || \
+   devid == PCI_CHIP_IVYBRIDGE_S_GT2)
+
+#define IS_BAYTRAIL_T(devid)              \
+  (devid == PCI_CHIP_BAYTRAIL_T)
 
-#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid))
+#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid) || IS_BAYTRAIL_T(devid))
 #define IS_GEN7(devid)      IS_IVYBRIDGE(devid)
 
 
@@ -90,6 +99,13 @@
 #define PCI_CHIP_HASWELL_M1          0x0406 /* GT1 mobile */
 #define PCI_CHIP_HASWELL_M2          0x0416 /* GT2 mobile */
 #define PCI_CHIP_HASWELL_M3          0x0426 /* GT3 mobile */
+#define PCI_CHIP_HASWELL_B1          0x040B /* Haswell GT1 */
+#define PCI_CHIP_HASWELL_B2          0x041B /* Haswell GT2 */
+#define PCI_CHIP_HASWELL_B3          0x042B /* Haswell GT3 */
+#define PCI_CHIP_HASWELL_E1          0x040E /* Haswell GT1 */
+#define PCI_CHIP_HASWELL_E2          0x041E /* Haswell GT2 */
+#define PCI_CHIP_HASWELL_E3          0x042E /* Haswell GT3 */
+
 /* Software Development Vehicle devices. */
 #define PCI_CHIP_HASWELL_SDV_D1      0x0C02 /* SDV GT1 desktop */
 #define PCI_CHIP_HASWELL_SDV_D2      0x0C12 /* SDV GT2 desktop */
@@ -100,6 +116,12 @@
 #define PCI_CHIP_HASWELL_SDV_M1      0x0C06 /* SDV GT1 mobile */
 #define PCI_CHIP_HASWELL_SDV_M2      0x0C16 /* SDV GT2 mobile */
 #define PCI_CHIP_HASWELL_SDV_M3      0x0C26 /* SDV GT3 mobile */
+#define PCI_CHIP_HASWELL_SDV_B1      0x0C0B /* SDV GT1 */
+#define PCI_CHIP_HASWELL_SDV_B2      0x0C1B /* SDV GT2 */
+#define PCI_CHIP_HASWELL_SDV_B3      0x0C2B /* SDV GT3 */
+#define PCI_CHIP_HASWELL_SDV_E1      0x0C0E /* SDV GT1 */
+#define PCI_CHIP_HASWELL_SDV_E2      0x0C1E /* SDV GT2 */
+#define PCI_CHIP_HASWELL_SDV_E3      0x0C2E /* SDV GT3 */
 /* Ultrabooks */
 #define PCI_CHIP_HASWELL_ULT_D1      0x0A02 /* ULT GT1 desktop */
 #define PCI_CHIP_HASWELL_ULT_D2      0x0A12 /* ULT GT2 desktop */
@@ -110,6 +132,12 @@
 #define PCI_CHIP_HASWELL_ULT_M1      0x0A06 /* ULT GT1 mobile */
 #define PCI_CHIP_HASWELL_ULT_M2      0x0A16 /* ULT GT2 mobile */
 #define PCI_CHIP_HASWELL_ULT_M3      0x0A26 /* ULT GT3 mobile */
+#define PCI_CHIP_HASWELL_ULT_B1      0x0A0B /* ULT GT1 */
+#define PCI_CHIP_HASWELL_ULT_B2      0x0A1B /* ULT GT2 */
+#define PCI_CHIP_HASWELL_ULT_B3      0x0A2B /* ULT GT3 */
+#define PCI_CHIP_HASWELL_ULT_E1      0x0A0E /* ULT GT1 */
+#define PCI_CHIP_HASWELL_ULT_E2      0x0A1E /* ULT GT2 */
+#define PCI_CHIP_HASWELL_ULT_E3      0x0A2E /* ULT GT3 */
 /* CRW */
 #define PCI_CHIP_HASWELL_CRW_D1      0x0D02 /* CRW GT1 desktop */
 #define PCI_CHIP_HASWELL_CRW_D2      0x0D12 /* CRW GT2 desktop */
@@ -120,26 +148,45 @@
 #define PCI_CHIP_HASWELL_CRW_M1      0x0D06 /* CRW GT1 mobile */
 #define PCI_CHIP_HASWELL_CRW_M2      0x0D16 /* CRW GT2 mobile */
 #define PCI_CHIP_HASWELL_CRW_M3      0x0D26 /* CRW GT3 mobile */
+#define PCI_CHIP_HASWELL_CRW_B1      0x0D0B /* CRW GT1 */
+#define PCI_CHIP_HASWELL_CRW_B2      0x0D1B /* CRW GT2 */
+#define PCI_CHIP_HASWELL_CRW_B3      0x0D2B /* CRW GT3 */
+#define PCI_CHIP_HASWELL_CRW_E1      0x0D0E /* CRW GT1 */
+#define PCI_CHIP_HASWELL_CRW_E2      0x0D1E /* CRW GT2 */
+#define PCI_CHIP_HASWELL_CRW_E3      0x0D2E /* CRW GT3 */
+
 
 #define IS_HASWELL(devid) (  \
 	(devid) == PCI_CHIP_HASWELL_D1 || (devid) == PCI_CHIP_HASWELL_D2 || \
 	(devid) == PCI_CHIP_HASWELL_D3 || (devid) == PCI_CHIP_HASWELL_S1 || \
 	(devid) == PCI_CHIP_HASWELL_S2 || (devid) == PCI_CHIP_HASWELL_S3 || \
 	(devid) == PCI_CHIP_HASWELL_M1 || (devid) == PCI_CHIP_HASWELL_M2 || \
-	(devid) == PCI_CHIP_HASWELL_M3 || (devid) == PCI_CHIP_HASWELL_SDV_D1 || \
+	(devid) == PCI_CHIP_HASWELL_M3 || (devid) == PCI_CHIP_HASWELL_B1 || \
+	(devid) == PCI_CHIP_HASWELL_B2 || (devid) == PCI_CHIP_HASWELL_B3 || \
+	(devid) == PCI_CHIP_HASWELL_E1 || (devid) == PCI_CHIP_HASWELL_E2 || \
+	(devid) == PCI_CHIP_HASWELL_E3 || (devid) == PCI_CHIP_HASWELL_SDV_D1 || \
 	(devid) == PCI_CHIP_HASWELL_SDV_D2 || (devid) == PCI_CHIP_HASWELL_SDV_D3 || \
 	(devid) == PCI_CHIP_HASWELL_SDV_S1 || (devid) == PCI_CHIP_HASWELL_SDV_S2 || \
 	(devid) == PCI_CHIP_HASWELL_SDV_S3 || (devid) == PCI_CHIP_HASWELL_SDV_M1 || \
 	(devid) == PCI_CHIP_HASWELL_SDV_M2 || (devid) == PCI_CHIP_HASWELL_SDV_M3 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_B1 || (devid) == PCI_CHIP_HASWELL_SDV_B2 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_B3 || (devid) == PCI_CHIP_HASWELL_SDV_E1 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_E2 || (devid) == PCI_CHIP_HASWELL_SDV_E3 || \
 	(devid) == PCI_CHIP_HASWELL_ULT_D1 || (devid) == PCI_CHIP_HASWELL_ULT_D2 || \
 	(devid) == PCI_CHIP_HASWELL_ULT_D3 || (devid) == PCI_CHIP_HASWELL_ULT_S1 || \
 	(devid) == PCI_CHIP_HASWELL_ULT_S2 || (devid) == PCI_CHIP_HASWELL_ULT_S3 || \
 	(devid) == PCI_CHIP_HASWELL_ULT_M1 || (devid) == PCI_CHIP_HASWELL_ULT_M2 || \
-	(devid) == PCI_CHIP_HASWELL_ULT_M3 || (devid) == PCI_CHIP_HASWELL_CRW_D1 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_M3 || (devid) == PCI_CHIP_HASWELL_ULT_B1 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_B2 || (devid) == PCI_CHIP_HASWELL_ULT_B3 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_E1 || (devid) == PCI_CHIP_HASWELL_ULT_E2 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_E3 || (devid) == PCI_CHIP_HASWELL_CRW_D1 || \
 	(devid) == PCI_CHIP_HASWELL_CRW_D2 || (devid) == PCI_CHIP_HASWELL_CRW_D3 || \
 	(devid) == PCI_CHIP_HASWELL_CRW_S1 || (devid) == PCI_CHIP_HASWELL_CRW_S2 || \
 	(devid) == PCI_CHIP_HASWELL_CRW_S3 || (devid) == PCI_CHIP_HASWELL_CRW_M1 || \
-	(devid) == PCI_CHIP_HASWELL_CRW_M2 || (devid) == PCI_CHIP_HASWELL_CRW_M3)
+	(devid) == PCI_CHIP_HASWELL_CRW_M2 || (devid) == PCI_CHIP_HASWELL_CRW_M3 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_B1 || (devid) == PCI_CHIP_HASWELL_CRW_B2 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_B3 || (devid) == PCI_CHIP_HASWELL_CRW_E1 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_E2 || (devid) == PCI_CHIP_HASWELL_CRW_E3)
 
 #define IS_GEN75(devid)  IS_HASWELL(devid)
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 0426738..ee3f2b7 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -26,6 +26,7 @@
 #include "cl_khr_icd.h"
 #include "cl_thread.h"
 #include "CL/cl.h"
+#include "cl_gbe_loader.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -37,35 +38,62 @@
 
 static struct _cl_device_id intel_ivb_gt2_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 128,
+  .max_compute_unit = 16,
   .max_thread_per_unit = 8,
-  .max_work_item_sizes = {512, 512, 512},
+  .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
-  .wg_sz = 1024,
 #include "cl_gen7_device.h"
 };
 
 static struct _cl_device_id intel_ivb_gt1_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 64,
-  .max_thread_per_unit = 8,
+  .max_compute_unit = 6,
+  .max_thread_per_unit = 6,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-  .wg_sz = 512,
 #include "cl_gen7_device.h"
 };
 
-/* XXX we clone IVB for HSW now */
-static struct _cl_device_id intel_hsw_device = {
+static struct _cl_device_id intel_baytrail_t_device = {
   INIT_ICD(dispatch)
-  .max_compute_unit = 64,
+  .max_compute_unit = 4,
   .max_thread_per_unit = 8,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
-  .wg_sz = 512,
+#include "cl_gen7_device.h"
+};
+
+/* XXX we clone IVB for HSW now */
+static struct _cl_device_id intel_hsw_gt1_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 10,
+  .max_thread_per_unit = 7,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_hsw_gt2_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 20,
+  .max_thread_per_unit = 7,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_hsw_gt3_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 40,
+  .max_thread_per_unit = 7,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
+  .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
 };
 
@@ -73,92 +101,157 @@ LOCAL cl_device_id
 cl_get_gt_device(void)
 {
   cl_device_id ret = NULL;
-  cl_set_thread_batch_buf(NULL);
   const int device_id = cl_driver_get_device_id();
+  cl_device_id device = NULL;
 
 #define DECL_INFO_STRING(BREAK, STRUCT, FIELD, STRING) \
     STRUCT.FIELD = STRING; \
     STRUCT.JOIN(FIELD,_sz) = sizeof(STRING); \
+    device = &STRUCT; \
     goto BREAK;
 
   switch (device_id) {
     case PCI_CHIP_HASWELL_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Desktop");
     case PCI_CHIP_HASWELL_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Desktop");
     case PCI_CHIP_HASWELL_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Desktop");
     case PCI_CHIP_HASWELL_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Server");
     case PCI_CHIP_HASWELL_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Server");
     case PCI_CHIP_HASWELL_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Server");
     case PCI_CHIP_HASWELL_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Mobile");
     case PCI_CHIP_HASWELL_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Mobile");
     case PCI_CHIP_HASWELL_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Mobile");
+    case PCI_CHIP_HASWELL_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
+    case PCI_CHIP_HASWELL_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
+    case PCI_CHIP_HASWELL_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
+    case PCI_CHIP_HASWELL_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
+    case PCI_CHIP_HASWELL_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
+    case PCI_CHIP_HASWELL_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
     case PCI_CHIP_HASWELL_SDV_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 Desktop");
     case PCI_CHIP_HASWELL_SDV_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 Desktop");
     case PCI_CHIP_HASWELL_SDV_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 Desktop");
     case PCI_CHIP_HASWELL_SDV_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 Server");
     case PCI_CHIP_HASWELL_SDV_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 Server");
     case PCI_CHIP_HASWELL_SDV_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 Server");
     case PCI_CHIP_HASWELL_SDV_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 Mobile");
     case PCI_CHIP_HASWELL_SDV_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 Mobile");
     case PCI_CHIP_HASWELL_SDV_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 Mobile");
+    case PCI_CHIP_HASWELL_SDV_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 reserved");
+    case PCI_CHIP_HASWELL_SDV_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 reserved");
+    case PCI_CHIP_HASWELL_SDV_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 reserved");
+    case PCI_CHIP_HASWELL_SDV_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 reserved");
+    case PCI_CHIP_HASWELL_SDV_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 reserved");
+    case PCI_CHIP_HASWELL_SDV_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 reserved");
     case PCI_CHIP_HASWELL_ULT_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Desktop");
     case PCI_CHIP_HASWELL_ULT_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Desktop");
     case PCI_CHIP_HASWELL_ULT_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Desktop");
     case PCI_CHIP_HASWELL_ULT_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Server");
     case PCI_CHIP_HASWELL_ULT_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Server");
     case PCI_CHIP_HASWELL_ULT_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Server");
     case PCI_CHIP_HASWELL_ULT_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Mobile");
     case PCI_CHIP_HASWELL_ULT_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile");
     case PCI_CHIP_HASWELL_ULT_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Mobile");
+    case PCI_CHIP_HASWELL_ULT_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
+    case PCI_CHIP_HASWELL_ULT_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
+    case PCI_CHIP_HASWELL_ULT_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
+    case PCI_CHIP_HASWELL_ULT_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
+    case PCI_CHIP_HASWELL_ULT_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
+    case PCI_CHIP_HASWELL_ULT_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
+
 	/* CRW */
     case PCI_CHIP_HASWELL_CRW_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Desktop");
     case PCI_CHIP_HASWELL_CRW_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Desktop");
     case PCI_CHIP_HASWELL_CRW_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Desktop");
     case PCI_CHIP_HASWELL_CRW_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Server");
     case PCI_CHIP_HASWELL_CRW_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Server");
     case PCI_CHIP_HASWELL_CRW_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Server");
     case PCI_CHIP_HASWELL_CRW_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Mobile");
     case PCI_CHIP_HASWELL_CRW_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Mobile");
     case PCI_CHIP_HASWELL_CRW_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Mobile");
+    case PCI_CHIP_HASWELL_CRW_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
+    case PCI_CHIP_HASWELL_CRW_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
+    case PCI_CHIP_HASWELL_CRW_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
+    case PCI_CHIP_HASWELL_CRW_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
+    case PCI_CHIP_HASWELL_CRW_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
+    case PCI_CHIP_HASWELL_CRW_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
 has_break:
-      intel_hsw_device.vendor_id = device_id;
-      intel_hsw_device.platform = intel_platform;
-      ret = &intel_hsw_device;
+      device->vendor_id = device_id;
+      device->platform = intel_platform;
+      ret = device;
       break;
 
     case PCI_CHIP_IVYBRIDGE_GT1:
@@ -177,14 +270,46 @@ ivb_gt1_break:
       DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge GT2");
     case PCI_CHIP_IVYBRIDGE_M_GT2:
       DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge M GT2");
+    case PCI_CHIP_IVYBRIDGE_S_GT2:
+      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge S GT2");
 ivb_gt2_break:
       intel_ivb_gt2_device.vendor_id = device_id;
       intel_ivb_gt2_device.platform = intel_platform;
       ret = &intel_ivb_gt2_device;
       break;
+
+    case PCI_CHIP_BAYTRAIL_T:
+      DECL_INFO_STRING(baytrail_t_device_break, intel_baytrail_t_device, name, "Intel(R) HD Graphics Bay Trail-T");
+baytrail_t_device_break:
+      intel_baytrail_t_device.vendor_id = device_id;
+      intel_baytrail_t_device.platform = intel_platform;
+      ret = &intel_baytrail_t_device;
+      break;
+
+    case PCI_CHIP_SANDYBRIDGE_BRIDGE:
+    case PCI_CHIP_SANDYBRIDGE_GT1:
+    case PCI_CHIP_SANDYBRIDGE_GT2:
+    case PCI_CHIP_SANDYBRIDGE_GT2_PLUS:
+    case PCI_CHIP_SANDYBRIDGE_BRIDGE_M:
+    case PCI_CHIP_SANDYBRIDGE_M_GT1:
+    case PCI_CHIP_SANDYBRIDGE_M_GT2:
+    case PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS:
+    case PCI_CHIP_SANDYBRIDGE_BRIDGE_S:
+    case PCI_CHIP_SANDYBRIDGE_S_GT:
+      // Intel(R) HD Graphics SandyBridge not supported yet
+      ret = NULL;
+      break;
     default:
-      printf("cl_get_gt_device(): error, unknown device\n");
-      exit(1);
+      printf("cl_get_gt_device(): error, unknown device: %x\n", device_id);
+  }
+
+  if (!CompilerSupported()) {
+    if (ret != NULL) {
+      ret->compiler_available = CL_FALSE;
+      //ret->linker_available = CL_FALSE;
+      ret->profile = "EMBEDDED_PROFILE";
+      ret->profile_sz = strlen(ret->profile) + 1;
+    }
   }
 
   return ret;
@@ -252,7 +377,11 @@ cl_get_device_info(cl_device_id     device,
 {
   if (UNLIKELY(device != &intel_ivb_gt1_device &&
                device != &intel_ivb_gt2_device &&
-               device != &intel_hsw_device))
+               device != &intel_baytrail_t_device &&
+               device != &intel_hsw_gt1_device &&
+               device != &intel_hsw_gt2_device &&
+               device != &intel_hsw_gt3_device
+               ))
     return CL_INVALID_DEVICE;
 
   /* Find the correct parameter */
@@ -283,6 +412,7 @@ cl_get_device_info(cl_device_id     device,
     DECL_FIELD(IMAGE_SUPPORT, image_support)
     DECL_FIELD(MAX_READ_IMAGE_ARGS, max_read_image_args)
     DECL_FIELD(MAX_WRITE_IMAGE_ARGS, max_write_image_args)
+    DECL_FIELD(IMAGE_MAX_ARRAY_SIZE, image_max_array_size)
     DECL_FIELD(IMAGE2D_MAX_WIDTH, image2d_max_width)
     DECL_FIELD(IMAGE2D_MAX_HEIGHT, image2d_max_height)
     DECL_FIELD(IMAGE3D_MAX_WIDTH, image3d_max_width)
@@ -293,11 +423,13 @@ cl_get_device_info(cl_device_id     device,
     DECL_FIELD(MEM_BASE_ADDR_ALIGN, mem_base_addr_align)
     DECL_FIELD(MIN_DATA_TYPE_ALIGN_SIZE, min_data_type_align_size)
     DECL_FIELD(SINGLE_FP_CONFIG, single_fp_config)
+    DECL_FIELD(DOUBLE_FP_CONFIG, double_fp_config)
     DECL_FIELD(GLOBAL_MEM_CACHE_TYPE, global_mem_cache_type)
     DECL_FIELD(GLOBAL_MEM_CACHELINE_SIZE, global_mem_cache_line_size)
     DECL_FIELD(GLOBAL_MEM_CACHE_SIZE, global_mem_cache_size)
     DECL_FIELD(GLOBAL_MEM_SIZE, global_mem_size)
     DECL_FIELD(MAX_CONSTANT_BUFFER_SIZE, max_constant_buffer_size)
+    DECL_FIELD(IMAGE_MAX_BUFFER_SIZE, image_mem_size)
     DECL_FIELD(MAX_CONSTANT_ARGS, max_constant_args)
     DECL_FIELD(LOCAL_MEM_TYPE, local_mem_type)
     DECL_FIELD(LOCAL_MEM_SIZE, local_mem_size)
@@ -307,9 +439,12 @@ cl_get_device_info(cl_device_id     device,
     DECL_FIELD(ENDIAN_LITTLE, endian_little)
     DECL_FIELD(AVAILABLE, available)
     DECL_FIELD(COMPILER_AVAILABLE, compiler_available)
+    DECL_FIELD(LINKER_AVAILABLE, linker_available)
     DECL_FIELD(EXECUTION_CAPABILITIES, execution_capabilities)
     DECL_FIELD(QUEUE_PROPERTIES, queue_properties)
     DECL_FIELD(PLATFORM, platform)
+    DECL_FIELD(PRINTF_BUFFER_SIZE, printf_buffer_size)
+    DECL_FIELD(PREFERRED_INTEROP_USER_SYNC, interop_user_sync)
     DECL_STRING_FIELD(NAME, name)
     DECL_STRING_FIELD(VENDOR, vendor)
     DECL_STRING_FIELD(VERSION, version)
@@ -317,6 +452,12 @@ cl_get_device_info(cl_device_id     device,
     DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
     DECL_STRING_FIELD(EXTENSIONS, extensions);
     DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
+    DECL_FIELD(PARENT_DEVICE, parent_device)
+    DECL_FIELD(PARTITION_MAX_SUB_DEVICES, partition_max_sub_device)
+    DECL_FIELD(PARTITION_PROPERTIES, partition_property)
+    DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
+    DECL_FIELD(PARTITION_TYPE, partition_type)
+    DECL_FIELD(REFERENCE_COUNT, device_reference_count)
 
     case CL_DRIVER_VERSION:
       if (param_value_size_ret) {
@@ -338,14 +479,23 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
 {
   if (UNLIKELY(device != &intel_ivb_gt1_device &&
                device != &intel_ivb_gt2_device &&
-               device != &intel_hsw_device))
+               device != &intel_baytrail_t_device &&
+               device != &intel_hsw_gt1_device &&
+               device != &intel_hsw_gt2_device &&
+               device != &intel_hsw_gt3_device))
     return CL_INVALID_DEVICE;
   if (ver == NULL)
     return CL_SUCCESS;
-  if (device == &intel_ivb_gt1_device || device == &intel_ivb_gt2_device)
+  if (device == &intel_ivb_gt1_device || 
+      device == &intel_ivb_gt2_device ||
+      device == &intel_baytrail_t_device) {
     *ver = 7;
-  else
+  } else if (device == &intel_hsw_gt1_device || device == &intel_hsw_gt2_device
+        || device == &intel_hsw_gt3_device) {
     *ver = 75;
+  } else
+    return CL_INVALID_VALUE;
+
   return CL_SUCCESS;
 }
 #undef DECL_FIELD
@@ -364,6 +514,42 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
   _DECL_FIELD(FIELD)
 
 #include "cl_kernel.h"
+#include "cl_program.h"
+static int
+cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
+{
+  const char * n = cl_kernel_get_name(kernel);
+  const char * builtin_kernels_2d = "__cl_copy_image_2d_to_2d;__cl_copy_image_2d_to_buffer;__cl_copy_buffer_to_image_2d;__cl_fill_image_2d;__cl_fill_image_2d_array;";
+  const char * builtin_kernels_3d = "__cl_copy_image_3d_to_2d;__cl_copy_image_2d_to_3d;__cl_copy_image_3d_to_3d;__cl_copy_image_3d_to_buffer;__cl_copy_buffer_to_image_3d;__cl_fill_image_3d";
+    if (!strstr(device->built_in_kernels, n)){
+      return 0;
+    }else if(strstr(builtin_kernels_2d, n)){
+      return 2;
+    }else if(strstr(builtin_kernels_3d, n)){
+      return 3;
+    }else
+      return 1;
+
+}
+
+LOCAL size_t
+cl_get_kernel_max_wg_sz(cl_kernel kernel)
+{
+  size_t work_group_size;
+  int simd_width = interp_kernel_get_simd_width(kernel->opaque);
+  int vendor_id = kernel->program->ctx->device->vendor_id;
+  if (!interp_kernel_use_slm(kernel->opaque)) {
+    if (!IS_BAYTRAIL_T(vendor_id) || simd_width == 16)
+      work_group_size = simd_width * 64;
+    else
+      work_group_size = kernel->program->ctx->device->max_compute_unit *
+                        kernel->program->ctx->device->max_thread_per_unit * simd_width;
+  } else
+    work_group_size = kernel->program->ctx->device->max_work_group_size /
+                      (16 / simd_width);
+  return work_group_size;
+}
+
 LOCAL cl_int
 cl_get_kernel_workgroup_info(cl_kernel kernel,
                              cl_device_id device,
@@ -373,21 +559,54 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
                              size_t* param_value_size_ret)
 {
   int err = CL_SUCCESS;
+  int dimension = 0;
   if (UNLIKELY(device != &intel_ivb_gt1_device &&
-               device != &intel_ivb_gt2_device))
+               device != &intel_ivb_gt2_device &&
+               device != &intel_baytrail_t_device &&
+               device != &intel_hsw_gt1_device &&
+               device != &intel_hsw_gt2_device &&
+               device != &intel_hsw_gt3_device))
     return CL_INVALID_DEVICE;
 
   CHECK_KERNEL(kernel);
   switch (param_name) {
-    DECL_FIELD(WORK_GROUP_SIZE, device->wg_sz)
+    case CL_KERNEL_WORK_GROUP_SIZE:
+    {
+      if (param_value && param_value_size < sizeof(size_t))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(size_t);
+      if (param_value) {
+        size_t work_group_size = cl_get_kernel_max_wg_sz(kernel);
+        *(size_t*)param_value = work_group_size;
+        return CL_SUCCESS;
+      }
+    }
     DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul)
     case CL_KERNEL_LOCAL_MEM_SIZE:
-      {
-        size_t local_mem_sz =  gbe_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
-        _DECL_FIELD(local_mem_sz)
-      }
+    {
+      size_t local_mem_sz =  interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
+      _DECL_FIELD(local_mem_sz)
+    }
     DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
     DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
+    case CL_KERNEL_GLOBAL_WORK_SIZE:
+      dimension = cl_check_builtin_kernel_dimension(kernel, device);
+      if ( !dimension ) return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(device->max_1d_global_work_sizes);
+      if (param_value) {
+        if (dimension == 1) {
+          memcpy(param_value, device->max_1d_global_work_sizes, sizeof(device->max_1d_global_work_sizes));
+        }else if(dimension == 2){
+          memcpy(param_value, device->max_2d_global_work_sizes, sizeof(device->max_2d_global_work_sizes));
+        }else if(dimension == 3){
+          memcpy(param_value, device->max_3d_global_work_sizes, sizeof(device->max_3d_global_work_sizes));
+        }else
+          return CL_INVALID_VALUE;
+
+        return CL_SUCCESS;
+      }
     default:
       return CL_INVALID_VALUE;
   };
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 4ece26c..31bce47 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -25,11 +25,14 @@ struct _cl_device_id {
   DEFINE_ICD(dispatch)
   cl_device_type device_type;
   cl_uint  vendor_id;
-  cl_uint  max_compute_unit;
-  cl_uint  max_thread_per_unit;
-  cl_uint  max_work_item_dimensions;
-  size_t   max_work_item_sizes[3];
-  size_t   max_work_group_size;
+  cl_uint  max_compute_unit;               // maximum EU number
+  cl_uint  max_thread_per_unit;            // maximum EU threads per EU.
+  cl_uint  max_work_item_dimensions;       // should be 3.
+  size_t   max_work_item_sizes[3];         // equal to maximum work group size.
+  size_t   max_work_group_size;            // maximum work group size under simd16 mode.
+  size_t   max_1d_global_work_sizes[3];       // maximum 1d global work size for builtin kernels.
+  size_t   max_2d_global_work_sizes[3];       // maximum 2d global work size for builtin kernels.
+  size_t   max_3d_global_work_sizes[3];       // maximum 3d global work size for builtin kernels.
   cl_uint  preferred_vector_width_char;
   cl_uint  preferred_vector_width_short;
   cl_uint  preferred_vector_width_int;
@@ -51,15 +54,18 @@ struct _cl_device_id {
   cl_uint  max_read_image_args;
   cl_uint  max_write_image_args;
   size_t   image2d_max_width;
+  size_t   image_max_array_size;
   size_t   image2d_max_height;
   size_t   image3d_max_width;
   size_t   image3d_max_height;
   size_t   image3d_max_depth;
+  cl_ulong image_mem_size;
   cl_uint  max_samplers;
   size_t   max_parameter_size;
   cl_uint  mem_base_addr_align;
   cl_uint  min_data_type_align_size;
   cl_device_fp_config single_fp_config;
+  cl_device_fp_config double_fp_config;
   cl_device_mem_cache_type global_mem_cache_type;
   cl_uint  global_mem_cache_line_size;
   cl_ulong global_mem_cache_size;
@@ -68,15 +74,19 @@ struct _cl_device_id {
   cl_uint  max_constant_args;
   cl_device_local_mem_type local_mem_type;
   cl_ulong local_mem_size;
+  cl_ulong scratch_mem_size;
   cl_bool  error_correction_support;
   cl_bool  host_unified_memory;
   size_t   profiling_timer_resolution;
   cl_bool  endian_little;
   cl_bool  available;
   cl_bool  compiler_available;
+  cl_bool  linker_available;
   cl_device_exec_capabilities execution_capabilities;
   cl_command_queue_properties queue_properties;
   cl_platform_id platform;
+  size_t printf_buffer_size;
+  cl_bool interop_user_sync;
   const char *name;
   const char *vendor;
   const char *version;
@@ -94,8 +104,14 @@ struct _cl_device_id {
   size_t driver_version_sz;
   size_t built_in_kernels_sz;
   /* Kernel specific info that we're assigning statically */
-  size_t wg_sz;
   size_t preferred_wg_sz_mul;
+  /* SubDevice specific info */
+  cl_device_id parent_device;
+  cl_uint      partition_max_sub_device;
+  cl_device_partition_property partition_property[3];
+  cl_device_affinity_domain    affinity_domain;
+  cl_device_partition_property partition_type[3];
+  cl_uint      device_reference_count;
 };
 
 /* Get a device from the given platform */
@@ -123,6 +139,7 @@ extern cl_int cl_get_kernel_workgroup_info(cl_kernel kernel,
                                            size_t *         param_value_size_ret);
 /* Returns the Gen device ID */
 extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
+extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
 
 #endif /* __CL_DEVICE_ID_H__ */
 
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 96fc377..9cdba98 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -59,7 +59,7 @@ typedef enum cl_gpgpu_tiling {
   GPGPU_TILE_Y  = 2,
 } cl_gpgpu_tiling;
 
-/* Cache control options */
+/* Cache control options for gen7 */
 typedef enum cl_cache_control {
   cc_gtt      = 0x0,
   cc_l3       = 0x1,
@@ -67,6 +67,20 @@ typedef enum cl_cache_control {
   cc_llc_l3   = 0x3
 } cl_cache_control;
 
+/* L3 Cache control options for gen75 */
+typedef enum cl_l3_cache_control {
+  l3cc_uc      = 0x0,
+  l3cc_ec       = 0x1
+} cl_l3_cache_control;
+
+/* LLCCC Cache control options for gen75 */
+typedef enum cl_llccc_cache_control {
+  llccc_pte      = 0x0<<1,
+  llccc_uc       = 0x1<<1,
+  llccc_ec       = 0x2<<1,
+  llccc_ucllc    = 0x3<<1
+} cl_llccc_cache_control;
+
 typedef enum gpu_command_status {
   command_queued    = 3,
   command_submitted = 2,
@@ -99,13 +113,16 @@ typedef void (cl_gpgpu_sync_cb)(void*);
 extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
 
 /* Bind a regular unformatted buffer */
-typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t cchint);
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t size, uint8_t bti);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
 /* bind samplers defined in both kernel and kernel args. */
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
+/* get the default cache control value. */
+typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
+extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
 /* Set a 2d texture */
 typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
                                       uint32_t id,
@@ -126,11 +143,11 @@ typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, u
 extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
 
 /* Setup scratch */
-typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
+typedef int (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
 extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
 
 /* Configure internal state */
-typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
+typedef int (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
 extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
 
 /* Set the buffer object where to report performance counters */
@@ -138,10 +155,10 @@ typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
 extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
 
 /* Fills current curbe buffer with data */
-typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+typedef int (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
 extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
 
-typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size, uint8_t bti);
 extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
 
 /* Setup all indirect states */
@@ -157,7 +174,7 @@ typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu, uint32_t index, uint32_t non_no
 extern cl_gpgpu_set_sampler_cb *cl_gpgpu_set_sampler;
 
 /* Allocate the batch buffer and return the BO used for the batch buffer */
-typedef void (cl_gpgpu_batch_reset_cb)(cl_gpgpu, size_t sz);
+typedef int (cl_gpgpu_batch_reset_cb)(cl_gpgpu, size_t sz);
 extern cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset;
 
 /* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
@@ -176,24 +193,24 @@ extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
 typedef cl_gpgpu_event (cl_gpgpu_event_new_cb)(cl_gpgpu);
 extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new;
 
-/* new a event for a batch buffer */
+/* update the batch buffer of this event */
 typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int);
 extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status;
 
-/* new a event for a batch buffer */
-typedef void (cl_gpgpu_event_pending_cb)(cl_gpgpu, cl_gpgpu_event);
-extern cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending;
+/* flush the batch buffer of this event */
+typedef void (cl_gpgpu_event_flush_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush;
 
-/* new a event for a batch buffer */
-typedef void (cl_gpgpu_event_resume_cb)(cl_gpgpu_event);
-extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
+/* cancel exec batch buffer of this event */
+typedef void (cl_gpgpu_event_cancel_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_cancel_cb *cl_gpgpu_event_cancel;
 
-/* new a event for a batch buffer */
+/* delete a gpgpu event */
 typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
 extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
 
 /* Get a event time stamp */
-typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, cl_gpgpu_event, int, uint64_t*);
 extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
 
 /* Get current GPU time stamp */
@@ -208,6 +225,34 @@ extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
 typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
 extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
 
+/* Set the printf buffer */
+typedef int (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint32_t, uint8_t);
+extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
+
+/* get the printf buffer offset in the apeture*/
+typedef unsigned long (cl_gpgpu_reloc_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t);
+extern cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer;
+
+/* map the printf buffer */
+typedef void* (cl_gpgpu_map_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer;
+
+/* unmap the printf buffer */
+typedef void (cl_gpgpu_unmap_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer;
+
+/* release the printf buffer */
+typedef unsigned long (cl_gpgpu_release_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer;
+
+/* Set the last printfset pointer */
+typedef int (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *, size_t*);
+extern cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info;
+
+/* Get the last printfset pointer */
+typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*);
+extern cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info;
+
 /* Will spawn all threads */
 typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
                                   uint32_t simd_sz,
@@ -242,7 +287,7 @@ extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
 typedef cl_buffer (cl_buffer_get_buffer_from_libva_cb)(cl_context ctx, unsigned int bo_name, size_t *sz);
 extern cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva;
 
-typedef cl_buffer (cl_buffer_get_image_from_libva_cb)(cl_context ctx, unsigned int bo_name, struct _cl_mem_image *image);
+typedef cl_buffer (cl_buffer_get_image_from_libva_cb)(cl_context ctx, unsigned int bo_name, struct _cl_mem_image *image, unsigned int offset);
 extern cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva;
 
 /* Unref a buffer and destroy it if no more ref */
@@ -297,6 +342,9 @@ extern cl_buffer_subdata_cb *cl_buffer_subdata;
 typedef int (cl_buffer_wait_rendering_cb) (cl_buffer);
 extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
 
+typedef int (cl_buffer_get_fd_cb)(cl_buffer, int *fd);
+extern cl_buffer_get_fd_cb *cl_buffer_get_fd;
+
 /* Get the device id */
 typedef int (cl_driver_get_device_id_cb)(void);
 extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 0a9012c..72f25d9 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -47,6 +47,7 @@ LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
 LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
 LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
 LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
+LOCAL cl_buffer_get_fd_cb *cl_buffer_get_fd = NULL;
 
 /* cl_khr_gl_sharing */
 LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
@@ -63,6 +64,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
 LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
@@ -77,11 +79,17 @@ LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
 LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
 LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
 LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
-LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
-LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
+LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
 LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
 LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
 LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
 LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
 LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
+LOCAL cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer = NULL;
+LOCAL cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer = NULL;
+LOCAL cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer = NULL;
+LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
+LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
+LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
+LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
 
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 330d230..af118ad 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -61,13 +61,18 @@ cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
   const size_t* host_origin = data->host_origin;
   const size_t* region = data->region;
 
-  if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+  if (!(src_ptr = cl_mem_map_auto(mem))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
    size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
-   src_ptr = (char*)src_ptr + offset;
+   src_ptr = (char*)src_ptr + offset +  buffer->sub_offset;
 
    offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
    dst_ptr = (char *)data->ptr + offset;
@@ -92,7 +97,7 @@ cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
      }
    }
 
-  err = cl_mem_unmap_auto(data->mem_obj);
+  err = cl_mem_unmap_auto(mem);
 
 error:
   return err;
@@ -130,13 +135,18 @@ cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
   const size_t* host_origin = data->host_origin;
   const size_t* region = data->region;
 
-  if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+  if (!(dst_ptr = cl_mem_map_auto(mem))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
   size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
-  dst_ptr = (char *)dst_ptr + offset;
+  dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset;
 
   offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
   src_ptr = (char*)data->const_ptr + offset;
@@ -161,7 +171,7 @@ cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
     }
   }
 
-  err = cl_mem_unmap_auto(data->mem_obj);
+  err = cl_mem_unmap_auto(mem);
 
 error:
   return err;
@@ -225,11 +235,11 @@ cl_int cl_enqueue_write_image(enqueue_data *data)
     err = CL_MAP_FAILURE;
     goto error;
   }
-
+  //dst need to add offset
   cl_mem_copy_image_region(data->origin, data->region, dst_ptr,
                            image->row_pitch, image->slice_pitch,
                            data->const_ptr, data->row_pitch,
-                           data->slice_pitch, image);
+                           data->slice_pitch, image, CL_TRUE, CL_FALSE);
   err = cl_mem_unmap_auto(mem);
 
 error:
@@ -246,18 +256,22 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data)
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
-  //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
-  if (!(ptr = cl_mem_map_gtt(mem))) {
+  if(data->unsync_map == 1)
+    //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+    ptr = cl_mem_map_gtt(mem);
+  else
+    ptr = cl_mem_map_auto(mem);
+
+  if (ptr == NULL) {
     err = CL_MAP_FAILURE;
     goto error;
   }
-
-  ptr = (char*)ptr + data->offset + buffer->sub_offset;
-  assert(data->ptr == ptr);
+  data->ptr = ptr;
 
   if(mem->flags & CL_MEM_USE_HOST_PTR) {
     assert(mem->host_ptr);
-    memcpy(mem->host_ptr + data->offset, ptr, data->size);
+    ptr = (char*)ptr + data->offset + buffer->sub_offset;
+    memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
   }
 
 error:
@@ -269,20 +283,31 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
   cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
   void *ptr = NULL;
+  size_t row_pitch = 0;
   CHECK_IMAGE(mem, image);
 
-  if (!(ptr = cl_mem_map_gtt(mem))) {
+  if(data->unsync_map == 1)
+    //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+    ptr = cl_mem_map_gtt(mem);
+  else
+    ptr = cl_mem_map_auto(mem);
+
+  if (ptr == NULL) {
     err = CL_MAP_FAILURE;
     goto error;
   }
-
-  assert(data->ptr == (char*)ptr + data->offset);
+  data->ptr = ptr;
+  if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+    row_pitch = image->slice_pitch;
+  else
+    row_pitch = image->row_pitch;
 
   if(mem->flags & CL_MEM_USE_HOST_PTR) {
     assert(mem->host_ptr);
+    //src and dst need add offset in function cl_mem_copy_image_region
     cl_mem_copy_image_region(data->origin, data->region,
                              mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
-                             data->ptr, data->row_pitch, data->slice_pitch, image);
+                             data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
   }
 
 error:
@@ -292,11 +317,13 @@ error:
 cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
 {
   cl_int err = CL_SUCCESS;
-  int i;
+  int i, j;
   size_t mapped_size = 0;
+  size_t origin[3], region[3];
   void * v_ptr = NULL;
   void * mapped_ptr = data->ptr;
   cl_mem memobj = data->mem_obj;
+  size_t row_pitch = 0;
 
   assert(memobj->mapped_ptr_sz >= memobj->map_ref);
   INVALID_VALUE_IF(!mapped_ptr);
@@ -305,6 +332,12 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
       memobj->mapped_ptr[i].ptr = NULL;
       mapped_size = memobj->mapped_ptr[i].size;
       v_ptr = memobj->mapped_ptr[i].v_ptr;
+      for(j=0; j<3; j++) {
+        region[j] = memobj->mapped_ptr[i].region[j];
+        origin[j] = memobj->mapped_ptr[i].origin[j];
+        memobj->mapped_ptr[i].region[j] = 0;
+        memobj->mapped_ptr[i].origin[j] = 0;
+      }
       memobj->mapped_ptr[i].size = 0;
       memobj->mapped_ptr[i].v_ptr = NULL;
       memobj->map_ref--;
@@ -315,15 +348,29 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
   INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
 
   if (memobj->flags & CL_MEM_USE_HOST_PTR) {
-    assert(mapped_ptr >= memobj->host_ptr &&
-      mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
-    /* Sync the data. */
-    memcpy(v_ptr, mapped_ptr, mapped_size);
+    if(memobj->type == CL_MEM_BUFFER_TYPE ||
+       memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+      assert(mapped_ptr >= memobj->host_ptr &&
+        mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+      /* Sync the data. */
+      memcpy(v_ptr, mapped_ptr, mapped_size);
+    } else {
+      CHECK_IMAGE(memobj, image);
+
+      if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+        row_pitch = image->slice_pitch;
+      else
+        row_pitch = image->row_pitch;
+      //v_ptr have added offset, host_ptr have not added offset.
+      cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
+                               memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+                               image, CL_FALSE, CL_TRUE);
+    }
   } else {
     assert(v_ptr == mapped_ptr);
   }
 
-  cl_mem_unmap_gtt(memobj);
+  cl_mem_unmap_auto(memobj);
 
   /* shrink the mapped slot. */
   if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
@@ -412,10 +459,13 @@ cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
     case EnqueueCopyBufferToImage:
     case EnqueueCopyImageToBuffer:
     case EnqueueNDRangeKernel:
-      cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
+    case EnqueueFillBuffer:
+    case EnqueueFillImage:
+      cl_event_flush(event);
       return CL_SUCCESS;
     case EnqueueNativeKernel:
       return cl_enqueue_native_kernel(data);
+    case EnqueueMigrateMemObj:
     default:
       return CL_SUCCESS;
   }
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 1d3ae5f..a9b3601 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -41,6 +41,10 @@ typedef enum {
   EnqueueNDRangeKernel,
   EnqueueNativeKernel,
   EnqueueMarker,
+  EnqueueBarrier,
+  EnqueueFillBuffer,
+  EnqueueFillImage,
+  EnqueueMigrateMemObj,
   EnqueueInvalid
 } enqueue_type;
 
@@ -60,6 +64,7 @@ typedef struct _enqueue_data {
   const void *      const_ptr;        /* Const ptr for memory read */
   void *            ptr;              /* Ptr for write and return value */
   const cl_mem*     mem_list;         /* mem_list of clEnqueueNativeKernel */
+  uint8_t           unsync_map;       /* Indicate the clEnqueueMapBuffer/Image is unsync map */
   void (*user_func)(void *);          /* pointer to a host-callable user function */
 } enqueue_data;
 
diff --git a/src/cl_event.c b/src/cl_event.c
index f838a3a..99e60eb 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -33,6 +33,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
 {
   switch(type) {
     case CL_COMMAND_COPY_BUFFER:
+    case CL_COMMAND_FILL_BUFFER:
     case CL_COMMAND_COPY_IMAGE:
     case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
     case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
@@ -45,6 +46,18 @@ cl_event_is_gpu_command_type(cl_command_type type)
   }
 }
 
+void cl_event_flush(cl_event event)
+{
+  assert(event->gpgpu_event != NULL);
+  if (event->gpgpu) {
+    cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
+    cl_gpgpu_delete(event->gpgpu);
+    event->gpgpu = NULL;
+  }
+  cl_gpgpu_event_flush(event->gpgpu_event);
+  event->queue->last_event = event;
+}
+
 cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
 {
   cl_event event = NULL;
@@ -83,8 +96,6 @@ cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type ty
   event->enqueue_cb = NULL;
   event->waits_head = NULL;
   event->emplict = emplict;
-  if(queue && event->gpgpu_event)
-    queue->last_event = event;
 
 exit:
   return event;
@@ -99,7 +110,7 @@ void cl_event_delete(cl_event event)
   if (UNLIKELY(event == NULL))
     return;
 
-  cl_event_update_status(event);
+  cl_event_update_status(event, 0);
 
   if (atomic_dec(&event->ref_n) > 1)
     return;
@@ -112,6 +123,7 @@ void cl_event_delete(cl_event event)
   while(event->user_cb) {
     cb = event->user_cb;
     if(cb->executed == CL_FALSE) {
+      cb->executed = CL_TRUE;
       cb->pfn_notify(event, event->status, cb->user_data);
     }
     event->user_cb = cb->next;
@@ -137,6 +149,11 @@ void cl_event_delete(cl_event event)
   pthread_mutex_unlock(&event->ctx->event_lock);
   cl_context_delete(event->ctx);
 
+  if (event->gpgpu) {
+    fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n");
+    cl_gpgpu_delete(event->gpgpu);
+    event->gpgpu = NULL;
+  }
   cl_free(event);
 }
 
@@ -213,7 +230,7 @@ error:
 cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
                             cl_command_queue queue)
 {
-  cl_int i, j;
+  cl_int i;
 
   /* Check whether wait user events */
   for(i=0; i<num_events_in_wait_list; i++) {
@@ -224,15 +241,12 @@ cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *eve
     if((event_wait_list[i]->type == CL_COMMAND_USER) ||
        (event_wait_list[i]->enqueue_cb &&
        (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
-      for(j=0; j<num_events_in_wait_list; j++)
-        cl_event_add_ref(event_wait_list[j]);  //add defer enqueue's wait event reference
       return CL_ENQUEUE_EXECUTE_DEFER;
     }
   }
 
-  if(queue && queue->barrier_index > 0) {
-    return CL_ENQUEUE_EXECUTE_DEFER;
-  }
+  if(queue && queue->barrier_events_num )
+      return CL_ENQUEUE_EXECUTE_DEFER;
 
   /* Non user events or all user event finished, wait all enqueue events finish */
   for(i=0; i<num_events_in_wait_list; i++) {
@@ -258,40 +272,44 @@ void cl_event_new_enqueue_callback(cl_event event,
   user_event *user_events, *u_ev;
   cl_command_queue queue = event->queue;
   cl_int i;
-  GET_QUEUE_THREAD_GPGPU(data->queue);
+  cl_int err = CL_SUCCESS;
 
-  /* Allocate and inialize the structure itself */
+  /* Allocate and initialize the structure itself */
   TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
-  cb->num_events = num_events_in_wait_list;
+  cb->num_events = 0;
   TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list));
-  for(i=0; i<num_events_in_wait_list; i++)
-    cb->wait_list[i] = event_wait_list[i];
+  for(i=0; i<num_events_in_wait_list; i++) {
+    //user event will insert to cb->wait_user_events, need not in wait list, avoid ref twice
+    if(event_wait_list[i]->type != CL_COMMAND_USER) {
+      cb->wait_list[cb->num_events++] = event_wait_list[i];
+      cl_event_add_ref(event_wait_list[i]);  //add defer enqueue's wait event reference
+    }
+  }
   cb->event = event;
   cb->next = NULL;
   cb->wait_user_events = NULL;
 
-  if(queue && queue->barrier_index > 0) {
-    for(i=0; i<queue->barrier_index; i++) {
+  if(queue && queue->barrier_events_num > 0) {
+    for(i=0; i<queue->barrier_events_num; i++) {
       /* Insert the enqueue_callback to user event list */
       node = queue->wait_events[i]->waits_head;
       if(node == NULL)
         queue->wait_events[i]->waits_head = cb;
-      else
+      else{
         while((node != cb) && node->next)
           node = node->next;
         if(node == cb)   //wait on dup user event
           continue;
         node->next = cb;
+      }
 
       /* Insert the user event to enqueue_callback's wait_user_events */
-      TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
-      u_ev->event = queue->wait_events[i];
-      u_ev->next = cb->wait_user_events;
-      cb->wait_user_events = u_ev;
+      TRY(cl_event_insert_user_event, &cb->wait_user_events, queue->wait_events[i]);
+      cl_event_add_ref(queue->wait_events[i]);
     }
   }
 
-  /* Find out all user events that events in event_wait_list wait */
+  /* Find out all user events that in event_wait_list wait */
   for(i=0; i<num_events_in_wait_list; i++) {
     if(event_wait_list[i]->status <= CL_COMPLETE)
       continue;
@@ -309,36 +327,42 @@ void cl_event_new_enqueue_callback(cl_event event,
         node->next = cb;
       }
       /* Insert the user event to enqueue_callback's wait_user_events */
-      TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
-      u_ev->event = event_wait_list[i];
-      u_ev->next = cb->wait_user_events;
-      cb->wait_user_events = u_ev;
+      TRY(cl_event_insert_user_event, &cb->wait_user_events, event_wait_list[i]);
+      cl_event_add_ref(event_wait_list[i]);
       cl_command_queue_insert_event(event->queue, event_wait_list[i]);
+      if(data->type == EnqueueBarrier){
+        cl_command_queue_insert_barrier_event(event->queue, event_wait_list[i]);
+      }
     } else if(event_wait_list[i]->enqueue_cb != NULL) {
       user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
       while(user_events != NULL) {
         /* Insert the enqueue_callback to user event's  waits_tail */
         node = user_events->event->waits_head;
-        while((node != cb) && node->next)
-          node = node->next;
-        if(node == cb) {  //wait on dup user event
-          user_events = user_events->next;
-          continue;
+        if(node == NULL)
+          event_wait_list[i]->waits_head = cb;
+        else{
+          while((node != cb) && node->next)
+            node = node->next;
+          if(node == cb) {  //wait on dup user event
+            user_events = user_events->next;
+            continue;
+          }
+          node->next = cb;
         }
-        node->next = cb;
 
         /* Insert the user event to enqueue_callback's wait_user_events */
-        TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
-        u_ev->event = user_events->event;
-        u_ev->next = cb->wait_user_events;
-        cb->wait_user_events = u_ev;
+        TRY(cl_event_insert_user_event, &cb->wait_user_events, user_events->event);
+        cl_event_add_ref(user_events->event);
+        cl_command_queue_insert_event(event->queue, user_events->event);
+        if(data->type == EnqueueBarrier){
+          cl_command_queue_insert_barrier_event(event->queue, user_events->event);
+        }
         user_events = user_events->next;
-        cl_command_queue_insert_event(event->queue, event_wait_list[i]);
       }
     }
   }
   if(data->queue != NULL && event->gpgpu_event != NULL) {
-    cl_gpgpu_event_pending(gpgpu, event->gpgpu_event);
+    event->gpgpu = cl_thread_gpgpu_take(event->queue);
     data->ptr = (void *)event->gpgpu_event;
   }
   cb->data = *data;
@@ -351,10 +375,14 @@ error:
     while(cb->wait_user_events) {
       u_ev = cb->wait_user_events;
       cb->wait_user_events = cb->wait_user_events->next;
+      cl_event_delete(u_ev->event);
       cl_free(u_ev);
     }
-    if(cb->wait_list)
-      cl_free(cb->wait_list);
+    for(i=0; i<cb->num_events; i++) {
+      if(cb->wait_list[i]) {
+        cl_event_delete(cb->wait_list[i]);
+      }
+    }
     cl_free(cb);
   }
   goto exit;
@@ -363,7 +391,6 @@ error:
 void cl_event_set_status(cl_event event, cl_int status)
 {
   user_callback *user_cb;
-  user_event    *u_ev, *u_ev_next;
   cl_int ret, i;
   cl_event evt;
 
@@ -380,9 +407,18 @@ void cl_event_set_status(cl_event event, cl_int status)
 
   if(status <= CL_COMPLETE) {
     if(event->enqueue_cb) {
-      cl_enqueue_handle(event, &event->enqueue_cb->data);
-      if(event->gpgpu_event)
-        cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
+      if(status == CL_COMPLETE) {
+        cl_enqueue_handle(event, &event->enqueue_cb->data);
+        if(event->gpgpu_event)
+          cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
+      } else {
+        if(event->gpgpu_event) {
+          // Error then cancel the enqueued event.
+          cl_gpgpu_delete(event->gpgpu);
+          event->gpgpu = NULL;
+        }
+      }
+
       event->status = status;  //Change the event status after enqueue and befor unlock
 
       pthread_mutex_unlock(&event->ctx->event_lock);
@@ -407,8 +443,8 @@ void cl_event_set_status(cl_event event, cl_int status)
   user_cb = event->user_cb;
   while(user_cb) {
     if(user_cb->status >= status) {
-      user_cb->pfn_notify(event, event->status, user_cb->user_data);
       user_cb->executed = CL_TRUE;
+      user_cb->pfn_notify(event, event->status, user_cb->user_data);
     }
     user_cb = user_cb->next;
   }
@@ -419,23 +455,9 @@ void cl_event_set_status(cl_event event, cl_int status)
   /* Check all defer enqueue */
   enqueue_callback *cb, *enqueue_cb = event->waits_head;
   while(enqueue_cb) {
-    /* Remove this user event in enqueue_cb */
-    while(enqueue_cb->wait_user_events &&
-          enqueue_cb->wait_user_events->event == event) {
-      u_ev = enqueue_cb->wait_user_events;
-      enqueue_cb->wait_user_events = enqueue_cb->wait_user_events->next;
-      cl_free(u_ev);
-    }
-
-    u_ev = enqueue_cb->wait_user_events;
-    while(u_ev) {
-      u_ev_next = u_ev->next;
-      if(u_ev_next && u_ev_next->event == event) {
-        u_ev->next = u_ev_next->next;
-        cl_free(u_ev_next);
-      } else
-        u_ev->next = u_ev_next;
-    }
+    /* Remove this user event in enqueue_cb, update the header if needed. */
+    cl_event_remove_user_event(&enqueue_cb->wait_user_events, event);
+    cl_event_delete(event);
 
     /* Still wait on other user events */
     if(enqueue_cb->wait_user_events != NULL) {
@@ -445,10 +467,12 @@ void cl_event_set_status(cl_event event, cl_int status)
 
     //remove user event frome enqueue_cb's ctx
     cl_command_queue_remove_event(enqueue_cb->event->queue, event);
+    cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event);
 
     /* All user events complete, now wait enqueue events */
     ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
-                               enqueue_cb->event->queue);
+        enqueue_cb->event->queue);
+    ret = ret;
     assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
 
     cb = enqueue_cb;
@@ -456,7 +480,11 @@ void cl_event_set_status(cl_event event, cl_int status)
 
     /* Call the pending operation */
     evt = cb->event;
-    cl_event_set_status(cb->event, CL_COMPLETE);
+    /* TODO: if this event wait on several events, one event's
+       status is error, the others is complete, what's the status
+       of this event? Can't find the description in OpenCL spec.
+       Simply update to latest finish wait event.*/
+    cl_event_set_status(cb->event, status);
     if(evt->emplict == CL_FALSE) {
       cl_event_delete(evt);
     }
@@ -464,25 +492,40 @@ void cl_event_set_status(cl_event event, cl_int status)
   event->waits_head = NULL;
 }
 
-void cl_event_update_status(cl_event event)
+void cl_event_update_status(cl_event event, int wait)
 {
   if(event->status <= CL_COMPLETE)
     return;
   if((event->gpgpu_event) &&
-     (cl_gpgpu_event_update_status(event->gpgpu_event, 0) == command_complete))
+     (cl_gpgpu_event_update_status(event->gpgpu_event, wait) == command_complete))
     cl_event_set_status(event, CL_COMPLETE);
 }
 
-cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
+cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
+                cl_uint num_events_in_wait_list,
+                const cl_event *event_wait_list,
+                cl_event* event)
 {
-  enqueue_data data;
+  enqueue_data data = { 0 };
+  cl_event e;
 
-  *event = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
-  if(event == NULL)
+  e = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
+  if(e == NULL)
     return CL_OUT_OF_HOST_MEMORY;
 
-  //if wait_events_num>0, the marker event need wait queue->wait_events
-  if(queue->wait_events_num > 0) {
+  if(event != NULL ){
+    *event = e;
+  }
+
+//enqueues a marker command which waits for either a list of events to complete, or if the list is
+//empty it waits for all commands previously enqueued in command_queue to complete before it  completes.
+  if(num_events_in_wait_list > 0){
+    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
+      data.type = EnqueueMarker;
+      cl_event_new_enqueue_callback(*event, &data, num_events_in_wait_list, event_wait_list);
+      return CL_SUCCESS;
+    }
+  } else if(queue->wait_events_num > 0) {
     data.type = EnqueueMarker;
     cl_event_new_enqueue_callback(*event, &data, queue->wait_events_num, queue->wait_events);
     return CL_SUCCESS;
@@ -492,7 +535,44 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
     cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
   }
 
-  cl_event_set_status(*event, CL_COMPLETE);
+  cl_event_set_status(e, CL_COMPLETE);
+  return CL_SUCCESS;
+}
+
+cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
+                cl_uint num_events_in_wait_list,
+                const cl_event *event_wait_list,
+                cl_event* event)
+{
+  enqueue_data data = { 0 };
+  cl_event e;
+
+  e = cl_event_new(queue->ctx, queue, CL_COMMAND_BARRIER, CL_TRUE);
+  if(e == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  if(event != NULL ){
+    *event = e;
+  }
+//enqueues a barrier command which waits for either a list of events to complete, or if the list is
+//empty it waits for all commands previously enqueued in command_queue to complete before it  completes.
+  if(num_events_in_wait_list > 0){
+    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
+      data.type = EnqueueBarrier;
+      cl_event_new_enqueue_callback(e, &data, num_events_in_wait_list, event_wait_list);
+      return CL_SUCCESS;
+    }
+  } else if(queue->wait_events_num > 0) {
+    data.type = EnqueueBarrier;
+    cl_event_new_enqueue_callback(e, &data, queue->wait_events_num, queue->wait_events);
+    return CL_SUCCESS;
+  }
+
+  if(queue->last_event && queue->last_event->gpgpu_event) {
+    cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
+  }
+
+  cl_event_set_status(e, CL_COMPLETE);
   return CL_SUCCESS;
 }
 
@@ -513,14 +593,58 @@ cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
     event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   } else if(param_name == CL_PROFILING_COMMAND_START) {
-    cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 0, &ret_val);
+    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 0, &ret_val);
     event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   } else if (param_name == CL_PROFILING_COMMAND_END) {
-    cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 1, &ret_val);
+    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 1, &ret_val);
     event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
-  } else {
-    return CL_INVALID_VALUE;
   }
+  return CL_INVALID_VALUE;
+}
+
+cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event)
+{
+  user_event * u_iter = *p_u_ev;
+  user_event * u_ev;
+
+  while(u_iter)
+  {
+    if(u_iter->event == event)
+      return CL_SUCCESS;
+    u_iter = u_iter->next;
+  }
+
+  TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+  u_ev->event = event;
+  u_ev->next = *p_u_ev;
+  *p_u_ev = u_ev;
+
+
+  return CL_SUCCESS;
+error:
+  return CL_FALSE;
+}
+
+cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event)
+{
+  user_event * u_iter = *p_u_ev;
+  user_event * u_prev = *p_u_ev;
+
+  while(u_iter){
+    if(u_iter->event == event ){
+      if(u_iter == *p_u_ev){
+        *p_u_ev = u_iter->next;
+      }else{
+        u_prev->next = u_iter->next;
+      }
+      cl_free(u_iter);
+      break;
+    }
+    u_prev = u_iter;
+    u_iter = u_iter->next;
+  }
+
+  return CL_SUCCESS;
 }
diff --git a/src/cl_event.h b/src/cl_event.h
index 3c61110..cfe5ddd 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -63,6 +63,7 @@ struct _cl_event {
   cl_command_queue   queue;       /* The command queue associated with event */
   cl_command_type    type;        /* The command type associated with event */
   cl_int             status;      /* The execution status */
+  cl_gpgpu           gpgpu;       /* Current gpgpu, owned by this structure. */
   cl_gpgpu_event     gpgpu_event; /* The event object communicate with hardware */
   user_callback*     user_cb;     /* The event callback functions */
   enqueue_callback*  enqueue_cb;  /* This event's enqueue */
@@ -88,10 +89,18 @@ void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_e
 /* Set the event status and call all callbacks */
 void cl_event_set_status(cl_event, cl_int);
 /* Check and update event status */
-void cl_event_update_status(cl_event);
+void cl_event_update_status(cl_event, cl_int);
 /* Create the marker event */
-cl_int cl_event_marker(cl_command_queue, cl_event*);
+cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event *,  cl_event*);
+/* Create the barrier event */
+cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *,  cl_event*);
 /* Do the event profiling */
 cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
+/* insert the user event */
+cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
+/* remove the user event */
+cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event);
+/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */
+void cl_event_flush(cl_event event);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
new file mode 100644
index 0000000..7da0475
--- /dev/null
+++ b/src/cl_gbe_loader.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <iostream>
+#include <dlfcn.h>
+#include <string.h>
+#include <stdio.h>
+#include "cl_gbe_loader.h"
+#include "backend/src/GBEConfig.h"
+
+//function pointer from libgbe.so
+gbe_program_new_from_source_cb *compiler_program_new_from_source = NULL;
+gbe_program_compile_from_source_cb *compiler_program_compile_from_source = NULL;
+gbe_program_new_gen_program_cb *compiler_program_new_gen_program = NULL;
+gbe_program_link_program_cb *compiler_program_link_program = NULL;
+gbe_program_build_from_llvm_cb *compiler_program_build_from_llvm = NULL;
+gbe_program_new_from_llvm_binary_cb *compiler_program_new_from_llvm_binary = NULL;
+gbe_program_serialize_to_binary_cb *compiler_program_serialize_to_binary = NULL;
+gbe_program_new_from_llvm_cb *compiler_program_new_from_llvm = NULL;
+gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL;
+
+//function pointer from libgbeinterp.so
+gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL;
+gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL;
+gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL;
+gbe_program_delete_cb *interp_program_delete = NULL;
+gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
+gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
+gbe_program_get_kernel_cb *interp_program_get_kernel = NULL;
+gbe_kernel_get_name_cb *interp_kernel_get_name = NULL;
+gbe_kernel_get_attributes_cb *interp_kernel_get_attributes = NULL;
+gbe_kernel_get_code_cb *interp_kernel_get_code = NULL;
+gbe_kernel_get_code_size_cb *interp_kernel_get_code_size = NULL;
+gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num = NULL;
+gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size = NULL;
+gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti = NULL;
+gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type = NULL;
+gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align = NULL;
+gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width = NULL;
+gbe_kernel_get_curbe_offset_cb *interp_kernel_get_curbe_offset = NULL;
+gbe_kernel_get_curbe_size_cb *interp_kernel_get_curbe_size = NULL;
+gbe_kernel_get_stack_size_cb *interp_kernel_get_stack_size = NULL;
+gbe_kernel_get_scratch_size_cb *interp_kernel_get_scratch_size = NULL;
+gbe_kernel_get_required_work_group_size_cb *interp_kernel_get_required_work_group_size = NULL;
+gbe_kernel_use_slm_cb *interp_kernel_use_slm = NULL;
+gbe_kernel_get_slm_size_cb *interp_kernel_get_slm_size = NULL;
+gbe_kernel_get_sampler_size_cb *interp_kernel_get_sampler_size = NULL;
+gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data = NULL;
+gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size = NULL;
+gbe_kernel_get_image_size_cb *interp_kernel_get_image_size = NULL;
+gbe_kernel_get_image_data_cb *interp_kernel_get_image_data = NULL;
+gbe_get_printf_num_cb* interp_get_printf_num = NULL;
+gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti = NULL;
+gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti = NULL;
+gbe_dup_printfset_cb* interp_dup_printfset = NULL;
+gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size = NULL;
+gbe_release_printf_info_cb* interp_release_printf_info = NULL;
+gbe_output_printf_cb* interp_output_printf = NULL;
+gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info = NULL;
+
+struct GbeLoaderInitializer
+{
+  GbeLoaderInitializer()
+  {
+    LoadCompiler();
+
+    const char* path;
+    if (!LoadInterp(path))
+      std::cerr << "unable to load " << path << " which is part of the driver, please check!" << std::endl;
+  }
+
+  bool LoadInterp(const char*& path)
+  {
+    const char* interpPath = getenv("OCL_INTERP_PATH");
+    if (interpPath == NULL)
+      interpPath = INTERP_OBJECT_DIR;
+
+    path = interpPath;
+
+    dlhInterp = dlopen(interpPath, RTLD_LAZY | RTLD_LOCAL);
+    if (dlhInterp == NULL) {
+      return false;
+    }
+
+    interp_program_new_from_binary = *(gbe_program_new_from_binary_cb**)dlsym(dlhInterp, "gbe_program_new_from_binary");
+    if (interp_program_new_from_binary == NULL)
+      return false;
+
+    interp_program_get_global_constant_size = *(gbe_program_get_global_constant_size_cb**)dlsym(dlhInterp, "gbe_program_get_global_constant_size");
+    if (interp_program_get_global_constant_size == NULL)
+      return false;
+
+    interp_program_get_global_constant_data = *(gbe_program_get_global_constant_data_cb**)dlsym(dlhInterp, "gbe_program_get_global_constant_data");
+    if (interp_program_get_global_constant_data == NULL)
+      return false;
+
+    interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete");
+    if (interp_program_delete == NULL)
+      return false;
+
+    interp_program_get_kernel_num = *(gbe_program_get_kernel_num_cb**)dlsym(dlhInterp, "gbe_program_get_kernel_num");
+    if (interp_program_get_kernel_num == NULL)
+      return false;
+
+    interp_program_get_kernel_by_name = *(gbe_program_get_kernel_by_name_cb**)dlsym(dlhInterp, "gbe_program_get_kernel_by_name");
+    if (interp_program_get_kernel_by_name == NULL)
+      return false;
+
+    interp_program_get_kernel = *(gbe_program_get_kernel_cb**)dlsym(dlhInterp, "gbe_program_get_kernel");
+    if (interp_program_get_kernel == NULL)
+      return false;
+
+    interp_kernel_get_name = *(gbe_kernel_get_name_cb**)dlsym(dlhInterp, "gbe_kernel_get_name");
+    if (interp_kernel_get_name == NULL)
+      return false;
+
+    interp_kernel_get_attributes = *(gbe_kernel_get_attributes_cb**)dlsym(dlhInterp, "gbe_kernel_get_attributes");
+    if (interp_kernel_get_attributes == NULL)
+      return false;
+
+    interp_kernel_get_code = *(gbe_kernel_get_code_cb**)dlsym(dlhInterp, "gbe_kernel_get_code");
+    if (interp_kernel_get_code == NULL)
+      return false;
+
+    interp_kernel_get_code_size = *(gbe_kernel_get_code_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_code_size");
+    if (interp_kernel_get_code_size == NULL)
+      return false;
+
+    interp_kernel_get_arg_num = *(gbe_kernel_get_arg_num_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_num");
+    if (interp_kernel_get_arg_num == NULL)
+      return false;
+
+    interp_kernel_get_arg_size = *(gbe_kernel_get_arg_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_size");
+    if (interp_kernel_get_arg_size == NULL)
+      return false;
+
+    interp_kernel_get_arg_bti = *(gbe_kernel_get_arg_bti_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_bti");
+    if (interp_kernel_get_arg_bti == NULL)
+      return false;
+
+    interp_kernel_get_arg_type = *(gbe_kernel_get_arg_type_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_type");
+    if (interp_kernel_get_arg_type == NULL)
+      return false;
+
+    interp_kernel_get_arg_align = *(gbe_kernel_get_arg_align_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_align");
+    if (interp_kernel_get_arg_align == NULL)
+      return false;
+
+    interp_kernel_get_simd_width = *(gbe_kernel_get_simd_width_cb**)dlsym(dlhInterp, "gbe_kernel_get_simd_width");
+    if (interp_kernel_get_simd_width == NULL)
+      return false;
+
+    interp_kernel_get_curbe_offset = *(gbe_kernel_get_curbe_offset_cb**)dlsym(dlhInterp, "gbe_kernel_get_curbe_offset");
+    if (interp_kernel_get_curbe_offset == NULL)
+      return false;
+
+    interp_kernel_get_curbe_size = *(gbe_kernel_get_curbe_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_curbe_size");
+    if (interp_kernel_get_curbe_size == NULL)
+      return false;
+
+    interp_kernel_get_stack_size = *(gbe_kernel_get_stack_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_stack_size");
+    if (interp_kernel_get_stack_size == NULL)
+      return false;
+
+    interp_kernel_get_scratch_size = *(gbe_kernel_get_scratch_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_scratch_size");
+    if (interp_kernel_get_scratch_size == NULL)
+      return false;
+
+    interp_kernel_get_required_work_group_size = *(gbe_kernel_get_required_work_group_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_required_work_group_size");
+    if (interp_kernel_get_required_work_group_size == NULL)
+      return false;
+
+    interp_kernel_use_slm = *(gbe_kernel_use_slm_cb**)dlsym(dlhInterp, "gbe_kernel_use_slm");
+    if (interp_kernel_use_slm == NULL)
+      return false;
+
+    interp_kernel_get_slm_size = *(gbe_kernel_get_slm_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_slm_size");
+    if (interp_kernel_get_slm_size == NULL)
+      return false;
+
+    interp_kernel_get_sampler_size = *(gbe_kernel_get_sampler_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_sampler_size");
+    if (interp_kernel_get_sampler_size == NULL)
+      return false;
+
+    interp_kernel_get_sampler_data = *(gbe_kernel_get_sampler_data_cb**)dlsym(dlhInterp, "gbe_kernel_get_sampler_data");
+    if (interp_kernel_get_sampler_data == NULL)
+      return false;
+
+    interp_kernel_get_compile_wg_size = *(gbe_kernel_get_compile_wg_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_compile_wg_size");
+    if (interp_kernel_get_compile_wg_size == NULL)
+      return false;
+
+    interp_kernel_get_image_size = *(gbe_kernel_get_image_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_image_size");
+    if (interp_kernel_get_image_size == NULL)
+      return false;
+
+    interp_kernel_get_image_data = *(gbe_kernel_get_image_data_cb**)dlsym(dlhInterp, "gbe_kernel_get_image_data");
+    if (interp_kernel_get_image_data == NULL)
+      return false;
+
+    interp_get_printf_num = *(gbe_get_printf_num_cb**)dlsym(dlhInterp, "gbe_get_printf_num");
+    if (interp_get_printf_num == NULL)
+      return false;
+
+    interp_get_printf_buf_bti = *(gbe_get_printf_buf_bti_cb**)dlsym(dlhInterp, "gbe_get_printf_buf_bti");
+    if (interp_get_printf_buf_bti == NULL)
+      return false;
+
+    interp_get_printf_indexbuf_bti = *(gbe_get_printf_indexbuf_bti_cb**)dlsym(dlhInterp, "gbe_get_printf_indexbuf_bti");
+    if (interp_get_printf_indexbuf_bti == NULL)
+      return false;
+
+    interp_dup_printfset = *(gbe_dup_printfset_cb**)dlsym(dlhInterp, "gbe_dup_printfset");
+    if (interp_dup_printfset == NULL)
+      return false;
+
+    interp_get_printf_sizeof_size = *(gbe_get_printf_sizeof_size_cb**)dlsym(dlhInterp, "gbe_get_printf_sizeof_size");
+    if (interp_get_printf_sizeof_size == NULL)
+      return false;
+
+    interp_release_printf_info = *(gbe_release_printf_info_cb**)dlsym(dlhInterp, "gbe_release_printf_info");
+    if (interp_release_printf_info == NULL)
+      return false;
+
+    interp_output_printf = *(gbe_output_printf_cb**)dlsym(dlhInterp, "gbe_output_printf");
+    if (interp_output_printf == NULL)
+      return false;
+
+    interp_kernel_get_arg_info = *(gbe_kernel_get_arg_info_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_info");
+    if (interp_kernel_get_arg_info == NULL)
+      return false;
+
+    return true;
+  }
+
+  void LoadCompiler()
+  {
+    compilerLoaded = false;
+
+    const char* nonCompiler = getenv("OCL_NON_COMPILER");
+    if (nonCompiler != NULL) {
+      if (strcmp(nonCompiler, "1") == 0)
+        return;
+    }
+
+    const char* gbePath = getenv("OCL_GBE_PATH");
+    if (gbePath == NULL)
+      gbePath = GBE_OBJECT_DIR;
+
+    dlhCompiler = dlopen(gbePath, RTLD_LAZY | RTLD_LOCAL);
+    if (dlhCompiler != NULL) {
+      compiler_program_new_from_source = *(gbe_program_new_from_source_cb **)dlsym(dlhCompiler, "gbe_program_new_from_source");
+      if (compiler_program_new_from_source == NULL)
+        return;
+
+      compiler_program_compile_from_source = *(gbe_program_compile_from_source_cb **)dlsym(dlhCompiler, "gbe_program_compile_from_source");
+      if (compiler_program_compile_from_source == NULL)
+        return;
+
+      compiler_program_new_gen_program = *(gbe_program_new_gen_program_cb **)dlsym(dlhCompiler, "gbe_program_new_gen_program");
+      if (compiler_program_new_gen_program == NULL)
+        return;
+
+      compiler_program_link_program = *(gbe_program_link_program_cb **)dlsym(dlhCompiler, "gbe_program_link_program");
+      if (compiler_program_link_program == NULL)
+        return;
+
+      compiler_program_build_from_llvm = *(gbe_program_build_from_llvm_cb **)dlsym(dlhCompiler, "gbe_program_build_from_llvm");
+      if (compiler_program_build_from_llvm == NULL)
+        return;
+
+      compiler_program_new_from_llvm_binary = *(gbe_program_new_from_llvm_binary_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm_binary");
+      if (compiler_program_new_from_llvm_binary == NULL)
+        return;
+
+      compiler_program_serialize_to_binary = *(gbe_program_serialize_to_binary_cb **)dlsym(dlhCompiler, "gbe_program_serialize_to_binary");
+      if (compiler_program_serialize_to_binary == NULL)
+        return;
+
+      compiler_program_new_from_llvm = *(gbe_program_new_from_llvm_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm");
+      if (compiler_program_new_from_llvm == NULL)
+        return;
+
+      compiler_program_clean_llvm_resource = *(gbe_program_clean_llvm_resource_cb **)dlsym(dlhCompiler, "gbe_program_clean_llvm_resource");
+      if (compiler_program_clean_llvm_resource == NULL)
+        return;
+
+      compilerLoaded = true;
+    }
+  }
+
+  ~GbeLoaderInitializer()
+  {
+    if (dlhCompiler != NULL)
+      dlclose(dlhCompiler);
+
+    if (dlhInterp != NULL)
+      dlclose(dlhInterp);
+  }
+
+  bool compilerLoaded;
+  void *dlhCompiler;
+  void *dlhInterp;
+};
+
+static struct GbeLoaderInitializer gbeLoader;
+
+int CompilerSupported()
+{
+  if (gbeLoader.compilerLoaded)
+    return 1;
+  else
+    return 0;
+}
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
new file mode 100644
index 0000000..da9d034
--- /dev/null
+++ b/src/cl_gbe_loader.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_GBE_LOADER_H__
+#define __CL_GBE_LOADER_H__
+
+#include "program.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern gbe_program_new_from_source_cb *compiler_program_new_from_source;
+extern gbe_program_compile_from_source_cb *compiler_program_compile_from_source;
+extern gbe_program_new_gen_program_cb *compiler_program_new_gen_program;
+extern gbe_program_link_program_cb *compiler_program_link_program;
+extern gbe_program_build_from_llvm_cb *compiler_program_build_from_llvm;
+extern gbe_program_new_from_llvm_binary_cb *compiler_program_new_from_llvm_binary;
+extern gbe_program_serialize_to_binary_cb *compiler_program_serialize_to_binary;
+extern gbe_program_new_from_llvm_cb *compiler_program_new_from_llvm;
+extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource;
+
+extern gbe_program_new_from_binary_cb *interp_program_new_from_binary;
+extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size;
+extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data;
+extern gbe_program_delete_cb *interp_program_delete;
+extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num;
+extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name;
+extern gbe_program_get_kernel_cb *interp_program_get_kernel;
+extern gbe_kernel_get_name_cb *interp_kernel_get_name;
+extern gbe_kernel_get_attributes_cb *interp_kernel_get_attributes;
+extern gbe_kernel_get_code_cb *interp_kernel_get_code;
+extern gbe_kernel_get_code_size_cb *interp_kernel_get_code_size;
+extern gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num;
+extern gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size;
+extern gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti;
+extern gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type;
+extern gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align;
+extern gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width;
+extern gbe_kernel_get_curbe_offset_cb *interp_kernel_get_curbe_offset;
+extern gbe_kernel_get_curbe_size_cb *interp_kernel_get_curbe_size;
+extern gbe_kernel_get_stack_size_cb *interp_kernel_get_stack_size;
+extern gbe_kernel_get_scratch_size_cb *interp_kernel_get_scratch_size;
+extern gbe_kernel_get_required_work_group_size_cb *interp_kernel_get_required_work_group_size;
+extern gbe_kernel_use_slm_cb *interp_kernel_use_slm;
+extern gbe_kernel_get_slm_size_cb *interp_kernel_get_slm_size;
+extern gbe_kernel_get_sampler_size_cb *interp_kernel_get_sampler_size;
+extern gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data;
+extern gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size;
+extern gbe_kernel_get_image_size_cb *interp_kernel_get_image_size;
+extern gbe_kernel_get_image_data_cb *interp_kernel_get_image_data;
+extern gbe_get_printf_num_cb* interp_get_printf_num;
+extern gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti;
+extern gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti;
+extern gbe_dup_printfset_cb* interp_dup_printfset;
+extern gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size;
+extern gbe_release_printf_info_cb* interp_release_printf_info;
+extern gbe_output_printf_cb* interp_output_printf;
+extern gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info;
+
+int CompilerSupported();
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_GBE_LOADER_H__ */
diff --git a/src/cl_gen75_device.h b/src/cl_gen75_device.h
index 7bf662e..682ee06 100644
--- a/src/cl_gen75_device.h
+++ b/src/cl_gen75_device.h
@@ -24,6 +24,7 @@
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
 .local_mem_size = 64 << 10,
+.scratch_mem_size = 2 << 20,
 
 #include "cl_gt_device.h"
 
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index e198d6f..69cc0b9 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -23,6 +23,7 @@
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
 .local_mem_size = 64 << 10,
+.scratch_mem_size = 12 << 10,
 
 #include "cl_gt_device.h"
 
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 110988a..e2fcee3 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -21,37 +21,43 @@
 .device_type = CL_DEVICE_TYPE_GPU,
 .vendor_id = 0, /* == device_id (set when requested) */
 .max_work_item_dimensions = 3,
-.preferred_vector_width_char = 16,
-.preferred_vector_width_short = 16,
-.preferred_vector_width_int = 16,
-.preferred_vector_width_long = 16,
-.preferred_vector_width_float = 16,
+.max_1d_global_work_sizes = {1024 * 1024 * 256, 1, 1},
+.max_2d_global_work_sizes = {8192, 8192, 1},
+.max_3d_global_work_sizes = {8192, 8192, 2048},
+.preferred_vector_width_char = 8,
+.preferred_vector_width_short = 8,
+.preferred_vector_width_int = 4,
+.preferred_vector_width_long = 2,
+.preferred_vector_width_float = 4,
 .preferred_vector_width_double = 0,
 .preferred_vector_width_half = 0,
-.native_vector_width_char = 16,
-.native_vector_width_short = 16,
-.native_vector_width_int = 16,
-.native_vector_width_long = 16,
-.native_vector_width_float = 16,
-.native_vector_width_double = 16,
-.native_vector_width_half = 16,
+.native_vector_width_char = 8,
+.native_vector_width_short = 8,
+.native_vector_width_int = 4,
+.native_vector_width_long = 2,
+.native_vector_width_float = 4,
+.native_vector_width_double = 2,
+.native_vector_width_half = 8,
 .preferred_wg_sz_mul = 16,
 .address_bits = 32,
-.max_mem_alloc_size = 128 * 1024 * 1024,
+.max_mem_alloc_size = 256 * 1024 * 1024,
 .image_support = CL_TRUE,
 .max_read_image_args = 128,
 .max_write_image_args = 8,
+.image_max_array_size = 2048,
 .image2d_max_width = 8192,
 .image2d_max_height = 8192,
 .image3d_max_width = 8192,
 .image3d_max_height = 8192,
 .image3d_max_depth = 2048,
+.image_mem_size = 8192,
 .max_samplers = 16,
 .mem_base_addr_align = sizeof(cl_long) * 16 * 8,
 .min_data_type_align_size = sizeof(cl_long) * 16,
 .single_fp_config = 0, /* XXX */
+.double_fp_config = 0,
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
-.global_mem_size = 128 * 1024 * 1024,
+.global_mem_size = 1024 * 1024 * 1024,
 .max_constant_buffer_size = 512 << 10,
 .max_constant_args = 8,
 .error_correction_support = CL_FALSE,
@@ -60,11 +66,14 @@
 .endian_little = CL_TRUE,
 .available = CL_TRUE,
 .compiler_available = CL_TRUE,
+.linker_available = CL_TRUE,
 .execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
 .queue_properties = CL_QUEUE_PROFILING_ENABLE,
 .platform = NULL, /* == intel_platform (set when requested) */
 /* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
 .single_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
+.printf_buffer_size = 1 * 1024 * 1024,
+.interop_user_sync = CL_TRUE,
 
 #define DECL_INFO_STRING(FIELD, STRING) \
     .FIELD = STRING,                    \
@@ -75,8 +84,41 @@ DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
 DECL_INFO_STRING(profile, "FULL_PROFILE")
 DECL_INFO_STRING(opencl_c_version, LIBCL_C_VERSION_STRING)
 DECL_INFO_STRING(extensions, "")
-DECL_INFO_STRING(built_in_kernels, "")
+DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
+                                   "__cl_copy_region_align16;"
+                                   "__cl_cpy_region_unalign_same_offset;"
+                                   "__cl_copy_region_unalign_dst_offset;"
+                                   "__cl_copy_region_unalign_src_offset;"
+                                   "__cl_copy_buffer_rect;"
+                                   "__cl_copy_image_1d_to_1d;"
+                                   "__cl_copy_image_2d_to_2d;"
+                                   "__cl_copy_image_3d_to_2d;"
+                                   "__cl_copy_image_2d_to_3d;"
+                                   "__cl_copy_image_3d_to_3d;"
+                                   "__cl_copy_image_2d_to_buffer;"
+                                   "__cl_copy_image_3d_to_buffer;"
+                                   "__cl_copy_buffer_to_image_2d;"
+                                   "__cl_copy_buffer_to_image_3d;"
+                                   "__cl_fill_region_unalign;"
+                                   "__cl_fill_region_align2;"
+                                   "__cl_fill_region_align4;"
+                                   "__cl_fill_region_align8_2;"
+                                   "__cl_fill_region_align8_4;"
+                                   "__cl_fill_region_align8_8;"
+                                   "__cl_fill_region_align8_16;"
+                                   "__cl_fill_region_align128;"
+                                   "__cl_fill_image_1d;"
+                                   "__cl_fill_image_1d_array;"
+                                   "__cl_fill_image_2d;"
+                                   "__cl_fill_image_2d_array;"
+                                   "__cl_fill_image_3d;")
+
 DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
 #undef DECL_INFO_STRING
-
+.parent_device = NULL,
+.partition_max_sub_device = 1,
+.partition_property = {0},
+.affinity_domain = 0,
+.partition_type = {0},
+.device_reference_count = 1,
 
diff --git a/src/cl_image.c b/src/cl_image.c
index f89bcae..ced9789 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -28,6 +28,9 @@ cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp)
 {
   assert(bpp);
 
+  if(fmt == NULL)
+    return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+
   const uint32_t type = fmt->image_channel_data_type;
   const uint32_t order = fmt->image_channel_order;
   switch (type) {
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 6a0c8e6..55b707a 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -83,7 +83,14 @@ LOCAL const char*
 cl_kernel_get_name(cl_kernel k)
 {
   if (UNLIKELY(k == NULL)) return NULL;
-  return gbe_kernel_get_name(k->opaque);
+  return interp_kernel_get_name(k->opaque);
+}
+
+LOCAL const char*
+cl_kernel_get_attributes(cl_kernel k)
+{
+  if (UNLIKELY(k == NULL)) return NULL;
+  return interp_kernel_get_attributes(k->opaque);
 }
 
 LOCAL void
@@ -98,12 +105,13 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   uint32_t offset;            /* where to patch */
   enum gbe_arg_type arg_type; /* kind of argument */
   size_t arg_sz;              /* size of the argument */
-  cl_mem mem;                 /* for __global, __constant and image arguments */
+  cl_mem mem = NULL;          /* for __global, __constant and image arguments */
+  cl_context ctx = k->program->ctx;
 
   if (UNLIKELY(index >= k->arg_n))
     return CL_INVALID_ARG_INDEX;
-  arg_type = gbe_kernel_get_arg_type(k->opaque, index);
-  arg_sz = gbe_kernel_get_arg_size(k->opaque, index);
+  arg_type = interp_kernel_get_arg_type(k->opaque, index);
+  arg_sz = interp_kernel_get_arg_size(k->opaque, index);
 
   if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
     if (arg_sz == 2 && arg_type == GBE_ARG_VALUE && sz == sizeof(cl_sampler)) {
@@ -133,9 +141,10 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     // should be image, GLOBAL_PTR, CONSTANT_PTR
     if (UNLIKELY(value == NULL && arg_type == GBE_ARG_IMAGE))
       return CL_INVALID_ARG_VALUE;
-    if(value != NULL) {
+    if(value != NULL)
       mem = *(cl_mem*)value;
-      if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+    if(value != NULL && mem) {
+      if( CL_SUCCESS != is_valid_mem(mem, ctx->buffers))
         return CL_INVALID_MEM_OBJECT;
 
       if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
@@ -146,7 +155,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 
   /* Copy the structure or the value directly into the curbe */
   if (arg_type == GBE_ARG_VALUE) {
-    offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
     assert(offset + sz <= k->curbe_sz);
     memcpy(k->curbe + offset, value, sz);
     k->args[index].local_sz = 0;
@@ -172,12 +181,18 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     k->args[index].mem = NULL;
     k->args[index].sampler = sampler;
     cl_set_sampler_arg_slot(k, index, sampler);
+    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    assert(offset + 2 <= k->curbe_sz);
+    memcpy(k->curbe + offset, &sampler->clkSamplerValue, 2);
     return CL_SUCCESS;
   }
 
-  if(value == NULL) {
+  if(value != NULL)
+    mem = *(cl_mem*) value;
+
+  if(value == NULL || mem == NULL) {
     /* for buffer object GLOBAL_PTR CONSTANT_PTR, it maybe NULL */
-    int32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    int32_t offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
     *((uint32_t *)(k->curbe + offset)) = 0;
     assert(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR);
 
@@ -197,6 +212,89 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   k->args[index].mem = mem;
   k->args[index].is_set = 1;
   k->args[index].local_sz = 0;
+  k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
+  return CL_SUCCESS;
+}
+
+LOCAL int
+cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
+                       size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  assert(k != NULL);
+  void *ret_info = interp_kernel_get_arg_info(k->opaque, arg_index,
+                           param_name - CL_KERNEL_ARG_ADDRESS_QUALIFIER);
+  int str_len = 0;
+  cl_kernel_arg_type_qualifier type_qual = CL_KERNEL_ARG_TYPE_NONE;
+
+  switch (param_name) {
+  case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
+    if (param_value_size < sizeof(cl_kernel_arg_address_qualifier))
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_kernel_arg_address_qualifier);
+    if (!param_value) return CL_SUCCESS;
+    if ((cl_ulong)ret_info == 0) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+    } else if ((cl_ulong)ret_info == 1 || (cl_ulong)ret_info == 4) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+    } else if ((cl_ulong)ret_info == 2) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_CONSTANT;
+    } else if ((cl_ulong)ret_info == 3) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
+    } else {
+      /* If no address qualifier is specified, the default address qualifier
+         which is CL_KERNEL_ARG_ADDRESS_PRIVATE is returned. */
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
+    }
+    return CL_SUCCESS;
+
+  case CL_KERNEL_ARG_ACCESS_QUALIFIER:
+    if (param_value_size < sizeof(cl_kernel_arg_access_qualifier))
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_kernel_arg_access_qualifier);
+    if (!param_value) return CL_SUCCESS;
+    if (!strcmp((char*)ret_info, "write_only")) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+    } else if (!strcmp((char*)ret_info, "read_only")) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_ONLY;
+    } else if (!strcmp((char*)ret_info, "read_write")) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_WRITE;
+    } else {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_NONE;
+    }
+    return CL_SUCCESS;
+
+  case CL_KERNEL_ARG_TYPE_NAME:
+  case CL_KERNEL_ARG_NAME:
+    str_len = strlen(ret_info);
+    if (param_value_size < str_len + 1)
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = str_len + 1;
+    if (!param_value) return CL_SUCCESS;
+    memcpy(param_value, ret_info, str_len);
+    ((char *)param_value)[str_len] = 0;
+    return CL_SUCCESS;
+
+  case CL_KERNEL_ARG_TYPE_QUALIFIER:
+    if (param_value_size < sizeof(cl_kernel_arg_type_qualifier))
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_kernel_arg_type_qualifier);
+    if (!param_value) return CL_SUCCESS;
+    if (strstr((char*)ret_info, "const"))
+      type_qual = type_qual | CL_KERNEL_ARG_TYPE_CONST;
+    if (strstr((char*)ret_info, "volatile"))
+      type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
+    if (strstr((char*)ret_info, "restrict"))
+      type_qual = type_qual | CL_KERNEL_ARG_TYPE_RESTRICT;
+    *(cl_kernel_arg_type_qualifier *)param_value = type_qual;
+    return CL_SUCCESS;
+
+  default:
+    assert(0);
+  }
 
   return CL_SUCCESS;
 }
@@ -205,7 +303,7 @@ LOCAL uint32_t
 cl_kernel_get_simd_width(cl_kernel k)
 {
   assert(k != NULL);
-  return gbe_kernel_get_simd_width(k->opaque);
+  return interp_kernel_get_simd_width(k->opaque);
 }
 
 LOCAL void
@@ -218,31 +316,31 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
     cl_buffer_unreference(k->bo);
 
   /* Allocate the gen code here */
-  const uint32_t code_sz = gbe_kernel_get_code_size(opaque);
-  const char *code = gbe_kernel_get_code(opaque);
+  const uint32_t code_sz = interp_kernel_get_code_size(opaque);
+  const char *code = interp_kernel_get_code(opaque);
   k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u);
-  k->arg_n = gbe_kernel_get_arg_num(opaque);
+  k->arg_n = interp_kernel_get_arg_num(opaque);
 
   /* Upload the code */
   cl_buffer_subdata(k->bo, 0, code_sz, code);
   k->opaque = opaque;
 
   /* Create the curbe */
-  k->curbe_sz = gbe_kernel_get_curbe_size(k->opaque);
+  k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
 
   /* Get sampler data & size */
-  k->sampler_sz = gbe_kernel_get_sampler_size(k->opaque);
+  k->sampler_sz = interp_kernel_get_sampler_size(k->opaque);
   assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
   if (k->sampler_sz > 0)
-    gbe_kernel_get_sampler_data(k->opaque, k->samplers);
-  gbe_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
-  k->stack_size = gbe_kernel_get_stack_size(k->opaque);
+    interp_kernel_get_sampler_data(k->opaque, k->samplers);
+  interp_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
+  k->stack_size = interp_kernel_get_stack_size(k->opaque);
   /* Get image data & size */
-  k->image_sz = gbe_kernel_get_image_size(k->opaque);
+  k->image_sz = interp_kernel_get_image_size(k->opaque);
   assert(k->sampler_sz <= GEN_MAX_SURFACES);
   if (k->image_sz > 0) {
     TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
-    gbe_kernel_get_image_data(k->opaque, k->images);
+    interp_kernel_get_image_data(k->opaque, k->images);
   } else
     k->images = NULL;
   return;
@@ -310,7 +408,7 @@ cl_kernel_work_group_sz(cl_kernel ker,
   cl_uint i;
 
   for (i = 0; i < wk_dim; ++i) {
-    const uint32_t required_sz = gbe_kernel_get_required_work_group_size(ker->opaque, i);
+    const uint32_t required_sz = interp_kernel_get_required_work_group_size(ker->opaque, i);
     if (required_sz != 0 && required_sz != local_wk_sz[i]) {
       err = CL_INVALID_WORK_ITEM_SIZE;
       goto error;
@@ -320,7 +418,7 @@ cl_kernel_work_group_sz(cl_kernel ker,
   for (i = 1; i < wk_dim; ++i)
     sz *= local_wk_sz[i];
 
-  if (sz > ker->program->ctx->device->max_work_group_size) {
+  if (sz > cl_get_kernel_max_wg_sz(ker)) {
     err = CL_INVALID_WORK_ITEM_SIZE;
     goto error;
   }
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index fb509a2..1ed90a5 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -22,7 +22,7 @@
 
 #include "cl_internals.h"
 #include "cl_driver.h"
-#include "program.h"
+#include "cl_gbe_loader.h"
 #include "CL/cl.h"
 
 #include <stdint.h>
@@ -37,6 +37,7 @@ struct _gbe_kernel;
 typedef struct cl_argument {
   cl_mem mem;           /* For image and regular buffers */
   cl_sampler sampler;   /* For sampler. */
+  unsigned char bti;
   uint32_t local_sz:31; /* For __local size specification */
   uint32_t is_set:1;    /* All args must be set before NDRange */
 } cl_argument;
@@ -58,6 +59,8 @@ struct _cl_kernel {
   cl_ulong local_mem_sz;      /* local memory size specified in kernel args. */
   size_t compile_wg_sz[3];    /* Required workgroup size by __attribute__((reqd_work_gro
                                  up_size(X, Y, Z))) qualifier.*/
+  size_t global_work_sz[3];    /* maximum global size that can be used to execute a kernel
+                                (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
   size_t stack_size;          /* stack size per work item. */
   cl_argument *args;          /* To track argument setting */
   uint32_t arg_n:31;          /* Number of arguments */
@@ -76,6 +79,9 @@ extern void cl_kernel_setup(cl_kernel k, gbe_kernel opaque);
 /* Get the kernel name */
 extern const char *cl_kernel_get_name(cl_kernel k);
 
+/* Get the kernel attributes*/
+extern const char *cl_kernel_get_attributes(cl_kernel k);
+
 /* Get the simd width as used in the code */
 extern uint32_t cl_kernel_get_simd_width(cl_kernel k);
 
@@ -93,6 +99,12 @@ extern int cl_kernel_set_arg(cl_kernel,
                              size_t      arg_size,
                              const void *arg_value);
 
+/* Get the argument information */
+extern int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index,
+                                  cl_kernel_arg_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret);
+
 /* Compute and check the work group size from the user provided local size */
 extern cl_int
 cl_kernel_work_group_sz(cl_kernel ker,
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index cb5f5cd..50a0898 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -14,14 +14,6 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  */
-#include <CL/cl.h>
-#ifndef CL_VERSION_1_2
-#include <cl_mem.h>
-typedef cl_uint             cl_kernel_arg_info;
-typedef cl_bitfield         cl_mem_migration_flags;
-#define cl_device_partition_property cl_device_partition_property_ext
-#define CL_API_SUFFIX__VERSION_1_2
-#endif
 #include <ocl_icd.h>
 
 #include "cl_platform_id.h"
@@ -148,21 +140,21 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
   CL_1_2_NOTYET(clReleaseDeviceEXT),
 #ifdef CL_VERSION_1_2
   (void *) NULL,
-  CL_1_2_NOTYET(clCreateSubDevices),
-  CL_1_2_NOTYET(clRetainDevice),
-  CL_1_2_NOTYET(clReleaseDevice),
-  CL_1_2_NOTYET(clCreateImage),
-  CL_1_2_NOTYET(clCreateProgramWithBuiltInKernels),
-  CL_1_2_NOTYET(clCompileProgram),
-  CL_1_2_NOTYET(clLinkProgram),
-  CL_1_2_NOTYET(clUnloadPlatformCompiler),
-  CL_1_2_NOTYET(clGetKernelArgInfo),
-  CL_1_2_NOTYET(clEnqueueFillBuffer),
-  CL_1_2_NOTYET(clEnqueueFillImage),
-  CL_1_2_NOTYET(clEnqueueMigrateMemObjects),
-  CL_1_2_NOTYET(clEnqueueMarkerWithWaitList),
-  CL_1_2_NOTYET(clEnqueueBarrierWithWaitList),
-  CL_1_2_NOTYET(clGetExtensionFunctionAddressForPlatform),
+  clCreateSubDevices,
+  clRetainDevice,
+  clReleaseDevice,
+  clCreateImage,
+  clCreateProgramWithBuiltInKernels,
+  clCompileProgram,
+  clLinkProgram,
+  clUnloadPlatformCompiler,
+  clGetKernelArgInfo,
+  clEnqueueFillBuffer,
+  clEnqueueFillImage,
+  clEnqueueMigrateMemObjects,
+  clEnqueueMarkerWithWaitList,
+  clEnqueueBarrierWithWaitList,
+  clGetExtensionFunctionAddressForPlatform,
   CL_GL_INTEROP(clCreateFromGLTexture),
   (void *) NULL,
   (void *) NULL,
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 40e0a99..81c4d64 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -44,9 +44,7 @@
       return CL_INVALID_VALUE;              \
     break;
 
-#define CL_MEM_OBJECT_BUFFER                        0x10F0
-#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
-#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define MAX_TILING_SIZE                             128 * MB
 
 static cl_mem_object_type
 cl_get_mem_object_type(cl_mem mem)
@@ -100,7 +98,12 @@ cl_get_mem_object_info(cl_mem mem,
     *((size_t *)param_value) = mem->size;
     break;
   case CL_MEM_HOST_PTR:
-    *((size_t *)param_value) = (size_t)mem->host_ptr;
+    if(mem->type == CL_MEM_IMAGE_TYPE) {
+      *((size_t *)param_value) = (size_t)mem->host_ptr;
+    } else {
+      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+      *((size_t *)param_value) = (size_t)mem->host_ptr + buf->sub_offset;
+    }
     break;
   case CL_MEM_MAP_COUNT:
     *((cl_uint *)param_value) = mem->map_ref;
@@ -132,6 +135,18 @@ cl_get_mem_object_info(cl_mem mem,
   return CL_SUCCESS;
 }
 
+#define IS_1D(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D ||        \
+                      image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||  \
+                      image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+
+#define IS_2D(image) (image->image_type == CL_MEM_OBJECT_IMAGE2D ||        \
+                      image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
+#define IS_3D(image) (image->image_type == CL_MEM_OBJECT_IMAGE3D)
+
+#define IS_ARRAY(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
+                         image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
 LOCAL cl_int
 cl_get_image_info(cl_mem mem,
                   cl_image_info param_name,
@@ -151,6 +166,10 @@ cl_get_image_info(cl_mem mem,
     FIELD_SIZE(IMAGE_WIDTH, size_t);
     FIELD_SIZE(IMAGE_HEIGHT, size_t);
     FIELD_SIZE(IMAGE_DEPTH, size_t);
+    FIELD_SIZE(IMAGE_ARRAY_SIZE, size_t);
+    FIELD_SIZE(IMAGE_BUFFER, cl_mem);
+    FIELD_SIZE(IMAGE_NUM_MIP_LEVELS, cl_uint);
+    FIELD_SIZE(IMAGE_NUM_SAMPLES, cl_uint);
   default:
     return CL_INVALID_VALUE;
   }
@@ -173,10 +192,20 @@ cl_get_image_info(cl_mem mem,
     *(size_t *)param_value = image->w;
     break;
   case CL_IMAGE_HEIGHT:
-    *(size_t *)param_value = image->h;
+    *(size_t *)param_value = IS_1D(image) ? 0 : image->h;
     break;
   case CL_IMAGE_DEPTH:
-    *(size_t *)param_value = image->depth;
+    *(size_t *)param_value = IS_3D(image) ? image->depth : 0;
+    break;
+  case CL_IMAGE_ARRAY_SIZE:
+    *(size_t *)param_value = IS_ARRAY(image) ? image->depth : 0;
+    break;
+  case CL_IMAGE_BUFFER:
+    *(cl_mem *)param_value = image->buffer_1d;
+    break;
+  case CL_IMAGE_NUM_MIP_LEVELS:
+  case CL_IMAGE_NUM_SAMPLES:
+    *(cl_mem *)param_value = 0;
     break;
   }
 
@@ -260,6 +289,21 @@ error:
 
 }
 
+LOCAL cl_int
+is_valid_mem(cl_mem mem, cl_mem buffers)
+{
+  cl_mem tmp = buffers;
+  while(tmp){
+    if(mem == tmp){
+      if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+        return CL_INVALID_MEM_OBJECT;
+      return CL_SUCCESS;
+    }
+    tmp = tmp->next;
+  }
+  return CL_INVALID_MEM_OBJECT;
+}
+
 LOCAL cl_mem
 cl_mem_new_buffer(cl_context ctx,
                   cl_mem_flags flags,
@@ -287,9 +331,13 @@ cl_mem_new_buffer(cl_context ctx,
 		      || ((flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_WRITE_ONLY)))
               || ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
               || ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+              || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+              || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY))
+              || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
               || ((flags & (~(CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY
                         | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
-                        | CL_MEM_USE_HOST_PTR))) != 0))) {
+                        | CL_MEM_USE_HOST_PTR | CL_MEM_HOST_WRITE_ONLY
+                        | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) != 0))) {
     err = CL_INVALID_VALUE;
     goto error;
   }
@@ -334,6 +382,10 @@ cl_mem_new_buffer(cl_context ctx,
     goto error;
   }
 
+  /* HSW: Byte scattered Read/Write has limitation that
+     the buffer size must be a multiple of 4 bytes. */
+  sz = ALIGN(sz, 4);
+
   /* Create the buffer in video memory */
   mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
   if (mem == NULL || err != CL_SUCCESS)
@@ -374,11 +426,22 @@ cl_mem_new_sub_buffer(cl_mem buffer,
 
   if (flags && (((buffer->flags & CL_MEM_WRITE_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY)))
           || ((buffer->flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)))
-          || (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR)))) {
+          || (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR))
+          || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+          || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY))
+          || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS)))) {
     err = CL_INVALID_VALUE;
     goto error;
   }
 
+  if((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE)) == 0) {
+    flags |= buffer->flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE);
+  }
+  flags |= buffer->flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR);
+  if((flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) {
+    flags |= buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS);
+  }
+
   if (create_type != CL_BUFFER_CREATE_TYPE_REGION) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -401,7 +464,7 @@ cl_mem_new_sub_buffer(cl_mem buffer,
     goto error;
   }
 
-  if (info->origin & (buffer->ctx->device->mem_base_addr_align - 1)) {
+  if (info->origin & (buffer->ctx->device->mem_base_addr_align / 8 - 1)) {
     err = CL_MISALIGNED_SUB_BUFFER_OFFSET;
     goto error;
   }
@@ -452,14 +515,37 @@ error:
   goto exit;
 }
 
+void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
+{
+  cl_buffer_unreference(buffer->bo);
+  buffer->bo = new_bo;
+  cl_buffer_reference(new_bo);
+  if (buffer->type != CL_MEM_SUBBUFFER_TYPE)
+    return;
+
+  struct _cl_mem_buffer *it = ((struct _cl_mem_buffer*)buffer)->sub_next;
+  for( ; it != (struct _cl_mem_buffer*)buffer; it = it->sub_next)
+  {
+    cl_buffer_unreference(it->base.bo);
+    it->base.bo = new_bo;
+    cl_buffer_reference(new_bo);
+  }
+}
+
 void
 cl_mem_copy_image_region(const size_t *origin, const size_t *region,
                          void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
                          const void *src, size_t src_row_pitch, size_t src_slice_pitch,
-                         const struct _cl_mem_image *image)
+                         const struct _cl_mem_image *image, cl_bool offset_dst, cl_bool offset_src)
 {
-  size_t offset = image->bpp * origin[0] + dst_row_pitch * origin[1] + dst_slice_pitch * origin[2];
-  dst = (char*)dst + offset;
+  if(offset_dst) {
+    size_t dst_offset = image->bpp * origin[0] + dst_row_pitch * origin[1] + dst_slice_pitch * origin[2];
+    dst = (char*)dst + dst_offset;
+  }
+  if(offset_src) {
+    size_t src_offset = image->bpp * origin[0] + src_row_pitch * origin[1] + src_slice_pitch * origin[2];
+    src = (char*)src + src_offset;
+  }
   if (!origin[0] && region[0] == image->w && dst_row_pitch == src_row_pitch &&
       (region[2] == 1 || (!origin[1] && region[1] == image->h && dst_slice_pitch == src_slice_pitch)))
   {
@@ -481,6 +567,34 @@ cl_mem_copy_image_region(const size_t *origin, const size_t *region,
   }
 }
 
+void
+cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region,
+                           const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image)
+{
+  char* dst= cl_mem_map_auto((cl_mem)dst_image);
+  char* src= cl_mem_map_auto((cl_mem)src_image);
+  size_t dst_offset = dst_image->bpp * dst_origin[0] + dst_image->row_pitch * dst_origin[1] + dst_image->slice_pitch * dst_origin[2];
+  size_t src_offset = src_image->bpp * src_origin[0] + src_image->row_pitch * src_origin[1] + src_image->slice_pitch * src_origin[2];
+  dst= (char*)dst+ dst_offset;
+  src= (char*)src+ src_offset;
+  cl_uint y, z;
+  for (z = 0; z < region[2]; z++) {
+    const char* src_ptr = src;
+    char* dst_ptr = dst;
+    for (y = 0; y < region[1]; y++) {
+      memcpy(dst_ptr, src_ptr, src_image->bpp*region[0]);
+      src_ptr += src_image->row_pitch;
+      dst_ptr += dst_image->row_pitch;
+    }
+    src = (char*)src + src_image->slice_pitch;
+    dst = (char*)dst + dst_image->slice_pitch;
+  }
+
+  cl_mem_unmap_auto((cl_mem)src_image);
+  cl_mem_unmap_auto((cl_mem)dst_image);
+
+}
+
 static void
 cl_mem_copy_image(struct _cl_mem_image *image,
 		  size_t row_pitch,
@@ -492,7 +606,7 @@ cl_mem_copy_image(struct _cl_mem_image *image,
   size_t region[3] = {image->w, image->h, image->depth};
 
   cl_mem_copy_image_region(origin, region, dst_ptr, image->row_pitch, image->slice_pitch,
-                           host_ptr, row_pitch, slice_pitch, image);
+                           host_ptr, row_pitch, slice_pitch, image, CL_FALSE, CL_FALSE); //offset is 0
   cl_mem_unmap_auto((cl_mem)image);
 }
 
@@ -501,12 +615,34 @@ static const uint32_t tilex_w = 512;  /* tileX width in bytes */
 static const uint32_t tilex_h = 8;    /* tileX height in number of rows */
 static const uint32_t tiley_w = 128;  /* tileY width in bytes */
 static const uint32_t tiley_h = 32;   /* tileY height in number of rows */
+static const uint32_t valign = 2;     /* vertical alignment is 2. */
+
+cl_image_tiling_t cl_get_default_tiling(void)
+{
+  static int initialized = 0;
+  static cl_image_tiling_t tiling = CL_TILE_X;
+  if (!initialized) {
+    char *tilingStr = getenv("OCL_TILING");
+    if (tilingStr != NULL) {
+      switch (tilingStr[0]) {
+        case '0': tiling = CL_NO_TILE; break;
+        case '1': tiling = CL_TILE_X; break;
+        case '2': tiling = CL_TILE_Y; break;
+        default:
+          break;
+      }
+    }
+    initialized = 1;
+  }
+
+  return tiling;
+}
 
 static cl_mem
 _cl_mem_new_image(cl_context ctx,
                   cl_mem_flags flags,
                   const cl_image_format *fmt,
-                  const cl_mem_object_type image_type,
+                  const cl_mem_object_type orig_image_type,
                   size_t w,
                   size_t h,
                   size_t depth,
@@ -517,8 +653,9 @@ _cl_mem_new_image(cl_context ctx,
 {
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
+  cl_mem_object_type image_type = orig_image_type;
   uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
-  size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h;
+  size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h = 0;
   cl_image_tiling_t tiling = CL_NO_TILE;
 
   /* Check flags consistency */
@@ -534,7 +671,7 @@ _cl_mem_new_image(cl_context ctx,
   /* Only a sub-set of the formats are supported */
   intel_fmt = cl_image_get_intel_format(fmt);
   if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) {
-    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
     goto error;
   }
 
@@ -544,10 +681,28 @@ _cl_mem_new_image(cl_context ctx,
     err = CL_INVALID_IMAGE_SIZE;  \
     goto error;                   \
   } while (0);
+
   if (UNLIKELY(w == 0)) DO_IMAGE_ERROR;
-  if (UNLIKELY(h == 0)) DO_IMAGE_ERROR;
+  if (UNLIKELY(h == 0 && (image_type != CL_MEM_OBJECT_IMAGE1D &&
+      image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+      image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)))
+    DO_IMAGE_ERROR;
 
-  if (image_type == CL_MEM_OBJECT_IMAGE2D) {
+  if (image_type == CL_MEM_OBJECT_IMAGE1D ||
+      image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+    size_t min_pitch = bpp * w;
+    if (data && pitch == 0)
+      pitch = min_pitch;
+
+    h = 1;
+    depth = 1;
+    if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
+    if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
+    if (UNLIKELY(data && (slice_pitch % pitch != 0))) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR;
+    tiling = CL_NO_TILE;
+  } else if (image_type == CL_MEM_OBJECT_IMAGE2D) {
     size_t min_pitch = bpp * w;
     if (data && pitch == 0)
       pitch = min_pitch;
@@ -558,35 +713,43 @@ _cl_mem_new_image(cl_context ctx,
 
     /* Pick up tiling mode (we do only linear on SNB) */
     if (cl_driver_get_ver(ctx->drv) != 6)
-      tiling = CL_TILE_Y;
+      tiling = cl_get_default_tiling();
+
     depth = 1;
-  }
+  } else if (image_type == CL_MEM_OBJECT_IMAGE3D ||
+             image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+             image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+    if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+      h = 1;
+      tiling = CL_NO_TILE;
+    } else if (cl_driver_get_ver(ctx->drv) != 6)
+      tiling = cl_get_default_tiling();
 
-  if (image_type == CL_MEM_OBJECT_IMAGE3D) {
     size_t min_pitch = bpp * w;
     if (data && pitch == 0)
       pitch = min_pitch;
-    size_t min_slice_pitch = min_pitch * h;
+    size_t min_slice_pitch = pitch * h;
     if (data && slice_pitch == 0)
       slice_pitch = min_slice_pitch;
     if (UNLIKELY(w > ctx->device->image3d_max_width)) DO_IMAGE_ERROR;
     if (UNLIKELY(h > ctx->device->image3d_max_height)) DO_IMAGE_ERROR;
-    if (UNLIKELY(depth > ctx->device->image3d_max_depth)) DO_IMAGE_ERROR;
+    if (image_type == CL_MEM_OBJECT_IMAGE3D &&
+       (UNLIKELY(depth > ctx->device->image3d_max_depth))) DO_IMAGE_ERROR
+    else if (UNLIKELY(depth > ctx->device->image_max_array_size)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
     if (UNLIKELY(data && min_slice_pitch > slice_pitch)) DO_IMAGE_ERROR;
     if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
     if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR;
 
-    /* Pick up tiling mode (we do only linear on SNB) */
-    if (cl_driver_get_ver(ctx->drv) != 6)
-      tiling = CL_TILE_Y;
-  }
+  } else
+    assert(0);
+
 #undef DO_IMAGE_ERROR
 
   /* Tiling requires to align both pitch and height */
   if (tiling == CL_NO_TILE) {
     aligned_pitch = w * bpp;
-    aligned_h     = h;
+    aligned_h  = ALIGN(h, valign);
   } else if (tiling == CL_TILE_X) {
     aligned_pitch = ALIGN(w * bpp, tilex_w);
     aligned_h     = ALIGN(h, tilex_h);
@@ -597,13 +760,26 @@ _cl_mem_new_image(cl_context ctx,
 
   sz = aligned_pitch * aligned_h * depth;
 
+  /* If sz is large than 128MB, map gtt may fail in some system.
+     Because there is no obviours performance drop, disable tiling. */
+  if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
+    tiling = CL_NO_TILE;
+    aligned_pitch = w * bpp;
+    aligned_h     = h;
+    sz = aligned_pitch * aligned_h * depth;
+  }
+
   mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
   cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
-  aligned_slice_pitch = (image_type == CL_MEM_OBJECT_IMAGE1D
-                         || image_type == CL_MEM_OBJECT_IMAGE2D) ? 0 : aligned_pitch * ALIGN(h, 2);
+  if (image_type == CL_MEM_OBJECT_IMAGE1D ||
+      image_type == CL_MEM_OBJECT_IMAGE2D ||
+      image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    aligned_slice_pitch = 0;
+  else
+    aligned_slice_pitch = aligned_pitch * ALIGN(h, 2);
 
   cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
                     intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
@@ -629,6 +805,151 @@ error:
   goto exit;
 }
 
+static cl_mem
+_cl_mem_new_image_from_buffer(cl_context ctx,
+                              cl_mem_flags flags,
+                              const cl_image_format* image_format,
+                              const cl_image_desc *image_desc,
+                              cl_int *errcode_ret)
+{
+  cl_mem image = NULL;
+  cl_mem buffer = image_desc->buffer;
+  cl_int err = CL_SUCCESS;
+  *errcode_ret = err;
+  cl_ulong max_size;
+  cl_mem_flags merged_flags;
+  uint32_t bpp;
+  uint32_t intel_fmt = INTEL_UNSUPPORTED_FORMAT;
+  size_t offset = 0;
+
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS))
+    goto error;
+
+  /* Only a sub-set of the formats are supported */
+  intel_fmt = cl_image_get_intel_format(image_format);
+  if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
+
+  if (!buffer) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  if (flags & (CL_MEM_USE_HOST_PTR|CL_MEM_ALLOC_HOST_PTR|CL_MEM_COPY_HOST_PTR)) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  /* access check. */
+  if ((buffer->flags & CL_MEM_WRITE_ONLY) &&
+      (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_READ_ONLY) &&
+      (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_HOST_WRITE_ONLY) &&
+      (flags & CL_MEM_HOST_READ_ONLY)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_HOST_READ_ONLY) &&
+      (flags & CL_MEM_HOST_WRITE_ONLY)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_HOST_NO_ACCESS) &&
+      (flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((err = cl_get_device_info(ctx->device,
+                                CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+                                sizeof(max_size),
+                                &max_size,
+                                NULL)) != CL_SUCCESS) {
+    goto error;
+  }
+
+  if (image_desc->image_width > max_size) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  if (image_desc->image_width*bpp > buffer->size) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  merged_flags = buffer->flags;
+  if (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)) {
+    merged_flags &= ~(CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY);
+    merged_flags |= flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY);
+  }
+  if (flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS)) {
+    merged_flags &= ~(CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
+    merged_flags |= flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
+  }
+  struct _cl_mem_buffer *mem_buffer = (struct _cl_mem_buffer*)buffer;
+  if (buffer->type == CL_MEM_SUBBUFFER_TYPE) {
+    offset = ((struct _cl_mem_buffer *)buffer)->sub_offset;
+    mem_buffer = mem_buffer->parent;
+  }
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS))
+    goto error;
+
+  // Per bspec, a image should has a at least 2 line vertical alignment,
+  // thus we can't simply attach a buffer to a 1d image surface which has the same size.
+  // We have to create a new image, and copy the buffer data to this new image.
+  // And replace all the buffer object's reference to this image.
+  image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type,
+                    mem_buffer->base.size / bpp, 0, 0, 0, 0, NULL, errcode_ret);
+  if (image == NULL)
+    return NULL;
+  void *src = cl_mem_map(buffer);
+  void *dst = cl_mem_map(image);
+  //
+  // FIXME, we could use copy buffer to image to do this on GPU latter.
+  // currently the copy buffer to image function doesn't support 1D image.
+  // 
+  // There is a potential risk that this buffer was mapped and the caller
+  // still hold the pointer and want to access it again. This scenario is
+  // not explicitly forbidden in the spec, although it should not be permitted.
+  memcpy(dst, src, mem_buffer->base.size);
+  cl_mem_unmap(buffer);
+  cl_mem_unmap(image);
+
+  if (err != 0)
+    goto error;
+ 
+  // Now replace buffer's bo to this new bo, need to take care of sub buffer
+  // case. 
+  cl_mem_replace_buffer(buffer, image->bo);
+  /* Now point to the right offset if buffer is a SUB_BUFFER. */
+  if (buffer->flags & CL_MEM_USE_HOST_PTR)
+    image->host_ptr = buffer->host_ptr + offset;
+  cl_mem_image(image)->offset = offset;
+  cl_mem_image(image)->w = image_desc->image_width;
+  cl_mem_add_ref(buffer);
+  cl_mem_image(image)->buffer_1d = buffer;
+  return image;
+
+error:
+  if (image)
+    cl_mem_delete(image);
+  image = NULL;
+  *errcode_ret = err;
+  return image;
+}
+
 LOCAL cl_mem
 cl_mem_new_image(cl_context context,
                  cl_mem_flags flags,
@@ -645,10 +966,15 @@ cl_mem_new_image(cl_context context,
                              image_desc->image_width, image_desc->image_height, image_desc->image_depth,
                              image_desc->image_row_pitch, image_desc->image_slice_pitch,
                              host_ptr, errcode_ret);
-  case CL_MEM_OBJECT_IMAGE2D_ARRAY:
   case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+  case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+    return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
+                             image_desc->image_width, image_desc->image_height, image_desc->image_array_size,
+                             image_desc->image_row_pitch, image_desc->image_slice_pitch,
+                             host_ptr, errcode_ret);
   case CL_MEM_OBJECT_IMAGE1D_BUFFER:
-    NOT_IMPLEMENTED;
+    return _cl_mem_new_image_from_buffer(context, flags, image_format,
+                                         image_desc, errcode_ret);
     break;
   case CL_MEM_OBJECT_BUFFER:
   default:
@@ -671,6 +997,15 @@ cl_mem_delete(cl_mem mem)
   }
 #endif
 
+  /* iff we are a image, delete the 1d buffer if has. */
+  if (IS_IMAGE(mem)) {
+    if (cl_mem_image(mem)->buffer_1d) {
+      assert(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER);
+      cl_mem_delete(cl_mem_image(mem)->buffer_1d);
+      cl_mem_image(mem)->buffer_1d = NULL;
+    }
+  }
+
   /* Remove it from the list */
   assert(mem->ctx);
   pthread_mutex_lock(&mem->ctx->buffer_lock);
@@ -689,7 +1024,7 @@ cl_mem_delete(cl_mem mem)
     for(i=0; i<mem->mapped_ptr_sz; i++) {
       if(mem->mapped_ptr[i].ptr != NULL) {
         mem->map_ref--;
-        cl_mem_unmap_gtt(mem);
+        cl_mem_unmap_auto(mem);
       }
     }
     assert(mem->map_ref == 0);
@@ -744,59 +1079,328 @@ LOCAL cl_int
 cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
             size_t src_offset, size_t dst_offset, size_t cb)
 {
-  cl_int ret;
-  cl_kernel ker;
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {1,1,1};
+  const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff};
+  int aligned = 0;
+  int dw_src_offset = src_offset/4;
+  int dw_dst_offset = dst_offset/4;
+
+  if (!cb)
+    return ret;
 
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(src_buf->ctx == dst_buf->ctx);
 
-  if ((cb % 4) || (src_offset % 4) || (dst_offset % 4)) {
-    extern char cl_internal_copy_buf_align1_str[];
-    extern int cl_internal_copy_buf_align1_str_size;
+  /* All 16 bytes aligned, fast and easy one. */
+  if((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) {
+    extern char cl_internal_copy_buf_align16_str[];
+    extern size_t cl_internal_copy_buf_align16_str_size;
 
-    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN1,
-             cl_internal_copy_buf_align1_str, (size_t)cl_internal_copy_buf_align1_str_size, NULL);
-  } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+             cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
+    cb = cb/16;
+    aligned = 1;
+  } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/
     extern char cl_internal_copy_buf_align4_str[];
-    extern int cl_internal_copy_buf_align4_str_size;
+    extern size_t cl_internal_copy_buf_align4_str_size;
 
-    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
              cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL);
     cb = cb/4;
-    src_offset = src_offset/4;
-    dst_offset = dst_offset/4;
+    aligned = 1;
+  }
+
+  if (aligned) {
+    if (!ker)
+      return CL_OUT_OF_RESOURCES;
+
+    if (cb < LOCAL_SZ_0) {
+      local_sz[0] = 1;
+    } else {
+      local_sz[0] = LOCAL_SZ_0;
+    }
+    global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
+
+  /* Now handle the unaligned cases. */
+  int dw_num = ((dst_offset % 4 + cb) + 3) / 4;
+  unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4];
+  unsigned int last_mask = masks[(dst_offset + cb) % 4];
+  /* handle the very small range copy. */
+  if (cb < 4 && dw_num == 1) {
+    first_mask = first_mask | ~last_mask;
+  }
+
+  if (cb < LOCAL_SZ_0) {
+    local_sz[0] = 1;
   } else {
-    extern char cl_internal_copy_buf_align16_str[];
-    extern int cl_internal_copy_buf_align16_str_size;
+    local_sz[0] = LOCAL_SZ_0;
+  }
+  global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+
+  if (src_offset % 4 == dst_offset % 4) {
+    /* Src and dst has the same unaligned offset, just handle the
+       header and tail. */
+    extern char cl_internal_copy_buf_unalign_same_offset_str[];
+    extern size_t cl_internal_copy_buf_unalign_same_offset_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
+             cl_internal_copy_buf_unalign_same_offset_str,
+             (size_t)cl_internal_copy_buf_unalign_same_offset_str_size, NULL);
+
+    if (!ker)
+      return CL_OUT_OF_RESOURCES;
+
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
 
-    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
-             cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
-    cb = cb/16;
-    src_offset = src_offset/4;
-    dst_offset = dst_offset/4;
+  /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */
+  if (dst_offset % 4 < src_offset % 4) {
+    extern char cl_internal_copy_buf_unalign_dst_offset_str[];
+    extern size_t cl_internal_copy_buf_unalign_dst_offset_str_size;
+
+    int align_diff = src_offset % 4 - dst_offset % 4;
+    unsigned int dw_mask = masks[align_diff];
+    int shift = align_diff * 8;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
+             cl_internal_copy_buf_unalign_dst_offset_str,
+             (size_t)cl_internal_copy_buf_unalign_dst_offset_str_size, NULL);
+
+    if (!ker)
+      return CL_OUT_OF_RESOURCES;
+
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+    cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+    cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
+
+  /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */
+  if (dst_offset % 4 > src_offset % 4) {
+    extern char cl_internal_copy_buf_unalign_src_offset_str[];
+    extern size_t cl_internal_copy_buf_unalign_src_offset_str_size;
+
+    int align_diff = dst_offset % 4 - src_offset % 4;
+    unsigned int dw_mask = masks[4 - align_diff];
+    int shift = align_diff * 8;
+    int src_less = !(src_offset % 4) && !((src_offset + cb) % 4);
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
+             cl_internal_copy_buf_unalign_src_offset_str,
+             (size_t)cl_internal_copy_buf_unalign_src_offset_str_size, NULL);
+
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+    cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+    cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+    cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
+
+  /* no case can hanldle? */
+  assert(0);
+
+  return ret;
+}
+
+LOCAL cl_int
+cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image* src_image,
+           const size_t * origin, const size_t * region)
+{
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+    extern char cl_internal_fill_image_1d_str[];
+    extern size_t cl_internal_fill_image_1d_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D,
+        cl_internal_fill_image_1d_str, (size_t)cl_internal_fill_image_1d_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+    extern char cl_internal_fill_image_1d_array_str[];
+    extern size_t cl_internal_fill_image_1d_array_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,
+        cl_internal_fill_image_1d_array_str, (size_t)cl_internal_fill_image_1d_array_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    extern char cl_internal_fill_image_2d_str[];
+    extern size_t cl_internal_fill_image_2d_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D,
+        cl_internal_fill_image_2d_str, (size_t)cl_internal_fill_image_2d_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+    extern char cl_internal_fill_image_2d_array_str[];
+    extern size_t cl_internal_fill_image_2d_array_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,
+        cl_internal_fill_image_2d_array_str, (size_t)cl_internal_fill_image_2d_array_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    extern char cl_internal_fill_image_3d_str[];
+    extern size_t cl_internal_fill_image_3d_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_3D,
+        cl_internal_fill_image_3d_str, (size_t)cl_internal_fill_image_3d_str_size, NULL);
+  }else{
+    return CL_IMAGE_FORMAT_NOT_SUPPORTED;
   }
 
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
-  if (cb < LOCAL_SZ_0) {
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+  cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
+
+  ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz, local_sz);
+  return ret;
+}
+
+LOCAL cl_int
+cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+            cl_mem buffer, size_t offset, size_t size)
+{
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {1,1,1};
+  char pattern_comb[4];
+  int is_128 = 0;
+  const void * pattern1 = NULL;
+
+  assert(offset % pattern_size == 0);
+  assert(size % pattern_size == 0);
+
+  if (!size)
+    return ret;
+
+  if (pattern_size == 128) {
+    /* 128 is according to pattern of double16, but double works not very
+       well on some platform. We use two float16 to handle this. */
+    extern char cl_internal_fill_buf_align128_str[];
+    extern size_t cl_internal_fill_buf_align128_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
+               cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
+    is_128 = 1;
+    pattern_size = pattern_size / 2;
+    pattern1 = pattern + pattern_size;
+    size = size / 2;
+  } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
+    extern char cl_internal_fill_buf_align8_str[];
+    extern size_t cl_internal_fill_buf_align8_str_size;
+    int order = ffs(pattern_size / 8) - 1;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
+               cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
+  } else if (pattern_size == 4) {
+    extern char cl_internal_fill_buf_align4_str[];
+    extern size_t cl_internal_fill_buf_align4_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+  } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
+    /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
+       the pattern with the pattern duplication fill in. */
+    assert(pattern_size == 1 || pattern_size == 2);
+    extern char cl_internal_fill_buf_align4_str[];
+    extern size_t cl_internal_fill_buf_align4_str_size;
+
+    if (pattern_size == 2) {
+      memcpy(pattern_comb, pattern, sizeof(char)*2);
+      memcpy(pattern_comb + 2, pattern, sizeof(char)*2);
+    } else {
+      pattern_comb[0] = pattern_comb[1] = pattern_comb[2]
+        = pattern_comb[3] = *(char *)pattern;
+    }
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+    pattern_size = 4;
+    pattern = pattern_comb;
+  }
+  //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
+  //functions. This depend on the usage but now we just use aligned 1 and 2.
+  else if (pattern_size == 2) {
+    extern char cl_internal_fill_buf_align2_str[];
+    extern size_t cl_internal_fill_buf_align2_str_size;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2,
+               cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL);
+  } else if (pattern_size == 1) {
+    extern char cl_internal_fill_buf_unalign_str[];
+    extern size_t cl_internal_fill_buf_unalign_str_size;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN,
+               cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL);
+  } else
+    assert(0);
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  size = size / pattern_size;
+  offset = offset / pattern_size;
+
+  if (size < LOCAL_SZ_0) {
     local_sz[0] = 1;
   } else {
     local_sz[0] = LOCAL_SZ_0;
   }
-  global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
-
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
-  cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
-  cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
-  cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+  global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 1, pattern_size, pattern);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
+  if (is_128)
+    cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
-
   return ret;
 }
 
@@ -815,33 +1419,19 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
   global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
   global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
-  cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
   cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
   cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
 
-  static const char *str_kernel =
-      "kernel void __cl_cpy_buffer_rect ( \n"
-      "       global char* src, global char* dst, \n"
-      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
-      "       unsigned int src_offset, unsigned int dst_offset, \n"
-      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
-      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
-      "  int i = get_global_id(0); \n"
-      "  int j = get_global_id(1); \n"
-      "  int k = get_global_id(2); \n"
-      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
-      "    return; \n"
-      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
-      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
-      "  dst[dst_offset] = src[src_offset]; \n"
-      "}";
-
-
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(src_buf->ctx == dst_buf->ctx);
 
   /* setup the kernel and run. */
-  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL);
+  extern char cl_internal_copy_buf_rect_str[];
+  extern size_t cl_internal_copy_buf_rect_str_size;
+
+  ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
+      cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL);
+
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
@@ -866,12 +1456,10 @@ LOCAL cl_int
 cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
                          const size_t *src_origin, const size_t *dst_origin, const size_t *region) {
   cl_int ret;
-  cl_kernel ker;
+  cl_kernel ker = NULL;
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
-  cl_int index = CL_ENQUEUE_COPY_IMAGE_0;
-  char option[40] = "";
   uint32_t fixupDataType;
   uint32_t savedIntelFmt;
 
@@ -881,15 +1469,6 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
   global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
   global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
 
-  if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-    strcat(option, "-D SRC_IMAGE_3D");
-    index += 1;
-  }
-  if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-    strcat(option, " -D DST_IMAGE_3D");
-    index += 2;
-  }
-
   switch (src_image->fmt.image_channel_data_type) {
     case CL_SNORM_INT8:
     case CL_UNORM_INT8:  fixupDataType = CL_UNSIGNED_INT8; break;
@@ -912,54 +1491,74 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
     src_image->intel_fmt = cl_image_get_intel_format(&fmt);
     dst_image->intel_fmt = src_image->intel_fmt;
   }
-  static const char *str_kernel =
-      "#ifdef SRC_IMAGE_3D \n"
-      "  #define SRC_IMAGE_TYPE image3d_t \n"
-      "  #define SRC_COORD_TYPE int4 \n"
-      "#else \n"
-      "  #define SRC_IMAGE_TYPE image2d_t \n"
-      "  #define SRC_COORD_TYPE int2 \n"
-      "#endif \n"
-      "#ifdef DST_IMAGE_3D \n"
-      "  #define DST_IMAGE_TYPE image3d_t \n"
-      "  #define DST_COORD_TYPE int4 \n"
-      "#else \n"
-      "  #define DST_IMAGE_TYPE image2d_t \n"
-      "  #define DST_COORD_TYPE int2 \n"
-      "#endif \n"
-      "kernel void __cl_copy_image ( \n"
-      "       __read_only SRC_IMAGE_TYPE src_image, __write_only DST_IMAGE_TYPE dst_image, \n"
-      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
-      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
-      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2) { \n"
-      "  int i = get_global_id(0); \n"
-      "  int j = get_global_id(1); \n"
-      "  int k = get_global_id(2); \n"
-      "  int4 color; \n"
-      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
-      "  SRC_COORD_TYPE src_coord; \n"
-      "  DST_COORD_TYPE dst_coord; \n"
-      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
-      "    return; \n"
-      "  src_coord.x = src_origin0 + i; \n"
-      "  src_coord.y = src_origin1 + j; \n"
-      "#ifdef SRC_IMAGE_3D \n"
-      "  src_coord.z = src_origin2 + k; \n"
-      "#endif \n"
-      "  dst_coord.x = dst_origin0 + i; \n"
-      "  dst_coord.y = dst_origin1 + j; \n"
-      "#ifdef DST_IMAGE_3D \n"
-      "  dst_coord.z = dst_origin2 + k; \n"
-      "#endif \n"
-      "  color = read_imagei(src_image, sampler, src_coord); \n"
-      "  write_imagei(dst_image, dst_coord, color); \n"
-      "}";
 
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(src_image->base.ctx == dst_image->base.ctx);
 
   /* setup the kernel and run. */
-  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+  if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+      extern char cl_internal_copy_image_1d_to_1d_str[];
+      extern size_t cl_internal_copy_image_1d_to_1d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,
+          cl_internal_copy_image_1d_to_1d_str, (size_t)cl_internal_copy_image_1d_to_1d_str_size, NULL);
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_image_2d_to_2d_str[];
+      extern size_t cl_internal_copy_image_2d_to_2d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,
+          cl_internal_copy_image_2d_to_2d_str, (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      extern char cl_internal_copy_image_2d_to_3d_str[];
+      extern size_t cl_internal_copy_image_2d_to_3d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,
+          cl_internal_copy_image_2d_to_3d_str, (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_image_3d_to_2d_str[];
+      extern size_t cl_internal_copy_image_3d_to_2d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,
+          cl_internal_copy_image_3d_to_2d_str, (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      extern char cl_internal_copy_image_3d_to_3d_str[];
+      extern size_t cl_internal_copy_image_3d_to_3d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,
+          cl_internal_copy_image_3d_to_3d_str, (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  }
+
   if (!ker) {
     ret = CL_OUT_OF_RESOURCES;
     goto fail;
@@ -991,12 +1590,10 @@ LOCAL cl_int
 cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer,
                          const size_t *src_origin, const size_t dst_offset, const size_t *region) {
   cl_int ret;
-  cl_kernel ker;
+  cl_kernel ker = NULL;
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
-  cl_int index = CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0;
-  char option[40] = "";
   uint32_t intel_fmt, bpp;
   cl_image_format fmt;
   size_t origin0, region0;
@@ -1007,42 +1604,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
   global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
   global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
 
-  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-    strcat(option, "-D IMAGE_3D");
-    index += 1;
-  }
-
-  static const char *str_kernel =
-      "#ifdef IMAGE_3D \n"
-      "  #define IMAGE_TYPE image3d_t \n"
-      "  #define COORD_TYPE int4 \n"
-      "#else \n"
-      "  #define IMAGE_TYPE image2d_t \n"
-      "  #define COORD_TYPE int2 \n"
-      "#endif \n"
-      "kernel void __cl_copy_image_to_buffer ( \n"
-      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
-      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
-      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
-      "       unsigned int dst_offset) { \n"
-      "  int i = get_global_id(0); \n"
-      "  int j = get_global_id(1); \n"
-      "  int k = get_global_id(2); \n"
-      "  uint4 color; \n"
-      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
-      "  COORD_TYPE src_coord; \n"
-      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
-      "    return; \n"
-      "  src_coord.x = src_origin0 + i; \n"
-      "  src_coord.y = src_origin1 + j; \n"
-      "#ifdef IMAGE_3D \n"
-      "  src_coord.z = src_origin2 + k; \n"
-      "#endif \n"
-      "  color = read_imageui(image, sampler, src_coord); \n"
-      "  dst_offset += (k * region1 + j) * region0 + i; \n"
-      "  buffer[dst_offset] = color.x; \n"
-      "}";
-
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(image->base.ctx == buffer->ctx);
 
@@ -1058,7 +1619,20 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
   global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 
   /* setup the kernel and run. */
-  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_image_2d_to_buffer_str[];
+      extern size_t cl_internal_copy_image_2d_to_buffer_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,
+          cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);
+  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    extern char cl_internal_copy_image_3d_to_buffer_str[];
+    extern size_t cl_internal_copy_image_3d_to_buffer_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,
+          cl_internal_copy_image_3d_to_buffer_str, (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL);
+  }
+
   if (!ker) {
     ret = CL_OUT_OF_RESOURCES;
     goto fail;
@@ -1090,12 +1664,10 @@ LOCAL cl_int
 cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image,
                          const size_t src_offset, const size_t *dst_origin, const size_t *region) {
   cl_int ret;
-  cl_kernel ker;
+  cl_kernel ker = NULL;
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
-  cl_int index = CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0;
-  char option[40] = "";
   uint32_t intel_fmt, bpp;
   cl_image_format fmt;
   size_t origin0, region0;
@@ -1106,41 +1678,6 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
   global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
 
-  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-    strcat(option, "-D IMAGE_3D");
-    index += 1;
-  }
-
-  static const char *str_kernel =
-      "#ifdef IMAGE_3D \n"
-      "  #define IMAGE_TYPE image3d_t \n"
-      "  #define COORD_TYPE int4 \n"
-      "#else \n"
-      "  #define IMAGE_TYPE image2d_t \n"
-      "  #define COORD_TYPE int2 \n"
-      "#endif \n"
-      "kernel void __cl_copy_image_to_buffer ( \n"
-      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
-      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
-      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2, \n"
-      "       unsigned int src_offset) { \n"
-      "  int i = get_global_id(0); \n"
-      "  int j = get_global_id(1); \n"
-      "  int k = get_global_id(2); \n"
-      "  uint4 color = (uint4)(0); \n"
-      "  COORD_TYPE dst_coord; \n"
-      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
-      "    return; \n"
-      "  dst_coord.x = dst_origin0 + i; \n"
-      "  dst_coord.y = dst_origin1 + j; \n"
-      "#ifdef IMAGE_3D \n"
-      "  dst_coord.z = dst_origin2 + k; \n"
-      "#endif \n"
-      "  src_offset += (k * region1 + j) * region0 + i; \n"
-      "  color.x = buffer[src_offset]; \n"
-      "  write_imageui(image, dst_coord, color); \n"
-      "}";
-
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(image->base.ctx == buffer->ctx);
 
@@ -1156,7 +1693,19 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 
   /* setup the kernel and run. */
-  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_buffer_to_image_2d_str[];
+      extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
+          cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      extern char cl_internal_copy_buffer_to_image_3d_str[];
+      extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,
+          cl_internal_copy_buffer_to_image_3d_str, (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL);
+  }
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
@@ -1200,6 +1749,7 @@ cl_mem_map_gtt(cl_mem mem)
 {
   cl_buffer_map_gtt(mem->bo);
   assert(cl_buffer_get_virtual(mem->bo));
+  mem->mapped_gtt = 1;
   return cl_buffer_get_virtual(mem->bo);
 }
 
@@ -1230,8 +1780,10 @@ cl_mem_map_auto(cl_mem mem)
 LOCAL cl_int
 cl_mem_unmap_auto(cl_mem mem)
 {
-  if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
+  if (mem->mapped_gtt == 1) {
     cl_buffer_unmap_gtt(mem->bo);
+    mem->mapped_gtt = 0;
+  }
   else
     cl_buffer_unmap(mem->bo);
   return CL_SUCCESS;
@@ -1295,14 +1847,16 @@ LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
   struct _cl_mem_image *image = NULL;
   uint32_t intel_fmt, bpp;
 
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(&fmt, &bpp)) != CL_SUCCESS))
+    goto error;
+
   intel_fmt = cl_image_get_intel_format(&fmt);
   if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) {
     err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
     goto error;
   }
 
-  cl_image_byte_per_pixel(&fmt, &bpp);
-
   mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, &err);
   if (mem == NULL || err != CL_SUCCESS) {
     err = CL_OUT_OF_HOST_MEMORY;
@@ -1311,7 +1865,7 @@ LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
 
   image = cl_mem_image(mem);
 
-  mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image);
+  mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image, offset);
 
   image->w = width;
   image->h = height;
@@ -1337,3 +1891,13 @@ error:
   mem = NULL;
   goto exit;
 }
+
+LOCAL cl_int
+cl_mem_get_fd(cl_mem mem,
+              int* fd)
+{
+  cl_int err = CL_SUCCESS;
+  if(cl_buffer_get_fd(mem->bo, fd))
+	err = CL_INVALID_OPERATION;
+  return err;
+}
diff --git a/src/cl_mem.h b/src/cl_mem.h
index e325fa1..3174c5c 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -55,6 +55,8 @@ typedef struct _cl_mapped_ptr {
   void * ptr;
   void * v_ptr;
   size_t size;
+  size_t origin[3];  /* mapped origin */
+  size_t region[3];  /* mapped region */
 }cl_mapped_ptr;
 
 typedef struct _cl_mem_dstr_cb {
@@ -87,6 +89,7 @@ typedef  struct _cl_mem {
   cl_mapped_ptr* mapped_ptr;/* Store the mapped addresses and size by caller. */
   int mapped_ptr_sz;        /* The array size of mapped_ptr. */
   int map_ref;              /* The mapped count. */
+  uint8_t mapped_gtt;       /* This object has mapped gtt, for unmap. */
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
 } _cl_mem;
 
@@ -102,6 +105,7 @@ struct _cl_mem_image {
   cl_image_tiling_t tiling;       /* only IVB+ supports TILE_[X,Y] (image only) */
   size_t tile_x, tile_y;          /* tile offset, used for mipmap images.  */
   size_t offset;                  /* offset for dri_bo, used when it's reloc. */
+  cl_mem buffer_1d;               /* if the image is created from buffer, it point to the buffer.*/
 };
 
 struct _cl_mem_gl_image {
@@ -172,6 +176,9 @@ extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t
 /* Query information about an image */
 extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
 
+/* Query whether mem is in buffers */
+extern cl_int is_valid_mem(cl_mem mem, cl_mem buffers);
+
 /* Create a new memory object and initialize it with possible user data */
 extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
 
@@ -200,6 +207,12 @@ extern void cl_mem_add_ref(cl_mem);
 extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
               size_t src_offset, size_t dst_offset, size_t cb);
 
+extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+              cl_mem buffer, size_t offset, size_t size);
+
+extern cl_int cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image*,
+                                    const size_t *, const size_t *);
+
 /* api clEnqueueCopyBufferRect help function */
 extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
                                      const size_t *, const size_t *, const size_t *,
@@ -254,7 +267,11 @@ void
 cl_mem_copy_image_region(const size_t *origin, const size_t *region,
                          void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
                          const void *src, size_t src_row_pitch, size_t src_slice_pitch,
-                         const struct _cl_mem_image *image);
+                         const struct _cl_mem_image *image, cl_bool offset_dst, cl_bool offset_src);
+
+void
+cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region,
+                           const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image);
 
 extern cl_mem cl_mem_new_libva_buffer(cl_context ctx,
                                       unsigned int bo_name,
@@ -266,6 +283,8 @@ extern cl_mem cl_mem_new_libva_image(cl_context ctx,
                                      cl_image_format fmt,
                                      size_t row_pitch,
                                      cl_int *errcode);
+extern cl_int cl_mem_get_fd(cl_mem mem, int* fd);
+
 
 #endif /* __CL_MEM_H__ */
 
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index fdf0d78..e7c8d6a 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -34,7 +34,7 @@ static struct _cl_platform_id intel_platform_data = {
   INIT_ICD(dispatch)
   DECL_INFO_STRING(profile, "FULL_PROFILE")
   DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
-  DECL_INFO_STRING(name, "Experiment Intel Gen OCL Driver")
+  DECL_INFO_STRING(name, "Intel Gen OCL Driver")
   DECL_INFO_STRING(vendor, "Intel")
   DECL_INFO_STRING(icd_suffix_khr, "Intel")
 };
diff --git a/src/cl_program.c b/src/cl_program.c
index 10eecee..79dff34 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -24,6 +24,7 @@
 #include "cl_alloc.h"
 #include "cl_utils.h"
 #include "cl_khr_icd.h"
+#include "cl_gbe_loader.h"
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
 
@@ -32,6 +33,9 @@
 #include <stdint.h>
 #include <string.h>
 #include <assert.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <libgen.h>
 
 static void
 cl_program_release_sources(cl_program p)
@@ -72,6 +76,11 @@ cl_program_delete(cl_program p)
     p->build_opts = NULL;
   }
 
+  if (p->build_log) {
+    free(p->build_log);
+    p->build_log = NULL;
+  }
+
   /* Remove it from the list */
   assert(p->ctx);
   pthread_mutex_lock(&p->ctx->program_lock);
@@ -92,7 +101,11 @@ cl_program_delete(cl_program p)
   cl_context_delete(p->ctx);
 
   /* Free the program as allocated by the compiler */
-  if (p->opaque) gbe_program_delete(p->opaque);
+  if (p->opaque) {
+    if (CompilerSupported())
+      compiler_program_clean_llvm_resource(p->opaque);
+    interp_program_delete(p->opaque);
+  }
 
   p->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(p);
@@ -106,12 +119,13 @@ cl_program_new(cl_context ctx)
   /* Allocate the structure */
   TRY_ALLOC_NO_ERR (p, CALLOC(struct _cl_program));
   SET_ICD(p->dispatch)
+  p->build_status = CL_BUILD_NONE;
   p->ref_n = 1;
   p->magic = CL_MAGIC_PROGRAM_HEADER;
   p->ctx = ctx;
-  p->build_log = calloc(200, sizeof(char));
+  p->build_log = calloc(1000, sizeof(char));
   if (p->build_log)
-    p->build_log_max_sz = 200;
+    p->build_log_max_sz = 1000;
   /* The queue also belongs to its context */
   cl_context_add_ref(ctx);
 
@@ -136,13 +150,13 @@ cl_program_load_gen_program(cl_program p)
   uint32_t i;
 
   assert(p->opaque != NULL);
-  p->ker_n = gbe_program_get_kernel_num(p->opaque);
+  p->ker_n = interp_program_get_kernel_num(p->opaque);
 
   /* Allocate the kernel array */
   TRY_ALLOC (p->ker, CALLOC_ARRAY(cl_kernel, p->ker_n));
 
   for (i = 0; i < p->ker_n; ++i) {
-    const gbe_kernel opaque = gbe_program_get_kernel(p->opaque, i);
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
     assert(opaque != NULL);
     TRY_ALLOC (p->ker[i], cl_kernel_new(p));
     cl_kernel_setup(p->ker[i], opaque);
@@ -152,6 +166,30 @@ error:
   return err;
 }
 
+inline cl_bool isBitcodeWrapper(const unsigned char *BufPtr, const unsigned char *BufEnd)
+{
+  // See if you can find the hidden message in the magic bytes :-).
+  // (Hint: it's a little-endian encoding.)
+  return BufPtr != BufEnd &&
+    BufPtr[0] == 0xDE &&
+    BufPtr[1] == 0xC0 &&
+    BufPtr[2] == 0x17 &&
+    BufPtr[3] == 0x0B;
+}
+
+inline cl_bool isRawBitcode(const unsigned char *BufPtr, const unsigned char *BufEnd)
+{
+  // These bytes sort of have a hidden message, but it's not in
+  // little-endian this time, and it's a little redundant.
+  return BufPtr != BufEnd &&
+    BufPtr[0] == 'B' &&
+    BufPtr[1] == 'C' &&
+    BufPtr[2] == 0xc0 &&
+    BufPtr[3] == 0xde;
+}
+
+#define isBitcode(BufPtr,BufEnd)  (isBitcodeWrapper(BufPtr, BufEnd) || isRawBitcode(BufPtr, BufEnd))
+
 LOCAL cl_program
 cl_program_create_from_binary(cl_context             ctx,
                               cl_uint                num_devices,
@@ -193,6 +231,27 @@ cl_program_create_from_binary(cl_context             ctx,
   program->binary_sz = lengths[0];
   program->source_type = FROM_BINARY;
 
+  if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
+    if(*program->binary == 1){
+      program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+    }else if(*program->binary == 2){
+      program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+    }else{
+      err= CL_INVALID_BINARY;
+      goto error;
+    }
+    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->vendor_id, program->binary, program->binary_sz);
+
+    if (UNLIKELY(program->opaque == NULL)) {
+      err = CL_INVALID_PROGRAM;
+      goto error;
+    }
+    program->source_type = FROM_LLVM;
+  }
+  else if (*program->binary == 0) {
+    program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  }
+
   if (binary_status)
     binary_status[0] = CL_SUCCESS;
 
@@ -209,6 +268,79 @@ error:
 }
 
 LOCAL cl_program
+cl_program_create_with_built_in_kernles(cl_context     ctx,
+                                  cl_uint              num_devices,
+                                  const cl_device_id * devices,
+                                  const char *         kernel_names,
+                                  cl_int *             errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+
+  assert(ctx);
+  INVALID_DEVICE_IF (num_devices != 1);
+  INVALID_DEVICE_IF (devices == NULL);
+  INVALID_DEVICE_IF (devices[0] != ctx->device);
+
+  cl_int binary_status = CL_SUCCESS;
+  extern char cl_internal_built_in_kernel_str[];
+  extern size_t cl_internal_built_in_kernel_str_size;
+  char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
+
+  ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
+                                                          &ctx->device,
+                                                          (size_t*)&cl_internal_built_in_kernel_str_size,
+                                                          (const unsigned char **)&p_built_in_kernel_str,
+                                                          &binary_status, &err);
+  if (!ctx->built_in_prgs)
+    return NULL;
+
+  err = cl_program_build(ctx->built_in_prgs, NULL);
+  if (err != CL_SUCCESS)
+    return NULL;
+
+  ctx->built_in_prgs->is_built = 1;
+
+  char delims[] = ";";
+  char* saveptr = NULL;
+  char* local_kernel_names;
+  char* kernel = NULL;
+  char* matched_kernel;
+  int i = 0;
+
+  //copy the content to local_kernel_names to protect the kernel_names.
+  TRY_ALLOC(local_kernel_names, cl_calloc(strlen(kernel_names)+1, sizeof(char) ) );
+  memcpy(local_kernel_names, kernel_names, strlen(kernel_names)+1);
+
+  kernel = strtok_r( local_kernel_names, delims , &saveptr);
+  while( kernel != NULL ) {
+    matched_kernel = strstr(ctx->device->built_in_kernels, kernel);
+    if(matched_kernel){
+      for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
+        assert(ctx->built_in_prgs->ker[i]);
+        const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
+        if (strcmp(ker_name, kernel) == 0) {
+          break;
+        }
+      }
+
+      ctx->built_in_kernels[i] = cl_program_create_kernel(ctx->built_in_prgs, kernel, NULL);
+    }
+    kernel = strtok_r((char*)saveptr , delims, &saveptr );
+  }
+
+  cl_free(local_kernel_names);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return ctx->built_in_prgs;
+error:
+  goto exit;
+
+  return CL_SUCCESS;
+}
+
+LOCAL cl_program
 cl_program_create_from_llvm(cl_context ctx,
                             cl_uint num_devices,
                             const cl_device_id *devices,
@@ -225,7 +357,7 @@ cl_program_create_from_llvm(cl_context ctx,
   INVALID_VALUE_IF (file_name == NULL);
 
   program = cl_program_new(ctx);
-  program->opaque = gbe_program_new_from_llvm(file_name, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1);
+  program->opaque = compiler_program_new_from_llvm(ctx->device->vendor_id, file_name, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1);
   if (UNLIKELY(program->opaque == NULL)) {
     err = CL_INVALID_PROGRAM;
     goto error;
@@ -282,6 +414,7 @@ cl_program_create_from_source(cl_context ctx,
   *p = '\0';
 
   program->source_type = FROM_SOURCE;
+  program->binary_type = CL_PROGRAM_BINARY_TYPE_NONE;
 
 exit:
   cl_free(lens);
@@ -295,6 +428,43 @@ error:
   goto exit;
 }
 
+/* Before we do the real work, we need to check whether our platform
+   cl version can meet -cl-std= */
+static int check_cl_version_option(cl_program p, const char* options) {
+  const char* s = NULL;
+  int ver1 = 0;
+  int ver2 = 0;
+  char version_str[64];
+
+  if (options && (s = strstr(options, "-cl-std="))) {
+
+    if (s + strlen("-cl-std=CLX.X") > options + strlen(options)) {
+      return 0;
+    }
+
+    if (s[8] != 'C' || s[9] != 'L' || s[10] > '9' || s[10] < '0' || s[11] != '.'
+        || s[12] > '9' || s[12] < '0') {
+      return 0;
+    }
+
+    ver1 = (s[10] - '0') * 10 + (s[12] - '0');
+
+    if (cl_get_device_info(p->ctx->device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
+                                  version_str, NULL) != CL_SUCCESS)
+      return 0;
+
+    assert(strstr(version_str, "OpenCL") && version_str[0] == 'O');
+    ver2 = (version_str[9] - '0') * 10 + (version_str[11] - '0');
+
+    if (ver2 < ver1)
+      return 0;
+
+    return 1;
+  }
+
+  return 1;
+}
+
 LOCAL cl_int
 cl_program_build(cl_program p, const char *options)
 {
@@ -302,9 +472,15 @@ cl_program_build(cl_program p, const char *options)
   int i = 0;
   int copyed = 0;
 
-  if (p->ref_n > 1)
-    return CL_INVALID_OPERATION;
+  if (p->ref_n > 1) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
 
+  if (!check_cl_version_option(p, options)) {
+    err = CL_BUILD_PROGRAM_FAILURE;
+    goto error;
+  }
   if (options) {
     if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
       if(p->build_opts) {
@@ -326,7 +502,12 @@ cl_program_build(cl_program p, const char *options)
   }
 
   if (p->source_type == FROM_SOURCE) {
-    p->opaque = gbe_program_new_from_source(p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+    if (!CompilerSupported()) {
+      err = CL_COMPILER_NOT_AVAILABLE;
+      goto error;
+    }
+
+    p->opaque = compiler_program_new_from_source(p->ctx->device->vendor_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
     if (UNLIKELY(p->opaque == NULL)) {
       if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
         err = CL_INVALID_BUILD_OPTIONS;
@@ -337,9 +518,24 @@ cl_program_build(cl_program p, const char *options)
 
     /* Create all the kernels */
     TRY (cl_program_load_gen_program, p);
-    p->source_type = FROM_LLVM;
+  } else if (p->source_type == FROM_LLVM) {
+    if (!CompilerSupported()) {
+      err = CL_COMPILER_NOT_AVAILABLE;
+      goto error;
+    }
+
+    compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
+    if (UNLIKELY(p->opaque == NULL)) {
+      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_BUILD_OPTIONS;
+      else
+        err = CL_BUILD_PROGRAM_FAILURE;
+      goto error;
+    }
+    /* Create all the kernels */
+    TRY (cl_program_load_gen_program, p);
   } else if (p->source_type == FROM_BINARY) {
-    p->opaque = gbe_program_new_from_binary(p->binary, p->binary_sz);
+    p->opaque = interp_program_new_from_binary(p->ctx->device->vendor_id, p->binary, p->binary_sz);
     if (UNLIKELY(p->opaque == NULL)) {
       err = CL_BUILD_PROGRAM_FAILURE;
       goto error;
@@ -347,25 +543,211 @@ cl_program_build(cl_program p, const char *options)
 
     /* Create all the kernels */
     TRY (cl_program_load_gen_program, p);
-    p->source_type = FROM_LLVM;
   }
+  p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+
+  for (i = 0; i < p->ker_n; i ++) {
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    p->bin_sz += interp_kernel_get_code_size(opaque);
+  }
+
+  TRY_ALLOC (p->bin, cl_calloc(p->bin_sz, sizeof(char)));
+  for (i = 0; i < p->ker_n; i ++) {
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    size_t sz = interp_kernel_get_code_size(opaque);
+
+    memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
+    copyed += sz;
+  }
+  p->is_built = 1;
+  p->build_status = CL_BUILD_SUCCESS;
+  return CL_SUCCESS;
+
+error:
+  p->build_status = CL_BUILD_ERROR;
+  return err;
+}
+
+cl_program
+cl_program_link(cl_context            context,
+                cl_uint               num_input_programs,
+                const cl_program *    input_programs,
+                const char *          options,
+                cl_int*               errcode_ret)
+{
+  cl_program p = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_int i = 0;
+  int copyed = 0;
+  p = cl_program_new(context);
+
+  if (!check_cl_version_option(p, options)) {
+    err = CL_BUILD_PROGRAM_FAILURE;
+    goto error;
+  }
+
+  p->opaque = compiler_program_new_gen_program(context->device->vendor_id, NULL, NULL);
+
+  for(i = 0; i < num_input_programs; i++) {
+    // if program create with llvm binary, need deserilize first to get module.
+    if(input_programs[i])
+      compiler_program_link_program(p->opaque, input_programs[i]->opaque,
+        p->build_log_max_sz, p->build_log, &p->build_log_sz);
+    if (UNLIKELY(p->opaque == NULL)) {
+      err = CL_LINK_PROGRAM_FAILURE;
+      goto error;
+    }
+  }
+
+  if(options && strstr(options, "-create-library")){
+    p->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+    goto done;
+  }else{
+    p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  }
+
+  compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
+
+  /* Create all the kernels */
+  TRY (cl_program_load_gen_program, p);
 
   for (i = 0; i < p->ker_n; i ++) {
-    const gbe_kernel opaque = gbe_program_get_kernel(p->opaque, i);
-    p->bin_sz += gbe_kernel_get_code_size(opaque);
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    p->bin_sz += interp_kernel_get_code_size(opaque);
   }
 
   TRY_ALLOC (p->bin, cl_calloc(p->bin_sz, sizeof(char)));
   for (i = 0; i < p->ker_n; i ++) {
-    const gbe_kernel opaque = gbe_program_get_kernel(p->opaque, i);
-    size_t sz = gbe_kernel_get_code_size(opaque);
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    size_t sz = interp_kernel_get_code_size(opaque);
 
-    memcpy(p->bin + copyed, gbe_kernel_get_code(opaque), sz);
+    memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
     copyed += sz;
   }
+done:
+  p->is_built = 1;
+  p->build_status = CL_BUILD_SUCCESS;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return p;
 
 error:
+  p->build_status = CL_BUILD_ERROR;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return p;
+}
+
+LOCAL cl_int
+cl_program_compile(cl_program            p,
+                   cl_uint               num_input_headers,
+                   const cl_program *    input_headers,
+                   const char **         header_include_names,
+                   const char*           options)
+{
+  cl_int err = CL_SUCCESS;
+  int i = 0;
+
+  if (p->ref_n > 1) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  if (!check_cl_version_option(p, options)) {
+    err = CL_BUILD_PROGRAM_FAILURE;
+    goto error;
+  }
+
+  if (options) {
+    if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+      if(p->build_opts) {
+        cl_free(p->build_opts);
+        p->build_opts = NULL;
+      }
+      TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
+      memcpy(p->build_opts, options, strlen(options));
+
+      p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+    }
+  }
+
+  if (options == NULL && p->build_opts) {
+    p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+
+    cl_free(p->build_opts);
+    p->build_opts = NULL;
+  }
+
+  char temp_header_template[]= "/tmp/beignet.XXXXXX";
+  char* temp_header_path = mkdtemp(temp_header_template);
+
+  if (p->source_type == FROM_SOURCE) {
+
+    if (!CompilerSupported()) {
+      err = CL_COMPILER_NOT_AVAILABLE;
+      goto error;
+    }
+
+    //write the headers to /tmp/beignet.XXXXXX for include.
+    for (i = 0; i < num_input_headers; i++) {
+      if(header_include_names[i] == NULL || input_headers[i] == NULL)
+        continue;
+
+      char temp_path[255]="";
+      strncpy(temp_path, temp_header_path, strlen(temp_header_path));
+      strncat(temp_path, "/", 1);
+      strncat(temp_path, header_include_names[i], strlen(header_include_names[i]));
+      char* dirc = strdup(temp_path);
+      char* dir = dirname(dirc);
+      mkdir(dir, 0755);
+      if(access(dir, R_OK|W_OK) != 0){
+        err = CL_COMPILE_PROGRAM_FAILURE;
+        goto error;
+      }
+      free(dirc);
+
+      FILE* pfile = fopen(temp_path, "wb");
+      if(pfile){
+        fwrite(input_headers[i]->source, strlen(input_headers[i]->source), 1, pfile);
+        fclose(pfile);
+      }else{
+        err = CL_COMPILE_PROGRAM_FAILURE;
+        goto error;
+      }
+    }
+
+    p->opaque = compiler_program_compile_from_source(p->ctx->device->vendor_id, p->source, temp_header_path,
+        p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+
+    char rm_path[255]="rm ";
+    strncat(rm_path, temp_header_path, strlen(temp_header_path));
+    strncat(rm_path, " -rf", 4);
+    int temp = system(rm_path);
+
+    if(temp){
+      assert(0);
+    }
+
+    if (UNLIKELY(p->opaque == NULL)) {
+      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_BUILD_OPTIONS;
+      else
+        err = CL_BUILD_PROGRAM_FAILURE;
+      goto error;
+    }
+
+    /* Create all the kernels */
+    p->source_type = FROM_LLVM;
+    p->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+  }
   p->is_built = 1;
+  p->build_status = CL_BUILD_SUCCESS;
+  return CL_SUCCESS;
+
+error:
+  p->build_status = CL_BUILD_ERROR;
+  cl_program_delete(p);
+  p = NULL;
   return err;
 }
 
@@ -426,3 +808,44 @@ error:
 
   return CL_OUT_OF_HOST_MEMORY;
 }
+
+LOCAL void
+cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size_ret)
+{
+  int i = 0;
+  const char *ker_name = NULL;
+  size_t len = 0;
+  if(size_ret) *size_ret = 0;
+
+  if(p->ker == NULL) {
+    return;
+  }
+
+  ker_name = cl_kernel_get_name(p->ker[i]);
+  len = strlen(ker_name);
+  if(names) {
+    strncpy(names, cl_kernel_get_name(p->ker[0]), size - 1);
+    if(size < len - 1) {
+      if(size_ret) *size_ret = size;
+      return;
+    }
+    size = size - len - 1;  //sub \0
+  }
+  if(size_ret) *size_ret = strlen(ker_name) + 1;  //add NULL
+
+  for (i = 1; i < p->ker_n; ++i) {
+    ker_name = cl_kernel_get_name(p->ker[i]);
+    len = strlen(ker_name);
+    if(names) {
+      strncat(names, ";", size);
+      if(size >= 1)
+        strncat(names, ker_name, size - 1);
+      if(size < len + 1) {
+        if(size_ret) *size_ret = size;
+        break;
+      }
+      size = size - len - 1;
+    }
+    if(size_ret) *size_ret += len + 1; //add ';'
+  }
+}
diff --git a/src/cl_program.h b/src/cl_program.h
index a6d75da..6dea29a 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -21,7 +21,7 @@
 #define __CL_PROGRAM_H__
 
 #include "cl_internals.h"
-#include "program.h"
+#include "cl_gbe_loader.h"
 #include "CL/cl.h"
 
 #include <stdint.h>
@@ -50,9 +50,11 @@ struct _cl_program {
   char *source;           /* Program sources */
   char *binary;           /* Program binary. */
   size_t binary_sz;       /* The binary size. */
+  uint32_t binary_type;   /* binary type: COMPILED_OBJECT(LLVM IR), LIBRARY(LLVM IR with option "-create-library"), or EXECUTABLE(GEN binary). */
   uint32_t ker_n;         /* Number of declared kernels */
   uint32_t source_type:2; /* Built from binary, source or LLVM */
   uint32_t is_built:1;    /* Did we call clBuildProgram on it? */
+  int32_t build_status;   /* build status. */
   char *build_opts;       /* The build options for this program */
   size_t build_log_max_sz; /*build log maximum size in byte.*/
   char *build_log;         /* The build log for this program. */
@@ -92,6 +94,13 @@ cl_program_create_from_binary(cl_context             context,
                               cl_int *               binary_status,
                               cl_int *               errcode_ret);
 
+/* Create a program with built-in kernels*/
+extern cl_program
+cl_program_create_with_built_in_kernles(cl_context     context,
+                                  cl_uint              num_devices,
+                                  const cl_device_id * device_list,
+                                  const char *         kernel_names,
+                                  cl_int *             errcode_ret);
 /* Directly create a program from a LLVM source file */
 extern cl_program
 cl_program_create_from_llvm(cl_context             context,
@@ -103,6 +112,25 @@ cl_program_create_from_llvm(cl_context             context,
 /* Build the program as specified by OCL */
 extern cl_int
 cl_program_build(cl_program p, const char* options);
-
+/* Compile the program as specified by OCL */
+extern cl_int
+cl_program_compile(cl_program            p,
+                   cl_uint               num_input_headers,
+                   const cl_program *    input_headers,
+                   const char **         header_include_names,
+                   const char*           options);
+/* link the program as specified by OCL */
+extern cl_program
+cl_program_link(cl_context            context,
+                cl_uint               num_input_programs,
+                const cl_program *    input_programs,
+                const char *          options,
+                cl_int*               errcode_ret);
+/* Get the kernel names in program */
+extern void
+cl_program_get_kernel_names(cl_program p,
+                            size_t size,
+                            char *names,
+                            size_t *size_ret);
 #endif /* __CL_PROGRAM_H__ */
 
diff --git a/src/cl_thread.c b/src/cl_thread.c
index cadc3cd..5713d70 100644
--- a/src/cl_thread.c
+++ b/src/cl_thread.c
@@ -15,113 +15,251 @@
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
  */
+#include <string.h>
+#include <stdio.h>
 
 #include "cl_thread.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
 
-static __thread void* thread_batch_buf = NULL;
+/* Because the cl_command_queue can be used in several threads simultaneously but
+   without add ref to it, we now handle it like this:
+   Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it
+   does not have a slot, assign it.
+   The resources are keeped in queue private, and resize it if needed.
+   When the thread exit, the slot will be set invalid.
+   When queue released, all the resources will be released. If user still enqueue, flush
+   or finish the queue after it has been released, the behavior is undefined.
+   TODO: Need to shrink the slot map.
+   */
 
-typedef struct _cl_thread_spec_data {
+static int thread_array_num = 1;
+static int *thread_slot_map = NULL;
+static int thread_magic_num = 1;
+static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_key_t destroy_key;
+
+static __thread int thread_id = -1;
+static __thread int thread_magic = -1;
+
+typedef struct _thread_spec_data {
   cl_gpgpu gpgpu ;
   int valid;
-}cl_thread_spec_data;
+  void* thread_batch_buf;
+  int thread_magic;
+} thread_spec_data;
 
-void cl_set_thread_batch_buf(void* buf) {
-  if (thread_batch_buf) {
-    cl_gpgpu_unref_batch_buf(thread_batch_buf);
-  }
-  thread_batch_buf = buf;
-}
+typedef struct _queue_thread_private {
+  thread_spec_data**  threads_data;
+  int threads_data_num;
+  pthread_mutex_t thread_data_lock;
+} queue_thread_private;
 
-void* cl_get_thread_batch_buf(void) {
-  return thread_batch_buf;
+static void thread_data_destructor(void *dummy) {
+  pthread_mutex_lock(&thread_queue_map_lock);
+  thread_slot_map[thread_id] = 0;
+  pthread_mutex_unlock(&thread_queue_map_lock);
+  free(dummy);
 }
 
-cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
 {
-  pthread_key_t* key = queue->thread_data;
-  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
-
-  if (!thread_spec_data) {
-    TRY_ALLOC_NO_ERR(thread_spec_data, CALLOC(struct _cl_thread_spec_data));
-    if (pthread_setspecific(*key, thread_spec_data)) {
-      cl_free(thread_spec_data);
-      return NULL;
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
+  int i = 0;
+
+  if (thread_id == -1) {
+    void * dummy = malloc(sizeof(int));
+
+    pthread_mutex_lock(&thread_queue_map_lock);
+    for (i = 0; i < thread_array_num; i++) {
+      if (thread_slot_map[i] == 0) {
+        thread_id = i;
+        break;
+      }
     }
+
+    if (i == thread_array_num) {
+      thread_array_num *= 2;
+      thread_slot_map = realloc(thread_slot_map, sizeof(int) * thread_array_num);
+      memset(thread_slot_map + thread_array_num/2, 0, sizeof(int) * (thread_array_num/2));
+      thread_id = thread_array_num/2;
+    }
+
+    thread_slot_map[thread_id] = 1;
+
+    thread_magic = thread_magic_num++;
+    pthread_mutex_unlock(&thread_queue_map_lock);
+
+    pthread_setspecific(destroy_key, dummy);
+  }
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  if (thread_array_num > thread_private->threads_data_num) {// just enlarge
+    int old_num = thread_private->threads_data_num;
+    thread_private->threads_data_num = thread_array_num;
+    thread_private->threads_data = realloc(thread_private->threads_data,
+                thread_private->threads_data_num * sizeof(void *));
+    memset(thread_private->threads_data + old_num, 0,
+           sizeof(void*) * (thread_private->threads_data_num - old_num));
   }
 
-  if (!thread_spec_data->valid) {
-    TRY_ALLOC_NO_ERR(thread_spec_data->gpgpu, cl_gpgpu_new(queue->ctx->drv));
-    thread_spec_data->valid = 1;
+  assert(thread_id != -1 && thread_id < thread_array_num);
+  spec = thread_private->threads_data[thread_id];
+  if (!spec && create) {
+       spec = CALLOC(thread_spec_data);
+       spec->thread_magic = thread_magic;
+       thread_private->threads_data[thread_id] = spec;
   }
 
-error:
-  return thread_spec_data->gpgpu;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  return spec;
 }
 
-void cl_invalid_thread_gpgpu(cl_command_queue queue)
+void* cl_thread_data_create(void)
 {
-  pthread_key_t* key = queue->thread_data;
-  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
+  queue_thread_private* thread_private = CALLOC(queue_thread_private);
 
-  if (!thread_spec_data) {
-    return;
+  if (thread_private == NULL)
+    return NULL;
+
+  if (thread_slot_map == NULL) {
+    pthread_mutex_lock(&thread_queue_map_lock);
+    thread_slot_map = calloc(thread_array_num, sizeof(int));
+    pthread_mutex_unlock(&thread_queue_map_lock);
+
+    pthread_key_create(&destroy_key, thread_data_destructor);
   }
 
-  if (!thread_spec_data->valid) {
-    return;
+  pthread_mutex_init(&thread_private->thread_data_lock, NULL);
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  thread_private->threads_data = malloc(thread_array_num * sizeof(void *));
+  memset(thread_private->threads_data, 0, sizeof(void*) * thread_array_num);
+  thread_private->threads_data_num = thread_array_num;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  return thread_private;
+}
+
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+
+  if (!spec->thread_magic && spec->thread_magic != thread_magic) {
+    //We may get the slot from last thread. So free the resource.
+    spec->valid = 0;
   }
 
-  assert(thread_spec_data->gpgpu);
-  cl_gpgpu_delete(thread_spec_data->gpgpu);
-  thread_spec_data->valid = 0;
+  if (!spec->valid) {
+    if (spec->thread_batch_buf) {
+      cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
+      spec->thread_batch_buf = NULL;
+    }
+    if (spec->gpgpu) {
+      cl_gpgpu_delete(spec->gpgpu);
+      spec->gpgpu = NULL;
+    }
+    TRY_ALLOC_NO_ERR(spec->gpgpu, cl_gpgpu_new(queue->ctx->drv));
+    spec->valid = 1;
+  }
+
+ error:
+  return spec->gpgpu;
 }
 
-static void thread_data_destructor(void *data) {
-  cl_thread_spec_data* thread_spec_data = (cl_thread_spec_data *)data;
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
 
-  if (thread_batch_buf) {
-    cl_gpgpu_unref_batch_buf(thread_batch_buf);
-    thread_batch_buf = NULL;
+  assert(spec && spec->thread_magic == thread_magic);
+
+  if (spec->thread_batch_buf) {
+    cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
   }
+  spec->thread_batch_buf = buf;
+}
+
+void* cl_get_thread_batch_buf(cl_command_queue queue) {
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
 
-  if (thread_spec_data->valid)
-    cl_gpgpu_delete(thread_spec_data->gpgpu);
-  cl_free(thread_spec_data);
+  assert(spec && spec->thread_magic == thread_magic);
+
+  return spec->thread_batch_buf;
 }
 
-/* Create the thread specific data. */
-void* cl_thread_data_create(void)
+void cl_invalid_thread_gpgpu(cl_command_queue queue)
 {
-  int rc = 0;
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
 
-  pthread_key_t *thread_specific_key = CALLOC(pthread_key_t);
-  if (thread_specific_key == NULL)
-    return NULL;
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  spec = thread_private->threads_data[thread_id];
+  assert(spec);
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  if (!spec->valid) {
+    return;
+  }
 
-  rc = pthread_key_create(thread_specific_key, thread_data_destructor);
+  assert(spec->gpgpu);
+  cl_gpgpu_delete(spec->gpgpu);
+  spec->gpgpu = NULL;
+  spec->valid = 0;
+}
+
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue)
+{
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
 
-  if (rc != 0)
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  spec = thread_private->threads_data[thread_id];
+  assert(spec);
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  if (!spec->valid)
     return NULL;
 
-  return thread_specific_key;
+  assert(spec->gpgpu);
+  cl_gpgpu gpgpu = spec->gpgpu;
+  spec->gpgpu = NULL;
+  spec->valid = 0;
+  return gpgpu;
 }
 
 /* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(void * data)
+void cl_thread_data_destroy(cl_command_queue queue)
 {
-  pthread_key_t *thread_specific_key = (pthread_key_t *)data;
-
-  /* First release self spec data. */
-  cl_thread_spec_data* thread_spec_data =
-         pthread_getspecific(*thread_specific_key);
-  if (thread_spec_data && thread_spec_data->valid) {
-    cl_gpgpu_delete(thread_spec_data->gpgpu);
-    if (thread_spec_data)
-      cl_free(thread_spec_data);
+  int i = 0;
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  int threads_data_num;
+  thread_spec_data** threads_data;
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  assert(thread_private->threads_data_num == thread_array_num);
+  threads_data_num = thread_private->threads_data_num;
+  threads_data = thread_private->threads_data;
+  thread_private->threads_data_num = 0;
+  thread_private->threads_data = NULL;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+  cl_free(thread_private);
+  queue->thread_data = NULL;
+
+  for (i = 0; i < threads_data_num; i++) {
+    if (threads_data[i] != NULL && threads_data[i]->thread_batch_buf) {
+      cl_gpgpu_unref_batch_buf(threads_data[i]->thread_batch_buf);
+      threads_data[i]->thread_batch_buf = NULL;
+    }
+
+    if (threads_data[i] != NULL && threads_data[i]->valid) {
+      cl_gpgpu_delete(threads_data[i]->gpgpu);
+      threads_data[i]->gpgpu = NULL;
+      threads_data[i]->valid = 0;
+    }
+    cl_free(threads_data[i]);
   }
 
-  pthread_key_delete(*thread_specific_key);
-  cl_free(thread_specific_key);
+  cl_free(threads_data);
 }
diff --git a/src/cl_thread.h b/src/cl_thread.h
index c8ab63c..ecc99ad 100644
--- a/src/cl_thread.h
+++ b/src/cl_thread.h
@@ -27,7 +27,7 @@
 void* cl_thread_data_create(void);
 
 /* The destructor for clean the thread specific data. */
-void cl_thread_data_destroy(void * data);
+void cl_thread_data_destroy(cl_command_queue queue);
 
 /* Used to get the gpgpu struct of each thread. */
 cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
@@ -36,9 +36,12 @@ cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
 void cl_invalid_thread_gpgpu(cl_command_queue queue);
 
 /* Used to set the batch buffer of each thread. */
-void cl_set_thread_batch_buf(void* buf);
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf);
 
 /* Used to get the batch buffer of each thread. */
-void* cl_get_thread_batch_buf(void);
+void* cl_get_thread_batch_buf(cl_command_queue queue);
+
+/* take current gpgpu from the thread gpgpu pool. */
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue);
 
 #endif /* __CL_THREAD_H__ */
diff --git a/src/cl_utils.h b/src/cl_utils.h
index fa900a7..26cf329 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -149,6 +149,35 @@ do {                                                        \
 struct _cl_mem_image *IMAGE;                                \
 IMAGE = cl_mem_image(MEM);                                  \
 
+#define FIXUP_IMAGE_REGION(IMAGE, PREGION, REGION)          \
+const size_t *REGION;                                       \
+size_t REGION ##_REC[3];                                    \
+do {                                                        \
+  if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {   \
+    REGION ##_REC[0] = PREGION[0];                          \
+    REGION ##_REC[1] = 1;                                   \
+    REGION ##_REC[2] = PREGION[1];                          \
+    REGION = REGION ##_REC;                                 \
+  } else {                                                  \
+    REGION = PREGION;                                       \
+  }                                                         \
+} while(0)
+
+#define FIXUP_IMAGE_ORIGIN(IMAGE, PREGION, REGION)          \
+const size_t *REGION;                                       \
+size_t REGION ##_REC[3];                                    \
+do {                                                        \
+  if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {   \
+    REGION ##_REC[0] = PREGION[0];                          \
+    REGION ##_REC[1] = 0;                                   \
+    REGION ##_REC[2] = PREGION[1];                          \
+    REGION = REGION ##_REC;                                 \
+  } else {                                                  \
+    REGION = PREGION;                                       \
+  }                                                         \
+} while(0)
+
+
 #define CHECK_EVENT(EVENT)                                    \
   do {                                                        \
     if (UNLIKELY(EVENT == NULL)) {                            \
diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
index 62eedd0..d3da3cc 100644
--- a/src/intel/intel_batchbuffer.c
+++ b/src/intel/intel_batchbuffer.c
@@ -53,7 +53,7 @@
 #include <string.h>
 #include <assert.h>
 
-LOCAL void
+LOCAL int
 intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t sz)
 {
   if (batch->buffer != NULL) {
@@ -66,14 +66,19 @@ intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t sz)
                                "batch buffer",
                                sz,
                                64);
-  assert(batch->buffer);
-
-  dri_bo_map(batch->buffer, 1);
+  if (!batch->buffer || (dri_bo_map(batch->buffer, 1) != 0)) {
+    if (batch->buffer)
+      dri_bo_unreference(batch->buffer);
+    batch->buffer = NULL;
+    return -1;
+  }
   batch->map = (uint8_t*) batch->buffer->virtual;
   batch->size = sz;
   batch->ptr = batch->map;
   batch->atomic = 0;
   batch->last_bo = batch->buffer;
+  batch->enable_slm = 0;
+  return 0;
 }
 
 LOCAL void
@@ -119,7 +124,14 @@ intel_batchbuffer_flush(intel_batchbuffer_t *batch)
   if (!is_locked)
     intel_driver_lock_hardware(batch->intel);
 
-  dri_bo_exec(batch->buffer, used, 0, 0, 0);
+  int flag = I915_EXEC_RENDER;
+  if(batch->enable_slm) {
+    /* use the hard code here temp, must change to
+     * I915_EXEC_ENABLE_SLM when it drm accept the patch */
+    flag |= (1<<13);
+  }
+  drm_intel_gem_bo_context_exec(batch->buffer, batch->intel->ctx, used, flag);
+
   if (!is_locked)
     intel_driver_unlock_hardware(batch->intel);
 
@@ -177,4 +189,3 @@ intel_batchbuffer_delete(intel_batchbuffer_t *batch)
 
   cl_free(batch);
 }
-
diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
index 74f1790..4c28a7c 100644
--- a/src/intel/intel_batchbuffer.h
+++ b/src/intel/intel_batchbuffer.h
@@ -83,6 +83,9 @@ typedef struct intel_batchbuffer
   uint32_t size;
   uint8_t *map;
   uint8_t *ptr;
+  /** HSW: can't set LRI in batch buffer, set I915_EXEC_ENABLE_SLM
+   *  flag when call exec. */
+  uint8_t enable_slm;
   int atomic;
 } intel_batchbuffer_t;
 
@@ -97,7 +100,7 @@ extern void intel_batchbuffer_emit_mi_flush(intel_batchbuffer_t*);
 extern void intel_batchbuffer_init(intel_batchbuffer_t*, struct intel_driver*);
 extern void intel_batchbuffer_terminate(intel_batchbuffer_t*);
 extern void intel_batchbuffer_flush(intel_batchbuffer_t*);
-extern void intel_batchbuffer_reset(intel_batchbuffer_t*, size_t sz);
+extern int intel_batchbuffer_reset(intel_batchbuffer_t*, size_t sz);
 
 static INLINE uint32_t
 intel_batchbuffer_space(const intel_batchbuffer_t *batch)
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index e5015ec..02ffde4 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -288,10 +288,18 @@
 #define I965_TILEWALK_XMAJOR                 0
 #define I965_TILEWALK_YMAJOR                 1
 
+#define I965_SURCHAN_SELECT_ZERO             0
+#define I965_SURCHAN_SELECT_ONE              1
+#define I965_SURCHAN_SELECT_RED              4
+#define I965_SURCHAN_SELECT_GREEN            5
+#define I965_SURCHAN_SELECT_BLUE             6
+#define I965_SURCHAN_SELECT_ALPHA            7
+
 #define URB_SIZE(intel)         (IS_IGDNG(intel->device_id) ? 1024 : \
                                  IS_G4X(intel->device_id) ? 384 : 256)
 
 // L3 cache stuff 
+#define GEN7_L3_SQC_REG1_ADDRESS_OFFSET          (0XB010)
 #define GEN7_L3_CNTL_REG2_ADDRESS_OFFSET         (0xB020)
 #define GEN7_L3_CNTL_REG3_ADDRESS_OFFSET         (0xB024)
 
diff --git a/src/intel/intel_dri_resource_sharing.c b/src/intel/intel_dri_resource_sharing.c
index b31844e..188c1fa 100644
--- a/src/intel/intel_dri_resource_sharing.c
+++ b/src/intel/intel_dri_resource_sharing.c
@@ -119,12 +119,12 @@ intel_get_gl_obj_from_texture(void *driver,
 }
 
 static GLenum
-get_cl_gl_format(gl_format format)
+get_cl_gl_format(mesa_format format)
 {
    switch (format) {
-   case MESA_FORMAT_RGBA8888:
+   case MESA_FORMAT_R8G8B8A8_UNORM:
       return GL_RGBA;
-   case MESA_FORMAT_ARGB8888:
+   case MESA_FORMAT_A8R8G8B8_UNORM:
       return GL_BGRA;
    default:
       return GL_BGRA;
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index f88a105..deb83c8 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -52,12 +52,15 @@
 #include "x11/mesa_egl_extension.h"
 #endif
 
+#ifdef HAS_X11
+#include <X11/Xlibint.h>
+#include "x11/dricommon.h"
+#endif
+
 #include "intel_driver.h"
 #include "intel_gpgpu.h"
 #include "intel_batchbuffer.h"
 #include "intel_bufmgr.h"
-#include <X11/Xlibint.h>
-#include "x11/dricommon.h"
 #include "cl_mem.h"
 
 #include <assert.h>
@@ -106,6 +109,7 @@ intel_driver_delete(intel_driver_t *driver)
 {
   if (driver == NULL)
     return;
+
   if (driver->bufmgr)
     drm_intel_bufmgr_destroy(driver->bufmgr);
   cl_free(driver);
@@ -128,16 +132,32 @@ error:
 }
 
 /* just used for maximum relocation number in drm_intel */
-#define BATCH_SIZE 0x1000
+#define BATCH_SIZE 0x4000
 
 static void
 intel_driver_memman_init(intel_driver_t *driver)
 {
   driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
   assert(driver->bufmgr);
+  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
   drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
 }
 
+static void
+intel_driver_context_init(intel_driver_t *driver)
+{
+  driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
+  assert(driver->ctx);
+}
+
+static void
+intel_driver_context_destroy(intel_driver_t *driver)
+{
+  if(driver->ctx)
+    drm_intel_gem_context_destroy(driver->ctx);
+  driver->ctx = NULL;
+}
+
 static void 
 intel_driver_init(intel_driver_t *driver, int dev_fd)
 {
@@ -150,6 +170,7 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
   intel_driver_get_param(driver, I915_PARAM_CHIPSET_ID, &driver->device_id);
   assert(res);
   intel_driver_memman_init(driver);
+  intel_driver_context_init(driver);
 
 #if EMULATE_GEN
   driver->gen_ver = EMULATE_GEN;
@@ -175,19 +196,22 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
 #endif /* EMULATE_GEN */
 }
 
-static void
+static cl_int
 intel_driver_open(intel_driver_t *intel, cl_context_prop props)
 {
   int cardi;
+#ifdef HAS_X11
   char *driver_name;
+#endif
   if (props != NULL
       && props->gl_type != CL_GL_NOSHARE
       && props->gl_type != CL_GL_GLX_DISPLAY
       && props->gl_type != CL_GL_EGL_DISPLAY) {
-    printf("Unsupported gl share type %d.\n", props->gl_type);
-    exit(-1);
+    fprintf(stderr, "Unsupported gl share type %d.\n", props->gl_type);
+    return CL_INVALID_OPERATION;
   }
 
+#ifdef HAS_X11
   intel->x11_display = XOpenDisplay(NULL);
 
   if(intel->x11_display) {
@@ -198,25 +222,31 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
       Xfree(driver_name);
     }
     else
-      printf("X server found. dri2 connection failed! \n");
-  } else {
-    printf("Can't find X server!\n");
+      fprintf(stderr, "X server found. dri2 connection failed! \n");
+  }
+#endif
+
+  if(!intel_driver_is_active(intel)) {
+    char card_name[20];
+    for(cardi = 0; cardi < 16; cardi++) {
+      sprintf(card_name, "/dev/dri/renderD%d", 128+cardi);
+      if(intel_driver_init_render(intel, card_name))
+        break;
+    }
   }
 
   if(!intel_driver_is_active(intel)) {
-    printf("Trying to open directly...\n");
     char card_name[20];
     for(cardi = 0; cardi < 16; cardi++) {
       sprintf(card_name, "/dev/dri/card%d", cardi);
-      if(intel_driver_init_master(intel, card_name)) {
-        printf("Success at %s.\n", card_name);
+      if(intel_driver_init_master(intel, card_name))
         break;
-      }
     }
   }
+
   if(!intel_driver_is_active(intel)) {
-    printf("Device open failed\n");
-    exit(-1);
+    fprintf(stderr, "Device open failed, aborting...\n");
+    return CL_DEVICE_NOT_FOUND;
   }
 
 #ifdef HAS_EGL
@@ -224,17 +254,23 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
     assert(props->egl_display);
   }
 #endif
+  return CL_SUCCESS;
 }
 
 static void
 intel_driver_close(intel_driver_t *intel)
 {
+#ifdef HAS_X11
   if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
   if(intel->x11_display) XCloseDisplay(intel->x11_display);
-  if(intel->fd) close(intel->fd);
+#endif
+  if(intel->need_close) {
+    close(intel->fd);
+    intel->need_close = 0;
+  }
   intel->dri_ctx = NULL;
   intel->x11_display = NULL;
-  intel->fd = 0;
+  intel->fd = -1;
 }
 
 LOCAL int
@@ -256,6 +292,7 @@ intel_driver_is_active(intel_driver_t *driver) {
   return driver->fd >= 0;
 }
 
+#ifdef HAS_X11
 LOCAL int 
 intel_driver_init_shared(intel_driver_t *driver, dri_state_t *state)
 {
@@ -263,9 +300,10 @@ intel_driver_init_shared(intel_driver_t *driver, dri_state_t *state)
   if(state->driConnectedFlag != DRI2)
     return 0;
   intel_driver_init(driver, state->fd);
-  driver->master = 0;
+  driver->need_close = 0;
   return 1;
 }
+#endif
 
 LOCAL int
 intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
@@ -277,31 +315,43 @@ intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
   // usually dev_name = "/dev/dri/card%d"
   dev_fd = open(dev_name, O_RDWR);
   if (dev_fd == -1) {
-    printf("open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
+    fprintf(stderr, "open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
     return 0;
   }
 
-  // Check that we're authenticated and the only opener
+  // Check that we're authenticated
   memset(&client, 0, sizeof(drm_client_t));
   int ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
-  assert (ret == 0);
-
-  if (!client.auth) {
-    printf("%s not authenticated\n", dev_name);
+  if (ret == -1) {
+    fprintf(stderr, "ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client) failed: %s\n", strerror(errno));
     close(dev_fd);
     return 0;
   }
 
-  client.idx = 1;
-  ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
-  if (ret != -1 || errno != EINVAL) {
-    printf("%s is already in use\n", dev_name);
+  if (!client.auth) {
+    fprintf(stderr, "%s not authenticated\n", dev_name);
     close(dev_fd);
     return 0;
   }
 
   intel_driver_init(driver, dev_fd);
-  driver->master = 1;
+  driver->need_close = 1;
+
+  return 1;
+}
+
+LOCAL int
+intel_driver_init_render(intel_driver_t *driver, const char* dev_name)
+{
+  int dev_fd;
+
+  // usually dev_name = "/dev/dri/renderD%d"
+  dev_fd = open(dev_name, O_RDWR);
+  if (dev_fd == -1)
+    return 0;
+
+  intel_driver_init(driver, dev_fd);
+  driver->need_close = 1;
 
   return 1;
 }
@@ -311,8 +361,10 @@ intel_driver_terminate(intel_driver_t *driver)
 {
   pthread_mutex_destroy(&driver->ctxmutex);
 
-  if(driver->master)
+  if(driver->need_close) {
     close(driver->fd);
+    driver->need_close = 0;
+  }
   driver->fd = -1;
   return 1;
 }
@@ -336,7 +388,6 @@ intel_driver_unlock_hardware(intel_driver_t *driver)
 LOCAL dri_bo*
 intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t name)
 {
-  assert(!driver->master);
   dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
                                              sname,
                                              name);
@@ -347,7 +398,6 @@ LOCAL uint32_t
 intel_driver_shared_name(intel_driver_t *driver, dri_bo *bo)
 {
   uint32_t name;
-  assert(!driver->master);
   assert(bo);
   dri_bo_flink(bo, &name);
   return name;
@@ -361,8 +411,9 @@ intel_get_device_id(void)
 
   driver = intel_driver_new();
   assert(driver != NULL);
-  intel_driver_open(driver, NULL);
+  if(UNLIKELY(intel_driver_open(driver, NULL) != CL_SUCCESS)) return INVALID_CHIP_ID;
   intel_device_id = driver->device_id;
+  intel_driver_context_destroy(driver);
   intel_driver_close(driver);
   intel_driver_terminate(driver);
   intel_driver_delete(driver);
@@ -375,21 +426,20 @@ cl_intel_driver_delete(intel_driver_t *driver)
 {
   if (driver == NULL)
     return;
+  intel_driver_context_destroy(driver);
   intel_driver_close(driver);
   intel_driver_terminate(driver);
   intel_driver_delete(driver);
 }
-#include "program.h"
+
+#include "cl_gbe_loader.h"
 static intel_driver_t*
 cl_intel_driver_new(cl_context_prop props)
 {
   intel_driver_t *driver = NULL;
   TRY_ALLOC_NO_ERR (driver, intel_driver_new());
+  if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
   intel_driver_open(driver, props);
-  /* We use the first 2 slots(0,1) for all the bufs.
-   * Notify the gbe this base index, thus gbe can avoid conflicts
-   * when it allocates slots for images*/
-  gbe_set_image_base_index(3);
 exit:
   return driver;
 error:
@@ -540,10 +590,12 @@ intel_alloc_buffer_from_texture_egl(cl_context ctx, unsigned int target,
   region.tiling = get_cl_tiling(region.tiling);
   if (cl_get_clformat_from_texture(region.gl_format, &cl_format) != 0)
     goto error;
+
+  if (cl_image_byte_per_pixel(&cl_format, &bpp) != CL_SUCCESS)
+    goto error;
   intel_fmt = cl_image_get_intel_format(&cl_format);
   if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
     goto error;
-  cl_image_byte_per_pixel(&cl_format, &bpp);
   cl_mem_object_type image_type;
   if (get_mem_type_from_target(target, &image_type) != 0)
     goto error;
@@ -607,13 +659,15 @@ cl_buffer intel_share_buffer_from_libva(cl_context ctx,
 
 cl_buffer intel_share_image_from_libva(cl_context ctx,
                                        unsigned int bo_name,
-                                       struct _cl_mem_image *image)
+                                       struct _cl_mem_image *image,
+                                       unsigned int offset)
 {
   drm_intel_bo *intel_bo;
   uint32_t intel_tiling, intel_swizzle_mode;
 
   intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
 
+  intel_bo->offset += offset;
   drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
   image->tiling = get_cl_tiling(intel_tiling);
 
@@ -642,11 +696,14 @@ static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
 static int intel_buffer_set_tiling(cl_buffer bo,
                                    cl_image_tiling_t tiling, size_t stride)
 {
-  uint32_t intel_tiling, required_tiling;
+  uint32_t intel_tiling;
   int ret;
   if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
     return -1;
+#ifndef NDEBUG
+  uint32_t required_tiling;
   required_tiling = intel_tiling;
+#endif
   ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
   assert(intel_tiling == required_tiling);
   return ret;
@@ -682,5 +739,6 @@ intel_setup_callbacks(void)
   cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
   cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
   cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
-  intel_set_gpgpu_callbacks();
+  cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
+  intel_set_gpgpu_callbacks(intel_get_device_id());
 }
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index a01d881..107fdfc 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -78,13 +78,14 @@ typedef struct _XDisplay Display;
 typedef struct intel_driver
 {
   dri_bufmgr *bufmgr;
+  drm_intel_context *ctx;
   int fd;
   int device_id;
   int gen_ver;
   sigset_t sa_mask;
   pthread_mutex_t ctxmutex;
   int locked;
-  int master;
+  int need_close;
   Display *x11_display;
   struct dri_state *dri_ctx;
 } intel_driver_t;
@@ -105,6 +106,9 @@ extern int intel_driver_init_shared(intel_driver_t*, struct dri_state*);
  */
 extern int intel_driver_init_master(intel_driver_t*, const char* dev_name);
 
+/* init driver for render node */
+extern int intel_driver_init_render(intel_driver_t*, const char* dev_name);
+
 /* terminate driver and all underlying structures */
 extern int intel_driver_terminate(intel_driver_t*);
 
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index b2d8bb0..c4b9156 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -28,12 +28,14 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <stddef.h>
+#include <errno.h>
 
 #include "intel/intel_gpgpu.h"
 #include "intel/intel_defines.h"
 #include "intel/intel_structs.h"
 #include "intel/intel_batchbuffer.h"
 #include "intel/intel_driver.h"
+#include "program.h" // for BTI_RESERVED_NUM
 
 #include "cl_alloc.h"
 #include "cl_utils.h"
@@ -60,9 +62,8 @@ typedef struct surface_heap {
 } surface_heap_t;
 
 typedef struct intel_event {
-  intel_batchbuffer_t *batch;
-  drm_intel_bo* buffer;
-  drm_intel_bo* ts_buf;
+  drm_intel_bo *buffer;
+  drm_intel_bo *ts_buf;
   int status;
 } intel_event_t;
 
@@ -78,6 +79,9 @@ enum {max_sampler_n = 16 };
 /* Handle GPGPU state */
 struct intel_gpgpu
 {
+  void* ker_opaque;
+  size_t global_wk_sz[3];
+  void* printf_info;
   intel_driver_t *drv;
   intel_batchbuffer_t *batch;
   cl_gpgpu_kernel *ker;
@@ -88,33 +92,48 @@ struct intel_gpgpu
 
   unsigned long img_bitmap;              /* image usage bitmap. */
   unsigned int img_index_base;          /* base index for image surface.*/
-  drm_intel_bo *binded_img[max_img_n];  /* all images binded for the call */
 
   unsigned long sampler_bitmap;          /* sampler usage bitmap. */
 
   struct { drm_intel_bo *bo; } stack_b;
-  struct { drm_intel_bo *bo; } idrt_b;
-  struct { drm_intel_bo *bo; } surface_heap_b;
-  struct { drm_intel_bo *bo; } vfe_state_b;
-  struct { drm_intel_bo *bo; } curbe_b;
-  struct { drm_intel_bo *bo; } sampler_state_b;
-  struct { drm_intel_bo *bo; } sampler_border_color_state_b;
   struct { drm_intel_bo *bo; } perf_b;
   struct { drm_intel_bo *bo; } scratch_b;
   struct { drm_intel_bo *bo; } constant_b;
   struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
+  struct { drm_intel_bo *bo;
+           drm_intel_bo *ibo;} printf_b;      /* the printf buf and index buf*/
+
+  struct { drm_intel_bo *bo; } aux_buf;
+  struct {
+    uint32_t surface_heap_offset;
+    uint32_t curbe_offset;
+    uint32_t idrt_offset;
+    uint32_t sampler_state_offset;
+    uint32_t sampler_border_color_state_offset;
+  } aux_offset;
 
   uint32_t per_thread_scratch;
   struct {
     uint32_t num_cs_entries;
     uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
-  } urb;
+  } curb;
 
   uint32_t max_threads;      /* max threads requested by the user */
 };
 
 typedef struct intel_gpgpu intel_gpgpu_t;
 
+typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
+intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
+
+typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size);
+intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL;
+
+typedef void (intel_gpgpu_post_action_t)(intel_gpgpu_t *gpgpu, int32_t flush_mode);
+intel_gpgpu_post_action_t *intel_gpgpu_post_action = NULL;
+
+typedef uint64_t (intel_gpgpu_read_ts_reg_t)(drm_intel_bufmgr *bufmgr);
+intel_gpgpu_read_ts_reg_t *intel_gpgpu_read_ts_reg = NULL;
 
 static void
 intel_gpgpu_sync(void *buf)
@@ -144,18 +163,12 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
     return;
   if(gpgpu->time_stamp_b.bo)
     drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
-  if (gpgpu->surface_heap_b.bo)
-    drm_intel_bo_unreference(gpgpu->surface_heap_b.bo);
-  if (gpgpu->idrt_b.bo)
-    drm_intel_bo_unreference(gpgpu->idrt_b.bo);
-  if (gpgpu->vfe_state_b.bo)
-    drm_intel_bo_unreference(gpgpu->vfe_state_b.bo);
-  if (gpgpu->curbe_b.bo)
-    drm_intel_bo_unreference(gpgpu->curbe_b.bo);
-  if (gpgpu->sampler_state_b.bo)
-    drm_intel_bo_unreference(gpgpu->sampler_state_b.bo);
-  if (gpgpu->sampler_border_color_state_b.bo)
-    drm_intel_bo_unreference(gpgpu->sampler_border_color_state_b.bo);
+  if(gpgpu->printf_b.bo)
+    drm_intel_bo_unreference(gpgpu->printf_b.bo);
+  if(gpgpu->printf_b.ibo)
+    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
+  if (gpgpu->aux_buf.bo)
+    drm_intel_bo_unreference(gpgpu->aux_buf.bo);
   if (gpgpu->perf_b.bo)
     drm_intel_bo_unreference(gpgpu->perf_b.bo);
   if (gpgpu->stack_b.bo)
@@ -196,10 +209,22 @@ intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen7()
+{
+  return cc_llc_l3;
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen75()
+{
+  return llccc_ec | l3cc_ec;
+}
+
 static void
 intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
 {
-  const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
+  const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
   BEGIN_BATCH(gpgpu->batch, 10);
   OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
   /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
@@ -209,10 +234,11 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
    * binding table pointer at 11 bits. So, we cannot use pointers directly while
    * using the surface heap
    */
-  OUT_RELOC(gpgpu->batch, gpgpu->surface_heap_b.bo,
+  assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
+  OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
             I915_GEM_DOMAIN_INSTRUCTION,
             I915_GEM_DOMAIN_INSTRUCTION,
-            0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY);
+            gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY));
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
@@ -234,26 +260,66 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
+  return size / 1024 - 1;
+}
+
+uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
+    size = size >> 11;
+    uint32_t index = 0;
+    while((size >>= 1) > 0)
+      index++;   //get leading one
+
+    //non pow 2 size
+    if(size & (size - 1)) index++;
+    return index;
+}
+
+static cl_int
+intel_gpgpu_get_max_curbe_size(uint32_t device_id)
+{
+  if (IS_BAYTRAIL_T(device_id) ||
+      IS_IVB_GT1(device_id))
+    return 992;
+  else
+    return 2016;
+}
+
+static cl_int
+intel_gpgpu_get_curbe_size(intel_gpgpu_t *gpgpu)
+{
+  int curbe_size = gpgpu->curb.size_cs_entry * gpgpu->curb.num_cs_entries;
+  int max_curbe_size = intel_gpgpu_get_max_curbe_size(gpgpu->drv->device_id);
+
+  if (curbe_size > max_curbe_size) {
+    fprintf(stderr, "warning, curbe size exceed limitation.\n");
+    return max_curbe_size;
+  } else
+    return curbe_size;
+}
+
 static void
 intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 {
+  int32_t scratch_index;
   BEGIN_BATCH(gpgpu->batch, 8);
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
 
   if(gpgpu->per_thread_scratch > 0) {
+    scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
     OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
               I915_GEM_DOMAIN_RENDER,
               I915_GEM_DOMAIN_RENDER,
-              gpgpu->per_thread_scratch/1024 - 1);
+              scratch_index);
   }
   else {
     OUT_BATCH(gpgpu->batch, 0);
   }
   /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
-  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (64 << 8) | 0xc4);
+  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (0 << 8) | 0xc4);
   OUT_BATCH(gpgpu->batch, 0);
   /* curbe_size */
-  OUT_BATCH(gpgpu->batch, 480);
+  OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu));
   OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, 0);
@@ -266,15 +332,8 @@ intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
   OUT_BATCH(gpgpu->batch, 0);                     /* mbz */
-// XXX
-#if 1
-  OUT_BATCH(gpgpu->batch,
-            gpgpu->urb.size_cs_entry*
-            gpgpu->urb.num_cs_entries*32);
-#else
-  OUT_BATCH(gpgpu->batch, 5120);
-#endif
-  OUT_RELOC(gpgpu->batch, gpgpu->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+  OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32);
+  OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.curbe_offset);
   ADVANCE_BATCH(gpgpu->batch);
 }
 
@@ -285,20 +344,22 @@ intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
   OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
   OUT_BATCH(gpgpu->batch, 0);                    /* mbz */
   OUT_BATCH(gpgpu->batch, 1 << 5);
-  OUT_RELOC(gpgpu->batch, gpgpu->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+  OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.idrt_offset);
   ADVANCE_BATCH(gpgpu->batch);
 }
 
 static const uint32_t gpgpu_l3_config_reg1[] = {
   0x00080040, 0x02040040, 0x00800040, 0x01000038,
   0x02000030, 0x01000038, 0x00000038, 0x00000040,
-  0x0A140091, 0x09100091, 0x08900091, 0x08900091
+  0x0A140091, 0x09100091, 0x08900091, 0x08900091,
+  0x010000a1
 };
 
 static const uint32_t gpgpu_l3_config_reg2[] = {
   0x00000000, 0x00000000, 0x00080410, 0x00080410,
   0x00040410, 0x00040420, 0x00080420, 0x00080020,
-  0x00204080, 0x00244890, 0x00284490, 0x002444A0
+  0x00204080, 0x00244890, 0x00284490, 0x002444A0,
+  0x00040810
 };
 
 /* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
@@ -319,9 +380,8 @@ intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
 static void
 intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
 {
-  BEGIN_BATCH(gpgpu->batch, SIZEOF32(gen6_pipe_control_t));
   gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
-    intel_batchbuffer_alloc_space(gpgpu->batch, 0);
+    intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
   memset(pc, 0, sizeof(*pc));
   pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
   pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
@@ -332,27 +392,89 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
   pc->dw1.texture_cache_invalidation_enable = 1;
   pc->dw1.cs_stall = 1;
   pc->dw1.dc_flush_enable = 1;
+  //pc->dw1.instruction_cache_invalidate_enable = 1;
   ADVANCE_BATCH(gpgpu->batch);
 }
 
 static void
-intel_gpgpu_set_L3(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
-  BEGIN_BATCH(gpgpu->batch, 6);
+  BEGIN_BATCH(gpgpu->batch, 9);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00730000);
+
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
   if (use_slm)
-    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
 
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
   if (use_slm)
-    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[8]);
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
   else
     OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
   ADVANCE_BATCH(gpgpu->batch);
+
+  intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_set_L3_baytrail(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+  BEGIN_BATCH(gpgpu->batch, 9);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00D30000);    /* General credit : High credit = 26 : 6 */
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, 0x01020021);  /* {SLM=64, URB=96, DC=16, RO=16, Sum=192} */
+  else
+    OUT_BATCH(gpgpu->batch, 0x02040040);  /* {SLM=0, URB=128, DC=32, RO=32, Sum=192} */
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x0);           /* {I/S=0, Const=0, Tex=0} */
+
+  ADVANCE_BATCH(gpgpu->batch);
+
+  intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+  /* still set L3 in batch buffer for fulsim. */
+  BEGIN_BATCH(gpgpu->batch, 9);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00610000);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
+  else
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
+  else
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+    ADVANCE_BATCH(gpgpu->batch);
+
+  //if(use_slm)
+  //  gpgpu->batch->enable_slm = 1;
   intel_gpgpu_pipe_control(gpgpu);
 }
 
@@ -361,6 +483,7 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
 {
   intel_batchbuffer_start_atomic(gpgpu->batch, 256);
   intel_gpgpu_pipe_control(gpgpu);
+  assert(intel_gpgpu_set_L3);
   intel_gpgpu_set_L3(gpgpu, gpgpu->ker->use_slm);
   intel_gpgpu_select_pipeline(gpgpu);
   intel_gpgpu_set_base_address(gpgpu);
@@ -388,6 +511,24 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
 }
 
 static void
+intel_gpgpu_post_action_gen7(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+  if(flush_mode)
+    intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_post_action_gen75(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+  /* flush force for set L3 */
+  intel_gpgpu_pipe_control(gpgpu);
+
+  /* Restore L3 control to disable SLM mode,
+     otherwise, may affect 3D pipeline */
+  intel_gpgpu_set_L3(gpgpu, 0);
+}
+
+static void
 intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
 {
   /* Insert PIPE_CONTROL for time stamp of end*/
@@ -409,14 +550,14 @@ intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
     ADVANCE_BATCH(gpgpu->batch);
   }
 
-  if(flush_mode) intel_gpgpu_pipe_control(gpgpu);
+  intel_gpgpu_post_action(gpgpu, flush_mode);
   intel_batchbuffer_end_atomic(gpgpu->batch);
 }
 
-static void
+static int
 intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
 {
-  intel_batchbuffer_reset(gpgpu->batch, sz);
+  return intel_batchbuffer_reset(gpgpu->batch, sz);
 }
 /* check we do not get a 0 starting address for binded buf */
 static void
@@ -428,20 +569,28 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
 }
 
 static void
+intel_gpgpu_flush_batch_buffer(intel_batchbuffer_t *batch)
+{
+  assert(batch);
+  intel_batchbuffer_emit_mi_flush(batch);
+  intel_batchbuffer_flush(batch);
+}
+
+static void
 intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
 {
-  intel_batchbuffer_emit_mi_flush(gpgpu->batch);
-  intel_batchbuffer_flush(gpgpu->batch);
+  if (!gpgpu->batch || !gpgpu->batch->buffer)
+    return;
+  intel_gpgpu_flush_batch_buffer(gpgpu->batch);
   intel_gpgpu_check_binded_buf_address(gpgpu);
 }
 
-static void
+static int
 intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
                        uint32_t max_threads,
                        uint32_t size_cs_entry,
                        int profiling)
 {
-  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   drm_intel_bo *bo;
 
   /* Binded buffers */
@@ -451,175 +600,242 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
 
   /* URB */
-  gpgpu->urb.num_cs_entries = 64;
-  gpgpu->urb.size_cs_entry = size_cs_entry;
+  gpgpu->curb.num_cs_entries = 64;
+  gpgpu->curb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
+  if (gpgpu->printf_b.ibo)
+    dri_bo_unreference(gpgpu->printf_b.ibo);
+  gpgpu->printf_b.ibo = NULL;
+  if (gpgpu->printf_b.bo)
+    dri_bo_unreference(gpgpu->printf_b.bo);
+  gpgpu->printf_b.bo = NULL;
+
   /* Set the profile buffer*/
   if(gpgpu->time_stamp_b.bo)
     dri_bo_unreference(gpgpu->time_stamp_b.bo);
   gpgpu->time_stamp_b.bo = NULL;
   if (profiling) {
     bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
-    assert(bo);
     gpgpu->time_stamp_b.bo = bo;
+    if (!bo)
+      fprintf(stderr, "Could not allocate buffer for profiling.\n");
   }
 
-  /* Constant URB  buffer */
-  if(gpgpu->curbe_b.bo)
-    dri_bo_unreference(gpgpu->curbe_b.bo);
-  uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
-  size_cb = ALIGN(size_cb, 4096);
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64);
-  assert(bo);
-  gpgpu->curbe_b.bo = bo;
-
-  /* surface state */
-  if(gpgpu->surface_heap_b.bo)
-    dri_bo_unreference(gpgpu->surface_heap_b.bo);
-  bo = dri_bo_alloc(bufmgr,
-                    "SURFACE_HEAP",
-                    sizeof(surface_heap_t),
-                    32);
-  assert(bo);
-  dri_bo_map(bo, 1);
-  memset(bo->virtual, 0, sizeof(surface_heap_t));
-  gpgpu->surface_heap_b.bo = bo;
-
-  /* Interface descriptor remap table */
-  if(gpgpu->idrt_b.bo)
-    dri_bo_unreference(gpgpu->idrt_b.bo);
-  bo = dri_bo_alloc(bufmgr,
-                    "IDRT",
-                    MAX_IF_DESC * sizeof(struct gen6_interface_descriptor),
-                    32);
-  assert(bo);
-  gpgpu->idrt_b.bo = bo;
-
-  /* vfe state */
-  if(gpgpu->vfe_state_b.bo)
-    dri_bo_unreference(gpgpu->vfe_state_b.bo);
-  gpgpu->vfe_state_b.bo = NULL;
-
-  /* sampler state */
-  if (gpgpu->sampler_state_b.bo)
-    dri_bo_unreference(gpgpu->sampler_state_b.bo);
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr,
-                    "SAMPLER_STATE",
-                    GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
-                    32);
-  assert(bo);
-  dri_bo_map(bo, 1);
-  memset(bo->virtual, 0, sizeof(gen6_sampler_state_t) * GEN_MAX_SAMPLERS);
-  gpgpu->sampler_state_b.bo = bo;
-
-  /* sampler border color state */
-  if (gpgpu->sampler_border_color_state_b.bo)
-    dri_bo_unreference(gpgpu->sampler_border_color_state_b.bo);
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr,
-                    "SAMPLER_BORDER_COLOR_STATE",
-                    sizeof(gen7_sampler_border_color_t),
-                    32);
-  assert(bo);
-  dri_bo_map(bo, 1);
-  memset(bo->virtual, 0, sizeof(gen7_sampler_border_color_t));
-  gpgpu->sampler_border_color_state_b.bo = bo;
-
   /* stack */
   if (gpgpu->stack_b.bo)
     dri_bo_unreference(gpgpu->stack_b.bo);
   gpgpu->stack_b.bo = NULL;
+
+  /* Set the auxiliary buffer*/
+  uint32_t size_aux = 0;
+  if(gpgpu->aux_buf.bo)
+    dri_bo_unreference(gpgpu->aux_buf.bo);
+  gpgpu->aux_buf.bo = NULL;
+
+  //surface heap must be 4096 bytes aligned because state base address use 20bit for the address
+  size_aux = ALIGN(size_aux, 4096);
+  gpgpu->aux_offset.surface_heap_offset = size_aux;
+  size_aux += sizeof(surface_heap_t);
+
+  //curbe must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.curbe_offset = size_aux;
+  size_aux += gpgpu->curb.num_cs_entries * gpgpu->curb.size_cs_entry * 32;
+
+  //idrt must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.idrt_offset = size_aux;
+  size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
+
+  //sampler state must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.sampler_state_offset = size_aux;
+  size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t);
+
+  //sampler border color state must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.sampler_border_color_state_offset = size_aux;
+  size_aux += GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t);
+
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 0);
+  if (!bo || dri_bo_map(bo, 1) != 0) {
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    if (bo)
+      dri_bo_unreference(bo);
+    if (profiling && gpgpu->time_stamp_b.bo)
+      dri_bo_unreference(gpgpu->time_stamp_b.bo);
+    gpgpu->time_stamp_b.bo = NULL;
+    return -1;
+  }
+  memset(bo->virtual, 0, size_aux);
+  gpgpu->aux_buf.bo = bo;
+  return 0;
 }
 
 static void
 intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
 {
-  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
   heap->binding_table[index] = offsetof(surface_heap_t, surface) +
                                index * sizeof(gen7_surface_state_t);
-  dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
                     I915_GEM_DOMAIN_RENDER,
                     I915_GEM_DOMAIN_RENDER,
                     obj_bo_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
                     heap->binding_table[index] +
                     offsetof(gen7_surface_state_t, ss1),
                     obj_bo);
 }
 
 static dri_bo*
-intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
+intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
 {
   uint32_t s = size - 1;
   assert(size != 0);
 
-  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
-  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
   memset(ss2, 0, sizeof(gen7_surface_state_t));
   ss2->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
   ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
   ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
   ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
-  ss2->ss5.cache_control = cc_llc_l3;
-  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
 
   if(gpgpu->constant_b.bo)
     dri_bo_unreference(gpgpu->constant_b.bo);
   gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
-  assert(gpgpu->constant_b.bo);
+  if (gpgpu->constant_b.bo == NULL)
+    return NULL;
   ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
-  dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
                       I915_GEM_DOMAIN_RENDER,
                       I915_GEM_DOMAIN_RENDER,
                       0,
-                      heap->binding_table[2] +
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[bti] +
                       offsetof(gen7_surface_state_t, ss1),
                       gpgpu->constant_b.bo);
   return gpgpu->constant_b.bo;
 }
 
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+{
+  uint32_t s = size - 1;
+  assert(size != 0);
+
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
+  memset(ss2, 0, sizeof(gen7_surface_state_t));
+  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
+  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
+  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
+  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
+  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  ss2->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+  ss2->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+  ss2->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+  ss2->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
+
+  if(gpgpu->constant_b.bo)
+    dri_bo_unreference(gpgpu->constant_b.bo);
+  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+  if (gpgpu->constant_b.bo == NULL)
+    return NULL;
+  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[bti] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      gpgpu->constant_b.bo);
+  return gpgpu->constant_b.bo;
+}
 
-/* Map address space with two 2GB surfaces. One surface for untyped message and
- * one surface for byte scatters / gathers. Actually the HW does not require two
- * surfaces but Fulsim complains
- */
 static void
-intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
+intel_gpgpu_setup_bti(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, uint32_t size, unsigned char index)
 {
-  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
-  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[0];
-  gen7_surface_state_t *ss1 = (gen7_surface_state_t *) heap->surface[1];
+  uint32_t s = size - 1;
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[index];
   memset(ss0, 0, sizeof(gen7_surface_state_t));
-  memset(ss1, 0, sizeof(gen7_surface_state_t));
-  ss1->ss0.surface_type = ss0->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss1->ss0.surface_format = ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
-  ss1->ss2.width  = ss0->ss2.width  = 127;   /* bits 6:0 of sz */
-  ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
-  ss0->ss3.depth  = 1023; /* bits 30:21 of sz */
-  ss1->ss3.depth  = 1023;  /* bits 30:21 of sz */
-  ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3;
-  heap->binding_table[0] = offsetof(surface_heap_t, surface);
-  heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth  = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+  ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
+
+  ss0->ss1.base_addr = buf->offset + internal_offset;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      internal_offset,
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[index] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      buf);
+}
+
+
+static int
+intel_is_surface_array(cl_mem_object_type type)
+{
+  if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+        type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+    return 1;
+
+  return 0;
 }
 
 static int
 intel_get_surface_type(cl_mem_object_type type)
 {
   switch (type) {
-  case CL_MEM_OBJECT_IMAGE1D: return I965_SURFACE_1D;
-  case CL_MEM_OBJECT_IMAGE2D: return I965_SURFACE_2D;
-  case CL_MEM_OBJECT_IMAGE3D: return I965_SURFACE_3D;
   case CL_MEM_OBJECT_IMAGE1D_BUFFER:
-  case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+  case CL_MEM_OBJECT_IMAGE1D:
   case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-    NOT_IMPLEMENTED;
-    break;
+    return I965_SURFACE_1D;
+
+  case CL_MEM_OBJECT_IMAGE2D:
+  case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+    return I965_SURFACE_2D;
+
+  case CL_MEM_OBJECT_IMAGE3D:
+    return I965_SURFACE_3D;
+
   default:
       assert(0);
   }
   return 0;
 }
 
+/* Get fixed surface type. If it is a 1D array image with a large index,
+   we need to fixup it to 2D type due to a Gen7/Gen75's sampler issue
+   on a integer type surface with clamp address mode and nearest filter mode.
+*/
+static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type)
+{
+  uint32_t surface_type;
+  if (((IS_IVYBRIDGE(gpgpu->drv->device_id) || IS_HASWELL(gpgpu->drv->device_id))) &&
+      index >= 128 + BTI_RESERVED_NUM &&
+      type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+    surface_type = I965_SURFACE_2D;
+  else
+    surface_type = intel_get_surface_type(type);
+  return surface_type;
+}
+
 static void
 intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
                               uint32_t index,
@@ -633,12 +849,61 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
                               int32_t pitch,
                               int32_t tiling)
 {
-  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
   gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
 
   memset(ss, 0, sizeof(*ss));
+  ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+  ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+  if (intel_is_surface_array(type)) {
+    ss->ss0.surface_array = 1;
+    ss->ss0.surface_array_spacing = 1;
+  }
+  ss->ss0.surface_format = format;
+  ss->ss1.base_addr = obj_bo->offset;
+  ss->ss2.width = w - 1;
 
-  ss->ss0.surface_type = intel_get_surface_type(type);
+  ss->ss2.height = h - 1;
+  ss->ss3.depth = depth - 1;
+  ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+  ss->ss4.not_str_buf.min_array_element = 0;
+  ss->ss3.pitch = pitch - 1;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  if (tiling == GPGPU_TILE_X) {
+    ss->ss0.tiled_surface = 1;
+    ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
+  } else if (tiling == GPGPU_TILE_Y) {
+    ss->ss0.tiled_surface = 1;
+    ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
+  }
+  ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+  intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
+                              uint32_t index,
+                              dri_bo* obj_bo,
+                              uint32_t obj_bo_offset,
+                              uint32_t format,
+                              cl_mem_object_type type,
+                              int32_t w,
+                              int32_t h,
+                              int32_t depth,
+                              int32_t pitch,
+                              int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+  ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+  if (intel_is_surface_array(type)) {
+    ss->ss0.surface_array = 1;
+    ss->ss0.surface_array_spacing = 1;
+  }
   ss->ss0.surface_format = format;
   ss->ss1.base_addr = obj_bo->offset;
   ss->ss2.width = w - 1;
@@ -647,7 +912,11 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
   ss->ss4.not_str_buf.rt_view_extent = depth - 1;
   ss->ss4.not_str_buf.min_array_element = 0;
   ss->ss3.pitch = pitch - 1;
-  ss->ss5.cache_control = cc_llc_l3;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+  ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+  ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+  ss->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
   if (tiling == GPGPU_TILE_X) {
     ss->ss0.tiled_surface = 1;
     ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
@@ -657,26 +926,31 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
   }
   ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
   intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
-  gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
+
+  assert(index < GEN_MAX_SURFACES);
 }
 
 static void
 intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
-                     uint32_t internal_offset, uint32_t cchint)
+                     uint32_t internal_offset, uint32_t size, uint8_t bti)
 {
   assert(gpgpu->binded_n < max_buf_n);
   gpgpu->binded_buf[gpgpu->binded_n] = buf;
   gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
   gpgpu->binded_offset[gpgpu->binded_n] = offset;
   gpgpu->binded_n++;
+  intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti);
 }
 
-static void
+static int
 intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   drm_intel_bo* old = gpgpu->scratch_b.bo;
   uint32_t total = per_thread_size * gpgpu->max_threads;
+  /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
+  if (IS_HASWELL(gpgpu->drv->device_id))
+      total *= 2;
 
   gpgpu->per_thread_scratch = per_thread_size;
 
@@ -685,52 +959,39 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
     old = NULL;
   }
 
-  if(!old)
+  if(!old && total) {
     gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
+    if (gpgpu->scratch_b.bo == NULL)
+      return -1;
+  }
+  return 0;
 }
 static void
-intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
+intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint8_t bti)
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
-  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, cchint);
-}
 
-static void
-intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
-                       uint32_t index,
-                       cl_buffer *obj_bo,
-                       uint32_t obj_bo_offset,
-                       uint32_t format,
-                       cl_mem_object_type type,
-                       int32_t w,
-                       int32_t h,
-                       int32_t depth,
-                       int32_t pitch,
-                       cl_gpgpu_tiling tiling)
-{
-  intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, obj_bo_offset, format, type, w, h, depth, pitch, tiling);
-  assert(index < GEN_MAX_SURFACES);
+  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, size, bti);
 }
 
 static void
 intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 {
   gen6_interface_descriptor_t *desc;
-  drm_intel_bo *bo = NULL, *ker_bo = NULL;
+  drm_intel_bo *ker_bo = NULL;
 
-  bo = gpgpu->idrt_b.bo;
-  dri_bo_map(bo, 1);
-  assert(bo->virtual);
-  desc = (gen6_interface_descriptor_t*) bo->virtual;
+  desc = (gen6_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
 
   memset(desc, 0, sizeof(*desc));
   ker_bo = (drm_intel_bo *) kernel->bo;
   desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
-  desc->desc1.single_program_flow = 1;
+  desc->desc1.single_program_flow = 0;
   desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
   desc->desc5.rounding_mode = 0; /* round to nearest even */
-  desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
+
+  assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
+  desc->desc2.sampler_state_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) >> 5;
   desc->desc3.binding_table_entry_count = 0; /* no prefetch */
   desc->desc3.binding_table_pointer = 0;
   desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
@@ -757,21 +1018,20 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   else
     desc->desc5.group_threads_num = kernel->barrierID; /* BarrierID on GEN6 */
 
-  dri_bo_emit_reloc(bo,
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
                     I915_GEM_DOMAIN_INSTRUCTION, 0,
                     0,
-                    offsetof(gen6_interface_descriptor_t, desc0),
+                    gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc0),
                     ker_bo);
 
-  dri_bo_emit_reloc(bo,
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
                     I915_GEM_DOMAIN_SAMPLER, 0,
-                    0,
-                    offsetof(gen6_interface_descriptor_t, desc2),
-                    gpgpu->sampler_state_b.bo);
-  dri_bo_unmap(bo);
+                    gpgpu->aux_offset.sampler_state_offset,
+                    gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc2),
+                    gpgpu->aux_buf.bo);
 }
 
-static void
+static int
 intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
 {
   unsigned char *curbe = NULL;
@@ -779,23 +1039,27 @@ intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
   uint32_t i, j;
 
   /* Upload the data first */
-  dri_bo_map(gpgpu->curbe_b.bo, 1);
-  assert(gpgpu->curbe_b.bo->virtual);
-  curbe = (unsigned char *) gpgpu->curbe_b.bo->virtual;
+  if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) {
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    return -1;
+  }
+  assert(gpgpu->aux_buf.bo->virtual);
+  curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset);
   memcpy(curbe, data, size);
 
   /* Now put all the relocations for our flat address space */
   for (i = 0; i < k->thread_n; ++i)
     for (j = 0; j < gpgpu->binded_n; ++j) {
       *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset + gpgpu->target_buf_offset[j];
-      drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
-                              gpgpu->binded_offset[j]+i*k->curbe_sz,
+      drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
+                              gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
                               gpgpu->binded_buf[j],
                               gpgpu->target_buf_offset[j],
                               I915_GEM_DOMAIN_RENDER,
                               I915_GEM_DOMAIN_RENDER);
     }
-  dri_bo_unmap(gpgpu->curbe_b.bo);
+  dri_bo_unmap(gpgpu->aux_buf.bo);
+  return 0;
 }
 
 static void
@@ -803,7 +1067,7 @@ intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
 {
   if (n) {
     const size_t sz = n * sizeof(gen6_sampler_state_t);
-    memcpy(gpgpu->sampler_state_b.bo->virtual, data, sz);
+    memcpy(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset, data, sz);
   }
 }
 
@@ -831,9 +1095,10 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
   uint32_t wrap_mode;
   gen7_sampler_state_t *sampler;
 
-  sampler = (gen7_sampler_state_t *)(gpgpu->sampler_state_b.bo->virtual)  + index;
+  sampler = (gen7_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset)  + index;
   memset(sampler, 0, sizeof(*sampler));
-  sampler->ss2.default_color_pointer = (gpgpu->sampler_border_color_state_b.bo->offset) >> 5;
+  assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0);
+  sampler->ss2.default_color_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) >> 5;
   if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
     sampler->ss3.non_normalized_coord = 1;
   else
@@ -877,12 +1142,13 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
                                    GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
                                    GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
 
-  dri_bo_emit_reloc(gpgpu->sampler_state_b.bo,
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
                     I915_GEM_DOMAIN_SAMPLER, 0,
-                    0,
+                    gpgpu->aux_offset.sampler_border_color_state_offset,
+                    gpgpu->aux_offset.sampler_state_offset +
                     index * sizeof(gen7_sampler_state_t) +
                     offsetof(gen7_sampler_state_t, ss2),
-                    gpgpu->sampler_border_color_state_b.bo);
+                    gpgpu->aux_buf.bo);
 
 }
 
@@ -890,22 +1156,9 @@ static void
 intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
 {
   int index;
-#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-  //assert(sampler_sz <= GEN_MAX_SAMPLERS/2);
-#else
   assert(sampler_sz <= GEN_MAX_SAMPLERS);
-#endif
-  for(index = 0; index < sampler_sz; index++) {
+  for(index = 0; index < sampler_sz; index++)
     intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
-#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-    /* Duplicate the sampler to 8 + index and fixup the address mode
-     * to repeat.*/
-    if ((samplers[index] & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) {
-      intel_gpgpu_insert_sampler(gpgpu, index + 8,
-                                 (samplers[index] & ~__CLK_ADDRESS_MASK) | CLK_ADDRESS_CLAMP_TO_EDGE);
-    }
-#endif
-  }
 }
 
 static void
@@ -913,10 +1166,7 @@ intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 {
   gpgpu->ker = kernel;
   intel_gpgpu_build_idrt(gpgpu, kernel);
-  intel_gpgpu_map_address_space(gpgpu);
-  dri_bo_unmap(gpgpu->surface_heap_b.bo);
-  dri_bo_unmap(gpgpu->sampler_state_b.bo);
-  dri_bo_unmap(gpgpu->sampler_border_color_state_b.bo);
+  dri_bo_unmap(gpgpu->aux_buf.bo);
 }
 
 static void
@@ -953,6 +1203,7 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
   BEGIN_BATCH(gpgpu->batch, 11);
   OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
   OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
+  assert(thread_n <= 64);
   if (simd_sz == 16)
     OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
   else
@@ -979,11 +1230,10 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
   intel_event_t *event = NULL;
   TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t));
 
-  event->status = command_queued;
-  event->batch = NULL;
   event->buffer = gpgpu->batch->buffer;
-  if(event->buffer != NULL)
+  if (event->buffer)
     drm_intel_bo_reference(event->buffer);
+  event->status = command_queued;
 
   if(gpgpu->time_stamp_b.bo) {
     event->ts_buf = gpgpu->time_stamp_b.bo;
@@ -998,6 +1248,17 @@ error:
   goto exit;
 }
 
+/*
+   The upper layer already flushed the batch buffer, just update
+   internal status to command_submitted.
+*/
+static void
+intel_gpgpu_event_flush(intel_event_t *event)
+{
+  assert(event->status == command_queued);
+  event->status = command_running;
+}
+
 static int
 intel_gpgpu_event_update_status(intel_event_t *event, int wait)
 {
@@ -1005,7 +1266,7 @@ intel_gpgpu_event_update_status(intel_event_t *event, int wait)
     return event->status;
 
   if (event->buffer &&
-      event->batch == NULL &&        //have flushed
+      event->status == command_running &&
       !drm_intel_bo_busy(event->buffer)) {
     event->status = command_complete;
     drm_intel_bo_unreference(event->buffer);
@@ -1026,30 +1287,8 @@ intel_gpgpu_event_update_status(intel_event_t *event, int wait)
 }
 
 static void
-intel_gpgpu_event_pending(intel_gpgpu_t *gpgpu, intel_event_t *event)
-{
-  assert(event->buffer);           //This is gpu enqueue command
-  assert(event->batch == NULL);    //This command haven't pengding.
-  event->batch = intel_batchbuffer_new(gpgpu->drv);
-  assert(event->batch);
-  *event->batch = *gpgpu->batch;
-  if(event->batch->buffer)
-    drm_intel_bo_reference(event->batch->buffer);
-}
-
-static void
-intel_gpgpu_event_resume(intel_event_t *event)
-{
-  assert(event->batch);           //This command have pending.
-  intel_batchbuffer_flush(event->batch);
-  intel_batchbuffer_delete(event->batch);
-  event->batch = NULL;
-}
-
-static void
 intel_gpgpu_event_delete(intel_event_t *event)
 {
-  assert(event->batch == NULL);   //This command must have been flushed.
   if(event->buffer)
     drm_intel_bo_unreference(event->buffer);
   if(event->ts_buf)
@@ -1057,6 +1296,33 @@ intel_gpgpu_event_delete(intel_event_t *event)
   cl_free(event);
 }
 
+/* IVB and HSW's result MUST shift in x86_64 system */
+static uint64_t
+intel_gpgpu_read_ts_reg_gen7(drm_intel_bufmgr *bufmgr)
+{
+  uint64_t result = 0;
+  drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+  /* In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of
+     result which got from drm_intel_reg_read, and 32-35 bits are lost; but match bspec in
+     i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain
+     32 bits data in i386.
+  */
+#ifdef __i386__
+  return result & 0x0ffffffff;
+#else
+  return result >> 32;
+#endif  /* __i386__  */
+}
+
+/* baytrail's result should clear high 4 bits */
+static uint64_t
+intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
+{
+  uint64_t result = 0;
+  drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+  return result & 0x0ffffffff;
+}
+
 /* We want to get the current time of GPU. */
 static void
 intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
@@ -1064,9 +1330,8 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
   uint64_t result = 0;
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
 
-  drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
-  result = result & 0xFFFFFFFFF0000000;
-  result = result >> 28;
+  /* Get the ts that match the bspec */
+  result = intel_gpgpu_read_ts_reg(bufmgr);
   result *= 80;
 
   *ret_ts = result;
@@ -1075,8 +1340,8 @@ intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
 
 /* Get the GPU execute time. */
 static void
-intel_gpgpu_event_get_exec_timestamp(intel_event_t *event,
-                                int index, uint64_t* ret_ts)
+intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
+				     int index, uint64_t* ret_ts)
 {
   uint64_t result = 0;
 
@@ -1089,26 +1354,115 @@ intel_gpgpu_event_get_exec_timestamp(intel_event_t *event,
   /* According to BSpec, the timestamp counter should be 36 bits,
      but comparing to the timestamp counter from IO control reading,
      we find the first 4 bits seems to be fake. In order to keep the
-     timestamp counter conformable, we just skip the first 4 bits. */
-  result = ((result & 0x0FFFFFFFF) << 4) * 80; //convert to nanoseconds
+     timestamp counter conformable, we just skip the first 4 bits.
+  */
+  result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
   *ret_ts = result;
 
   drm_intel_gem_bo_unmap_gtt(event->ts_buf);
 }
 
+static int
+intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint32_t offset, uint8_t bti)
+{
+  drm_intel_bo *bo = NULL;
+  if (i == 0) { // the index buffer.
+    if (gpgpu->printf_b.ibo)
+      dri_bo_unreference(gpgpu->printf_b.ibo);
+    gpgpu->printf_b.ibo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf index buffer", size, 4096);
+    bo = gpgpu->printf_b.ibo;
+  } else if (i == 1) {
+    if (gpgpu->printf_b.bo)
+      dri_bo_unreference(gpgpu->printf_b.bo);
+    gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf output buffer", size, 4096);
+    bo = gpgpu->printf_b.bo;
+  } else
+    assert(0);
+
+  if (!bo || (drm_intel_bo_map(bo, 1) != 0)) {
+    if (gpgpu->printf_b.bo)
+      drm_intel_bo_unreference(gpgpu->printf_b.bo);
+    gpgpu->printf_b.bo = NULL;
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    return -1;
+  }
+  memset(bo->virtual, 0, size);
+  drm_intel_bo_unmap(bo);
+  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, bti);
+  return 0;
+}
+
+static void*
+intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+  drm_intel_bo *bo = NULL;
+  if (i == 0) {
+    bo = gpgpu->printf_b.ibo;
+  } else if (i == 1) {
+    bo = gpgpu->printf_b.bo;
+  } else
+    assert(0);
+
+  drm_intel_bo_map(bo, 1);
+  return bo->virtual;
+}
+
+static void
+intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+  drm_intel_bo *bo = NULL;
+  if (i == 0) {
+    bo = gpgpu->printf_b.ibo;
+  } else if (i == 1) {
+    bo = gpgpu->printf_b.bo;
+  } else
+  assert(0);
+
+  drm_intel_bo_unmap(bo);
+}
+
+static void
+intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+  if (i == 0) {
+    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
+    gpgpu->printf_b.ibo = NULL;
+  } else if (i == 1) {
+    drm_intel_bo_unreference(gpgpu->printf_b.bo);
+    gpgpu->printf_b.bo = NULL;
+  } else
+    assert(0);
+}
+
+static void
+intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info, size_t * global_sz)
+{
+  gpgpu->printf_info = printf_info;
+  gpgpu->global_wk_sz[0] = global_sz[0];
+  gpgpu->global_wk_sz[1] = global_sz[1];
+  gpgpu->global_wk_sz[2] = global_sz[2];
+}
+
+static void*
+intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz)
+{
+  global_sz[0] = gpgpu->global_wk_sz[0];
+  global_sz[1] = gpgpu->global_wk_sz[1];
+  global_sz[2] = gpgpu->global_wk_sz[2];
+  return gpgpu->printf_info;
+}
+
 LOCAL void
-intel_set_gpgpu_callbacks(void)
+intel_set_gpgpu_callbacks(int device_id)
 {
   cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new;
   cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
   cl_gpgpu_sync = (cl_gpgpu_sync_cb *) intel_gpgpu_sync;
-  cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image;
   cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
   cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
   cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
   cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
   cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
-  cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
   cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
   cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
   cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
@@ -1119,13 +1473,41 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
   cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
   cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
+  cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
   cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status;
-  cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
-  cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
   cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
   cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
   cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
   cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
   cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
+  cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
+  cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
+  cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
+  cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
+  cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
+  cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
+
+  if (IS_HASWELL(device_id)) {
+    cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
+    cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen75;
+    intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
+    intel_gpgpu_post_action = intel_gpgpu_post_action_gen75;
+    intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb
+  }
+  else if (IS_IVYBRIDGE(device_id)) {
+    cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
+    cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen7;
+    if (IS_BAYTRAIL_T(device_id)) {
+      intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
+      intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
+    } else {
+      intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
+      intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+    }
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
+    intel_gpgpu_post_action = intel_gpgpu_post_action_gen7;
+  }
 }
-
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index 9918b35..d593ac7 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -28,7 +28,7 @@
 #include <stdint.h>
 
 /* Set the gpgpu related call backs */
-extern void intel_set_gpgpu_callbacks(void);
+extern void intel_set_gpgpu_callbacks(int device_id);
 
 #endif /* __INTEL_GPGPU_H__ */
 
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index 36b5971..ef76bb4 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -234,7 +234,16 @@ typedef struct gen7_surface_state
   } ss5;
 
   uint32_t ss6; /* unused */
-  uint32_t ss7; /* unused */
+
+  struct {
+    uint32_t min_lod:12;
+    uint32_t pad0:4;
+    uint32_t shader_a:3;
+    uint32_t shader_b:3;
+    uint32_t shader_g:3;
+    uint32_t shader_r:3;
+    uint32_t pad1:4;
+  } ss7;
 } gen7_surface_state_t;
 
 STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t));
@@ -342,8 +351,12 @@ typedef struct gen6_pipe_control
   } dw2;
 
   struct {
-    uint64_t data;
-  } qw0;
+    uint32_t data;
+  } dw3;
+
+  struct {
+    uint32_t data;
+  } dw4;
 } gen6_pipe_control_t;
 
 typedef struct gen6_sampler_state
diff --git a/src/kernels/cl_internal_copy_buf_align1.cl b/src/kernels/cl_internal_copy_buf_align1.cl
deleted file mode 100644
index cd3ec7b..0000000
--- a/src/kernels/cl_internal_copy_buf_align1.cl
+++ /dev/null
@@ -1,8 +0,0 @@
-kernel void __cl_cpy_region_align1 ( global char* src, unsigned int src_offset,
-                                     global char* dst, unsigned int dst_offset,
-				     unsigned int size)
-{
-    int i = get_global_id(0);
-    if (i < size)
-        dst[i+dst_offset] = src[i+src_offset];
-}
diff --git a/src/kernels/cl_internal_copy_buf_align16.cl b/src/kernels/cl_internal_copy_buf_align16.cl
index 75b1a4a..1abb4e9 100644
--- a/src/kernels/cl_internal_copy_buf_align16.cl
+++ b/src/kernels/cl_internal_copy_buf_align16.cl
@@ -1,4 +1,4 @@
-kernel void __cl_cpy_region_align16 ( global float* src, unsigned int src_offset,
+kernel void __cl_copy_region_align16 ( global float* src, unsigned int src_offset,
                                       global float* dst, unsigned int dst_offset,
 				      unsigned int size)
 {
diff --git a/src/kernels/cl_internal_copy_buf_align4.cl b/src/kernels/cl_internal_copy_buf_align4.cl
index 44a0f81..27174ca 100644
--- a/src/kernels/cl_internal_copy_buf_align4.cl
+++ b/src/kernels/cl_internal_copy_buf_align4.cl
@@ -1,4 +1,4 @@
-kernel void __cl_cpy_region_align4 ( global float* src, unsigned int src_offset,
+kernel void __cl_copy_region_align4 ( global float* src, unsigned int src_offset,
                                      global float* dst, unsigned int dst_offset,
 				     unsigned int size)
 {
diff --git a/src/kernels/cl_internal_copy_buf_rect.cl b/src/kernels/cl_internal_copy_buf_rect.cl
new file mode 100644
index 0000000..71e7484
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_rect.cl
@@ -0,0 +1,15 @@
+kernel void __cl_copy_buffer_rect ( global char* src, global char* dst,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_offset, unsigned int dst_offset,
+                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
+                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+  dst[dst_offset] = src[src_offset];
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
new file mode 100644
index 0000000..e02d0e5
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
@@ -0,0 +1,28 @@
+kernel void __cl_copy_region_unalign_dst_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask,
+				     unsigned int shift, unsigned int dw_mask)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    /* last dw, need to be careful, not to overflow the source. */
+    if ((i == size - 1) && ((last_mask & (~(~dw_mask >> shift))) == 0)) {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift);
+    } else {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift)
+             | ((src[src_offset + i + 1] & dw_mask) << (32 - shift));
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
new file mode 100644
index 0000000..83b6e97
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_region_unalign_same_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask)
+{
+    int i = get_global_id(0);
+    if (i > size -1)
+       return;
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask)
+             | (src[src_offset] & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (src[i+src_offset] & last_mask)
+            | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = src[i+src_offset];
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
new file mode 100644
index 0000000..ce0aa1d
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
@@ -0,0 +1,29 @@
+kernel void __cl_copy_region_unalign_src_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask,
+				     unsigned int shift, unsigned int dw_mask, int src_less)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    if (i == 0) {
+        tmp = ((src[src_offset + i] & dw_mask) << shift);
+    } else if (src_less && i == size - 1) { // not exceed the bound of source
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift));
+    } else {
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift))
+             | ((src[src_offset + i] & dw_mask) << shift);
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
new file mode 100644
index 0000000..a218b58
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d(__read_only image2d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += (k * region1 + j) * region0 + i;
+  color.x = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
new file mode 100644
index 0000000..84d3b27
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_buffer_to_image_3d(__read_only image3d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color = (uint4)(0);
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  src_offset += (k * region1 + j) * region0 + i;
+  color.x = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_1d_to_1d.cl b/src/kernels/cl_internal_copy_image_1d_to_1d.cl
new file mode 100644
index 0000000..dca82b2
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_1d_to_1d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_1d_to_1d(__read_only image1d_t src_image, __write_only image1d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int src_coord;
+  int dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord = src_origin0 + i;
+  dst_coord = dst_origin0 + i;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_2d.cl b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
new file mode 100644
index 0000000..c5eaab1
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
@@ -0,0 +1,21 @@
+kernel void __cl_copy_image_2d_to_2d(__read_only image2d_t src_image, __write_only image2d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_3d.cl b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
new file mode 100644
index 0000000..4c73a74
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_2d_to_3d(__read_only image2d_t src_image, __write_only image3d_t dst_image,
+                                         unsigned int region0, unsigned int region1, unsigned int region2,
+                                         unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                         unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer.cl b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
new file mode 100644
index 0000000..b6c352e
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                        unsigned int dst_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  color = read_imageui(image, sampler, src_coord);
+  dst_offset += (k * region1 + j) * region0 + i;
+  buffer[dst_offset] = color.x;
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_2d.cl b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
new file mode 100644
index 0000000..e0effa0
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_3d_to_2d(__read_only image3d_t src_image, __write_only image2d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_3d.cl b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
new file mode 100644
index 0000000..de80a0a
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
@@ -0,0 +1,23 @@
+kernel void __cl_copy_image_3d_to_3d(__read_only image3d_t src_image, __write_only image3d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer.cl b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
new file mode 100644
index 0000000..dcfc8a2
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
@@ -0,0 +1,22 @@
+#define IMAGE_TYPE image3d_t
+#define COORD_TYPE int4
+kernel void __cl_copy_image_3d_to_buffer ( __read_only IMAGE_TYPE image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                        unsigned int dst_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  COORD_TYPE src_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  color = read_imageui(image, sampler, src_coord);
+  dst_offset += (k * region1 + j) * region0 + i;
+  buffer[dst_offset] = color.x;
+}
diff --git a/src/kernels/cl_internal_fill_buf_align128.cl b/src/kernels/cl_internal_fill_buf_align128.cl
new file mode 100644
index 0000000..552820c
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align128.cl
@@ -0,0 +1,9 @@
+kernel void __cl_fill_region_align128 ( global float16* dst, float16 pattern0,
+                                        unsigned int offset, unsigned int size, float16 pattern1)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i*2+offset] = pattern0;
+        dst[i*2+offset+1] = pattern1;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align2.cl b/src/kernels/cl_internal_fill_buf_align2.cl
new file mode 100644
index 0000000..0b9a4cf
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align2.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align2 ( global char2 * dst, char2 pattern,
+			             unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align4.cl b/src/kernels/cl_internal_fill_buf_align4.cl
new file mode 100644
index 0000000..aefd92f
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align4 ( global float* dst, float pattern,
+			             unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align8.cl b/src/kernels/cl_internal_fill_buf_align8.cl
new file mode 100644
index 0000000..edaff77
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align8.cl
@@ -0,0 +1,14 @@
+#define COMPILER_ABS_FUNC_N(N) \
+    kernel void __cl_fill_region_align8_##N ( global float##N* dst, float##N pattern, \
+                                              unsigned int offset, unsigned int size) { \
+         int i = get_global_id(0); \
+         if (i < size) { \
+             dst[i+offset] = pattern; \
+         }  \
+    }
+
+
+COMPILER_ABS_FUNC_N(2)
+COMPILER_ABS_FUNC_N(4)
+COMPILER_ABS_FUNC_N(8)
+COMPILER_ABS_FUNC_N(16)
diff --git a/src/kernels/cl_internal_fill_buf_unalign.cl b/src/kernels/cl_internal_fill_buf_unalign.cl
new file mode 100644
index 0000000..90762b0
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_unalign.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_unalign ( global char * dst, char pattern,
+			               unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_image_1d.cl b/src/kernels/cl_internal_fill_image_1d.cl
new file mode 100644
index 0000000..b3b0cbf
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_1d.cl
@@ -0,0 +1,14 @@
+kernel void __cl_fill_image_1d( __write_only image1d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord = origin0 + i;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_1d_array.cl b/src/kernels/cl_internal_fill_image_1d_array.cl
new file mode 100644
index 0000000..f1eb241
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_1d_array.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_1d_array( __write_only image1d_array_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int2 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_2d.cl b/src/kernels/cl_internal_fill_image_2d.cl
new file mode 100644
index 0000000..0e29f3e
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_2d.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_2d( __write_only image2d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int2 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_2d_array.cl b/src/kernels/cl_internal_fill_image_2d_array.cl
new file mode 100644
index 0000000..f29c9e7
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_2d_array.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_2d_array( __write_only image2d_array_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  coord.z = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_3d.cl b/src/kernels/cl_internal_fill_image_3d.cl
new file mode 100644
index 0000000..042b8ab
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_3d.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_3d( __write_only image3d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  coord.z = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/performance.c b/src/performance.c
new file mode 100644
index 0000000..85cd481
--- /dev/null
+++ b/src/performance.c
@@ -0,0 +1,324 @@
+#include <performance.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#define MAX_KERNEL_NAME_LENGTH 100
+#define MAX_KERNEL_EXECUTION_COUNT 100000
+#define MAX_KERNEL_BUILD_OPT 1000
+typedef struct kernel_storage_node
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  char build_option[MAX_KERNEL_BUILD_OPT];
+  int current_count;
+  float kernel_sum_time;
+  struct kernel_storage_node *next;
+} kernel_storage_node;
+
+typedef struct context_storage_node
+{
+  uintptr_t context_id;
+  kernel_storage_node *kernels_storage;
+  char max_time_kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_max_time;
+  int kernel_count;
+  struct context_storage_node *next;
+} context_storage_node;
+
+typedef struct storage
+{
+  context_storage_node * context_storage;
+} storage;
+
+
+
+static storage record;
+static int atexit_registered = 0;
+
+
+static context_storage_node * prev_context_pointer = NULL;
+static kernel_storage_node * prev_kernel_pointer = NULL;
+
+static context_storage_node * find_context(cl_context context)
+{
+  if(NULL != prev_context_pointer )
+  {
+    if(prev_context_pointer->context_id == (uintptr_t)context)
+      return prev_context_pointer;
+  }
+
+  if(NULL == record.context_storage)
+  {
+    record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+    record.context_storage->context_id = (uintptr_t)context;
+    record.context_storage->kernels_storage = NULL;
+    record.context_storage->kernel_max_time = 0.0f;
+    record.context_storage->next = NULL;
+    record.context_storage->kernel_count = 0;
+    return record.context_storage;
+  }
+
+  context_storage_node *pre = record.context_storage;
+  context_storage_node *cur = record.context_storage;
+  while(NULL !=cur && (uintptr_t)context != cur->context_id )
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+    return cur;
+
+  pre->next = (context_storage_node *)malloc(sizeof(context_storage_node));
+  pre = pre->next;
+  pre->context_id = (uintptr_t)context;
+  pre->kernels_storage = NULL;
+  pre->kernel_max_time = 0.0f;
+  pre->next = NULL;
+  pre->kernel_count = 0;
+  return pre;
+}
+
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt)
+{
+  if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
+     p_context == prev_context_pointer &&
+     !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) &&
+     !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT))
+    return prev_kernel_pointer;
+
+  if(NULL == p_context)
+    return NULL;
+
+  if(NULL == p_context->kernels_storage)
+  {
+    p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+    p_context->kernel_count++;
+    strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
+    p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+    strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+    p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0';
+    p_context->kernels_storage->current_count = 0;
+    p_context->kernels_storage->kernel_sum_time = 0.0f;
+    p_context->kernels_storage->next = NULL;
+    return p_context->kernels_storage;
+  }
+
+  kernel_storage_node *pre = p_context->kernels_storage;
+  kernel_storage_node *cur = p_context->kernels_storage;
+  while(NULL != cur &&
+        (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) ||
+         strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT)))
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+    return cur;
+
+  p_context->kernel_count++;
+  pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+  pre = pre->next;
+  pre->current_count = 0;
+  pre->kernel_sum_time = 0.0f;
+  pre->next = NULL;
+  strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+  pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+  strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+  pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+  return pre;
+}
+
+static void free_storage()
+{
+  context_storage_node *p_context = record.context_storage;
+  while(NULL != p_context)
+  {
+    context_storage_node *p_tmp_context = p_context->next;
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    while(NULL != p_kernel)
+    {
+      kernel_storage_node *p_tmp_kernel = p_kernel->next;
+      free(p_kernel);
+      p_kernel = p_tmp_kernel;
+    }
+    free(p_context);
+    p_context = p_tmp_context;
+  }
+}
+
+typedef struct time_element
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_sum_time;
+  int kernel_execute_count;
+  double dev;
+  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  uint32_t time_index;
+} time_element;
+
+static int cmp(const void *a, const void *b)
+{
+  if(((time_element *)a)->kernel_sum_time < ((time_element *)b)->kernel_sum_time)
+    return 1;
+  else if(((time_element *)a)->kernel_sum_time > ((time_element *)b)->kernel_sum_time)
+    return -1;
+  else
+    return 0;
+}
+
+static void print_time_info()
+{
+  context_storage_node *p_context = record.context_storage;
+  if(NULL == p_context)
+  {
+    printf("Nothing to output !\n");
+    return;
+  }
+
+  int tmp_context_id = 0;
+  while(NULL != p_context)
+  {
+    printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
+    printf("  ->>>> KERNELS TIME SUMMARY <<<<-\n");
+
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    kernel_storage_node *p_tmp_kernel = p_kernel;
+    time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+    memset(te, 0, sizeof(time_element)*p_context->kernel_count);
+    int i = -1, j = 0, k = 0;
+    while(NULL != p_tmp_kernel)
+    {
+      for(k=0; k<=i; k++)
+      {
+        if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH))
+          break;
+      }
+      if(k == i+1)
+      {
+        i++;
+        k = i;
+      }
+      te[k].kernel_execute_count += p_tmp_kernel->current_count;
+      strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH);
+      te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+      te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time;
+      for(j=0; j != p_tmp_kernel->current_count; ++j)
+        te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j];
+      p_tmp_kernel = p_tmp_kernel->next;
+    }
+
+    for(k=0; k<=i; k++)
+    {
+      float average = te[k].kernel_sum_time / te[k].kernel_execute_count;
+      double sumsquare = 0.0;
+      for(j=0; j<te[k].time_index; ++j)
+        sumsquare += pow((te[k].kernel_times[j] - average), 2.0);
+      te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count);
+    }
+
+    float sum_time = 0.0f;
+    qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
+    for(j=0; j<=i; ++j)
+      sum_time += te[j].kernel_sum_time;
+
+    for(j=0; j<=i; ++j)
+    {
+      printf("    [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f  Count: %-7d  Ave(ms): %7.2f  Dev: %.1lf%%]\n",
+             te[j].kernel_name,
+             te[j].kernel_sum_time / sum_time * 100,
+             te[j].kernel_sum_time,
+             te[j].kernel_execute_count,
+             te[j].kernel_sum_time / te[j].kernel_execute_count,
+             te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100);
+    }
+    free(te);
+    printf("    Total : %.2f\n", sum_time);
+    if(2 != b_output_kernel_perf)
+    {
+      printf("[------------  CONTEXT ENDS------------]\n\n");
+      p_context = p_context->next;
+      continue;
+    }
+    p_tmp_kernel = p_kernel;
+    printf("\n  ->>>> KERNELS TIME DETAIL <<<<-\n");
+    while(NULL != p_kernel)
+    {
+      printf("    [Kernel Name : %30s   Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+      if(*p_kernel->build_option != '\0')
+      {
+        int count = 0;
+        printf("      ->Build Options : ");
+        while(p_kernel->build_option[count] != '\0' )
+        {
+          printf("%c", p_kernel->build_option[count++]);
+          if(count % 100 == 0)
+            printf("\n                         ");
+        }
+        printf("\n");
+      }
+      for(i=0; i!=p_kernel->current_count; ++i)
+        printf("      Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
+      p_kernel = p_kernel->next;
+    }
+    printf("[------------  CONTEXT ENDS------------]\n\n");
+    p_context = p_context->next;
+  }
+  free_storage();
+}
+
+
+static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time)
+{
+  if(!atexit_registered)
+  {
+    atexit_registered = 1;
+    atexit(print_time_info);
+  }
+  context_storage_node *p_context = find_context(context);
+  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
+  prev_context_pointer = p_context;
+  prev_kernel_pointer = p_kernel;
+  p_kernel->kernel_times[p_kernel->current_count++] = time;
+  p_kernel->kernel_sum_time += time;
+  if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
+  {
+    p_context->kernel_max_time = p_kernel->kernel_sum_time;
+    strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+    p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+  }
+}
+
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+int b_output_kernel_perf = 0;
+static struct timeval start, end;
+
+void initialize_env_var()
+{
+  char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
+  if(NULL == env || !strncmp(env,"0", 1))
+    b_output_kernel_perf = 0;
+  else if(!strncmp(env,"1", 1))
+    b_output_kernel_perf = 1;
+  else
+    b_output_kernel_perf = 2;
+}
+
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+  pthread_mutex_lock(&mutex);
+  gettimeofday(&start, NULL);
+}
+
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
+{
+  clFinish(cq);
+  gettimeofday(&end, NULL);
+  float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
+  insert(context, kernel_name, build_opt, t);
+  pthread_mutex_unlock(&mutex);
+}
diff --git a/src/performance.h b/src/performance.h
new file mode 100644
index 0000000..1e75054
--- /dev/null
+++ b/src/performance.h
@@ -0,0 +1,12 @@
+#ifndef __PERFORMANCE_H__
+#define __PERFORMANCE_H__
+#include "CL/cl.h"
+
+
+extern int b_output_kernel_perf;
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq);
+void initialize_env_var();
+
+
+#endif
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 0614ee6..9c531de 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -3,10 +3,8 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
 
 ##### Math Function Part:
 EXEC_PROGRAM(mkdir ${CMAKE_CURRENT_SOURCE_DIR} ARGS generated -p)
-EXEC_PROGRAM(python ${CMAKE_CURRENT_SOURCE_DIR} ARGS utest_math_gen.py OUTPUT_VARIABLE GEN_MATH_STRING)
+EXEC_PROGRAM(${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR} ARGS utest_math_gen.py OUTPUT_VARIABLE GEN_MATH_STRING)
 string(REGEX REPLACE " " ";" ADDMATHFUNC ${GEN_MATH_STRING})
-string(REGEX REPLACE " " "\n" NAMEMATHLIST ${GEN_MATH_STRING})
-MESSAGE(STATUS "Generated Builtin Math Functions:\n" ${NAMEMATHLIST})
 
 string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../kernels/\\1.cl" KERNEL_MATH_LIST ${GEN_MATH_STRING})
 string(REGEX REPLACE " " ";" KERNEL_MATH_LIST ${KERNEL_MATH_LIST})
@@ -18,7 +16,8 @@ configure_file (
   "setenv.sh"
   )
 
-link_directories (${LLVM_LIBRARY_DIR})
+#XXX only need GL if required
+link_directories (${LLVM_LIBRARY_DIR} ${OPENGL_LIBDIR} ${DRM_LIBDIR})
 set (utests_sources
   utest_error.c
   compiler_basic_arithmetic.cpp
@@ -32,6 +31,7 @@ set (utests_sources
   compiler_insert_to_constant.cpp
   compiler_argument_structure.cpp
   compiler_arith_shift_right.cpp
+  compiler_mixed_pointer.cpp
   compiler_array0.cpp
   compiler_array.cpp
   compiler_array1.cpp
@@ -44,6 +44,7 @@ set (utests_sources
   compiler_convert_uchar_sat.cpp
   compiler_copy_buffer.cpp
   compiler_copy_image.cpp
+  compiler_copy_image_1d.cpp
   compiler_copy_image_3d.cpp
   compiler_copy_buffer_row.cpp
   compiler_degrees.cpp
@@ -53,6 +54,7 @@ set (utests_sources
   compiler_abs_diff.cpp
   compiler_fill_image.cpp
   compiler_fill_image0.cpp
+  compiler_fill_image_1d.cpp
   compiler_fill_image_3d.cpp
   compiler_fill_image_3d_2.cpp
   compiler_function_argument0.cpp
@@ -119,6 +121,7 @@ set (utests_sources
   compiler_volatile.cpp
   compiler_copy_image1.cpp
   compiler_get_image_info.cpp
+  compiler_get_image_info_array.cpp
   compiler_vect_compare.cpp
   compiler_vector_load_store.cpp
   compiler_vector_inc.cpp
@@ -134,7 +137,6 @@ set (utests_sources
   builtin_shuffle.cpp
   builtin_shuffle2.cpp
   builtin_sign.cpp
-  builtin_sinpi.cpp
   builtin_lgamma.cpp
   builtin_lgamma_r.cpp
   builtin_tgamma.cpp
@@ -152,10 +154,9 @@ set (utests_sources
   runtime_createcontext.cpp
   runtime_null_kernel_arg.cpp
   runtime_event.cpp
-  compiler_double.cpp
-  compiler_double_2.cpp
-  compiler_double_3.cpp
-  compiler_double_4.cpp
+  runtime_barrier_list.cpp
+  runtime_marker_list.cpp
+  runtime_compile_link.cpp
   compiler_long.cpp
   compiler_long_2.cpp
   compiler_long_convert.cpp
@@ -165,28 +166,51 @@ set (utests_sources
   compiler_long_mult.cpp
   compiler_long_cmp.cpp
   compiler_function_argument3.cpp
+  compiler_function_qualifiers.cpp
   compiler_bool_cross_basic_block.cpp
   compiler_private_data_overflow.cpp
-  load_program_from_bin.cpp
+  compiler_getelementptr_bitcast.cpp
+  compiler_simd_any.cpp
+  compiler_simd_all.cpp
+  compiler_double_precision.cpp
+  load_program_from_bin_file.cpp
+  load_program_from_gen_bin.cpp
+  get_arg_info.cpp
+  profiling_exec.cpp
   enqueue_copy_buf.cpp
+  enqueue_copy_buf_unaligned.cpp
+  test_printf.cpp
+  enqueue_fill_buf.cpp
+  enqueue_built_in_kernels.cpp
+  builtin_kernel_max_global_size.cpp
+  image_1D_buffer.cpp
+  compare_image_2d_and_1d_array.cpp
+  compiler_constant_expr.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
   utest_helper.cpp)
 
 SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
-ADD_CUSTOM_COMMAND(
-    OUTPUT ${kernel_bin}.bin
-    COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl
-    )
+
+if(GEN_PCI_ID)
+  ADD_CUSTOM_COMMAND(
+  OUTPUT ${kernel_bin}.bin
+  COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin -t${GEN_PCI_ID}
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+else(GEN_PCI_ID)
+  ADD_CUSTOM_COMMAND(
+  OUTPUT ${kernel_bin}.bin
+  COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+endif(GEN_PCI_ID)
 
 ADD_CUSTOM_TARGET(kernel_bin.bin
     DEPENDS ${kernel_bin}.bin)
 
 add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/generated
     COMMAND mkdir ${CMAKE_CURRENT_SOURCE_DIR}/generated -p
-    COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/utest_math_gen.py > /dev/null 2>&1
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/utest_math_gen.py > /dev/null 2>&1
     COMMAND echo ${KERNEL_GITIGNORE_LIST} |sed 's/ /\\n/g' > ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/.gitignore
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
     )
@@ -199,11 +223,14 @@ if (EGL_FOUND AND MESA_SOURCE_FOUND)
 SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
 SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
 SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
-endif (EGL_FOUND AND MESA_SOURCE_FOUND)
+SET(UTESTS_REQUIRED_EGL_LIB ${EGL_LIBRARIES})
+else()
+SET(UTESTS_REQUIRED_EGL_LIB "")
+endif()
 
 ADD_LIBRARY(utests SHARED ${ADDMATHFUNC} ${utests_sources})
 
-TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT})
 
 ADD_EXECUTABLE(utest_run utest_run.cpp)
 TARGET_LINK_LIBRARIES(utest_run utests)
diff --git a/utests/builtin_kernel_max_global_size.cpp b/utests/builtin_kernel_max_global_size.cpp
new file mode 100644
index 0000000..c777564
--- /dev/null
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -0,0 +1,30 @@
+#include "utest_helper.hpp"
+
+void builtin_kernel_max_global_size(void)
+{
+  char* built_in_kernel_names;
+  size_t built_in_kernels_size;
+  cl_int err = CL_SUCCESS;
+  size_t ret_sz;
+
+
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+  built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+  OCL_ASSERT(ret_sz == built_in_kernels_size);
+  cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+  OCL_ASSERT(built_in_prog != NULL);
+  cl_kernel builtin_kernel_1d = clCreateKernel(built_in_prog, "__cl_copy_region_unalign_src_offset",  &err);
+  OCL_ASSERT(builtin_kernel_1d != NULL);
+  size_t param_value_size;
+  void* param_value;
+  clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, 0, NULL, &param_value_size);
+  param_value = malloc(param_value_size);
+  clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, param_value_size, param_value, 0);
+  OCL_ASSERT(*(size_t*)param_value == 256 * 1024 *1024);
+  clReleaseKernel(builtin_kernel_1d);
+  clReleaseProgram(built_in_prog);
+  free(param_value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_kernel_max_global_size);
diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp
new file mode 100644
index 0000000..f2c828e
--- /dev/null
+++ b/utests/compare_image_2d_and_1d_array.cpp
@@ -0,0 +1,79 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compare_image_2d_and_1d_array(void)
+{
+  const int w = 64;
+  const int h = 32;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  // Create the 1D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  uint32_t* image_data1 = (uint32_t *)malloc(w * h * sizeof(uint32_t));
+  uint32_t* image_data2 = (uint32_t *)malloc(w * h * sizeof(uint32_t));
+  for (int j = 0; j < h; j++) {
+    for (int i = 0; i < w; i++) {
+      char a = 0;
+      if (j % 2 == 0)
+        a = (j + 3) & 0x3f;
+
+      image_data2[w * j + i] = image_data1[w * j + i] = a << 24 | a << 16 | a << 8 | a;
+    }
+  }
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1);
+
+  // Create the 2D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+  desc.image_width = w;
+  desc.image_array_size = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2);
+
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_LINEAR);
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("compare_image_2d_and_1d_array");
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_sampler), &sampler);
+  globals[0] = 32;
+  globals[1] = 16;
+  locals[0] = 32;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; i++) {
+      // Because the array index will not join the sample caculation, the result should
+      // be different between the 2D and 1D_array.
+      if (j % 2 == 0)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
+    }
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compare_image_2d_and_1d_array);
diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp
index 7951ff7..ad661c0 100644
--- a/utests/compiler_async_copy.cpp
+++ b/utests/compiler_async_copy.cpp
@@ -52,4 +52,4 @@ DEF(uint, uint, 2);
 DEF(int64_t, long, 2);
 DEF(uint64_t, ulong, 2);
 DEF(float, float, 2);
-DEF(double, double, 2);
+//DEF(double, double, 2);
diff --git a/utests/compiler_async_stride_copy.cpp b/utests/compiler_async_stride_copy.cpp
index 132f917..2e9eaeb 100644
--- a/utests/compiler_async_stride_copy.cpp
+++ b/utests/compiler_async_stride_copy.cpp
@@ -19,7 +19,7 @@ static void compiler_async_stride_copy(void)
 
   OCL_MAP_BUFFER(1);
   for (uint32_t i = 0; i < n * copiesPerWorkItem * 4 * stride; ++i)
-      ((char*)buf_data[1])[i] = rand() && 0xff;
+      ((char*)buf_data[1])[i] = rand() & 0xff;
   OCL_UNMAP_BUFFER(1);
 
   // Run the kernel
@@ -33,10 +33,10 @@ static void compiler_async_stride_copy(void)
   char *dst = (char*)buf_data[0];
   char *src = (char*)buf_data[1];
   for (uint32_t i = 0; i < n * copiesPerWorkItem; i += stride * 4) {
-    OCL_ASSERT(dst[i + 0] == src[i + 0] + 3);
-    OCL_ASSERT(dst[i + 1] == src[i + 1] + 3);
-    OCL_ASSERT(dst[i + 2] == src[i + 2] + 3);
-    OCL_ASSERT(dst[i + 3] == src[i + 3] + 3);
+    OCL_ASSERT(dst[i + 0] == (char)(src[i + 0] + 3));
+    OCL_ASSERT(dst[i + 1] == (char)(src[i + 1] + 3));
+    OCL_ASSERT(dst[i + 2] == (char)(src[i + 2] + 3));
+    OCL_ASSERT(dst[i + 3] == (char)(src[i + 3] + 3));
   }
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
diff --git a/utests/compiler_basic_arithmetic.cpp b/utests/compiler_basic_arithmetic.cpp
index 0e5ec41..ba05de0 100644
--- a/utests/compiler_basic_arithmetic.cpp
+++ b/utests/compiler_basic_arithmetic.cpp
@@ -15,7 +15,6 @@ static void test_exec(const char* kernel_name)
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL_FROM_FILE("compiler_basic_arithmetic", kernel_name);
-std::cout <<"kernel name: " << kernel_name << std::endl;
   buf_data[0] = (T*) malloc(sizeof(T) * n);
   buf_data[1] = (T*) malloc(sizeof(T) * n);
   for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = (T) rand();
diff --git a/utests/compiler_box_blur_image.cpp b/utests/compiler_box_blur_image.cpp
index 351f08e..d94a97c 100644
--- a/utests/compiler_box_blur_image.cpp
+++ b/utests/compiler_box_blur_image.cpp
@@ -4,6 +4,7 @@ static void compiler_box_blur_image()
 {
   int w, h;
   cl_image_format format = { };
+  cl_image_desc desc = { };
   size_t origin[3] = { };
   size_t region[3];
   int *src, *dst;
@@ -15,11 +16,17 @@ static void compiler_box_blur_image()
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = 1;
+  desc.image_row_pitch = w*sizeof(uint32_t);
 
   /* Run the kernel */
-  OCL_CREATE_IMAGE2D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, w*sizeof(uint32_t), src);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, src);
   free(src);
-  OCL_CREATE_IMAGE2D(buf[1], 0, &format, w, h, 0, NULL);
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
   globals[0] = w;
diff --git a/utests/compiler_constant_expr.cpp b/utests/compiler_constant_expr.cpp
new file mode 100644
index 0000000..8bed03b
--- /dev/null
+++ b/utests/compiler_constant_expr.cpp
@@ -0,0 +1,35 @@
+#include "utest_helper.hpp"
+#include <math.h>
+
+static void compiler_constant_expr(void)
+{
+  const size_t n = 48;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_constant_expr");
+  buf_data[0] = (uint32_t*) malloc(sizeof(float) * n);
+  for (uint32_t i = 0; i < n; ++i) ((float*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(float), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i) {
+    float expect = pow(((float*)buf_data[0])[i], (i % 3) + 1);
+    float err = fabs(((float*)buf_data[1])[i] - expect);
+    OCL_ASSERT(err <= 100 * cl_FLT_ULP(expect));
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_constant_expr);
+
diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp
index 58827f2..150fd8a 100644
--- a/utests/compiler_copy_image.cpp
+++ b/utests/compiler_copy_image.cpp
@@ -1,3 +1,4 @@
+#include <string.h>
 #include "utest_helper.hpp"
 
 static void compiler_copy_image(void)
@@ -5,8 +6,12 @@ static void compiler_copy_image(void)
   const size_t w = 512;
   const size_t h = 512;
   cl_image_format format;
+  cl_image_desc desc;
   cl_sampler sampler;
 
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_copy_image");
   buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
@@ -16,9 +21,14 @@ static void compiler_copy_image(void)
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
-  OCL_CREATE_IMAGE2D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, w * sizeof(uint32_t), buf_data[0]);
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
 
-  OCL_CREATE_IMAGE2D(buf[1], 0, &format, w, h, 0, NULL);
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
   OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
   free(buf_data[0]);
   buf_data[0] = NULL;
@@ -41,6 +51,8 @@ static void compiler_copy_image(void)
       OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_copy_image);
diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp
index d469fbd..659dddc 100644
--- a/utests/compiler_copy_image1.cpp
+++ b/utests/compiler_copy_image1.cpp
@@ -1,3 +1,4 @@
+#include <string.h>
 #include "utest_helper.hpp"
 
 static void compiler_copy_image1(void)
@@ -5,8 +6,12 @@ static void compiler_copy_image1(void)
   const size_t w = 512;
   const size_t h = 512;
   cl_image_format format;
+  cl_image_desc desc;
   cl_sampler sampler;
 
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_copy_image1");
   buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
@@ -16,14 +21,19 @@ static void compiler_copy_image1(void)
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
-  OCL_CREATE_IMAGE2D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, w * sizeof(uint32_t), buf_data[0]);
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
   OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
 
-  OCL_CREATE_IMAGE2D(buf[1], 0, &format, w, h, 0, NULL);
-  OCL_CREATE_IMAGE2D(buf[2], 0, &format, w, h, 0, NULL);
-  OCL_CREATE_IMAGE2D(buf[3], 0, &format, w, h, 0, NULL);
-  OCL_CREATE_IMAGE2D(buf[4], 0, &format, w, h, 0, NULL);
-  OCL_CREATE_IMAGE2D(buf[5], 0, &format, w, h, 0, NULL);
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[2], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[3], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[4], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[5], 0, &format, &desc, NULL);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
@@ -66,6 +76,8 @@ static void compiler_copy_image1(void)
   OCL_UNMAP_BUFFER(3);
   OCL_UNMAP_BUFFER(4);
   OCL_UNMAP_BUFFER(5);
+
+  OCL_CALL(clReleaseSampler, sampler);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);
diff --git a/utests/compiler_copy_image_1d.cpp b/utests/compiler_copy_image_1d.cpp
new file mode 100644
index 0000000..5af6a77
--- /dev/null
+++ b/utests/compiler_copy_image_1d.cpp
@@ -0,0 +1,52 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_copy_image_1d(void)
+{
+  const size_t w = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image_1d");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w);
+  for (uint32_t i = 0; i < w; i++)
+      ((uint32_t*)buf_data[0])[i] = i;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+  desc.image_width = w;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  globals[0] = w;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < w; i++) {
+      //printf (" %x", ((uint32_t*)buf_data[1])[i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_1d);
diff --git a/utests/compiler_copy_image_3d.cpp b/utests/compiler_copy_image_3d.cpp
index ff493e7..de7cd45 100644
--- a/utests/compiler_copy_image_3d.cpp
+++ b/utests/compiler_copy_image_3d.cpp
@@ -7,8 +7,12 @@ static void compiler_copy_image_3d(void)
   const size_t h = 512;
   const size_t depth = 4;
   cl_image_format format;
+  cl_image_desc desc;
   cl_sampler sampler;
 
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_copy_image_3d");
   buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * depth);
@@ -19,10 +23,23 @@ static void compiler_copy_image_3d(void)
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNORM_INT8;
-  OCL_CREATE_IMAGE3D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, depth, w*4, w*h*4, buf_data[0]);
-  OCL_CREATE_IMAGE3D(buf[1], 0, &format, w, h, depth, 0, 0, NULL);
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  memset(&desc, 0, sizeof(desc));
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = 1;
   for(uint32_t i = 0; i < depth; i++)
-   OCL_CREATE_IMAGE2D(buf[2 + i], 0, &format, w, h, 0, NULL);
+   OCL_CREATE_IMAGE(buf[2 + i], 0, &format, &desc, NULL);
+
   OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
   free(buf_data[0]);
   buf_data[0] = NULL;
@@ -53,6 +70,8 @@ static void compiler_copy_image_3d(void)
 
   for(uint32_t i = 0; i < depth + 2; i++)
     OCL_UNMAP_BUFFER_GTT(i);
+
+  OCL_CALL(clReleaseSampler, sampler);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_3d);
diff --git a/utests/compiler_double_precision.cpp b/utests/compiler_double_precision.cpp
new file mode 100644
index 0000000..217fd18
--- /dev/null
+++ b/utests/compiler_double_precision.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+#include <math.h>
+
+static void double_precision_check(void)
+{
+  const size_t n = 16; //8192 * 4;
+
+  double d0 = 0.12345678912345678;
+  double d1 = 0.12355678922345678;
+  float cpu_result = d1 - d0;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("double_precision_check");
+  //OCL_CREATE_KERNEL("compiler_array");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((float*)buf_data[0])[i] = 0;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  bool precisionOK = true;
+  for (uint32_t i = 0; i < n; ++i) {
+    float error = ((float*)buf_data[1])[i] - cpu_result;
+    if (error != 0)
+      precisionOK = false;
+    OCL_ASSERT((fabs(error) < 1e-4));
+  }
+  if (!precisionOK)
+    printf("\n  - WARN: GPU doesn't have correct double precision. Got %.7G, expected %.7G\n", ((float*)buf_data[1])[0], cpu_result);
+}
+
+MAKE_UTEST_FROM_FUNCTION(double_precision_check);
diff --git a/utests/compiler_fill_gl_image.cpp b/utests/compiler_fill_gl_image.cpp
index 437fcf4..87d2fcd 100644
--- a/utests/compiler_fill_gl_image.cpp
+++ b/utests/compiler_fill_gl_image.cpp
@@ -46,7 +46,7 @@ static void compiler_fill_gl_image(void)
   glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, NULL);
 
   OCL_CREATE_KERNEL("test_fill_gl_image");
-  OCL_CREATE_GL_IMAGE2D(buf[0], 0, GL_TEXTURE_2D, 0, tex);
+  OCL_CREATE_GL_IMAGE(buf[0], 0, GL_TEXTURE_2D, 0, tex);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
diff --git a/utests/compiler_fill_image.cpp b/utests/compiler_fill_image.cpp
index 2f9fe3d..5a38b8c 100644
--- a/utests/compiler_fill_image.cpp
+++ b/utests/compiler_fill_image.cpp
@@ -1,3 +1,4 @@
+#include <string.h>
 #include "utest_helper.hpp"
 
 static void compiler_fill_image(void)
@@ -6,14 +7,22 @@ static void compiler_fill_image(void)
   const size_t h = 512;
   uint32_t color = 0x12345678;
   cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_fill_image");
 
-  OCL_CREATE_IMAGE2D(buf[0], 0, &format, w, h, 0, NULL);
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_fill_image0.cpp
index 1ab13be..e6e0b1d 100644
--- a/utests/compiler_fill_image0.cpp
+++ b/utests/compiler_fill_image0.cpp
@@ -1,3 +1,4 @@
+#include <string.h>
 #include "utest_helper.hpp"
 
 static void compiler_fill_image0(void)
@@ -5,14 +6,22 @@ static void compiler_fill_image0(void)
   const size_t w = 512;
   const size_t h = 512;
   cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_fill_image0");
 
-  OCL_CREATE_IMAGE2D(buf[0], 0, &format, w, h, 0, NULL);
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
diff --git a/utests/compiler_fill_image_1d.cpp b/utests/compiler_fill_image_1d.cpp
new file mode 100644
index 0000000..e644c5f
--- /dev/null
+++ b/utests/compiler_fill_image_1d.cpp
@@ -0,0 +1,50 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_1d(void)
+{
+  const size_t w = 2048;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+  desc.image_width = w;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image_1d");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t i = 0; i < w; i++) {
+      ((uint32_t*)buf_data[0])[i] = 0;
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = w/2;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  //printf("------ The image result is: -------\n");
+  for (uint32_t i = 0; i < w/2; i++) {
+      //printf(" %2x", ((uint32_t *)buf_data[0])[i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 0x03020100);
+  }
+  for (uint32_t i = w/2; i < w; i++) {
+      //printf(" %2x", ((uint32_t *)buf_data[0])[i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 0);
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_1d);
diff --git a/utests/compiler_fill_image_3d.cpp b/utests/compiler_fill_image_3d.cpp
index 6a679fb..ec96e80 100644
--- a/utests/compiler_fill_image_3d.cpp
+++ b/utests/compiler_fill_image_3d.cpp
@@ -1,3 +1,4 @@
+#include <string.h>
 #include "utest_helper.hpp"
 
 static void compiler_fill_image_3d(void)
@@ -7,14 +8,24 @@ static void compiler_fill_image_3d(void)
   const size_t depth = 5;
   uint32_t color = 0x12345678;
   cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_fill_image_3d");
 
-  OCL_CREATE_IMAGE3D(buf[0], 0, &format, w, h, depth, 0, 0, NULL);
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
diff --git a/utests/compiler_fill_image_3d_2.cpp b/utests/compiler_fill_image_3d_2.cpp
index f5ff792..410ace8 100644
--- a/utests/compiler_fill_image_3d_2.cpp
+++ b/utests/compiler_fill_image_3d_2.cpp
@@ -1,3 +1,4 @@
+#include <string.h>
 #include "utest_helper.hpp"
 
 static void compiler_fill_image_3d_2(void)
@@ -6,14 +7,24 @@ static void compiler_fill_image_3d_2(void)
   const size_t h = 512;
   const size_t depth = 5;
   cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_fill_image_3d_2");
 
-  OCL_CREATE_IMAGE3D(buf[0], 0, &format, w, h, depth, 0, 0, NULL);
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
diff --git a/utests/compiler_function_qualifiers.cpp b/utests/compiler_function_qualifiers.cpp
index 55ddd84..622313c 100644
--- a/utests/compiler_function_qualifiers.cpp
+++ b/utests/compiler_function_qualifiers.cpp
@@ -3,6 +3,16 @@
 void compiler_function_qualifiers(void)
 {
   OCL_CREATE_KERNEL("compiler_function_qualifiers");
+
+  size_t param_value_size;
+  void* param_value;
+  cl_int err;
+
+  err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, 0, NULL, &param_value_size);
+  OCL_ASSERT(err == CL_SUCCESS);
+  param_value = malloc(param_value_size);
+  err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, param_value_size, param_value, NULL);
+  OCL_ASSERT(err == CL_SUCCESS);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_function_qualifiers);
diff --git a/utests/compiler_get_image_info.cpp b/utests/compiler_get_image_info.cpp
index 4454d03..3b9d132 100644
--- a/utests/compiler_get_image_info.cpp
+++ b/utests/compiler_get_image_info.cpp
@@ -6,14 +6,24 @@ static void compiler_get_image_info(void)
   const size_t h = 512;
   const size_t depth = 3;
   cl_image_format format;
+  cl_image_desc desc;
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = NULL;
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_get_image_info");
 
-  OCL_CREATE_IMAGE3D(buf[0], 0, &format, w, h, depth, 0, 0, NULL);
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
   OCL_CREATE_BUFFER(buf[1], 0, 32 * sizeof(int), NULL);
   OCL_CREATE_BUFFER(buf[2], 0, 32 * sizeof(int), NULL);
 
diff --git a/utests/compiler_get_image_info_array.cpp b/utests/compiler_get_image_info_array.cpp
new file mode 100644
index 0000000..970877d
--- /dev/null
+++ b/utests/compiler_get_image_info_array.cpp
@@ -0,0 +1,64 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_get_image_info_array(void)
+{
+  const int w = 256;
+  const int h = 512;
+  const int array_size1 = 10;
+  const int array_size2 = 3;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  // Create the 1D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+  desc.image_width = w;
+  desc.image_array_size = array_size1;
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  // Create the 2D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_array_size = array_size2;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_get_image_info_array");
+
+  OCL_CREATE_BUFFER(buf[2], 0, 32 * sizeof(int), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 32;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(2);
+  OCL_ASSERT(((int*)buf_data[2])[0] == w);
+  OCL_ASSERT(((int*)buf_data[2])[1] == array_size1);
+  OCL_ASSERT(((int*)buf_data[2])[2] == CL_UNSIGNED_INT8);
+  OCL_ASSERT(((int*)buf_data[2])[3] == CL_RGBA);
+
+  OCL_ASSERT(((int*)buf_data[2])[4] == w);
+  OCL_ASSERT(((int*)buf_data[2])[5] == h);
+  OCL_ASSERT(((int*)buf_data[2])[6] == array_size2);
+  OCL_ASSERT(((int*)buf_data[2])[7] == CL_UNSIGNED_INT8);
+  OCL_ASSERT(((int*)buf_data[2])[8] == CL_RGBA);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_info_array);
diff --git a/utests/compiler_getelementptr_bitcast.cpp b/utests/compiler_getelementptr_bitcast.cpp
new file mode 100644
index 0000000..a57ff36
--- /dev/null
+++ b/utests/compiler_getelementptr_bitcast.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void compiler_getelementptr_bitcast(void)
+{
+  const size_t n = 16;
+  float cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_getelementptr_bitcast");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+
+  //must be 1 to pass the test, it is required by the special usage in the kernel
+  locals[0] = 1;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i){
+      unsigned char* c = (unsigned char*)&cpu_src[i];
+      cpu_dst[i] = c[2];
+    }
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i){
+      //printf("src:%f, gpu_dst: %f, cpu_dst: %f\n", cpu_src[i], ((float *)buf_data[1])[i], cpu_dst[i]);
+      OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_getelementptr_bitcast);
diff --git a/utests/compiler_local_slm.cpp b/utests/compiler_local_slm.cpp
index 48a072f..3a0c1ed 100644
--- a/utests/compiler_local_slm.cpp
+++ b/utests/compiler_local_slm.cpp
@@ -11,8 +11,7 @@ void compiler_local_slm(void)
   OCL_NDRANGE(1);
   OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < n; ++i)
-//    std::cout << ((int32_t*)buf_data[0])[i] << std::endl;
-    OCL_ASSERT(((int32_t*)buf_data[0])[i] == (i%16 + 2 + 1+ i/16));
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == (i%16 + 2 + 1+ i/16));
   OCL_UNMAP_BUFFER(0);
 }
 
diff --git a/utests/compiler_mixed_pointer.cpp b/utests/compiler_mixed_pointer.cpp
new file mode 100644
index 0000000..9531fb2
--- /dev/null
+++ b/utests/compiler_mixed_pointer.cpp
@@ -0,0 +1,119 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src1, int *src2, int *dst) {
+  int * tmp = NULL;
+
+  switch(global_id) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[global_id] = tmp[global_id];
+
+}
+static void cpu1(int global_id, int *src, int *dst1, int *dst2) {
+  int * tmp = global_id < 5 ? dst1 : dst2;
+  tmp[global_id] = src[global_id];
+}
+
+void compiler_mixed_pointer(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16], cpu_src1[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mixed_pointer");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_src1[i] = ((int32_t*)buf_data[1])[i] = 65536-i;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_src1, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst[i], ((int32_t*)buf_data[2])[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst[i]);
+    }
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer);
+
+void compiler_mixed_pointer1(void)
+{
+  const size_t n = 16;
+  int cpu_dst1[16], cpu_dst2[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_mixed_pointer", "compiler_mixed_pointer1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_dst1[i] = ((int32_t*)buf_data[1])[i] = 0xff;
+      cpu_dst2[i] = ((int32_t*)buf_data[2])[i] = 0xff;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu1(i, cpu_src, cpu_dst1, cpu_dst2);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst1[i], ((int32_t*)buf_data[1])[i]);
+//      printf(" %d  %d\n", ((int32_t*)buf_data[2])[i], cpu_dst2[i]);
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst1[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst2[i]);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer1);
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
index 50526e8..8f1e66e 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_movforphi_undef.cpp
@@ -1,4 +1,5 @@
 #include "utest_helper.hpp"
+#include "string.h"
 
 static void compiler_movforphi_undef(void)
 {
@@ -6,6 +7,7 @@ static void compiler_movforphi_undef(void)
   const size_t h = 16;
   cl_sampler sampler;
   cl_image_format format;
+  cl_image_desc desc;
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_movforphi_undef");
@@ -16,9 +18,15 @@ static void compiler_movforphi_undef(void)
 
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
-  OCL_CREATE_IMAGE2D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, w * sizeof(uint32_t), buf_data[0]);
+  memset(&desc, 0, sizeof(desc));
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
 
-  OCL_CREATE_IMAGE2D(buf[1], 0, &format, w, h, 0, NULL);
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
   OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
   free(buf_data[0]);
   buf_data[0] = NULL;
@@ -46,6 +54,8 @@ static void compiler_movforphi_undef(void)
     }
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_movforphi_undef);
diff --git a/utests/compiler_saturate_sub.cpp b/utests/compiler_saturate_sub.cpp
index 48947b7..1c95e2d 100644
--- a/utests/compiler_saturate_sub.cpp
+++ b/utests/compiler_saturate_sub.cpp
@@ -108,7 +108,7 @@ compiler_saturate_sub(int8_t, test_char)
 compiler_saturate_sub(uint8_t, test_uchar)
 compiler_saturate_sub(int16_t, test_short)
 compiler_saturate_sub(uint16_t, test_ushort)
-//compiler_saturate_sub(int32_t, test_int) // TODO due to the possible hardware bug, we disable this, uncomment it when it's done.
+compiler_saturate_sub(int32_t, test_int)
 compiler_saturate_sub(uint32_t, test_uint)
 //compiler_saturate_sub(int64_t, test_long)
 //compiler_saturate_sub(uint64_t, test_ulong)
diff --git a/utests/compiler_simd_all.cpp b/utests/compiler_simd_all.cpp
new file mode 100644
index 0000000..086c54f
--- /dev/null
+++ b/utests/compiler_simd_all.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+void compiler_simd_all(void)
+{
+  const size_t n = 40;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_simd_all");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = n;
+  locals[0] = 10;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    ((int*)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
+    if (i % 2 == 1) {
+      if (i < (int32_t)locals[0])
+        OCL_ASSERT(((int *)buf_data[1])[i] == 1);
+      else
+        OCL_ASSERT(((int *)buf_data[1])[i] == 2);
+    }
+    else
+      OCL_ASSERT(((int *)buf_data[1])[i] == 3);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_simd_all);
diff --git a/utests/compiler_simd_any.cpp b/utests/compiler_simd_any.cpp
new file mode 100644
index 0000000..dcc5ef1
--- /dev/null
+++ b/utests/compiler_simd_any.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+void compiler_simd_any(void)
+{
+  const size_t n = 40;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_simd_any");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = n;
+  locals[0] = 10;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    ((int*)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
+    if (i % 2 == 1) {
+      if (i < (int32_t)locals[0])
+        OCL_ASSERT(((int *)buf_data[1])[i] == 1);
+      else
+        OCL_ASSERT(((int *)buf_data[1])[i] == 2);
+    }
+    else
+      OCL_ASSERT(((int *)buf_data[1])[i] == 3);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_simd_any);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index b44abc7..5a1a8d1 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -58,6 +58,6 @@ test_all_vector(uint16_t, ushort, true)
 test_all_vector(int32_t, int, true)
 test_all_vector(uint32_t, uint, true)
 test_all_vector(float, float, true)
-test_all_vector(double, double, true)
+//test_all_vector(double, double, true)
 test_all_vector(int64_t, long, true)
 test_all_vector(uint64_t, ulong, false)
diff --git a/utests/enqueue_built_in_kernels.cpp b/utests/enqueue_built_in_kernels.cpp
new file mode 100644
index 0000000..52b8848
--- /dev/null
+++ b/utests/enqueue_built_in_kernels.cpp
@@ -0,0 +1,19 @@
+#include "utest_helper.hpp"
+
+void enqueue_built_in_kernels(void)
+{
+  char* built_in_kernel_names;
+  size_t built_in_kernels_size;
+  cl_int err = CL_SUCCESS;
+  size_t ret_sz;
+
+
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+  built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+  OCL_ASSERT(ret_sz == built_in_kernels_size);
+  cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+  OCL_ASSERT(built_in_prog != NULL);
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_built_in_kernels);
diff --git a/utests/enqueue_copy_buf.cpp b/utests/enqueue_copy_buf.cpp
index 969eaa8..b647b7e 100644
--- a/utests/enqueue_copy_buf.cpp
+++ b/utests/enqueue_copy_buf.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
 
-void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+static void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
 {
     unsigned int i;
     OCL_MAP_BUFFER(0);
@@ -56,7 +56,7 @@ void enqueue_copy_buf(void)
     OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
     OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
 
-    for (i=0; i<sz; i+=8) {
+    for (i=0; i<sz; i+=7) {
         for (j=0; j<sz; j+=10) {
             test_copy_buf(sz, i, j, sz/2);
         }
diff --git a/utests/enqueue_copy_buf_unaligned.cpp b/utests/enqueue_copy_buf_unaligned.cpp
new file mode 100644
index 0000000..e1bd0aa
--- /dev/null
+++ b/utests/enqueue_copy_buf_unaligned.cpp
@@ -0,0 +1,118 @@
+#include "utest_helper.hpp"
+
+static void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+    unsigned int i;
+    OCL_MAP_BUFFER(0);
+
+    for (i=0; i < sz; i++) {
+        ((char*)buf_data[0])[i] = (rand() & 31);
+    }
+
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_MAP_BUFFER(1);
+
+    for (i=0; i < sz; i++) {
+        ((char*)buf_data[1])[i] = 64;
+    }
+
+    OCL_UNMAP_BUFFER(1);
+
+    if (src_off + cb > sz || dst_off + cb > sz) {
+        /* Expect Error. */
+        OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                       src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+        return;
+    }
+
+    OCL_ASSERT(!clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                    src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+
+#if 0
+    printf ("@@@@@@@@@ cb is %d\n", cb);
+    printf ("@@@@@@@@@ src_off is %d\n", src_off);
+    printf ("@@@@@@@@@ dst_off is %d\n", dst_off);
+    printf("\n########### Src buffer: \n");
+    for (i = 0; i < sz; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[0])[i]);
+
+    printf("\n########### dst buffer: \n");
+    for (i = 0; i < sz; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[1])[i]);
+#endif
+
+    // Check results
+    for (i = 0; i < cb; ++i) {
+        if (((char*)buf_data[0])[i +src_off] != ((char*)buf_data[1])[i + dst_off]) {
+            printf ("different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    for (i = 0; i < dst_off; ++i) {
+        if (((char*)buf_data[1])[i] != 64) {
+            printf ("wrong write, different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    for (i = dst_off + cb; i < sz; ++i) {
+        if (((char*)buf_data[1])[i] != 64) {
+            printf ("wrong write, different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+}
+
+void enqueue_copy_buf_unaligned(void)
+{
+    size_t i;
+    size_t j;
+    const size_t sz = 1024;
+    int offset = 0;
+
+    OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+#if 1
+    /* Test the same offset cases. */
+    for (i=0; i<sz; i+=32) {
+        for (j=64; j<sz; j+=32) {
+	    offset = (rand() & 3);
+            test_copy_buf(sz, i + offset, j + offset, ((rand() & 31) + 1));
+        }
+    }
+#endif
+
+#if 1
+    /* Test the dst small offset cases. */
+    for (i=0; i<sz; i+=32) {
+        for (j=64; j<sz; j+=32) {
+	    offset = (rand() & 2);
+            test_copy_buf(sz, i + offset + 1, j + offset, ((rand() & 31) + 1));
+        }
+    }
+#endif
+
+#if 1
+    /* Test the dst big offset cases. */
+    for (i=0; i<sz; i+=32) {
+        for (j=64; j<sz; j+=32) {
+	    offset = (rand() & 2);
+            test_copy_buf(sz, i + offset, j + offset + 1, ((rand() & 31) + 1));
+        }
+    }
+#endif
+//            test_copy_buf(sz, 0, 1, 17);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_copy_buf_unaligned);
diff --git a/utests/enqueue_fill_buf.cpp b/utests/enqueue_fill_buf.cpp
new file mode 100644
index 0000000..272b81f
--- /dev/null
+++ b/utests/enqueue_fill_buf.cpp
@@ -0,0 +1,90 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+static char pattern_serials[128];
+
+static void test_fill_buf(size_t sz, size_t offset, size_t size, size_t pattern_sz)
+{
+  unsigned int i;
+  int ret = 0;
+  OCL_MAP_BUFFER(0);
+  memset(((char*)buf_data[0]), 0, sz);
+  OCL_UNMAP_BUFFER(0);
+
+  for (i=0; i < pattern_sz; i++) {
+    pattern_serials[i] = (rand() & 63);
+  }
+
+  if (offset + size > sz) {
+    /* Expect Error. */
+    OCL_ASSERT(clEnqueueFillBuffer(queue, buf[0], pattern_serials,
+                                   pattern_sz, offset, size, 0, NULL, NULL));
+    return;
+  }
+
+  ret = clEnqueueFillBuffer(queue, buf[0], pattern_serials,
+                            pattern_sz, offset, size, 0, NULL, NULL);
+  OCL_ASSERT(!ret);
+
+  OCL_MAP_BUFFER(0);
+
+#if 0
+  printf("\n==== pattern size is %d, offset is %d, size is %d ====\n",
+         pattern_sz, offset, size);
+  printf("\n###########  buffer: \n");
+  for (i = 0; i < sz; ++i)
+    printf(" %2.2u", ((unsigned char*)buf_data[0])[i]);
+
+#endif
+
+  // Check results
+  int j = 0;
+  for (i = 0; i < sz; ++i) {
+    if (i < offset || i >= offset + size) {
+      if (((char*)buf_data[0])[i] != 0) {
+        printf ("\nnon zero index is %d\n", i);
+        OCL_ASSERT(0);
+      }
+      continue;
+    }
+
+    if (((char*)buf_data[0])[i] != pattern_serials[j]) {
+      printf ("\ndifferent index is %d\n", i);
+      OCL_ASSERT(0);
+    }
+    j++;
+    if (j == (int)pattern_sz) j = 0;
+  }
+
+  OCL_UNMAP_BUFFER(0);
+
+}
+
+void enqueue_fill_buf(void)
+{
+  size_t offset;
+  size_t pattern_sz;
+  const size_t sz = 1024;
+  size_t size = 0;
+  static int valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  unsigned int i = 0;
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+
+  for (i = 0; i < sizeof(valid_sz)/sizeof(int); i++) {
+
+	pattern_sz = valid_sz[i];
+	size = ((rand()%1024)/pattern_sz) * pattern_sz;
+	offset = ((rand()%1024)/pattern_sz) * pattern_sz;
+	while (size + offset + 1 > sz) {
+      if (size > offset) {
+        size = size - offset;
+      } else
+        offset = offset - size;
+	}
+
+	test_fill_buf(sz, offset, size, pattern_sz);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_fill_buf);
diff --git a/utests/get_arg_info.cpp b/utests/get_arg_info.cpp
new file mode 100644
index 0000000..c1ea1ef
--- /dev/null
+++ b/utests/get_arg_info.cpp
@@ -0,0 +1,85 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+void test_get_arg_info(void)
+{
+  int ret;
+  uint32_t ret_val;
+  cl_kernel_arg_type_qualifier type_qual;
+  size_t ret_sz;
+  char name[64];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_get_arg_info");
+
+  //Arg 0
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_address_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ADDRESS_GLOBAL);
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_ACCESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_access_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ACCESS_NONE);
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("float*") + 1);
+  OCL_ASSERT(!strcmp(name, "float*"));
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("src") + 1);
+  OCL_ASSERT(!strcmp(name, "src"));
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                           sizeof(type_qual), &type_qual, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_type_qualifier));
+  OCL_ASSERT(type_qual == (CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE));
+
+  //Arg 1
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_address_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ADDRESS_LOCAL);
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_ACCESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_access_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ACCESS_NONE);
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_TYPE_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("int*") + 1);
+  OCL_ASSERT(!strcmp(name, "int*"));
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("dst") + 1);
+  OCL_ASSERT(!strcmp(name, "dst"));
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                           sizeof(type_qual), &type_qual, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_type_qualifier));
+  OCL_ASSERT(type_qual == CL_KERNEL_ARG_TYPE_NONE);
+
+  //Arg 2
+  ret = clGetKernelArgInfo(kernel, 2, CL_KERNEL_ARG_TYPE_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("test_arg_struct") + 1);
+  OCL_ASSERT(!strcmp(name, "test_arg_struct"));
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_get_arg_info);
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index 4148ce9..807739b 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -502,9 +502,21 @@ void get_image_info(void)
   const size_t w = 512;
   const size_t h = 512;
   cl_image_format format;
+  cl_image_desc desc;
+
   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_UNSIGNED_INT8;
-  OCL_CREATE_IMAGE2D(buf[0], 0, &format, w, h, 0, NULL);
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = NULL;
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
   cl_mem image = buf[0];
 
   cl_image_format ret_format;
@@ -534,7 +546,7 @@ void get_image_info(void)
 
   size_t depth;
   OCL_CALL(clGetImageInfo, image, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
-  OCL_ASSERT(depth == 1);
+  OCL_ASSERT(depth == 0);
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_image_info);
@@ -571,7 +583,7 @@ void get_mem_info(void)
     expect_ref = 2048;
     maps.insert(make_pair(CL_MEM_SIZE,
                           (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
-    expect_ref = 0;
+    expect_ref = 1024;
     maps.insert(make_pair(CL_MEM_HOST_PTR,
                           (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
     expect_ref = 1;
diff --git a/utests/image_1D_buffer.cpp b/utests/image_1D_buffer.cpp
new file mode 100644
index 0000000..d8d761f
--- /dev/null
+++ b/utests/image_1D_buffer.cpp
@@ -0,0 +1,80 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+void image_1D_buffer(void)
+{
+  size_t buffer_sz = 1024;
+  char *buf_content = (char *)malloc(buffer_sz * sizeof(char));
+  int error;
+  cl_image_desc image_desc;
+  cl_image_format image_format;
+  cl_sampler sampler;
+  cl_mem image1, image2;
+  cl_mem ret_mem = NULL;
+
+  OCL_CREATE_KERNEL("image_1D_buffer");
+
+  for (int32_t i = 0; i < (int32_t)buffer_sz; ++i)
+    buf_content[i] = (rand() & 127);
+
+  cl_mem buff = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                      buffer_sz, buf_content, &error);
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  memset(&image_desc, 0x0, sizeof(cl_image_desc));
+  memset(&image_format, 0x0, sizeof(cl_image_format));
+
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+  image_desc.image_row_pitch = buffer_sz;
+  image_desc.image_width = buffer_sz / sizeof(uint32_t); //assume rgba32
+  image_desc.buffer = buff;
+
+  image_format.image_channel_order = CL_RGBA;
+  image_format.image_channel_data_type = CL_UNSIGNED_INT8;
+
+  image1 = clCreateImage(ctx, CL_MEM_READ_ONLY, &image_format,
+                        &image_desc, NULL, &error );
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  error = clGetImageInfo(image1, CL_IMAGE_BUFFER, sizeof(ret_mem), &ret_mem, NULL);
+  OCL_ASSERT(error == CL_SUCCESS);
+  OCL_ASSERT(ret_mem == buff);
+
+
+  memset(&image_desc, 0x0, sizeof(cl_image_desc));
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+  image_desc.image_width = buffer_sz / sizeof(uint32_t);
+  image2 = clCreateImage(ctx, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
+                         &image_format, &image_desc, buf_content, &error);
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  // Create sampler to use
+  sampler = clCreateSampler(ctx, false, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  cl_mem result_buf = buf[0] = clCreateBuffer(ctx, 0, buffer_sz, NULL, &error);
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &image1);
+  OCL_SET_ARG(1, sizeof(cl_mem), &image2);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  OCL_SET_ARG(3, sizeof(cl_mem), &result_buf);
+
+  globals[0] = buffer_sz/sizeof(int32_t);
+  locals[0] = 16;
+
+  OCL_NDRANGE(1);
+
+  /* Now check the result. */
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < buffer_sz/sizeof(int32_t); i++)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 1);
+  OCL_UNMAP_BUFFER(0);
+
+  clReleaseSampler(sampler);
+  clReleaseMemObject(image1);
+  clReleaseMemObject(image2);
+  clReleaseMemObject(buff);
+}
+
+MAKE_UTEST_FROM_FUNCTION(image_1D_buffer);
diff --git a/utests/load_program_from_bin.cpp b/utests/load_program_from_bin_file.cpp
similarity index 95%
copy from utests/load_program_from_bin.cpp
copy to utests/load_program_from_bin_file.cpp
index d45c2bd..feefacc 100644
--- a/utests/load_program_from_bin.cpp
+++ b/utests/load_program_from_bin_file.cpp
@@ -9,7 +9,7 @@ static void cpu(int global_id, float *src, float *dst) {
     dst[global_id] = ceilf(src[global_id]);
 }
 
-static void test_load_program_from_bin(void)
+static void test_load_program_from_bin_file(void)
 {
     const size_t n = 16;
     float cpu_dst[16], cpu_src[16];
@@ -74,4 +74,4 @@ static void test_load_program_from_bin(void)
     }
 }
 
-MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin);
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin_file);
diff --git a/utests/load_program_from_bin.cpp b/utests/load_program_from_gen_bin.cpp
similarity index 60%
rename from utests/load_program_from_bin.cpp
rename to utests/load_program_from_gen_bin.cpp
index d45c2bd..3db13b2 100644
--- a/utests/load_program_from_bin.cpp
+++ b/utests/load_program_from_gen_bin.cpp
@@ -9,7 +9,7 @@ static void cpu(int global_id, float *src, float *dst) {
     dst[global_id] = ceilf(src[global_id]);
 }
 
-static void test_load_program_from_bin(void)
+static void test_load_program_from_gen_bin(void)
 {
     const size_t n = 16;
     float cpu_dst[16], cpu_src[16];
@@ -18,21 +18,37 @@ static void test_load_program_from_bin(void)
     char *ker_path = NULL;
 
     cl_file_map_t *fm = cl_file_map_new();
-    ker_path = cl_do_kiss_path("compiler_ceil.bin", device);
+    ker_path = cl_do_kiss_path("compiler_ceil.cl", device);
     OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
 
-    const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
-    const size_t sz = cl_file_map_size(fm);
+    const char *src = (const char *)cl_file_map_begin(fm);
 
-    program = clCreateProgramWithBinary(ctx, 1,
-              &device, &sz, &src, &binary_status, &status);
+    program =clCreateProgramWithSource(ctx, 1, &src, NULL, &status);
 
     OCL_ASSERT(program && status == CL_SUCCESS);
 
     /* OCL requires to build the program even if it is created from a binary */
     OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
 
-    kernel = clCreateKernel(program, "compiler_ceil", &status);
+    size_t      binarySize;
+    unsigned char *binary = NULL;
+
+    status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+    OCL_ASSERT(status == CL_SUCCESS);
+    // Create a buffer and get the gen binary
+    binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
+    OCL_ASSERT(binary != NULL);
+
+    status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof( &binary), &binary, NULL );
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    cl_program bin_program = clCreateProgramWithBinary(ctx, 1,
+              &device, &binarySize, (const unsigned char**)&binary, &binary_status, &status);
+    OCL_ASSERT(bin_program && status == CL_SUCCESS);
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_ASSERT(clBuildProgram(bin_program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+    kernel = clCreateKernel(bin_program, "compiler_ceil", &status);
     OCL_ASSERT(status == CL_SUCCESS);
 
     OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
@@ -74,4 +90,4 @@ static void test_load_program_from_bin(void)
     }
 }
 
-MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin);
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_gen_bin);
diff --git a/utests/profiling_exec.cpp b/utests/profiling_exec.cpp
new file mode 100644
index 0000000..afa55ba
--- /dev/null
+++ b/utests/profiling_exec.cpp
@@ -0,0 +1,102 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+static void cpu_exec (int n, float* src, float* dst)
+{
+    int i = 0;
+    for (; i < n; i++) {
+	float f = src[i];
+	f = f < 0 ? -f : f;
+	dst[i] = f;
+    }
+}
+
+#define QUEUE_SECONDS_LIMIT 10
+#define SUBMIT_SECONDS_LIMIT 20
+#define COMMAND_SECONDS_LIMIT 10
+
+static void check_profiling_time(cl_ulong queued, cl_ulong submit, cl_ulong start, cl_ulong end)
+{
+    size_t profiling_resolution = 0;
+    OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
+             sizeof(profiling_resolution), &profiling_resolution, NULL);
+
+    /* Convert the time to second. */
+    double queue_to_submit = (double)(submit - queued)*1e-9;
+    double submit_to_start = (double)(start - submit)*1e-9;
+    double start_to_end = (double)(end - start)*1e-9;
+
+    //printf("Profiling info:\n");
+    //printf("Time from queue to submit : %fms\n", (double)(queue_to_submit) * 1000.f );
+    //printf( "Time from submit to start : %fms\n", (double)(submit_to_start) * 1000.f );
+    //printf( "Time from start to end: %fms\n", (double)(start_to_end) * 1000.f );
+
+    OCL_ASSERTM(queued <= submit, "Enqueue time is later than submit time, invalid\n");
+    OCL_ASSERTM(submit <= start, "Submit time is later than start time, invalid\n");
+    OCL_ASSERTM(start <= end, "Start time is later than end time, invalid\n");
+
+    OCL_ASSERTM(queue_to_submit <= QUEUE_SECONDS_LIMIT, "Too large time from queue to submit\n");
+    OCL_ASSERTM(submit_to_start <= QUEUE_SECONDS_LIMIT, "Too large time from submit to start\n");
+    OCL_ASSERTM(start_to_end <= QUEUE_SECONDS_LIMIT, "Too large time from start to end\n");
+}
+
+static void profiling_exec(void)
+{
+    const size_t n = 512;
+    cl_int status = CL_SUCCESS;
+    cl_command_queue profiling_queue = NULL;
+    cl_command_queue tmp_queue = NULL;
+    float* cpu_src = (float *)malloc(n*sizeof(float));
+    float* cpu_dst = (float *)malloc(n*sizeof(float));
+    cl_event exec_event;
+    cl_ulong time_queue, time_submit, time_start, time_end;
+
+
+    /* Because the profiling prop, we can not use default queue. */
+    profiling_queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &status);
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    /* save the default queue. */
+    tmp_queue = queue;
+    queue = profiling_queue;
+
+    OCL_CREATE_KERNEL("compiler_fabs");
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = n;
+    locals[0] = 256;
+
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+	cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    cpu_exec(n, cpu_src, cpu_dst);
+
+    // Run the kernel on GPU
+    OCL_CALL(clEnqueueNDRangeKernel, queue, kernel, 1, NULL, globals, locals, 0, NULL, &exec_event);
+    OCL_CALL(clWaitForEvents, 1, &exec_event);
+
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time_queue, NULL);
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &time_submit, NULL);
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL);
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL);
+
+    check_profiling_time(time_queue, time_submit, time_start, time_end);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+	OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+
+    queue = tmp_queue;
+    clReleaseCommandQueue(profiling_queue);
+    free(cpu_dst);
+    free(cpu_src);
+}
+
+MAKE_UTEST_FROM_FUNCTION(profiling_exec);
diff --git a/utests/runtime_event.cpp b/utests/runtime_barrier_list.cpp
similarity index 74%
copy from utests/runtime_event.cpp
copy to utests/runtime_barrier_list.cpp
index b974f6a..135996f 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_barrier_list.cpp
@@ -1,20 +1,25 @@
 #include "utest_helper.hpp"
 
 #define BUFFERSIZE  32*1024
-void runtime_event(void)
+void runtime_barrier_list(void)
 {
   const size_t n = BUFFERSIZE;
   cl_int cpu_src[BUFFERSIZE];
-  cl_event ev[3];
+  cl_int cpu_src_2[BUFFERSIZE];
+  cl_event ev[5];
   cl_int status = 0;
   cl_int value = 34;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_event");
   OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, BUFFERSIZE*sizeof(int), NULL);
 
   for(cl_uint i=0; i<BUFFERSIZE; i++)
+  {
     cpu_src[i] = 3;
+    cpu_src_2[i] = 5;
+  }
 
   OCL_CREATE_USER_EVENT(ev[0]);
 
@@ -26,15 +31,25 @@ void runtime_event(void)
   // Run the kernel
   globals[0] = n;
   locals[0] = 32;
+
   clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
 
-  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+  for (cl_uint i = 0; i < 3; ++i) {
     clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
     OCL_ASSERT(status >= CL_SUBMITTED);
   }
 
+
   buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
 
+  clEnqueueBarrierWithWaitList(queue, 0, NULL, &ev[3]);
+
+  clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+
+  OCL_FINISH();
+  clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status != CL_COMPLETE);
+
   OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
 
   clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
@@ -57,4 +72,4 @@ void runtime_event(void)
   }
 }
 
-MAKE_UTEST_FROM_FUNCTION(runtime_event);
+MAKE_UTEST_FROM_FUNCTION(runtime_barrier_list);
diff --git a/utests/runtime_compile_link.cpp b/utests/runtime_compile_link.cpp
new file mode 100644
index 0000000..4a39b6a
--- /dev/null
+++ b/utests/runtime_compile_link.cpp
@@ -0,0 +1,162 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+
+#define BUFFERSIZE  32*1024
+
+int init_program(const char* name, cl_context ctx, cl_program *pg )
+{
+  cl_int err;
+  char* ker_path = cl_do_kiss_path(name, device);
+
+  cl_file_map_t *fm = cl_file_map_new();
+  err = cl_file_map_open(fm, ker_path);
+  if(err != CL_FILE_MAP_SUCCESS)
+    OCL_ASSERT(0);
+  const char *src = cl_file_map_begin(fm);
+
+  *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
+  free(ker_path);
+  cl_file_map_delete(fm);
+  return 0;
+
+}
+
+void runtime_compile_link(void)
+{
+
+  cl_int err;
+
+  const char* header_file_name="runtime_compile_link.h";
+  cl_program foo_pg;
+  init_program(header_file_name, ctx, &foo_pg);
+
+  const char* myinc_file_name="include/runtime_compile_link_inc.h";
+  cl_program myinc_pg;
+  init_program(myinc_file_name, ctx, &myinc_pg);
+
+  const char* file_name_A="runtime_compile_link_a.cl";
+  cl_program program_A;
+  init_program(file_name_A, ctx, &program_A);
+
+  cl_program input_headers[2] = { foo_pg, myinc_pg};
+  const char * input_header_names[2] = {header_file_name, myinc_file_name}; 
+
+  err = clCompileProgram(program_A,
+                                0, NULL, // num_devices & device_list
+                                NULL, // compile_options
+                                2, // num_input_headers
+                                input_headers,
+                                input_header_names,
+                                NULL, NULL);
+
+  OCL_ASSERT(err==CL_SUCCESS);
+  const char* file_name_B="runtime_compile_link_b.cl";
+  cl_program program_B;
+  init_program(file_name_B, ctx, &program_B);
+
+  err = clCompileProgram(program_B,
+                                0, NULL, // num_devices & device_list
+                                NULL, // compile_options
+                                2, // num_input_headers
+                                input_headers,
+                                input_header_names,
+                                NULL, NULL);
+
+  OCL_ASSERT(err==CL_SUCCESS);
+  cl_program input_programs[2] = { program_A, program_B};
+  cl_program linked_program = clLinkProgram(ctx, 0, NULL, "-create-library", 2, input_programs, NULL, NULL, &err);
+
+  OCL_ASSERT(linked_program != NULL);
+  OCL_ASSERT(err == CL_SUCCESS);
+  size_t      binarySize;
+  unsigned char *binary;
+
+  // Get the size of the resulting binary (only one device)
+  err= clGetProgramInfo( linked_program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  // Create a buffer and get the actual binary
+  binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
+  if (binary == NULL) {
+    OCL_ASSERT(0);
+    return ;
+  }
+
+  unsigned char *buffers[ 1 ] = { binary };
+  // Do another sanity check here first
+  size_t size;
+  cl_int loadErrors[ 1 ];
+  err = clGetProgramInfo( linked_program, CL_PROGRAM_BINARIES, 0, NULL, &size );
+  OCL_ASSERT(err==CL_SUCCESS);
+  if( size != sizeof( buffers ) ){
+    free(binary);
+    return ;
+  }
+
+  err = clGetProgramInfo( linked_program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  cl_device_id deviceID;
+  err = clGetProgramInfo( linked_program, CL_PROGRAM_DEVICES, sizeof( deviceID), &deviceID, NULL );
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  cl_program program_with_binary = clCreateProgramWithBinary(ctx, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &err);
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  cl_program new_linked_program = clLinkProgram(ctx, 1, &deviceID, NULL, 1, &program_with_binary, NULL, NULL, &err);
+  OCL_ASSERT(err==CL_SUCCESS);
+  // link success, run this kernel.
+
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  src1[0] = (int64_t)1 << 63, src2[0] = 0x7FFFFFFFFFFFFFFFll;
+  src1[1] = (int64_t)1 << 63, src2[1] = ((int64_t)1 << 63) | 1;
+  src1[2] = -1ll, src2[2] = 0;
+  src1[3] = ((int64_t)123 << 32) | 0x7FFFFFFF, src2[3] = ((int64_t)123 << 32) | 0x80000000;
+  src1[4] = 0x7FFFFFFFFFFFFFFFll, src2[4] = (int64_t)1 << 63;
+  src1[5] = ((int64_t)1 << 63) | 1, src2[5] = (int64_t)1 << 63;
+  src1[6] = 0, src2[6] = -1ll;
+  src1[7] = ((int64_t)123 << 32) | 0x80000000, src2[7] = ((int64_t)123 << 32) | 0x7FFFFFFF;
+  for(size_t i=8; i<n; i++) {
+    src1[i] = i;
+    src2[i] = i;
+  }
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  kernel = clCreateKernel(new_linked_program, "runtime_compile_link_a", &err);
+
+  OCL_ASSERT(err == CL_SUCCESS);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 0, NULL, NULL);
+
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] < src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_compile_link);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
index b974f6a..f8170a3 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_event.cpp
@@ -28,7 +28,7 @@ void runtime_event(void)
   locals[0] = 32;
   clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
 
-  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+  for (cl_uint i = 0; i < 3; ++i) {
     clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
     OCL_ASSERT(status >= CL_SUBMITTED);
   }
diff --git a/utests/runtime_event.cpp b/utests/runtime_marker_list.cpp
similarity index 74%
copy from utests/runtime_event.cpp
copy to utests/runtime_marker_list.cpp
index b974f6a..f64b1d1 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_marker_list.cpp
@@ -1,20 +1,25 @@
 #include "utest_helper.hpp"
 
 #define BUFFERSIZE  32*1024
-void runtime_event(void)
+void runtime_marker_list(void)
 {
   const size_t n = BUFFERSIZE;
   cl_int cpu_src[BUFFERSIZE];
-  cl_event ev[3];
+  cl_int cpu_src_2[BUFFERSIZE];
+  cl_event ev[5];
   cl_int status = 0;
   cl_int value = 34;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_event");
   OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, BUFFERSIZE*sizeof(int), NULL);
 
   for(cl_uint i=0; i<BUFFERSIZE; i++)
+  {
     cpu_src[i] = 3;
+    cpu_src_2[i] = 5;
+  }
 
   OCL_CREATE_USER_EVENT(ev[0]);
 
@@ -26,15 +31,25 @@ void runtime_event(void)
   // Run the kernel
   globals[0] = n;
   locals[0] = 32;
+
   clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
 
-  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+  for (cl_uint i = 0; i < 3; ++i) {
     clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
     OCL_ASSERT(status >= CL_SUBMITTED);
   }
 
+
   buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
 
+  clEnqueueMarkerWithWaitList(queue, 0, NULL, &ev[3]);
+
+  clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+
+  OCL_FINISH();
+  clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status == CL_COMPLETE);
+
   OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
 
   clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
@@ -57,4 +72,4 @@ void runtime_event(void)
   }
 }
 
-MAKE_UTEST_FROM_FUNCTION(runtime_event);
+MAKE_UTEST_FROM_FUNCTION(runtime_marker_list);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
index ad77369..b0f575f 100644
--- a/utests/setenv.sh.in
+++ b/utests/setenv.sh.in
@@ -3,3 +3,5 @@
 export OCL_PCM_PATH=@LOCAL_PCM_OBJECT_DIR@
 export OCL_PCH_PATH=@LOCAL_PCH_OBJECT_DIR@
 export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
+export OCL_GBE_PATH=@LOCAL_GBE_OBJECT_DIR@
+export OCL_INTERP_PATH=@LOCAL_INTERP_OBJECT_DIR@
diff --git a/utests/sub_buffer.cpp b/utests/sub_buffer.cpp
index f65e8ff..d32fd65 100644
--- a/utests/sub_buffer.cpp
+++ b/utests/sub_buffer.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
 
-void sub_bufffer_check(void)
+void sub_buffer_check(void)
 {
     cl_int error;
     cl_ulong max_alloc_size;
@@ -39,7 +39,7 @@ void sub_bufffer_check(void)
                 continue;
             }
             /* invalid align, should be failed. */
-            if(off & (address_align-1)) {
+            if(off & ((address_align/8)-1)) {
                 OCL_ASSERT(error != CL_SUCCESS);
                 continue;
             }
@@ -85,7 +85,7 @@ void sub_bufffer_check(void)
                 continue;
             }
             /* invalid align, should be failed. */
-            if(off & (address_align-1)) {
+            if(off & (address_align/8-1)) {
                 OCL_ASSERT(error != CL_SUCCESS);
                 continue;
             }
@@ -128,8 +128,8 @@ void sub_bufffer_check(void)
         }
     }
 
-
+    clReleaseMemObject(main_buf);
     free(main_buf_content);
 }
 
-MAKE_UTEST_FROM_FUNCTION(sub_bufffer_check);
+MAKE_UTEST_FROM_FUNCTION(sub_buffer_check);
diff --git a/utests/test_printf.cpp b/utests/test_printf.cpp
new file mode 100644
index 0000000..3601574
--- /dev/null
+++ b/utests/test_printf.cpp
@@ -0,0 +1,18 @@
+#include "utest_helper.hpp"
+
+void test_printf(void)
+{
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_printf");
+  globals[0] = 16;
+  locals[0] = 16;
+  globals[1] = 4;
+  locals[1] = 4;
+  globals[2] = 8;
+  locals[2] = 2;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_printf);
diff --git a/utests/utest.cpp b/utests/utest.cpp
index 718916f..b491cae 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -26,36 +26,122 @@
 #include <vector>
 #include <string>
 #include <iostream>
+#include <sys/ioctl.h>
+#include <unistd.h>
 #include <cstring>
+#include <stdlib.h>
+#include <csignal>
+
+struct signalMap
+{
+  const char* signalName;
+  int signalNum;
+};
 
 using namespace std;
 vector<UTest> *UTest::utestList = NULL;
+// Initialize and declare statistics struct
+RStatistics UTest::retStatistics;
+
 void releaseUTestList(void) { delete UTest::utestList; }
+void runSummaryAtExit(void) {
+  // If case crashes, count it as fail, and accumulate finishrun
+  if(UTest::retStatistics.finishrun != UTest::utestList->size()) {
+    UTest::retStatistics.finishrun++;
+    UTest::retStatistics.failCount++;
+  }
+  printf("\nsummary:\n----------\n");
+  printf("  total: %zu\n",UTest::utestList->size());
+  printf("  run: %zu\n",UTest::retStatistics.finishrun);
+  printf("  pass: %zu\n",UTest::retStatistics.passCount);
+  printf("  fail: %zu\n",UTest::retStatistics.failCount);
+  printf("  pass rate: %f\n",1-(float)UTest::retStatistics.failCount/(float)UTest::utestList->size());
+
+  releaseUTestList();
+}
+
+void signalHandler( int signum )
+{
+  const char* name = NULL;
+
+  signalMap arr[] = {
+    {"SIGILL",  SIGILL},
+    {"SIGFPE",  SIGFPE},
+    {"SIGABRT", SIGABRT},
+    {"SIGBUS",  SIGBUS},
+    {"SIGSEGV", SIGSEGV},
+    {"SIGHUP",  SIGHUP},
+    {"SIGINT",  SIGINT},
+    {"SIGQUIT", SIGQUIT},
+    {"SIGTERM", SIGTERM},
+    {NULL,      -1}
+  };
+
+  for(int i=0; arr[i].signalNum != -1 && arr[i].signalName != NULL; i++) {
+    if(arr[i].signalNum == signum)
+
+      name = arr[i].signalName;
+  }
+
+  printf("    Interrupt signal (%s) received.", name);
+
+  exit(signum);
+}
+
+void catch_signal(void){
+  struct sigaction sa;
+  int sigs[] = {
+    SIGILL, SIGFPE, SIGABRT, SIGBUS,
+    SIGSEGV, SIGHUP, SIGINT, SIGQUIT,
+    SIGTERM
+  };
+
+  sa.sa_handler = signalHandler;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_RESETHAND;
+
+  for(unsigned int i = 0; i < sizeof(sigs)/sizeof(sigs[0]); ++i) {
+    if (sigaction(sigs[i], &sa, NULL) == -1)
+      perror("Could not set signal handler");
+  }
+}
 
 UTest::UTest(Function fn, const char *name, bool haveIssue, bool needDestroyProgram)
        : fn(fn), name(name), haveIssue(haveIssue), needDestroyProgram(needDestroyProgram) {
+
   if (utestList == NULL) {
     utestList = new vector<UTest>;
-    atexit(releaseUTestList);
+
+    catch_signal();
+    atexit(runSummaryAtExit);
   }
   utestList->push_back(*this);
 }
 
+
 static bool strequal(const char *s1, const char *s2) {
   if (strcmp(s1, s2) == 0) return true;
   return false;
 }
 
+void UTest::do_run(struct UTest utest){
+  // Print function name
+  printf("%s()", utest.name);
+  fflush(stdout);
+
+  // Run one case in utestList, print result [SUCCESS] or [FAILED]
+  (utest.fn)();
+}
+
 void UTest::run(const char *name) {
   if (name == NULL) return;
   if (utestList == NULL) return;
-  for (size_t i = 0; i < utestList->size(); ++i) {
-    const UTest &utest = (*utestList)[i];
+
+  for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+    const UTest &utest = (*utestList)[retStatistics.finishrun];
     if (utest.name == NULL || utest.fn == NULL ) continue;
     if (strequal(utest.name, name)) {
-      std::cout << utest.name << ":" << std::endl;
-      (utest.fn)();
-      std::cout << std::endl;
+      do_run(utest);
       cl_kernel_destroy(true);
       cl_buffer_destroy();
     }
@@ -64,12 +150,11 @@ void UTest::run(const char *name) {
 
 void UTest::runAll(void) {
   if (utestList == NULL) return;
-  for (size_t i = 0; i < utestList->size(); ++i) {
-    const UTest &utest = (*utestList)[i];
+
+  for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+    const UTest &utest = (*utestList)[retStatistics.finishrun];
     if (utest.fn == NULL) continue;
-    std::cout << utest.name << ":" << std::endl;
-    (utest.fn)();
-    std::cout << std::endl;
+    do_run(utest);
     cl_kernel_destroy(utest.needDestroyProgram);
     cl_buffer_destroy();
   }
@@ -77,12 +162,11 @@ void UTest::runAll(void) {
 
 void UTest::runAllNoIssue(void) {
   if (utestList == NULL) return;
-  for (size_t i = 0; i < utestList->size(); ++i) {
-    const UTest &utest = (*utestList)[i];
+
+  for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+    const UTest &utest = (*utestList)[retStatistics.finishrun];
     if (utest.fn == NULL || utest.haveIssue) continue;
-    std::cout << utest.name << ":" << std::endl;
-    (utest.fn)();
-    std::cout << std::endl;
+    do_run(utest);
     cl_kernel_destroy(utest.needDestroyProgram);
     cl_buffer_destroy();
   }
diff --git a/utests/utest.hpp b/utests/utest.hpp
index 01d4a8c..375ef70 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -31,6 +31,14 @@
 #include <vector>
 #include <iostream>
 
+/*! struct for statistics */
+struct RStatistics
+{
+  size_t passCount;
+  size_t failCount;
+  size_t finishrun;
+};
+
 /*! Quick and dirty unit test system with registration */
 struct UTest
 {
@@ -58,6 +66,10 @@ struct UTest
   static void runAll(void);
   /*! List all test cases */
   static void listAllCases(void);
+  /*! Statistics struct */
+  static RStatistics retStatistics;
+  /*! Do run a test case actually */
+  static void do_run(struct UTest utest);
 };
 
 /*! Register a new unit test */
@@ -78,17 +90,23 @@ struct UTest
   static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
   static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
 
+/*! Turn a function into a unit performance test */
+#define MAKE_BENCHMARK_FROM_FUNCTION(FN) \
+  static void __ANON__##FN##__(void) { BENCHMARK(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN);
 
 /*! No assert is expected */
 #define UTEST_EXPECT_SUCCESS(EXPR) \
  do { \
     try { \
       EXPR; \
-      std::cout << "  " << #EXPR << "    [SUCCESS]" << std::endl; \
+      std::cout << "    [SUCCESS]" << std::endl; \
+      UTest::retStatistics.passCount += 1; \
     } \
     catch (Exception e) { \
-      std::cout << "  " << #EXPR << "    [FAILED]" << std::endl; \
+      std::cout << "    [FAILED]" << std::endl; \
       std::cout << "    " << e.what() << std::endl; \
+      UTest::retStatistics.failCount++; \
     } \
   } while (0)
 
@@ -96,12 +114,26 @@ struct UTest
  do { \
     try { \
       EXPR; \
-      std::cout << "  " << #EXPR << "    [FAILED]" <<  std::endl; \
+      std::cout << "    [FAILED]" << std::endl; \
+      retStatistics.failCount++; \
     } \
     catch (gbe::Exception e) { \
-      std::cout << "  " << #EXPR << "    [SUCCESS]" << std::endl; \
+      std::cout << "    [SUCCESS]" << std::endl; \
+      retStatistics.passCount++; \
     } \
   } while (0)
 
+#define BENCHMARK(EXPR) \
+ do { \
+    int ret = 0; \
+    try { \
+      ret = EXPR; \
+      printf("  %s  [SUCCESS] [Result: %d]\n", #EXPR, ret);\
+    } \
+    catch (Exception e) { \
+      std::cout << "  " << #EXPR << "    [FAILED]" << std::endl; \
+      std::cout << "    " << e.what() << std::endl; \
+    } \
+  } while (0)
 #endif /* __UTEST_UTEST_HPP__ */
 
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index 626ac96..7522001 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -6,7 +6,7 @@ FLT_MIN_NEGA='-0x1.fffffep127f'
 FLT_MIN_POSI='0x1.0p-126f'
 FLT_MAX_NEGA='-0x1.0p-126f'
 
-paraTypeList={'float':'%.20f','int':'%d','double':'%lf','uint':'%d','string':'%s'}
+paraTypeList={'float':'%e','int':'%d','double':'%lf','uint':'%d','string':'%s'}
 
 
 def ulpUnit(ulpSize):
@@ -20,7 +20,14 @@ def udebug(ulpSize,returnType):
   #ulpNum=re.findall(r"([0-9]+)",ulpSize)[0]
   text='''
     static const char* INFORNAN;
-    static %s ULPSIZE;
+    static %s ULPSIZE, ULPSIZE_FACTOR;
+
+    const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
+
+    if (env_strict == NULL || strcmp(env_strict, "0") == 0)
+      ULPSIZE_FACTOR = 1000;
+    else
+      ULPSIZE_FACTOR = 1;
     
     if (isinf(cpu_data[index])){
       INFORNAN="INF";
@@ -29,7 +36,8 @@ def udebug(ulpSize,returnType):
       INFORNAN="NAN";
     }
     else{
-      ULPSIZE=cl_%s(cpu_data[index]) * %s;
+       ULPSIZE=ULPSIZE_FACTOR * cl_%s((cpu_data[index] == 0) ? 1 : cpu_data[index])
+               * ((ULPSIZE_FACTOR == 1) ? %s : ( (%s == 0) ? 1 : %s));
     }
 
 #if udebug 
@@ -67,6 +75,7 @@ def udebug(ulpSize,returnType):
   }
 }\n'''%(returnType,\
         ulpUnit(ulpSize),ulpNum(ulpSize),\
+        ulpNum(ulpSize), ulpNum(ulpSize),\
         paraTypeList['string'],paraTypeList['string'],\
         paraTypeList['string'],paraTypeList['string'],\
         paraTypeList['string'],paraTypeList['string'],\
@@ -128,6 +137,7 @@ which can print more values and information to assist debuging the issue.
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
+#include <string.h>
 
 #define udebug 0
 #define FLT_MAX 0x1.fffffep127f
@@ -298,7 +308,10 @@ static void %s_%s(void)
 
     #funcdiff = "    diff = fabs((gpu_data[index]-cpu_data[index])"
     #funcdiff += (self.retType(index) == "int") and ');' or '/(cpu_data[index]>1?cpu_data[index]:1));'
+    valuejudge = "    if (std::fpclassify(gpu_data[index]) == FP_SUBNORMAL){ gpu_data[index] = 0; }\n"
+    valuejudge += "    if (std::fpclassify(cpu_data[index]) == FP_SUBNORMAL){ cpu_data[index] = 0; }\n"
     funcdiff = "    diff = fabs((gpu_data[index]-cpu_data[index]));"
+    funcline += [ valuejudge ]
     funcline += [ funcdiff ]
     funcline += [ funcsprintfa + funcsprintfb ]
 
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 91633f0..cb4dd66 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -337,20 +337,26 @@ cl_ocl_init(void)
   GET_PLATFORM_STR_INFO(extensions, EXTENSIONS);
 
   /* Get the device (only GPU device is supported right now) */
-  OCL_CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
-  {
-    size_t param_value_size;
-    GET_DEVICE_STR_INFO(profile, PROFILE);
-    GET_DEVICE_STR_INFO(name, NAME);
-    GET_DEVICE_STR_INFO(vendor, VENDOR);
-    GET_DEVICE_STR_INFO(version, VERSION);
-    GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
-    GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
+  try {
+    OCL_CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    {
+      size_t param_value_size;
+      GET_DEVICE_STR_INFO(profile, PROFILE);
+      GET_DEVICE_STR_INFO(name, NAME);
+      GET_DEVICE_STR_INFO(vendor, VENDOR);
+      GET_DEVICE_STR_INFO(version, VERSION);
+      GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
+      GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
 #ifdef HAS_EGL
-    if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
-      hasGLExt = true;
-    }
+      if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
+        hasGLExt = true;
+      }
 #endif
+    }
+  } catch (...) {
+     fprintf(stderr, "error calling clGetDeviceIDs\n");
+     status = CL_DEVICE_NOT_FOUND;
+     goto error;
   }
 
 #ifdef HAS_EGL
@@ -531,6 +537,7 @@ int *cl_read_bmp(const char *filename, int *width, int *height)
   char magic[2];
   int ret;
   ret = fread(&magic[0], 1, 2, fp);
+  ret = ret;
   assert(2 == ret);
   assert(magic[0] == 'B' && magic[1] == 'M');
 
@@ -648,20 +655,6 @@ int cl_check_image(const int *img, int w, int h, const char *bmp)
   return (float(discrepancy) / float(n) > max_error_ratio) ? 0 : 1;
 }
 
-typedef struct
-{
-  unsigned int mantissa:23;
-  unsigned int exponent:8;
-  unsigned int sign:1;
-} FLOAT;
-
-typedef union
-{
-  float f;
-  unsigned int i;
-  FLOAT spliter;
-} SF;
-
 const float cl_FLT_ULP(float float_number)
 {
   SF floatBin, ulpBin, ulpBinBase;
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 0937bf2..de4d277 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -104,12 +104,6 @@ extern EGLSurface  eglSurface;
 #define OCL_CREATE_IMAGE(IMAGE, FLAGS, FORMAT, DESC, DATA) \
     OCL_CALL2(clCreateImage, IMAGE, ctx, FLAGS, FORMAT, DESC, DATA)
 
-#define OCL_CREATE_IMAGE2D(IMAGE, FLAGS, FORMAT, WIDTH, HEIGHT, PITCH, DATA) \
-    OCL_CALL2(clCreateImage2D, IMAGE, ctx, FLAGS, FORMAT, WIDTH, HEIGHT, PITCH, DATA)
-
-#define OCL_CREATE_IMAGE3D(IMAGE, FLAGS, FORMAT, WIDTH, HEIGHT, DEPTH, RPITCH, SPITCH, DATA) \
-    OCL_CALL2(clCreateImage3D, IMAGE, ctx, FLAGS, FORMAT, WIDTH, HEIGHT, DEPTH, RPITCH, SPITCH, DATA)
-
 #define OCL_READ_IMAGE(IMAGE, ORIGIN, REGION, DATA) \
     OCL_CALL(clEnqueueReadImage, queue, IMAGE, CL_TRUE, ORIGIN, REGION, 0, 0, DATA, 0, NULL, NULL)
 
@@ -119,12 +113,6 @@ extern EGLSurface  eglSurface;
 #define OCL_CREATE_GL_IMAGE(IMAGE, FLAGS, TARGET, LEVEL, TEXTURE) \
     OCL_CALL2(clCreateFromGLTexture, IMAGE, ctx, FLAGS, TARGET, LEVEL, TEXTURE)
 
-#define OCL_CREATE_GL_IMAGE2D(IMAGE, FLAGS, TARGET, LEVEL, TEXTURE) \
-    OCL_CALL2(clCreateFromGLTexture2D, IMAGE, ctx, FLAGS, TARGET, LEVEL, TEXTURE)
-
-#define OCL_CREATE_GL_IMAGE3D(IMAGE, FLAGS, TARGET, LEVEL, TEXTURE) \
-    OCL_CALL2(clCreateFromGLTexture3D, IMAGE, ctx, FLAGS, TARGET, LEVEL, TEXTURE)
-
 #define OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(ID) \
     OCL_CALL(clEnqueueAcquireGLObjects, queue, 1, &buf[ID], 0, 0, 0)
 
@@ -184,6 +172,21 @@ enum {
   BIN = 2
 };
 
+/* The SF is float type spliter*/
+typedef struct
+{
+  unsigned int mantissa:23;
+  unsigned int exponent:8;
+  unsigned int sign:1;
+} FLOAT;
+
+typedef union
+{
+  float f;
+  unsigned int i;
+  FLOAT spliter;
+} SF;
+
 /* Init OpenCL */
 extern int cl_ocl_init(void);
 
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
index f268739..30a9b24 100755
--- a/utests/utest_math_gen.py
+++ b/utests/utest_math_gen.py
@@ -14,7 +14,76 @@ import os,sys
 #    values
 #    ulp
 
-base_input_values = [ 0, 1, 3.14]
+# reduce pi*x limitation to [-pi,pi]
+reduce1='''
+static float reduce1( float x )
+{
+  SF fx, fy;
+  fx.f = fy.f = x;
+  int n;
+
+  fy.spliter.exponent = fx.spliter.exponent - 1;
+  n = (int)fy.f;
+
+  fx.f = fx.f - 2.0 * n;
+
+  // reduce to [-1.0, 1.0]
+  fx.f = (fx.f < -1)?(fx.f + 2.0):((fx.f > 1)?(fx.f - 2.0):fx.f);
+
+  return fx.f;
+}
+'''
+# define fuction: cospi
+cospi='''
+static float cospi(float x){
+  float r = x;
+  if ( x > 1 || x < -1) r = reduce1(x);
+
+  // reduce to [0.0, 1.0]
+  if (r < 0)
+    r = fabs(r);
+
+  if (r >= 0 && r <= 0.25)
+    return  cosf(r * M_PI);
+  else if (r > 0.25 && r <= 0.5)
+    return  sinf((0.5 - r) * M_PI);
+  else if (r > 0.5 && r <= 0.75)
+    return sinf(-(r-0.5) * M_PI);
+  else if (r > 0.75 && r <= 1.0){
+    return -cosf((1 -  r) * M_PI);}
+
+  // Error return
+  return 0xffffffff;
+}
+'''
+# define function: sinpi
+sinpi='''
+static float sinpi(float x){
+  float r = x;
+  if ( x > 1 || x < -1) r = reduce1(x);
+
+  // reduce to [-0.5, 0.5]
+  if (r < -0.5)
+    r = -1 - r;
+  else if (r > 0.5)
+    r = 1 - r;
+
+  if (r > 0.25 && r <= 0.5)
+    return  cosf((0.5 - r) * M_PI);
+  else if (r >= 0 && r <= 0.25)
+    return  sinf(r * M_PI);
+  else if (r >= -0.25 && r < 0)
+    return -sinf(r * -M_PI);
+  else if (r >= -0.5 && r < -0.25){
+    return -cosf((0.5 + r) * M_PI);}
+
+  // Error return
+  return 0xffffffff;
+}
+'''
+
+base_input_values = [ 0, 1, 3.14, 5.15, 6.01, 7.89]
+base_input_values1 = [ 1, 3.14, 5.15, 6.01, 7.89]
 def main():
   ##### gentype acos(gentype)
   acos_input_values = base_input_values
@@ -144,23 +213,20 @@ static float atanpi(float x){
   cospi_input_values = base_input_values
   cospi_input_type = ['float','float2','float4','float8','float16']
   cospi_output_type = ['float','float2','float4','float8','float16']
-  cospi_cpu_func='''
-static float cospi(float x){
-  return cos(M_PI * x);
-} '''
+  cospi_cpu_func=reduce1+cospi
   cospiUtests = func('cospi','cospi',[cospi_input_type],cospi_output_type,[cospi_input_values],'2 * FLT_ULP',cospi_cpu_func)
   
-  ##### gentype erf(gentype)
-  erf_input_values = base_input_values
-  erf_input_type = ['float','float2','float4','float8','float16']
-  erf_output_type = ['float','float2','float4','float8','float16']
-  erfUtests = func('erf','erf',[erf_input_type],erf_output_type,[erf_input_values],'16 * FLT_ULP')
-  
-  ##### gentype erfc(gentype)
-  erfc_input_values = base_input_values
-  erfc_input_type = ['float','float2','float4','float8','float16']
-  erfc_output_type = ['float','float2','float4','float8','float16']
-  erfcUtests = func('erfc','erfc',[erfc_input_type],erfc_output_type,[erfc_input_values],'16 * FLT_ULP')
+#  ##### gentype erf(gentype)
+#  erf_input_values = base_input_values
+#  erf_input_type = ['float','float2','float4','float8','float16']
+#  erf_output_type = ['float','float2','float4','float8','float16']
+#  erfUtests = func('erf','erf',[erf_input_type],erf_output_type,[erf_input_values],'16 * FLT_ULP')
+
+#  ##### gentype erfc(gentype)
+#  erfc_input_values = base_input_values
+#  erfc_input_type = ['float','float2','float4','float8','float16']
+#  erfc_output_type = ['float','float2','float4','float8','float16']
+#  erfcUtests = func('erfc','erfc',[erfc_input_type],erfc_output_type,[erfc_input_values],'16 * FLT_ULP')
   
   ##### gentype exp(gentype x)
   exp_input_values = base_input_values
@@ -364,21 +430,21 @@ static float minmag(float x, float y){
   nextafter_input_type1 = ['float','float2','float4','float8','float16']
   nextafter_input_type2 = ['float','float2','float4','float8','float16']
   nextafter_output_type = ['float','float2','float4','float8','float16']
-  nextafterUtests = func('nextafter','nextafter',[nextafter_input_type1,nextafter_input_type2],nextafter_output_type,[nextafter_input_values1,nextafter_input_values2],'0 * FLT_ULP')
+  nextafterUtests = func('nextafter','nextafterf',[nextafter_input_type1,nextafter_input_type2],nextafter_output_type,[nextafter_input_values1,nextafter_input_values2],'0 * FLT_ULP')
   
   ##### gentype pow(gentype x, gentype y)
-  pow_base_values = base_input_values
+  pow_base_values = base_input_values1
   pow_input_values1 = []
   pow_input_values2 = []
   pow_input_values1,pow_input_values2=gene2ValuesLoop(pow_input_values1,pow_input_values2,pow_base_values)
   pow_input_type1 = ['float','float2','float4','float8','float16']
   pow_input_type2 = ['float','float2','float4','float8','float16']
   pow_output_type = ['float','float2','float4','float8','float16']
-  powUtests = func('pow','pow',[pow_input_type1,pow_input_type2],pow_output_type,[pow_input_values1,pow_input_values2],'16 * FLT_ULP')
+  powUtests = func('pow','powf',[pow_input_type1,pow_input_type2],pow_output_type,[pow_input_values1,pow_input_values2],'16 * FLT_ULP')
   
   ##### floatn pown(floatn x, intn y)
-  pown_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
-  pown_input_values2 = [-1,-2,-3,4,5,6,7,8,9,10,11,12,13,14,15,16,12]
+  pown_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, 0.5, 1, 0.0,1500.24,-1500.24]
+  pown_input_values2 = [-1,-2,-3,4,5,6,7,8,10,12,14,16,12]
   pown_input_type1 = ['float','float2','float4','float8','float16']
   pown_input_type2 = ['int','int2','int4','int8','int16']
   pown_output_type = ['float','float2','float4','float8','float16']
@@ -389,14 +455,14 @@ static float pown(float x, int y){
   pownUtests = func('pown','pown',[pown_input_type1,pown_input_type2],pown_output_type,[pown_input_values1,pown_input_values2],'16 * FLT_ULP', pown_cpu_func)
   
   ##### gentype powr(gentype x, gentype y)
-  powr_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
-  powr_input_values2 = [1,2,3.14,4,5,6,7,8,9.889,10,11,12,13,14.33,15,0,12]
+  powr_input_values1 = [80, -80, 3.14, -3.14, 0.5, 1, -1, 0.0,6,1500.24,-1500.24]
+  powr_input_values2 = [5,6,7,8,10,11,12,13,14,0,12]
   powr_input_type1 = ['float','float2','float4','float8','float16']
   powr_input_type2 = ['float','float2','float4','float8','float16']
   powr_output_type = ['float','float2','float4','float8','float16']
   powr_cpu_func='''
 static float powr(float x, int y){
-    return pow(x,y);
+    return powf(x,y);
 } '''
   powrUtests = func('powr','powr',[powr_input_type1,powr_input_type2],powr_output_type,[powr_input_values1,powr_input_values2],'16 * FLT_ULP', powr_cpu_func)
   
@@ -417,8 +483,8 @@ static float powr(float x, int y){
   rintUtests = func('rint','rint',[rint_input_type],rint_output_type,[rint_input_values],'0 * FLT_ULP')
   
   ##### floatn rootn(floatn x, intn y)
-  rootn_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24,2,3,4]
-  rootn_input_values2 = [-1,-2,-3,2,3,6,7,8,9,2,11,12,13,14,15,16,2,2,2,2]
+  rootn_input_values1 = [0.0, 0.0012,  0.5, 1, 3.14, 12345]
+  rootn_input_values2 = [-1, 1, -20, 20, -123, 456]
   rootn_input_type1 = ['float','float2','float4','float8','float16']
   rootn_input_type2 = ['int','int2','int4','int8','int16']
   rootn_output_type = ['float','float2','float4','float8','float16']
@@ -466,13 +532,10 @@ static float rsqrt(float x)
   sinhUtests = func('sinh','sinh',[sinh_input_type],sinh_output_type,[sinh_input_values],'4 * FLT_ULP')
   
   ##### gentype sinpi(gentype x)
-  sinpi_input_values = base_input_values
+  sinpi_input_values = [0, 1, 3.14, -0.88, -0.12, -0.5, 0.5, -0.49, 0.49, 0.51, -0.51, -0.1, 0.1]
   sinpi_input_type = ['float','float2','float4','float8','float16']
   sinpi_output_type = ['float','float2','float4','float8','float16']
-  sinpi_cpu_func='''
-static float sinpi(float x){
-  return sin(M_PI*x);
-} '''
+  sinpi_cpu_func=reduce1+sinpi
   sinpiUtests = func('sinpi','sinpi',[sinpi_input_type],sinpi_output_type,[sinpi_input_values],'4 * FLT_ULP',sinpi_cpu_func)
   
   ##### gentype sqrt(gentype)
@@ -494,20 +557,15 @@ static float sinpi(float x){
   tanhUtests = func('tanh','tanh',[tanh_input_type],tanh_output_type,[tanh_input_values],'5 * FLT_ULP')
   
   ##### gentype tanpi(gentype x)
-  tanpi_input_values = base_input_values
+  tanpi_input_values = [ 0, 3.14, 5.15, 6.01, 7.89]
   tanpi_input_type = ['float','float2','float4','float8','float16']
   tanpi_output_type = ['float','float2','float4','float8','float16']
-  tanpi_cpu_func='''
+  tanpi_cpu_func=reduce1+sinpi+cospi+'''
 static float tanpi(float x){
-  return tan(M_PI*x);
-} '''
-  tanpiUtests = func('tanpi','tanpi',[tanpi_input_type],tanpi_output_type,[tanpi_input_values],'4 * FLT_ULP',tanpi_cpu_func)
-  
-  ##### gentype tgamma(gentype)
-  tgamma_input_values = base_input_values
-  tgamma_input_type = ['float','float2','float4','float8','float16']
-  tgamma_output_type = ['float','float2','float4','float8','float16']
-  tgammaUtests = func('tgamma','tgamma',[tgamma_input_type],tgamma_output_type,[tgamma_input_values],'16 * FLT_ULP')
+  return sinpi(x)/cospi(x);
+}
+'''
+  tanpiUtests = func('tanpi','tanpi',[tanpi_input_type],tanpi_output_type,[tanpi_input_values],'400 * FLT_ULP',tanpi_cpu_func)
   
   ##### gentype trunc(gentype)
   trunc_input_values = base_input_values

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git