[Pkg-opencl-devel] [beignet] 53/66: Imported Upstream version 0.2+git20130928+187c17e

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 07:27:08 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 3756b221f8602b8211a4d3fce985dbd2c8c6b9e4
Author: Simon Richter <sjr at debian.org>
Date:   Sat Sep 28 14:19:29 2013 +1000

    Imported Upstream version 0.2+git20130928+187c17e
---
 CMake/FindEGL.cmake                                |   18 +
 CMake/FindGBM.cmake                                |   36 -
 CMakeLists.txt                                     |   18 +-
 backend/CMakeLists.txt                             |    2 +-
 backend/src/CMakeLists.txt                         |   13 +-
 backend/src/backend/context.cpp                    |   82 +-
 backend/src/backend/context.hpp                    |    5 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |    5 +
 backend/src/backend/gen_context.cpp                |  649 ++++++++++-
 backend/src/backend/gen_context.hpp                |   15 +
 backend/src/backend/gen_defs.hpp                   |   19 +
 backend/src/backend/gen_encoder.cpp                |   69 +-
 backend/src/backend/gen_encoder.hpp                |    6 +-
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |    9 +
 backend/src/backend/gen_insn_selection.cpp         |  334 +++++-
 backend/src/backend/gen_insn_selection.hpp         |    6 +-
 backend/src/backend/gen_insn_selection.hxx         |   11 +
 backend/src/backend/gen_program.cpp                |   43 +-
 backend/src/backend/gen_program.hpp                |   10 +-
 backend/src/backend/gen_reg_allocation.cpp         |  100 +-
 backend/src/backend/gen_register.hpp               |   14 +
 backend/src/backend/program.cpp                    |  320 +++++-
 backend/src/backend/program.h                      |    3 +-
 backend/src/backend/program.hpp                    |   59 +-
 backend/src/builtin_vector_proto.def               |   25 +-
 backend/src/gbe_bin_generater.cpp                  |  308 ++++++
 backend/src/gen_builtin_vector.py                  |    5 +-
 backend/src/gen_convert.sh                         |   30 +-
 backend/src/ir/constant.cpp                        |  101 ++
 backend/src/ir/constant.hpp                        |   28 +-
 backend/src/ir/context.hpp                         |    1 +
 backend/src/ir/image.cpp                           |  139 +++
 backend/src/ir/image.hpp                           |   25 +-
 backend/src/ir/instruction.cpp                     |  122 ++-
 backend/src/ir/instruction.hpp                     |   32 +-
 backend/src/ir/instruction.hxx                     |    5 +
 backend/src/ir/profile.cpp                         |    5 +-
 backend/src/ir/profile.hpp                         |    4 +-
 backend/src/ir/sampler.cpp                         |   98 ++
 backend/src/ir/sampler.hpp                         |   25 +-
 backend/src/llvm/llvm_gen_backend.cpp              |  380 ++++---
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   38 +-
 backend/src/llvm/llvm_scalarize.cpp                |    1 -
 backend/src/ocl_common_defines.h                   |   57 +-
 backend/src/ocl_convert.h                          |  411 +++++++
 backend/src/ocl_stdlib.tmpl.h                      | 1136 +++++++++++++++++---
 backend/src/sys/platform.hpp                       |   44 +
 kernels/builtin_atan2.cl                           |    4 +
 kernels/builtin_lgamma.cl                          |    4 +
 kernels/builtin_lgamma_r.cl                        |    4 +
 kernels/builtin_sinpi.cl                           |    4 +
 kernels/builtin_tgamma.cl                          |    4 +
 kernels/compiler_abs_diff.cl                       |    2 +
 kernels/compiler_bool_cross_basic_block.cl         |   21 +
 kernels/compiler_box_blur_image.cl                 |    2 +-
 kernels/compiler_function_constant0.cl             |    2 +-
 kernels/compiler_global_constant.cl                |   59 +-
 kernels/compiler_global_constant_2.cl              |   13 +-
 kernels/compiler_group_size.cl                     |   17 +
 kernels/compiler_long_convert.cl                   |   12 +
 kernels/compiler_upsample_long.cl                  |    2 +-
 kernels/compiler_vector_inc.cl                     |   13 +
 kernels/test_copy_image_3d.cl                      |   27 +-
 kernels/test_fill_image_3d.cl                      |    2 +-
 kernels/test_fill_image_3d_2.cl                    |    2 +-
 src/CMakeLists.txt                                 |   18 +-
 src/cl_api.c                                       |  705 ++++++++++--
 src/cl_command_queue.c                             |  108 +-
 src/cl_command_queue.h                             |   16 +-
 src/cl_command_queue_gen7.c                        |   86 +-
 src/cl_context.c                                   |   30 +-
 src/cl_context.h                                   |   34 +-
 src/cl_device_id.c                                 |    2 +-
 src/cl_driver.h                                    |   82 +-
 src/cl_driver_defs.c                               |   14 +-
 src/cl_driver_type.h                               |   24 +
 src/cl_enqueue.c                                   |  256 +++--
 src/cl_enqueue.h                                   |   33 +-
 src/cl_event.c                                     |  110 +-
 src/cl_event.h                                     |    8 +-
 src/cl_extensions.c                                |   23 +-
 src/cl_extensions.h                                |   25 -
 src/cl_gt_device.h                                 |   10 +-
 src/cl_image.c                                     |   26 +-
 src/cl_kernel.c                                    |   14 +-
 src/cl_khr_icd.h                                   |    4 +
 src/cl_mem.c                                       |  636 +++++++++--
 src/cl_mem.h                                       |  138 ++-
 src/cl_mem_gl.c                                    |  194 +---
 src/cl_platform_id.c                               |    2 +-
 src/cl_platform_id.h                               |    5 +-
 src/cl_program.c                                   |   37 +-
 src/cl_program.h                                   |    2 +
 src/cl_sampler.c                                   |    2 +-
 src/cl_utils.h                                     |   12 +-
 src/intel/intel_dri_resource_sharing.c             |  208 ++++
 src/intel/intel_dri_resource_sharing.h             |   39 +
 src/intel/intel_dri_resource_sharing_int.h         |  143 +++
 src/intel/intel_driver.c                           |  231 +++-
 src/intel/intel_driver.h                           |    8 +-
 src/intel/intel_gpgpu.c                            |  147 ++-
 src/intel/intel_structs.h                          |   16 +-
 src/x11/dricommon.h                                |    5 -
 src/x11/gbm_deps/backend.h                         |   36 -
 src/x11/gbm_deps/common.h                          |   42 -
 src/x11/gbm_deps/common_drm.h                      |   48 -
 src/x11/gbm_deps/gbm.h                             |  292 -----
 src/x11/gbm_deps/gbm_driint.h                      |  108 --
 src/x11/gbm_deps/gbmint.h                          |  116 --
 src/x11/gbm_dri2_x11_platform.c                    |  126 ---
 src/x11/mesa_egl_extension.c                       |  307 ++++++
 src/x11/mesa_egl_extension.h                       |   20 +
 src/x11/mesa_egl_res_share.c                       |  135 +++
 src/x11/mesa_egl_res_share.h                       |   44 +
 utests/CMakeLists.txt                              |   28 +-
 utests/builtin_atan2.cpp                           |   43 +
 utests/builtin_lgamma.cpp                          |   40 +
 utests/builtin_lgamma_r.cpp                        |   46 +
 utests/builtin_sinpi.cpp                           |  104 ++
 utests/builtin_tgamma.cpp                          |   42 +
 utests/compiler_abs_diff.cpp                       |   35 +-
 utests/compiler_bool_cross_basic_block.cpp         |   55 +
 utests/compiler_copy_image_3d.cpp                  |   36 +-
 utests/compiler_fill_image_3d.cpp                  |    6 +-
 utests/compiler_fill_image_3d_2.cpp                |   10 +-
 utests/compiler_function_constant0.cpp             |    4 +-
 utests/compiler_global_constant.cpp                |   75 ++
 utests/compiler_global_constant_2.cpp              |   29 +
 utests/compiler_group_size.cpp                     |   55 +
 utests/compiler_long.cpp                           |    4 +-
 utests/compiler_long_2.cpp                         |    4 +-
 utests/compiler_long_convert.cpp                   |   91 ++
 utests/compiler_vector_inc.cpp                     |   46 +
 utests/load_program_from_bin.cpp                   |   77 ++
 utests/runtime_event.cpp                           |    7 +-
 utests/utest.cpp                                   |   28 +-
 utests/utest.hpp                                   |   14 +-
 utests/utest_helper.cpp                            |   29 +-
 utests/utest_helper.hpp                            |    5 +-
 utests/utest_run.cpp                               |   94 +-
 140 files changed, 8625 insertions(+), 2169 deletions(-)

diff --git a/CMake/FindEGL.cmake b/CMake/FindEGL.cmake
index 69d4852..d84ef95 100644
--- a/CMake/FindEGL.cmake
+++ b/CMake/FindEGL.cmake
@@ -33,4 +33,22 @@ ELSE(EGL_INCLUDE_PATH)
   SET(EGL_FOUND 0 CACHE STRING "Set to 1 if EGL is found, 0 otherwise")
 ENDIF(EGL_INCLUDE_PATH)
 
+# Find mesa source code.
+FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
+  $ENV{MESA_SOURCE_DIR}
+  ${MAKE_CURRENT_SOURCE_DIR}/../mesa
+  ~/mesa
+  DOC "The mesa source directory which is needed for cl_khr_gl_sharing.")
+
+IF(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
+                         ${MESA_SOURCE_PREFIX}/include
+                         ${MESA_SOURCE_PREFIX}/src/mapi
+                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
+                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
+SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ELSE(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ENDIF(MESA_SOURCE_PREFIX)
+
 MARK_AS_ADVANCED(EGL_FOUND)
diff --git a/CMake/FindGBM.cmake b/CMake/FindGBM.cmake
deleted file mode 100644
index f20f4b2..0000000
--- a/CMake/FindGBM.cmake
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Try to find gbm library and include path.
-# Once done this will define
-#
-# GBM_FOUND
-# GBM_INCLUDE_PATH
-# GBM_LIBRARY
-#
-
-FIND_PATH(GBM_INCLUDE_PATH gbm.h
-  ~/include/
-  /usr/include/
-  /usr/local/include/
-  /sw/include/
-  /opt/local/include/
-  DOC "The directory where gen/program.h resides")
-FIND_LIBRARY(GBM_LIBRARY
-  NAMES GBM gbm
-  PATHS
-  ~/lib/
-  /usr/lib64
-  /usr/lib
-  /usr/local/lib64
-  /usr/local/lib
-  /sw/lib
-  /opt/local/lib
-  DOC "The GBM library")
-
-IF(GBM_INCLUDE_PATH)
-  INCLUDE_DIRECTORIES(${GBM_INCLUDE_PATH})
-  SET(GBM_FOUND 1 CACHE STRING "Set to 1 if GBM is found, 0 otherwise")
-ELSE(GBM_INCLUDE_PATH)
-  SET(GBM_FOUND 0 CACHE STRING "Set to 1 if GBM is found, 0 otherwise")
-ENDIF(GBM_INCLUDE_PATH)
-
-MARK_AS_ADVANCED(GBM_FOUND)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb56567..3d18f50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,9 @@ ELSE (EMULATE_IVB)
   ADD_DEFINITIONS(-DEMULATE_GEN=0)
 ENDIF (EMULATE_HSW)
 
+# XXX now hard coded to enable the clamp to border workaround for IVB.
+ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
 IF (USE_FULSIM)
   ADD_DEFINITIONS(-DUSE_FULSIM=1)
 ELSE (USE_FULSIM)
@@ -116,15 +119,6 @@ ELSE(GBE_FOUND)
   MESSAGE(STATUS "Looking for Gen-Backend - not found")
 ENDIF(GBE_FOUND)
 
-Find_Package(GBM)
-IF(GBM_FOUND)
-  MESSAGE(STATUS "Looking for GBM - found")
-  SET(CMAKE_CXX_FLAGS "-DHAS_GBM ${CMAKE_CXX_FLAGS}")
-  SET(CMAKE_C_FLAGS "-DHAS_GBM ${CMAKE_C_FLAGS}")
-ELSE(GBM_FOUND)
-  MESSAGE(STATUS "Looking for GBM - not found")
-ENDIF(GBM_FOUND)
-
 Find_Package(EGL)
 IF(EGL_FOUND)
   MESSAGE(STATUS "Looking for EGL - found")
@@ -132,6 +126,12 @@ ELSE(EGL_FOUND)
   MESSAGE(STATUS "Looking for EGL - not found")
 ENDIF(EGL_FOUND)
 
+IF(MESA_SOURCE_FOUND)
+  MESSAGE(STATUS "Looking for mesa source code - found")
+ELSE(MESA_SOURCE_FOUND)
+  MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
+ENDIF(MESA_SOURCE_FOUND)
+
 Find_Package(OCLIcd)
 IF(OCLIcd_FOUND)
   MESSAGE(STATUS "Looking for OCL ICD header file - found")
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 8622f3e..476c6f2 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -34,7 +34,7 @@ else (GBE_DEBUG_MEMORY)
 endif (GBE_DEBUG_MEMORY)
 
 # Hide all symbols and allows the symbols declared as visible to be exported
-set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden")
+set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden ${CMAKE_C_CXX_FLAGS}")
 
 if (COMPILER STREQUAL "GCC")
   set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index b7b47ae..36bf688 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -3,6 +3,7 @@ set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
 set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
 set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
 set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
+set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
 set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h)
 set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
 set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
@@ -26,13 +27,13 @@ set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h
 
 add_custom_command(
   OUTPUT ${ocl_vector_file}
-  COMMAND ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
   DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file})
 
 add_custom_command(
   OUTPUT ${ocl_blob_file}
-  COMMAND ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
-  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
+  COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
+  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
 
 
 set (pch_object ${ocl_blob_file}.pch)
@@ -46,7 +47,7 @@ else (LLVM_VERSION_NODOT VERSION_GREATER 32)
         set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch)
     endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
-set (clang_cmd ${clang_cmd} -fno-builtin)
+set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
 
 add_custom_command(
      OUTPUT ${pch_object}
@@ -154,6 +155,10 @@ target_link_libraries(
                       ${CMAKE_THREAD_LIBS_INIT}
                       ${CMAKE_DL_LIBS})
 
+link_directories (${LLVM_LIBRARY_DIR})
+ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
+TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
+
 install (TARGETS gbe LIBRARY DESTINATION lib)
 install (FILES ${pch_object} DESTINATION lib)
 install (FILES backend/program.h DESTINATION include/gen)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5484869..cbd38f1 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -315,10 +315,10 @@ namespace gbe
       GBE_DELETE(this->kernel);
       this->kernel = NULL;
     }
-    if(this->kernel != NULL)
+    if(this->kernel != NULL) {
       this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
-    if(this->kernel != NULL)
       this->kernel->ctx = this;
+    }
     return this->kernel;
   }
 
@@ -364,7 +364,7 @@ namespace gbe
     this->kernel->stackSize = 1*KB; // XXX compute that in a better way
   }
 
-  void Context::newCurbeEntry(gbe_curbe_type value,
+  uint32_t Context::newCurbeEntry(gbe_curbe_type value,
                               uint32_t subValue,
                               uint32_t size,
                               uint32_t alignment)
@@ -374,6 +374,7 @@ namespace gbe
     GBE_ASSERT(offset >= GEN_REG_SIZE);
     kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
     kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
+    return offset;
   }
 
   uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
@@ -387,7 +388,12 @@ namespace gbe
     offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
     GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
     fn.getImageSet()->appendInfo(key, offset);
-    return offset;
+    return offset + GEN_REG_SIZE;
+  }
+
+
+  void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
+    curbeRegs.insert(std::make_pair(reg, offset));
   }
 
   void Context::buildPatchList(void) {
@@ -395,7 +401,7 @@ namespace gbe
     kernel->curbeSize = 0u;
 
     // We insert the block IP mask first
-    this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t));
+    this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
 
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
@@ -409,35 +415,55 @@ namespace gbe
           arg.type == ir::FunctionArgument::STRUCTURE ||
           arg.type == ir::FunctionArgument::IMAGE ||
           arg.type == ir::FunctionArgument::SAMPLER)
-        this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize);
+        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
     }
 
     // Already inserted registers go here
-    set<ir::Register> specialRegs;
-
     const size_t localIDSize = sizeof(uint32_t) * this->simdWidth;
-    this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize);
-    this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize);
-    this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize);
-    specialRegs.insert(ir::ocl::lid0);
-    specialRegs.insert(ir::ocl::lid1);
-    specialRegs.insert(ir::ocl::lid2);
+    insertCurbeReg(ir::ocl::lid0, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize));
+    insertCurbeReg(ir::ocl::lid1, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize));
+    insertCurbeReg(ir::ocl::lid2, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize));
+            insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
 
     // Go over all the instructions and find the special register we need
     // to push
 #define INSERT_REG(SPECIAL_REG, PATCH, WIDTH) \
   if (reg == ir::ocl::SPECIAL_REG) { \
-    if (specialRegs.find(reg) != specialRegs.end()) continue; \
-    this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH); \
+    if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
+    insertCurbeReg(reg, this->newCurbeEntry(GBE_CURBE_##PATCH, 0, ptrSize * WIDTH)); \
   } else
 
     bool useStackPtr = false;
-    fn.foreachInstruction([&](const ir::Instruction &insn) {
+    fn.foreachInstruction([&](ir::Instruction &insn) {
       const uint32_t srcNum = insn.getSrcNum();
       for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
         const ir::Register reg = insn.getSrc(srcID);
+        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+          if (srcID != 0) continue;
+          const unsigned char bti = fn.getImageSet()->getIdx(insn.getSrc(srcID));
+          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+          ir::ImageInfoKey key;
+          key.index = bti;
+          key.type = type;
+          const ir::Register imageInfo(key.data | 0x8000);
+          ir::Register realImageInfo;
+          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+            realImageInfo = insn.getSrc(1);
+            insertCurbeReg(realImageInfo, offset);
+            insertCurbeReg(imageInfo, (uint32_t)realImageInfo);
+          } else
+            realImageInfo = ir::Register(curbeRegs.find(imageInfo)->second);
+          insn.setSrc(srcID, realImageInfo);
+          continue;
+        } else if (insn.getOpcode() == ir::OP_GET_SAMPLER_INFO) {
+          /* change the src to sampler information register. */
+          if (curbeRegs.find(ir::ocl::samplerinfo) == curbeRegs.end())
+            insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
+          continue;
+        }
         if (fn.isSpecialReg(reg) == false) continue;
-        if (specialRegs.contains(reg) == true) continue;
+        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
         if (reg == ir::ocl::stackptr) useStackPtr = true;
         INSERT_REG(lsize0, LOCAL_SIZE_X, 1)
         INSERT_REG(lsize1, LOCAL_SIZE_Y, 1)
@@ -453,33 +479,22 @@ namespace gbe
         INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
         INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
         INSERT_REG(stackptr, STACK_POINTER, this->simdWidth)
-        do {} while (0);
-        specialRegs.insert(reg);
+        do {} while(0);
       }
     });
 #undef INSERT_REG
-    this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int));
-    specialRegs.insert(ir::ocl::constoffst);
-
-    // Insert serialized global constant arrays if used
-    const ir::ConstantSet& constantSet = unit.getConstantSet();
-    if (constantSet.getConstantNum()) {
-      size_t size = constantSet.getDataSize();
-      this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size);
-    }
 
     // Insert the number of threads
-    this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t));
+    insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)));
 
     // Insert the stack buffer if used
     if (useStackPtr)
-      this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize);
+      insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
 
     // After this point the vector is immutable. Sorting it will make
     // research faster
     std::sort(kernel->patches.begin(), kernel->patches.end());
 
-    // Align it on 32 bytes properly
     kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
   }
 
@@ -640,8 +655,7 @@ namespace gbe
         reg == ir::ocl::goffset0  ||
         reg == ir::ocl::goffset1  ||
         reg == ir::ocl::goffset2  ||
-        reg == ir::ocl::workdim   ||
-        reg == ir::ocl::constoffst)
+        reg == ir::ocl::workdim)
       return true;
     return false;
   }
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 50c0e70..ca2c88d 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -93,6 +93,8 @@ namespace gbe
     uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
     /*! allocate size scratch memory and return start address */
     uint32_t allocateScratchMem(uint32_t size);
+    /*! Preallocated curbe register set including special registers. */
+    map<ir::Register, uint32_t> curbeRegs;
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
@@ -115,7 +117,8 @@ namespace gbe
     /*! Insert a new entry with the given size in the Curbe. Return the offset
      *  of the entry
      */
-    void newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
+    void insertCurbeReg(ir::Register, uint32_t grfOffset);
+    uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
     /*! Provide for each branch and label the label index target */
     typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
     const ir::Unit &unit;                 //!< Unit that contains the kernel
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index bfb865a..f911e7c 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -1193,6 +1193,11 @@ int gen_disasm (FILE *file, const void *opaque_insn)
                   data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
         }
         break;
+      case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
+        format (file, " (bti: %d, %s)",
+                inst->bits3.gen7_dword_rw.bti,
+                data_port_data_cache_msg_type[inst->bits3.gen7_dword_rw.msg_type]);
+        break;
       case GEN_SFID_MESSAGE_GATEWAY:
         format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
             gateway_sub_function[inst->bits3.gen7_msg_gw.subfunc],
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 1a012fe..858105a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -88,6 +88,18 @@ namespace gbe
     }
   }
 
+  void GenContext::clearFlagRegister(void) {
+    // when group size not aligned to simdWidth, flag register need clear to
+    // make prediction(any8/16h) work correctly
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+    p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+    p->pop();
+  }
+
   void GenContext::emitStackPointer(void) {
     using namespace ir;
 
@@ -147,6 +159,21 @@ namespace gbe
       case SEL_OP_RNDE: p->RNDE(dst, src); break;
       case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
       case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
+      case SEL_OP_CONVI64_TO_I:
+       {
+        int execWidth = p->curr.execWidth;
+        GenRegister xsrc = src.bottom_half(), xdst = dst;
+        p->push();
+        p->curr.execWidth = 8;
+        for(int i = 0; i < execWidth/4; i ++) {
+          p->curr.chooseNib(i);
+          p->MOV(xdst, xsrc);
+          xdst = GenRegister::suboffset(xdst, 4);
+          xsrc = GenRegister::suboffset(xsrc, 8);
+        }
+        p->pop();
+        break;
+       }
       default: NOT_IMPLEMENTED;
     }
   }
@@ -417,6 +444,273 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2) {
+    addWithCarry(low1, low1, low2);
+    addWithCarry(high1, high1, high2);
+    p->ADD(high1, high1, low2);
+  }
+
+  void GenContext::I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low) {
+    GenRegister &e = dst1, &f = dst2, &g = dst3, &h = dst4,
+                &a = x_high, &b = x_low, &c = y_high, &d = y_low;
+    I32FullMult(e, h, b, d);
+    I32FullMult(f, g, a, d);
+    addWithCarry(g, g, e);
+    addWithCarry(f, f, e);
+    I32FullMult(e, d, b, c);
+    I64FullAdd(f, g, e, d);
+    I32FullMult(b, d, a, c);
+    I64FullAdd(e, f, b, d);
+  }
+
+  void GenContext::I64Neg(GenRegister high, GenRegister low, GenRegister tmp) {
+    p->NOT(high, high);
+    p->NOT(low, low);
+    p->MOV(tmp, GenRegister::immud(1));
+    addWithCarry(low, low, tmp);
+    p->ADD(high, high, tmp);
+  }
+
+  void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
+    p->SHR(sign, high, GenRegister::immud(31));
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
+    p->curr.predicate = GEN_PREDICATE_NORMAL;
+    I64Neg(high, low, tmp);
+    p->pop();
+  }
+
+  void GenContext::emitI64MULHIInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    GenRegister g = ra->genReg(insn.dst(7));
+    GenRegister h = ra->genReg(insn.dst(8));
+    GenRegister i = ra->genReg(insn.dst(9));
+    GenRegister flagReg = ra->genReg(insn.dst(10));
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(x.type == GEN_TYPE_UL) {
+      I64FullMult(e, f, g, h, a, b, c, d);
+    } else {
+      I64ABS(e, a, b, i, flagReg);
+      I64ABS(f, c, d, i, flagReg);
+      p->XOR(i, e, f);
+      I64FullMult(e, f, g, h, a, b, c, d);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->NOT(e, e);
+      p->NOT(f, f);
+      p->NOT(g, g);
+      p->NOT(h, h);
+      p->MOV(i, GenRegister::immud(1));
+      addWithCarry(h, h, i);
+      addWithCarry(g, g, i);
+      addWithCarry(f, f, i);
+      p->ADD(e, e, i);
+      p->pop();
+    }
+    storeTopHalf(dest, e);
+    storeBottomHalf(dest, f);
+  }
+
+  void GenContext::emitI64MADSATInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister z = ra->genReg(insn.src(2));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    GenRegister g = ra->genReg(insn.dst(7));
+    GenRegister h = ra->genReg(insn.dst(8));
+    GenRegister i = ra->genReg(insn.dst(9));
+    GenRegister flagReg = ra->genReg(insn.dst(10));
+    GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(x.type == GEN_TYPE_UL) {
+      I64FullMult(e, f, g, h, a, b, c, d);
+      loadTopHalf(c, z);
+      loadBottomHalf(d, z);
+      addWithCarry(h, h, d);
+      addWithCarry(g, g, d);
+      addWithCarry(f, f, d);
+      p->ADD(e, e, d);
+      addWithCarry(g, g, c);
+      addWithCarry(f, f, c);
+      p->ADD(e, e, c);
+      p->OR(a, e, f);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(g, GenRegister::immd(-1));
+      p->MOV(h, GenRegister::immd(-1));
+      p->pop();
+    } else {
+      I64ABS(e, a, b, i, flagReg);
+      I64ABS(f, c, d, i, flagReg);
+      p->XOR(i, e, f);
+      I64FullMult(e, f, g, h, a, b, c, d);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NZ, i, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->NOT(e, e);
+      p->NOT(f, f);
+      p->NOT(g, g);
+      p->NOT(h, h);
+      p->MOV(i, one);
+      addWithCarry(h, h, i);
+      addWithCarry(g, g, i);
+      addWithCarry(f, f, i);
+      p->ADD(e, e, i);
+      p->pop();
+      loadTopHalf(c, z);
+      loadBottomHalf(d, z);
+      p->ASR(GenRegister::retype(b, GEN_TYPE_D), GenRegister::retype(c, GEN_TYPE_D), GenRegister::immd(31));
+      p->MOV(a, b);
+      addWithCarry(h, h, d);
+      addWithCarry(g, g, d);
+      addWithCarry(f, f, d);
+      p->ADD(e, e, d);
+      addWithCarry(g, g, c);
+      addWithCarry(f, f, c);
+      p->ADD(e, e, c);
+      addWithCarry(f, f, b);
+      p->ADD(e, e, b);
+      p->ADD(e, e, a);
+      p->MOV(b, zero);
+      p->push();
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NZ, e, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NZ, f, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_G, g, GenRegister::immud(0x7FFFFFFF));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->SHR(a, e, GenRegister::immud(31));
+      p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, zero);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(g, GenRegister::immud(0x7FFFFFFF));
+      p->MOV(h, GenRegister::immud(0xFFFFFFFFu));
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(b, zero);
+      p->CMP(GEN_CONDITIONAL_NEQ, e, GenRegister::immud(0xFFFFFFFFu));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NEQ, f, GenRegister::immud(0xFFFFFFFFu));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_LE, g, GenRegister::immud(0x7FFFFFFF));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_Z, a, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, zero);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(g, GenRegister::immud(0x80000000u));
+      p->MOV(h, zero);
+      p->pop();
+    }
+    storeTopHalf(dest, g);
+    storeBottomHalf(dest, h);
+  }
+
+  void GenContext::emitI64HADDInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+    loadBottomHalf(a, x);
+    loadBottomHalf(b, y);
+    loadTopHalf(c, x);
+    loadTopHalf(d, y);
+    addWithCarry(a, a, b);
+    addWithCarry(c, c, b);
+    addWithCarry(c, c, d);
+    p->ADD(b, b, d);
+    p->SHR(a, a, GenRegister::immud(1));
+    p->SHL(d, c, GenRegister::immud(31));
+    p->OR(a, a, d);
+    p->SHR(c, c, GenRegister::immud(1));
+    p->SHL(d, b, GenRegister::immud(31));
+    p->OR(c, c, d);
+    storeBottomHalf(dest, a);
+    storeTopHalf(dest, c);
+  }
+
+  void GenContext::emitI64RHADDInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+    loadBottomHalf(a, x);
+    loadBottomHalf(b, y);
+    addWithCarry(a, a, b);
+    p->MOV(c, GenRegister::immud(1));
+    addWithCarry(a, a, c);
+    p->ADD(b, b, c);
+    loadTopHalf(c, x);
+    loadTopHalf(d, y);
+    addWithCarry(c, c, b);
+    addWithCarry(c, c, d);
+    p->ADD(b, b, d);
+    p->SHR(a, a, GenRegister::immud(1));
+    p->SHL(d, c, GenRegister::immud(31));
+    p->OR(a, a, d);
+    p->SHR(c, c, GenRegister::immud(1));
+    p->SHL(d, b, GenRegister::immud(31));
+    p->OR(c, c, d);
+    storeBottomHalf(dest, a);
+    storeTopHalf(dest, c);
+  }
+
   void GenContext::emitI64ShiftInstruction(const SelectionInstruction &insn) {
     GenRegister dest = ra->genReg(insn.dst(0));
     GenRegister x = ra->genReg(insn.src(0));
@@ -428,6 +722,7 @@ namespace gbe
     GenRegister e = ra->genReg(insn.dst(5));
     GenRegister f = ra->genReg(insn.dst(6));
     a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+    GenRegister flagReg = ra->genReg(insn.dst(7));
     GenRegister zero = GenRegister::immud(0);
     switch(insn.opcode) {
       case SEL_OP_I64SHL:
@@ -440,16 +735,16 @@ namespace gbe
         p->SHL(c, e, a);
         p->SHL(d, f, a);
         p->OR(e, d, b);
-        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
-        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, e);
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->AND(a, a, GenRegister::immud(32));
-        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
-        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, c);
         p->SEL(c, c, zero);
@@ -467,16 +762,16 @@ namespace gbe
         p->SHR(c, f, a);
         p->SHR(d, e, a);
         p->OR(e, d, b);
-        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
-        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, e);
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->AND(a, a, GenRegister::immud(32));
-        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
-        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, c);
         p->SEL(c, c, zero);
@@ -495,16 +790,16 @@ namespace gbe
         p->ASR(c, f, a);
         p->SHR(d, e, a);
         p->OR(e, d, b);
-        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
-        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, e);
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->AND(a, a, GenRegister::immud(32));
-        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
-        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, c);
         p->SEL(c, c, GenRegister::immd(-1));
@@ -524,6 +819,46 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp) {
+    p->MOV(dst, high);
+    p->MUL(dst, dst, GenRegister::immf(65536.f * 65536.f));
+    tmp.type = GEN_TYPE_F;
+    p->MOV(tmp, low);
+    p->ADD(dst, dst, tmp);
+  }
+
+  void GenContext::emitI64ToFloatInstruction(const SelectionInstruction &insn) {
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister high = ra->genReg(insn.dst(1));
+    GenRegister low = ra->genReg(insn.dst(2));
+    GenRegister tmp = ra->genReg(insn.dst(3));
+    GenRegister flagReg = ra->genReg(insn.dst(4));
+    loadTopHalf(high, src);
+    loadBottomHalf(low, src);
+    if(!src.is_signed_int()) {
+      UnsignedI64ToFloat(dest, high, low, tmp);
+    } else {
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_GE, high, GenRegister::immud(0x80000000));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->NOT(high, high);
+      p->NOT(low, low);
+      p->MOV(tmp, GenRegister::immud(1));
+      addWithCarry(low, low, tmp);
+      p->ADD(high, high, tmp);
+      p->pop();
+      UnsignedI64ToFloat(dest, high, low, tmp);
+      p->push();
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      dest.type = GEN_TYPE_UD;
+      p->OR(dest, dest, GenRegister::immud(0x80000000));
+      p->pop();
+    }
+  }
+
   void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
     GenRegister src0 = ra->genReg(insn.src(0));
     GenRegister src1 = ra->genReg(insn.src(1));
@@ -535,10 +870,11 @@ namespace gbe
     int flag = p->curr.flag, subFlag = p->curr.subFlag;
     GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW),
                 f2 = GenRegister::suboffset(f1, 1),
-                f3 = GenRegister::suboffset(f1, 2);
+                f3 = GenRegister::suboffset(f1, 2),
+                f4 = GenRegister::suboffset(f1, 3);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.flag = 0, p->curr.subFlag = 1;
+    saveFlag(f4, flag, subFlag);
     loadTopHalf(tmp0, src0);
     loadTopHalf(tmp1, src1);
     switch(insn.extra.function) {
@@ -554,46 +890,147 @@ namespace gbe
             cmpTopHalf = GEN_CONDITIONAL_G;
           p->CMP(cmpTopHalf, tmp0, tmp1);
         }
-        saveFlag(f1, 0, 1);
+        saveFlag(f1, flag, subFlag);
         p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
-        saveFlag(f2, 0, 1);
+        saveFlag(f2, flag, subFlag);
         tmp0.type = tmp1.type = GEN_TYPE_UD;
         loadBottomHalf(tmp0, src0);
         loadBottomHalf(tmp1, src1);
         p->CMP(insn.extra.function, tmp0, tmp1);
-        saveFlag(f3, 0, 1);
+        saveFlag(f3, flag, subFlag);
+        p->push();
+        p->curr.execWidth = 1;
         p->AND(f2, f2, f3);
         p->OR(f1, f1, f2);
+        p->pop();
         break;
       case GEN_CONDITIONAL_EQ:
         p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
-        saveFlag(f1, 0, 1);
+        saveFlag(f1, flag, subFlag);
         tmp0.type = tmp1.type = GEN_TYPE_UD;
         loadBottomHalf(tmp0, src0);
         loadBottomHalf(tmp1, src1);
         p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
-        saveFlag(f2, 0, 1);
+        saveFlag(f2, flag, subFlag);
+        p->push();
+        p->curr.execWidth = 1;
         p->AND(f1, f1, f2);
+        p->pop();
         break;
       case GEN_CONDITIONAL_NEQ:
         p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
-        saveFlag(f1, 0, 1);
+        saveFlag(f1, flag, subFlag);
         tmp0.type = tmp1.type = GEN_TYPE_UD;
         loadBottomHalf(tmp0, src0);
         loadBottomHalf(tmp1, src1);
         p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
-        saveFlag(f2, 0, 1);
+        saveFlag(f2, flag, subFlag);
+        p->push();
+        p->curr.execWidth = 1;
         p->OR(f1, f1, f2);
+        p->pop();
         break;
       default:
         NOT_IMPLEMENTED;
     }
-    saveFlag(f2, flag, subFlag);
-    p->AND(f1, f1, f2);
+    p->curr.execWidth = 1;
+    p->AND(f1, f1, f4);
     p->MOV(GenRegister::flag(flag, subFlag), f1);
     p->pop();
   }
 
+  void GenContext::emitI64SATADDInstruction(const SelectionInstruction &insn) {
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister flagReg = ra->genReg(insn.dst(6));
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(dst.is_signed_int())
+      p->SHR(e, a, GenRegister::immud(31));
+    addWithCarry(b, b, d);
+    addWithCarry(a, a, d);
+    addWithCarry(a, a, c);
+    p->ADD(c, c, d);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    if(! dst.is_signed_int()) {
+      p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(a, GenRegister::immud(0xFFFFFFFFu));
+      p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+    } else {
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x80000000u));
+      p->MOV(b, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+      p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+    }
+    p->pop();
+    storeTopHalf(dst, a);
+    storeBottomHalf(dst, b);
+  }
+
+  void GenContext::emitI64SATSUBInstruction(const SelectionInstruction &insn) {
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister flagReg = ra->genReg(insn.dst(6));
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(dst.is_signed_int())
+      p->SHR(e, a, GenRegister::immud(31));
+    subWithBorrow(b, b, d);
+    subWithBorrow(a, a, d);
+    subWithBorrow(a, a, c);
+    p->ADD(c, c, d);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    if(! dst.is_signed_int()) {
+      p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(a, GenRegister::immud(0));
+      p->MOV(b, GenRegister::immud(0));
+    } else {
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x80000000u));
+      p->MOV(b, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+      p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+    }
+    p->pop();
+    storeTopHalf(dst, a);
+    storeBottomHalf(dst, b);
+  }
+
   void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
     int execWidth = p->curr.execWidth;
     src = src.top_half();
@@ -664,11 +1101,11 @@ namespace gbe
     int execWidth = p->curr.execWidth;
     GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
     p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.execWidth = 8;
     p->ADDC(dest, src0, src1);
     p->MOV(src1, acc0);
     if (execWidth == 16) {
+      p->curr.quarterControl = 1;
       p->ADDC(GenRegister::suboffset(dest, 8),
               GenRegister::suboffset(src0, 8),
               GenRegister::suboffset(src1, 8));
@@ -681,11 +1118,11 @@ namespace gbe
     int execWidth = p->curr.execWidth;
     GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
     p->push();
-    p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.execWidth = 8;
     p->SUBB(dest, src0, src1);
     p->MOV(src1, acc0);
     if (execWidth == 16) {
+      p->curr.quarterControl = 1;
       p->SUBB(GenRegister::suboffset(dest, 8),
               GenRegister::suboffset(src0, 8),
               GenRegister::suboffset(src1, 8));
@@ -740,6 +1177,145 @@ namespace gbe
     storeBottomHalf(dest, a);
   }
 
+  void GenContext::emitI64DIVREMInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    GenRegister g = ra->genReg(insn.dst(7));
+    GenRegister h = ra->genReg(insn.dst(8));
+    GenRegister i = ra->genReg(insn.dst(9));
+    GenRegister j = ra->genReg(insn.dst(10));
+    GenRegister k = ra->genReg(insn.dst(11));
+    GenRegister l = ra->genReg(insn.dst(12));
+    GenRegister m = ra->genReg(insn.dst(13));
+    GenRegister flagReg = ra->genReg(insn.dst(14));
+    GenRegister zero = GenRegister::immud(0),
+                one = GenRegister::immud(1),
+                imm31 = GenRegister::immud(31);
+    // (a,b) <- x
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    // (c,d) <- y
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    // k <- sign_of_result
+    if(x.is_signed_int()) {
+      GBE_ASSERT(y.is_signed_int());
+      GBE_ASSERT(dest.is_signed_int());
+      I64ABS(k, a, b, e, flagReg);
+      I64ABS(l, c, d, e, flagReg);
+      if(insn.opcode == SEL_OP_I64DIV)
+        p->XOR(k, k, l);
+    }
+    // (e,f) <- 0
+    p->MOV(e, zero);
+    p->MOV(f, zero);
+    // (g,h) <- 2**63
+    p->MOV(g, GenRegister::immud(0x80000000));
+    p->MOV(h, zero);
+    // (i,j) <- 0
+    p->MOV(i, zero);
+    p->MOV(j, zero);
+    // m <- 0
+    p->MOV(m, zero);
+    {
+      uint32_t loop_start = p->n_instruction();
+      // (c,d,e,f) <- (c,d,e,f) / 2
+      p->SHR(f, f, one);
+      p->SHL(l, e, imm31);
+      p->OR(f, f, l);
+      p->SHR(e, e, one);
+      p->SHL(l, d, imm31);
+      p->OR(e, e, l);
+      p->SHR(d, d, one);
+      p->SHL(l, c, imm31);
+      p->OR(d, d, l);
+      p->SHR(c, c, one);
+      // condition <- (c,d)==0 && (a,b)>=(e,f)
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(l, zero);
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_EQ, a, e);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_GE, b, f);
+      p->MOV(l, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_G, a, e);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(l, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NEQ, l, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_EQ, c, zero);
+      p->CMP(GEN_CONDITIONAL_EQ, d, zero);
+      // under condition, (a,b) <- (a,b) - (e,f)
+      p->MOV(l, f);
+      subWithBorrow(b, b, l);
+      subWithBorrow(a, a, l);
+      p->MOV(l, e);
+      subWithBorrow(a, a, l);
+      // under condition, (i,j) <- (i,j) | (g,h)
+      p->OR(i, i, g);
+      p->OR(j, j, h);
+      p->pop();
+      // (g,h) /= 2
+      p->SHR(h, h, one);
+      p->SHL(l, g, imm31);
+      p->OR(h, h, l);
+      p->SHR(g, g, one);
+      // condition: m < 64
+      p->ADD(m, m, one);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      // under condition, jump back to start point
+      if (simdWidth == 8)
+        p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+      else if (simdWidth == 16)
+        p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+      else
+        NOT_IMPLEMENTED;
+      p->curr.execWidth = 1;
+      p->curr.noMask = 1;
+      int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
+      p->JMPI(zero);
+      p->patchJMPI(p->n_instruction()-1, jip);
+      p->pop();
+      // end of loop
+    }
+    // adjust sign of result
+    if(x.is_signed_int()) {
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      if(insn.opcode == SEL_OP_I64DIV)
+        I64Neg(i, j, l);
+      else
+        I64Neg(a, b, l);
+      p->pop();
+    }
+    // write dest
+    if(insn.opcode == SEL_OP_I64DIV) {
+      storeTopHalf(dest, i);
+      storeBottomHalf(dest, j);
+    } else {
+      GBE_ASSERT(insn.opcode == SEL_OP_I64REM);
+      storeTopHalf(dest, a);
+      storeBottomHalf(dest, b);
+    }
+  }
+
   void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
@@ -841,11 +1417,11 @@ namespace gbe
   }
 
   void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
-    p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
     p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.execWidth = 8;
       p->curr.noMask = 1;
+      p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
+      p->curr.execWidth = 8;
       p->EOT(112);
     p->pop();
   }
@@ -916,7 +1492,7 @@ namespace gbe
     const GenRegister data = ra->genReg(insn.src(1));
     const uint32_t bti = insn.extra.function;
     p->MOV(src, addr);
-    p->WRITE64(src, data, bti, elemNum);
+    p->WRITE64(src, data, bti, elemNum, isScalarReg(data.reg()));
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
@@ -941,6 +1517,13 @@ namespace gbe
     p->BYTE_SCATTER(src, bti, elemSize);
   }
 
+  void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.extra.function;
+    p->DWORD_GATHER(dst, src, bti);
+  }
+
   void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
@@ -950,6 +1533,7 @@ namespace gbe
     const GenRegister vcoord = ra->genReg(insn.src(5));
     const GenRegister wcoord = ra->genReg(insn.src(6));
     uint32_t simdWidth = p->curr.execWidth;
+    uint32_t coord_cnt = 2;
     p->push();
     const uint32_t nr = msgPayload.nr;
     // prepare mesg desc and move to a0.0.
@@ -957,9 +1541,11 @@ namespace gbe
     /* Prepare message payload. */
     p->MOV(GenRegister::f8grf(nr , 0), ucoord);
     p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
-    if (insn.src(8).reg() != 0)
+    if (insn.src(6).reg() != 0) {
       p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), wcoord);
-    p->SAMPLE(dst, msgPayload, false, bti, sampler, simdWidth, -1, 0);
+      coord_cnt++;
+    }
+    p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0);
     p->pop();
   }
 
@@ -1041,7 +1627,7 @@ namespace gbe
         p->curr.quarterControl = GEN_COMPRESSION_Q2;
       QUARTER_MOV0(nr + 1, ucoord);
       QUARTER_MOV0(nr + 2, vcoord);
-      if (insn.src(3 + insn.extra.elem).reg() != 0)
+      if (insn.src(2 + insn.extra.elem).reg() != 0)
         QUARTER_MOV0(nr + 3, wcoord);
       QUARTER_MOV1(nr + 5, R);
       QUARTER_MOV1(nr + 6, G);
@@ -1081,6 +1667,7 @@ namespace gbe
     schedulePostRegAllocation(*this, *this->sel);
     if (OCL_OUTPUT_REG_ALLOC)
       ra->outputAllocation();
+    this->clearFlagRegister();
     this->emitStackPointer();
     this->emitInstructionStream();
     this->patchBranches();
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 8b481d0..10e0603 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -61,6 +61,7 @@ namespace gbe
     INLINE const ir::Function &getFunction(void) const { return fn; }
     /*! Simd width chosen for the current function */
     INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
+    void clearFlagRegister(void);
     /*! Emit the per-lane stack pointer computation */
     void emitStackPointer(void);
     /*! Emit the instructions */
@@ -85,8 +86,13 @@ namespace gbe
 
     void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
     void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+    void I64Neg(GenRegister high, GenRegister low, GenRegister tmp);
+    void I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg);
+    void I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2);
     void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
+    void I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low);
     void saveFlag(GenRegister dest, int flag, int subFlag);
+    void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp);
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
@@ -95,8 +101,15 @@ namespace gbe
     void emitBinaryInstruction(const SelectionInstruction &insn);
     void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
     void emitTernaryInstruction(const SelectionInstruction &insn);
+    void emitI64MULHIInstruction(const SelectionInstruction &insn);
+    void emitI64MADSATInstruction(const SelectionInstruction &insn);
+    void emitI64HADDInstruction(const SelectionInstruction &insn);
+    void emitI64RHADDInstruction(const SelectionInstruction &insn);
     void emitI64ShiftInstruction(const SelectionInstruction &insn);
     void emitI64CompareInstruction(const SelectionInstruction &insn);
+    void emitI64SATADDInstruction(const SelectionInstruction &insn);
+    void emitI64SATSUBInstruction(const SelectionInstruction &insn);
+    void emitI64ToFloatInstruction(const SelectionInstruction &insn);
     void emitCompareInstruction(const SelectionInstruction &insn);
     void emitJumpInstruction(const SelectionInstruction &insn);
     void emitIndirectMoveInstruction(const SelectionInstruction &insn);
@@ -113,12 +126,14 @@ namespace gbe
     void emitAtomicInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn);
     void emitByteScatterInstruction(const SelectionInstruction &insn);
+    void emitDWordGatherInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
     void emitSpillRegInstruction(const SelectionInstruction &insn);
     void emitUnSpillRegInstruction(const SelectionInstruction &insn);
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
     void emitI64MULInstruction(const SelectionInstruction &insn);
+    void emitI64DIVREMInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
 
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index e3959ff..27ce58c 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -343,6 +343,10 @@ enum GenMessageTarget {
 #define GEN_BYTE_SCATTER_DWORD  2
 #define GEN_BYTE_SCATTER_QWORD  3
 
+/* dword scattered rw */
+#define GEN_DWORD_SCATTER_8_DWORDS   2
+#define GEN_DWORD_SCATTER_16_DWORDS  3
+
 #define GEN_SAMPLER_RETURN_FORMAT_FLOAT32     0
 #define GEN_SAMPLER_RETURN_FORMAT_UINT32      2
 #define GEN_SAMPLER_RETURN_FORMAT_SINT32      3
@@ -805,6 +809,21 @@ struct GenInstruction
       uint32_t end_of_thread:1;
     } gen7_oblock_rw;
 
+    /*! Data port dword scatter / gather */
+    struct {
+      uint32_t bti:8;
+      uint32_t block_size:2;
+      uint32_t ignored0:3;
+      uint32_t invalidate_after_read:1;
+      uint32_t msg_type:4;
+      uint32_t ignored1:1;
+      uint32_t header_present:1;
+      uint32_t response_length:5;
+      uint32_t msg_length:4;
+      uint32_t pad2:2;
+      uint32_t end_of_thread:1;
+    } gen7_dword_rw;
+
     /*! Data port typed read / write messages */
     struct {
       uint32_t bti:8;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 1a459e1..b0cc931 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -198,7 +198,21 @@ namespace gbe
      insn->bits3.gen7_typed_rw.bti = bti;
      insn->bits3.gen7_typed_rw.msg_type = msg_type;
   }
-
+  static void setDWordScatterMessgae(GenEncoder *p,
+                                     GenInstruction *insn,
+                                     uint32_t bti,
+                                     uint32_t block_size,
+                                     uint32_t msg_type,
+                                     uint32_t msg_length,
+                                     uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN6_SFID_DATAPORT_CONSTANT_CACHE;
+    setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_dword_rw.msg_type = msg_type;
+    insn->bits3.gen7_dword_rw.bti = bti;
+    insn->bits3.gen7_dword_rw.block_size = block_size;
+    insn->bits3.gen7_dword_rw.invalidate_after_read = 0;
+  }
   //////////////////////////////////////////////////////////////////////////
   // Gen Emitter encoding class
   //////////////////////////////////////////////////////////////////////////
@@ -315,6 +329,7 @@ namespace gbe
 
   void GenEncoder::setSrc1(GenInstruction *insn, GenRegister reg) {
      assert(reg.nr < 128);
+     assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
 
      insn->bits1.da1.src1_reg_file = reg.file;
      insn->bits1.da1.src1_reg_type = reg.type;
@@ -390,8 +405,9 @@ namespace gbe
     pop();
   }
 
-  void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) {
+  void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
     GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
+    GenRegister unpacked;
     msg = GenRegister::retype(msg, GEN_TYPE_UD);
     int originSimdWidth = curr.execWidth;
     int originPredicate = curr.predicate;
@@ -401,9 +417,19 @@ namespace gbe
       curr.predicate = GEN_PREDICATE_NONE;
       curr.noMask = GEN_MASK_DISABLE;
       curr.execWidth = 8;
-      MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half));
+      if (is_scalar) {
+        unpacked = data32;
+        unpacked.subnr += half * 4;
+      } else
+        unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
+      MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
       if (originSimdWidth == 16) {
-        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half));
+        if (is_scalar) {
+          unpacked = data32;
+          unpacked.subnr += half * 4;
+        } else
+          unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
+        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
         curr.execWidth = 16;
       }
       if (half == 1)
@@ -518,6 +544,36 @@ namespace gbe
                            response_length);
   }
 
+  void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
+    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    uint32_t block_size = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+      response_length = 1;
+      block_size = GEN_DWORD_SCATTER_8_DWORDS;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+      response_length = 2;
+      block_size = GEN_DWORD_SCATTER_16_DWORDS;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, dst);
+    this->setSrc0(insn, src);
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDWordScatterMessgae(this,
+                           insn,
+                           bti,
+                           block_size,
+                           GEN_DWORD_GATHER,
+                           msg_length,
+                           response_length);
+
+  }
+
   void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
     uint32_t msg_length = 0;
@@ -1123,15 +1179,16 @@ namespace gbe
                           bool header_present,
                           unsigned char bti,
                           unsigned char sampler,
+                          unsigned int coord_cnt,
                           uint32_t simdWidth,
                           uint32_t writemask,
                           uint32_t return_format)
   {
      if (writemask == 0) return;
-     uint32_t msg_type = (simdWidth == 16) ?
+     uint32_t msg_type =  (simdWidth == 16) ?
                             GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
      uint32_t response_length = (4 * (simdWidth / 8));
-     uint32_t msg_length = (2 * (simdWidth / 8));
+     uint32_t msg_length = (coord_cnt * (simdWidth / 8));
      if (header_present)
        msg_length++;
      uint32_t simd_mode = (simdWidth == 16) ?
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index bbf240c..d518c4a 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -147,7 +147,7 @@ namespace gbe
     /*! Read 64-bits float/int arrays */
     void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Write 64-bits float/int arrays */
-    void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum);
+    void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
     /*! Untyped read (upto 4 channels) */
     void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
@@ -156,6 +156,8 @@ namespace gbe
     void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
     void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+    /*! DWord gather (for constant cache read) */
+    void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
     /*! for scratch memory read */
     void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
     /*! for scratch memory write */
@@ -166,6 +168,7 @@ namespace gbe
                 bool header_present,
                 unsigned char bti,
                 unsigned char sampler,
+                unsigned int coord_cnt,
                 unsigned int simdWidth,
                 uint32_t writemask,
                 uint32_t return_format);
@@ -190,6 +193,7 @@ namespace gbe
     void setSrc0(GenInstruction *insn, GenRegister reg);
     void setSrc1(GenInstruction *insn, GenRegister reg);
     GenInstruction *next(uint32_t opcode);
+    uint32_t n_instruction(void) const { return store.size(); }
     GBE_CLASS(GenEncoder); //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 2204837..b33112c 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -6,8 +6,14 @@ DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
 DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        4,        2)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(I64Shift,        20,        4,        2)
+DECL_GEN7_SCHEDULE(I64HADD,         20,        4,        2)
+DECL_GEN7_SCHEDULE(I64RHADD,        20,        4,        2)
+DECL_GEN7_SCHEDULE(I64ToFloat,      20,        4,        2)
+DECL_GEN7_SCHEDULE(I64MULHI,        20,        4,        2)
+DECL_GEN7_SCHEDULE(I64MADSAT,       20,        4,        2)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
 DECL_GEN7_SCHEDULE(I64Compare,      20,        4,        2)
+DECL_GEN7_SCHEDULE(I64DIVREM,       20,        4,        2)
 DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
 DECL_GEN7_SCHEDULE(IndirectMove,    20,        2,        2)
 DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
@@ -22,6 +28,7 @@ DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
+DECL_GEN7_SCHEDULE(DWordGather,     80,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
@@ -29,3 +36,5 @@ DECL_GEN7_SCHEDULE(UnSpillReg,      80,        1,        1)
 DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
 DECL_GEN7_SCHEDULE(I64MUL,          20,        4,        2)
+DECL_GEN7_SCHEDULE(I64SATADD,       20,        4,        2)
+DECL_GEN7_SCHEDULE(I64SATSUB,       20,        4,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 8e4cd8f..bd52885 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -419,7 +419,7 @@ namespace gbe
 #define ALU3(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
 #define I64Shift(OP) \
-  INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
+  INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
     ALU1(MOV)
     ALU1WithTemp(MOV_DF)
     ALU1WithTemp(LOAD_DF_IMM)
@@ -459,6 +459,7 @@ namespace gbe
     ALU2(UPSAMPLE_INT)
     ALU2(UPSAMPLE_LONG)
     ALU1WithTemp(CONVI_TO_I64)
+    ALU1(CONVI64_TO_I)
     I64Shift(I64SHL)
     I64Shift(I64SHR)
     I64Shift(I64ASR)
@@ -468,10 +469,24 @@ namespace gbe
 #undef ALU2WithTemp
 #undef ALU3
 #undef I64Shift
+    /*! Convert 64-bit integer to 32-bit float */
+    void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]);
+    /*! Saturated 64bit x*y + z */
+    void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]);
+    /*! High 64bit of x*y */
+    void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]);
+    /*! (x+y)>>1 without mod. overflow */
+    void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+    /*! (x+y+1)>>1 without mod. overflow */
+    void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
     /*! Shift a 64-bit integer */
-    void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]);
     /*! Compare 64-bit integer */
     void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
+    /*! Saturated addition of 64-bit integer */
+    void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    /*! Saturated subtraction of 64-bit integer */
+    void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
     /*! Encode a barrier instruction */
     void BARRIER(GenRegister src);
     /*! Encode a barrier instruction */
@@ -506,6 +521,8 @@ namespace gbe
     void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
     void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
+    /*! DWord scatter (for constant cache read) */
+    void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
     /*! Extended math function (2 arguments) */
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
@@ -528,6 +545,10 @@ namespace gbe
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
     /*! Multiply 64-bit integers */
     void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    /*! 64-bit integer division */
+    void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
+    /*! 64-bit integer remainder of division */
+    void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]);
     /*! Use custom allocators */
     GBE_CLASS(Opaque);
     friend class SelectionBlock;
@@ -993,6 +1014,14 @@ namespace gbe
     vector->reg = &insn->src(0);
   }
 
+  void Selection::Opaque::DWORD_GATHER(Reg dst, Reg addr, uint32_t bti) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_DWORD_GATHER, 1, 1);
+
+    insn->src(0) = addr;
+    insn->dst(0) = dst;
+    insn->extra.function = bti;
+  }
+
   void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2);
     insn->dst(0) = dst;
@@ -1017,6 +1046,24 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
+  void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 15, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 14; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[14]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 15, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 14; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
   void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
     insn->dst(0) = dst;
@@ -1062,12 +1109,75 @@ namespace gbe
     insn->extra.function = conditional;
   }
 
-  void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
-    SelectionInstruction *insn = this->appendInsn(opcode, 7, 2);
+  void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 7, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i=0; i<6; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 7, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i=0; i<6; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 5, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+    for(int i = 0; i < 4; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 11, 3);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->src(2) = src2;
+    for(int i = 0; i < 10; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[10]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 11, 2);
     insn->dst(0) = dst;
     insn->src(0) = src0;
     insn->src(1) = src1;
-    for(int i = 0; i < 6; i ++)
+    for(int i = 0; i < 10; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, 5, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 4; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, 5, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 4; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 8, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 7; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
@@ -1384,16 +1494,21 @@ namespace gbe
   /*! Unary instruction patterns */
   DECL_PATTERN(UnaryInstruction)
   {
-    static ir::Type getType(const ir::Opcode opcode) {
+    static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType) {
+      if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
+        return insnType;
       if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
         return ir::TYPE_U32;
+      if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
+        return insnType;
       return ir::TYPE_FLOAT;
     }
 
     INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn) const {
       const ir::Opcode opcode = insn.getOpcode();
-      const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode));
-      const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode));
+      const ir::Type insnType = insn.getType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType));
+      const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType));
       switch (opcode) {
         case ir::OP_ABS:
           if (insn.getType() == ir::TYPE_S32) {
@@ -1484,8 +1599,17 @@ namespace gbe
       } else if(type == TYPE_FLOAT) {
         GBE_ASSERT(op != OP_REM);
         sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
-      } else {
-        NOT_IMPLEMENTED;
+      } else if (type == TYPE_S64 || type == TYPE_U64) {
+        GenRegister tmp[14];
+        for(int i=0; i<13; i++) {
+          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          tmp[i].type = GEN_TYPE_UD;
+        }
+        tmp[13] = sel.selReg(sel.reg(FAMILY_BOOL));
+        if(op == OP_DIV)
+          sel.I64DIV(dst, src0, src1, tmp);
+        else
+          sel.I64REM(dst, src0, src1, tmp);
       }
       markAllChildren(dag);
       return true;
@@ -1561,6 +1685,16 @@ namespace gbe
             sel.ADD(dst, src0, src1);
           break;
         case OP_ADDSAT:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister tmp[6];
+            for(int i=0; i<5; i++) {
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+              tmp[i].type = GEN_TYPE_UD;
+            }
+            tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
+            sel.I64SATADD(dst, src0, src1, tmp);
+            break;
+          }
           sel.push();
             sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
             sel.ADD(dst, src0, src1);
@@ -1592,6 +1726,16 @@ namespace gbe
             sel.ADD(dst, src0, GenRegister::negate(src1));
           break;
         case OP_SUBSAT:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister tmp[6];
+            for(int i=0; i<5; i++) {
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+              tmp[i].type = GEN_TYPE_UD;
+            }
+            tmp[5] = sel.selReg(sel.reg(FAMILY_BOOL));
+            sel.I64SATSUB(dst, src0, src1, tmp);
+            break;
+          }
           sel.push();
             sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
             sel.ADD(dst, src0, GenRegister::negate(src1));
@@ -1599,27 +1743,30 @@ namespace gbe
           break;
         case OP_SHL:
           if (type == TYPE_S64 || type == TYPE_U64) {
-            GenRegister tmp[6];
+            GenRegister tmp[7];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
             sel.I64SHL(dst, src0, src1, tmp);
           } else
             sel.SHL(dst, src0, src1);
           break;
         case OP_SHR:
           if (type == TYPE_S64 || type == TYPE_U64) {
-            GenRegister tmp[6];
+            GenRegister tmp[7];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
             sel.I64SHR(dst, src0, src1, tmp);
           } else
             sel.SHR(dst, src0, src1);
           break;
         case OP_ASR:
           if (type == TYPE_S64 || type == TYPE_U64) {
-            GenRegister tmp[6];
+            GenRegister tmp[7];
             for(int i = 0; i < 6; i ++)
               tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL));
             sel.I64ASR(dst, src0, src1, tmp);
           } else
             sel.ASR(dst, src0, src1);
@@ -1629,6 +1776,17 @@ namespace gbe
             sel.MUL_HI(dst, src0, src1, temp);
             break;
           }
+        case OP_I64_MUL_HI:
+         {
+          GenRegister temp[10];
+          for(int i=0; i<9; i++) {
+            temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            temp[i].type = GEN_TYPE_UD;
+          }
+          temp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
+          sel.I64_MUL_HI(dst, src0, src1, temp);
+          break;
+         }
         case OP_MUL:
           if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
@@ -1651,6 +1809,22 @@ namespace gbe
             sel.RHADD(dst, src0, src1, temp);
             break;
           }
+        case OP_I64HADD:
+         {
+          GenRegister tmp[4];
+          for(int i=0; i<4; i++)
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          sel.I64HADD(dst, src0, src1, tmp);
+          break;
+         }
+        case OP_I64RHADD:
+         {
+          GenRegister tmp[4];
+          for(int i=0; i<4; i++)
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          sel.I64RHADD(dst, src0, src1, tmp);
+          break;
+         }
         case OP_UPSAMPLE_SHORT:
           sel.UPSAMPLE_SHORT(dst, src0, src1);
           break;
@@ -1737,6 +1911,9 @@ namespace gbe
       SelectionDAG *cmp = dag.child[0];
       const SelectInstruction &insn = cast<SelectInstruction>(dag.insn);
 
+      if (insn.getType() == TYPE_S64 || insn.getType() == TYPE_U64) // not support
+        return false;
+
       // Not in this block
       if (cmp == NULL) return false;
 
@@ -2085,6 +2262,23 @@ namespace gbe
       sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
     }
 
+    void emitDWordGather(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GBE_ASSERT(valueNum == 1);
+      GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+      // get dword based address
+      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+      sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+
+      sel.DWORD_GATHER(dst, addrDW, bti);
+    }
+
     void emitRead64(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
@@ -2155,8 +2349,19 @@ namespace gbe
       GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.getAddressSpace() == MEM_CONSTANT)
-        this->emitIndirectMove(sel, insn, address);
+      if (insn.getAddressSpace() == MEM_CONSTANT) {
+        // XXX TODO read 64bit constant through constant cache
+        // Per HW Spec, constant cache messages can read at least DWORD data.
+        // So, byte/short data type, we have to read through data cache.
+        if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, 0x2);
+        else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitDWordGather(sel, insn, address, 0x2);
+        else {
+          const GenRegister value = sel.selReg(insn.getValue(0));
+          this->emitByteGather(sel, insn, elemSize, address, value, 0x2);
+        }
+      }
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
         this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
@@ -2334,7 +2539,7 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
 
       // We need two instructions to make the conversion
-      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
+      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
           const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
@@ -2345,8 +2550,24 @@ namespace gbe
           unpacked = GenRegister::unpacked_ub(sel.reg(FAMILY_DWORD));
           unpacked = GenRegister::retype(unpacked, type);
         }
-        sel.MOV(unpacked, src);
+        if(srcFamily == FAMILY_QWORD) {
+          GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
+          tmp.type = GEN_TYPE_D;
+          sel.CONVI64_TO_I(tmp, src);
+          sel.MOV(unpacked, tmp);
+        } else
+          sel.MOV(unpacked, src);
         sel.MOV(dst, unpacked);
+      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) && srcFamily == FAMILY_QWORD) {
+        sel.CONVI64_TO_I(dst, src);
+      } else if (dstType == ir::TYPE_FLOAT && srcFamily == FAMILY_QWORD) {
+        GenRegister tmp[4];
+        for(int i=0; i<3; i++) {
+          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          tmp[i].type = GEN_TYPE_UD;
+        }
+        tmp[3] = sel.selReg(sel.reg(FAMILY_BOOL));
+        sel.CONVI64_TO_F(dst, src, tmp);
       } else if (dst.isdf()) {
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
         sel.MOV_DF(dst, src, sel.selReg(r));
@@ -2450,6 +2671,36 @@ namespace gbe
     }
   };
 
+  DECL_PATTERN(TernaryInstruction)
+   {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn) const {
+      using namespace ir;
+      const Type type = insn.getType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), type),
+                        src0 = sel.selReg(insn.getSrc(0), type),
+                        src1 = sel.selReg(insn.getSrc(1), type),
+                        src2 = sel.selReg(insn.getSrc(2), type);
+      switch(insn.getOpcode()) {
+        case OP_I64MADSAT:
+         {
+          GenRegister tmp[10];
+          for(int i=0; i<9; i++) {
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            tmp[i].type = GEN_TYPE_UD;
+          }
+          tmp[9] = sel.selReg(sel.reg(FAMILY_BOOL));
+          sel.I64MADSAT(dst, src0, src1, src2, tmp);
+          break;
+         }
+        default:
+          NOT_IMPLEMENTED;
+      }
+      return true;
+    }
+
+    DECL_CTOR(TernaryInstruction, 1, 1);
+   };
+
   /*! Label instruction pattern */
   DECL_PATTERN(LabelInstruction)
   {
@@ -2504,6 +2755,13 @@ namespace gbe
       using namespace ir;
       GenRegister msgPayloads[4];
       GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
+      uint32_t srcNum = insn.getSrcNum();
+      uint32_t samplerOffset = 0;
+      if (srcNum == 6) {
+      /* We have the clamp border workaround. */
+        samplerOffset = insn.getSrc(srcNum - 1).value() * 8;
+        srcNum--;
+      }
 
       for( int i = 0; i < 4; ++i)
         msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
@@ -2511,15 +2769,15 @@ namespace gbe
       for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
-      for (uint32_t valueID = 0; valueID < insn.getSrcNum() - 2; ++valueID)
+      for (uint32_t valueID = 0; valueID < srcNum - 2; ++valueID)
         src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
 
       uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
                        (insn.getSrc(SampleInstruction::SURFACE_BTI));
       uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
-                           (insn.getSrc(SampleInstruction::SAMPLER_BTI));
+                           (insn.getSrc(SampleInstruction::SAMPLER_BTI)) + samplerOffset;
 
-      sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum() - 2, msgPayloads, 4, bti, sampler);
+      sel.SAMPLE(dst, insn.getDstNum(), src, srcNum - 2, msgPayloads, 4, bti, sampler);
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1);
@@ -2542,7 +2800,7 @@ namespace gbe
         msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
 
       // u, v, w coords should use coord type.
-      for (; valueID < 1 + coordNum; ++valueID)
+      for (; valueID < coordNum; ++valueID)
         src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
 
       for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
@@ -2562,19 +2820,32 @@ namespace gbe
     INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn) const
     {
       using namespace ir;
-      const uint32_t infoType = insn.getInfoType();
-      GenRegister dst[4];
-      uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(infoType);
-      for (uint32_t valueID = 0; valueID < dstNum; ++valueID)
-        dst[valueID] = sel.selReg(insn.getDst(valueID), TYPE_U32);
-      uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
-                       (insn.getSrc(0));
-      sel.GET_IMAGE_INFO(infoType, dst, dstNum, bti);
+      GenRegister dst;
+      dst = sel.selReg(insn.getDst(0), TYPE_U32);
+      GenRegister imageInfoReg = GenRegister::ud1grf(insn.getSrc(0));
+      sel.MOV(dst, imageInfoReg);
+
       return true;
     }
     DECL_CTOR(GetImageInfoInstruction, 1, 1);
   };
 
+  /*! get sampler info instruction pattern. */
+  DECL_PATTERN(GetSamplerInfoInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::GetSamplerInfoInstruction &insn) const
+    {
+      using namespace ir;
+      GenRegister dst, src;
+      dst = sel.selReg(insn.getDst(0), TYPE_U16);
+      src = GenRegister::offset(GenRegister::uw1grf(ocl::samplerinfo), 0, sel.ctx.getFunction().getSamplerSet()->getIdx(insn.getSrc(0)) * 2);
+      src.subphysical = 1;
+      sel.MOV(dst, src);
+      return true;
+    }
+    DECL_CTOR(GetSamplerInfoInstruction, 1, 1);
+  };
+
   /*! Branch instruction pattern */
   DECL_PATTERN(BranchInstruction)
   {
@@ -2615,6 +2886,9 @@ namespace gbe
           sel.CMP(GEN_CONDITIONAL_G, ip, GenRegister::immuw(nextLabel));
 
           // Branch to the jump target
+          // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
+          // work correct, as flag register bits mapped to non-active lanes tend
+          // to be zero.
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
           else if (simdWidth == 16)
@@ -2737,6 +3011,7 @@ namespace gbe
     this->insert<CompareInstructionPattern>();
     this->insert<ConvertInstructionPattern>();
     this->insert<AtomicInstructionPattern>();
+    this->insert<TernaryInstructionPattern>();
     this->insert<LabelInstructionPattern>();
     this->insert<BranchInstructionPattern>();
     this->insert<Int32x32MulInstructionPattern>();
@@ -2745,6 +3020,7 @@ namespace gbe
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
     this->insert<GetImageInfoInstructionPattern>();
+    this->insert<GetSamplerInfoInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 79b73e2..2422b2b 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -86,8 +86,8 @@ namespace gbe
     const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
     /*! No more than 17 sources (used by typed writes on simd8 mode.) */
     enum { MAX_SRC_NUM = 17 };
-    /*! No more than 4 destinations (used by samples and untyped reads) */
-    enum { MAX_DST_NUM = 4 };
+    /*! No more than 11 destinations (used by samples and untyped reads) */
+    enum { MAX_DST_NUM = 11 };
     /*! State of the instruction (extra fields neeed for the encoding) */
     GenInstructionState state;
     union {
@@ -115,7 +115,7 @@ namespace gbe
     /*! Gen opcode */
     uint8_t opcode;
     /*! Number of destinations */
-    uint8_t dstNum:3;
+    uint8_t dstNum:4;
     /*! Number of sources */
     uint8_t srcNum:5;
     /*! To store various indices */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 32c7a05..21b0a43 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -28,9 +28,13 @@ DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
 DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
 DECL_SELECTION_IR(ADD, BinaryInstruction)
 DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATADD, I64SATADDInstruction)
 DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATSUB, I64SATSUBInstruction)
 DECL_SELECTION_IR(MUL, BinaryInstruction)
 DECL_SELECTION_IR(I64MUL, I64MULInstruction)
+DECL_SELECTION_IR(I64DIV, I64DIVREMInstruction)
+DECL_SELECTION_IR(I64REM, I64DIVREMInstruction)
 DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
 DECL_SELECTION_IR(MACH, BinaryInstruction)
 DECL_SELECTION_IR(CMP, CompareInstruction)
@@ -51,17 +55,24 @@ DECL_SELECTION_IR(READ64, Read64Instruction)
 DECL_SELECTION_IR(WRITE64, Write64Instruction)
 DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
+DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
 DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
 DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64_MUL_HI, I64MULHIInstruction)
 DECL_SELECTION_IR(FBH, UnaryInstruction)
 DECL_SELECTION_IR(FBL, UnaryInstruction)
 DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
 DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
+DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
 DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
 DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
+DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
+DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
+DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 3d7bedd..33f07b2 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -27,12 +27,16 @@
 #include "backend/gen_program.hpp"
 #include "backend/gen_context.hpp"
 #include "backend/gen_defs.hpp"
+#include "backend/gen/gen_mesa_disasm.h"
 #include "backend/gen_reg_allocation.hpp"
 #include "ir/unit.hpp"
 #include "llvm/llvm_to_gen.hpp"
 
 #include <cstring>
+#include <sstream>
 #include <memory>
+#include <iostream>
+#include <fstream>
 
 namespace gbe {
 
@@ -41,8 +45,32 @@ namespace gbe {
   {}
   GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
   const char *GenKernel::getCode(void) const { return (const char*) insns; }
+  const void GenKernel::setCode(const char * ins, size_t size) {
+    insns = (GenInstruction *)ins;
+    insnNum = size / sizeof(GenInstruction);
+  }
   size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
 
+  void GenKernel::printStatus(int indent, std::ostream& outs) {
+    Kernel::printStatus(indent, outs);
+
+    FILE *f = fopen("/dev/null", "w");
+    char *buf = new char[4096];
+    setbuffer(f, buf, 4096);
+
+    for (uint32_t i = 0; i < insnNum; i++) {
+      gen_disasm(f, insns+i);
+      outs << buf;
+      fflush(f);
+      setbuffer(f, NULL, 0);
+      setbuffer(f, buf, 4096);
+    }
+
+    setbuffer(f, NULL, 0);
+    delete [] buf;
+    fclose(f);
+  }
+
   GenProgram::GenProgram(void) {}
   GenProgram::~GenProgram(void) {}
 
@@ -88,8 +116,19 @@ namespace gbe {
   }
 
   static gbe_program genProgramNewFromBinary(const char *binary, size_t size) {
-    NOT_IMPLEMENTED;
-    return NULL;
+    using namespace gbe;
+    std::string binary_content;
+    binary_content.assign(binary, size);
+    GenProgram *program = GBE_NEW_NO_ARG(GenProgram);
+    std::istringstream ifs(binary_content, std::ostringstream::binary);
+
+    if (!program->deserializeFromBin(ifs)) {
+      delete program;
+      return NULL;
+    }
+
+    //program->printStatus(0, std::cout);
+    return reinterpret_cast<gbe_program>(program);
   }
 
   static gbe_program genProgramNewFromLLVM(const char *fileName,
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
index 68b0427..f78e324 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -42,8 +42,12 @@ namespace gbe
     virtual ~GenKernel(void);
     /*! Implements base class */
     virtual const char *getCode(void) const;
-    /*! Implements base class */
+    /*! Set the instruction stream (to be implemented) */
+    virtual const void setCode(const char *, size_t size);
+    /*! Implements get the code size */
     virtual size_t getCodeSize(void) const;
+    /*! Implements printStatus*/
+    virtual void printStatus(int indent, std::ostream& outs);
     GenInstruction *insns; //!< Instruction stream
     uint32_t insnNum;      //!< Number of instructions
     GBE_CLASS(GenKernel);  //!< Use custom allocators
@@ -59,6 +63,10 @@ namespace gbe
     virtual ~GenProgram(void);
     /*! Implements base class */
     virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name);
+    /*! Allocate an empty kernel. */
+    virtual Kernel *allocateKernel(const std::string &name) {
+      return GBE_NEW(GenKernel, name);
+    }
     /*! Use custom allocators */
     GBE_CLASS(GenProgram);
   };
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index a765917..ab8b7ee 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -65,8 +65,10 @@ namespace gbe
     void allocateFlags(Selection &selection);
     /*! Allocate the GRF registers */
     bool allocateGRFs(Selection &selection);
+    /*! Create gen registers for all preallocated curbe registers. */
+    void allocatePayloadRegs(void);
     /*! Create a Gen register from a register set in the payload */
-    void allocatePayloadReg(gbe_curbe_type, ir::Register, uint32_t subValue = 0, uint32_t subOffset = 0);
+    void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
     /*! Create the intervals for each register */
     /*! Allocate the vectors detected in the instruction selection pass */
     void allocateVector(Selection &selection);
@@ -124,19 +126,38 @@ namespace gbe
   GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {}
   GenRegAllocator::Opaque::~Opaque(void) {}
 
-  void GenRegAllocator::Opaque::allocatePayloadReg(gbe_curbe_type value,
-                                                   ir::Register reg,
-                                                   uint32_t subValue,
+  void GenRegAllocator::Opaque::allocatePayloadReg(ir::Register reg,
+                                                   uint32_t offset,
                                                    uint32_t subOffset)
   {
     using namespace ir;
-    const Kernel *kernel = ctx.getKernel();
-    const int32_t curbeOffset = kernel->getCurbeOffset(value, subValue);
-    if (curbeOffset >= 0) {
-      const uint32_t offset = GEN_REG_SIZE + curbeOffset + subOffset;
-      RA.insert(std::make_pair(reg, offset));
-      this->intervals[reg].minID = 0;
-      this->intervals[reg].maxID = 0;
+    assert(offset >= GEN_REG_SIZE);
+    offset += subOffset;
+    RA.insert(std::make_pair(reg, offset));
+    GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
+    this->intervals[reg].minID = 0;
+    this->intervals[reg].maxID = 0;
+  }
+
+  INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+    using namespace ir;
+    for(auto &it : this->ctx.curbeRegs)
+      if (it.first.value() < 0x8000)
+        allocatePayloadReg(it.first, it.second);
+
+    // Allocate all pushed registers (i.e. structure kernel arguments)
+    const Function &fn = ctx.getFunction();
+    GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
+    const Function::PushMap &pushMap = fn.getPushMap();
+    for (const auto &pushed : pushMap) {
+      const uint32_t argID = pushed.second.argID;
+      const FunctionArgument arg = fn.getArg(argID);
+
+      const uint32_t subOffset = pushed.second.offset;
+      const Register reg = pushed.second.getRegister();
+      auto it = this->ctx.curbeRegs.find(arg.reg);
+      assert(it != ctx.curbeRegs.end());
+      allocatePayloadReg(reg, it->second, subOffset);
     }
   }
 
@@ -535,11 +556,9 @@ namespace gbe
     }
     return true;
   }
+
   INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
     using namespace ir;
-    const Kernel *kernel = ctx.getKernel();
-    const Function &fn = ctx.getFunction();
-    GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
     if (ctx.getSimdWidth() == 8) {
       reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE);
       reservedReg /= GEN_REG_SIZE;
@@ -555,25 +574,7 @@ namespace gbe
       this->intervals.push_back(ir::Register(regID));
 
     // Allocate the special registers (only those which are actually used)
-    allocatePayloadReg(GBE_CURBE_LOCAL_ID_X, ocl::lid0);
-    allocatePayloadReg(GBE_CURBE_LOCAL_ID_Y, ocl::lid1);
-    allocatePayloadReg(GBE_CURBE_LOCAL_ID_Z, ocl::lid2);
-    allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_X, ocl::lsize0);
-    allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Y, ocl::lsize1);
-    allocatePayloadReg(GBE_CURBE_LOCAL_SIZE_Z, ocl::lsize2);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_X, ocl::gsize0);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Y, ocl::gsize1);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_SIZE_Z, ocl::gsize2);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
-    allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
-    allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
-    allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
-    allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
-    allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr);
-    allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn);
-    allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst);
+    this->allocatePayloadRegs();
 
     // Group and barrier IDs are always allocated by the hardware in r0
     RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
@@ -582,33 +583,6 @@ namespace gbe
     RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
 
     // block IP used to handle the mask in SW is always allocated
-    const int32_t blockIPOffset = GEN_REG_SIZE + kernel->getCurbeOffset(GBE_CURBE_BLOCK_IP,0);
-    GBE_ASSERT(blockIPOffset >= 0 && blockIPOffset % GEN_REG_SIZE == 0);
-    RA.insert(std::make_pair(ocl::blockip, blockIPOffset));
-    this->intervals[ocl::blockip].minID = 0;
-
-    // Allocate all (non-structure) argument parameters
-    const uint32_t argNum = fn.argNum();
-    for (uint32_t argID = 0; argID < argNum; ++argID) {
-      const FunctionArgument &arg = fn.getArg(argID);
-      GBE_ASSERT(arg.type == FunctionArgument::GLOBAL_POINTER ||
-                 arg.type == FunctionArgument::CONSTANT_POINTER ||
-                 arg.type == FunctionArgument::LOCAL_POINTER ||
-                 arg.type == FunctionArgument::VALUE ||
-                 arg.type == FunctionArgument::STRUCTURE ||
-                 arg.type == FunctionArgument::IMAGE ||
-                 arg.type == FunctionArgument::SAMPLER);
-      allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, arg.reg, argID);
-    }
-
-    // Allocate all pushed registers (i.e. structure kernel arguments)
-    const Function::PushMap &pushMap = fn.getPushMap();
-    for (const auto &pushed : pushMap) {
-      const uint32_t argID = pushed.second.argID;
-      const uint32_t subOffset = pushed.second.offset;
-      const Register reg = pushed.second.getRegister();
-      allocatePayloadReg(GBE_CURBE_KERNEL_ARGUMENT, reg, argID, subOffset);
-    }
 
     // Compute the intervals
     int32_t insnID = 0;
@@ -645,7 +619,8 @@ namespace gbe
 
         // Flag registers can only go to src[0]
         const SelectionOpcode opcode = SelectionOpcode(insn.opcode);
-        if (opcode == SEL_OP_AND || opcode == SEL_OP_OR) {
+        if (opcode == SEL_OP_AND || opcode == SEL_OP_OR || opcode == SEL_OP_XOR
+            || opcode == SEL_OP_I64AND || opcode == SEL_OP_I64OR || opcode == SEL_OP_I64XOR) {
           if (insn.src(1).physical == 0) {
             const ir::Register reg = insn.src(1).reg();
             if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL)
@@ -753,7 +728,8 @@ namespace gbe
       }
       GBE_ASSERT(RA.contains(reg.reg()) != false);
       const uint32_t grfOffset = RA.find(reg.reg())->second;
-      const GenRegister dst = setGenReg(reg, grfOffset);
+      const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+      const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
       if (reg.quarter != 0)
         return GenRegister::Qn(dst, reg.quarter);
       else
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index ea1bc06..538f16c 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -158,6 +158,11 @@ namespace gbe
           NOT_IMPLEMENTED;
       }
     }
+    void useFlag(int nr, int subnr) {
+      flag = nr;
+      subFlag = subnr;
+      physicalFlag = 1;
+    }
   };
 
   /*! This is a book-keeping structure used to encode both virtual and physical
@@ -230,6 +235,7 @@ namespace gbe
     uint32_t nr:8;         //!< Just for some physical registers (acc, null)
     uint32_t subnr:8;      //!< Idem
     uint32_t physical:1;   //!< 1 if physical, 0 otherwise
+    uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise
     uint32_t type:4;       //!< Gen type
     uint32_t file:2;       //!< Register file
     uint32_t negation:1;   //!< For source
@@ -286,6 +292,14 @@ namespace gbe
       return false;
     }
 
+    INLINE int flag_nr(void) const {
+      return nr & 15;
+    }
+
+    INLINE int flag_subnr(void) const {
+      return subnr / typeSize(type);
+    }
+
     static INLINE GenRegister h2(GenRegister reg) {
       GenRegister r = reg;
       r.hstride = GEN_HORIZONTAL_STRIDE_2;
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 35d3a7c..ffd31d9 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -37,6 +37,7 @@
 #include <fstream>
 #include <dlfcn.h>
 #include <sstream>
+#include <iostream>
 #include <unistd.h>
 
 /* Not defined for LLVM 3.0 */
@@ -124,6 +125,321 @@ namespace gbe {
     return true;
   }
 
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  size_t Program::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+    size_t ker_num = kernels.size();
+    int has_constset = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    if (constantSet) {
+      has_constset = 1;
+      OUT_UPDATE_SZ(has_constset);
+      size_t sz = constantSet->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    } else {
+      OUT_UPDATE_SZ(has_constset);
+    }
+
+    OUT_UPDATE_SZ(ker_num);
+    for (auto ker : kernels) {
+      size_t sz = ker.second->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+
+    OUT_UPDATE_SZ(ret_size);
+    return ret_size;
+  }
+
+  size_t Program::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    int has_constset = 0;
+    size_t ker_num;
+    uint32_t magic;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(has_constset);
+    if(has_constset) {
+      constantSet = new ir::ConstantSet;
+      size_t sz = constantSet->deserializeFromBin(ins);
+
+      if (sz == 0) {
+        return 0;
+      }
+
+      total_size += sz;
+    }
+
+    IN_UPDATE_SZ(ker_num);
+
+    for (size_t i = 0; i < ker_num; i++) {
+      size_t ker_serial_sz;
+      std::string ker_name; // Just a empty name here.
+      Kernel* ker = allocateKernel(ker_name);
+
+      if(!(ker_serial_sz = ker->deserializeFromBin(ins)))
+        return 0;
+
+      kernels.insert(std::make_pair(ker->getName(), ker));
+      total_size += ker_serial_sz;
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+  size_t Kernel::serializeToBin(std::ostream& outs) {
+    unsigned int i;
+    size_t ret_size = 0;
+    int has_samplerset = 0;
+    int has_imageset = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    OUT_UPDATE_SZ(name.size());
+    outs.write(name.c_str(), name.size());
+    ret_size += sizeof(char)*name.size();
+
+    OUT_UPDATE_SZ(argNum);
+    for (i = 0; i < argNum; i++) {
+      KernelArgument& arg = args[i];
+      OUT_UPDATE_SZ(arg.type);
+      OUT_UPDATE_SZ(arg.size);
+      OUT_UPDATE_SZ(arg.bufSize);
+    }
+
+    OUT_UPDATE_SZ(patches.size());
+    for (auto patch : patches) {
+      unsigned int tmp;
+      tmp = patch.type;
+      OUT_UPDATE_SZ(tmp);
+      tmp = patch.subType;
+      OUT_UPDATE_SZ(tmp);
+      tmp = patch.offset;
+      OUT_UPDATE_SZ(tmp);
+    }
+
+    OUT_UPDATE_SZ(curbeSize);
+    OUT_UPDATE_SZ(simdWidth);
+    OUT_UPDATE_SZ(stackSize);
+    OUT_UPDATE_SZ(useSLM);
+
+    /* samplers. */
+    if (samplerSet) {
+      has_samplerset = 1;
+      OUT_UPDATE_SZ(has_samplerset);
+      size_t sz = samplerSet->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    } else {
+      OUT_UPDATE_SZ(has_samplerset);
+    }
+
+    /* images. */
+    if (imageSet) {
+      has_imageset = 1;
+      OUT_UPDATE_SZ(has_imageset);
+      size_t sz = imageSet->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    } else {
+      OUT_UPDATE_SZ(has_imageset);
+    }
+
+    /* Code. */
+    const char * code = getCode();
+    OUT_UPDATE_SZ(getCodeSize());
+    outs.write(code, getCodeSize()*sizeof(char));
+    ret_size += getCodeSize()*sizeof(char);
+
+    OUT_UPDATE_SZ(magic_end);
+
+    OUT_UPDATE_SZ(ret_size);
+    return ret_size;
+  }
+
+  size_t Kernel::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    int has_samplerset = 0;
+    int has_imageset = 0;
+    size_t code_size = 0;
+    uint32_t magic = 0;
+    size_t patch_num = 0;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    size_t name_len;
+    IN_UPDATE_SZ(name_len);
+    char* c_name = new char[name_len+1];
+    ins.read(c_name, name_len*sizeof(char));
+    total_size += sizeof(char)*name_len;
+    c_name[name_len] = 0;
+    name = c_name;
+    delete[] c_name;
+
+    IN_UPDATE_SZ(argNum);
+    args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, argNum);
+    for (uint32_t i = 0; i < argNum; i++) {
+      KernelArgument& arg = args[i];
+      IN_UPDATE_SZ(arg.type);
+      IN_UPDATE_SZ(arg.size);
+      IN_UPDATE_SZ(arg.bufSize);
+    }
+
+    IN_UPDATE_SZ(patch_num);
+    for (uint32_t i = 0; i < patch_num; i++) {
+      unsigned int tmp;
+      PatchInfo patch;
+      IN_UPDATE_SZ(tmp);
+      patch.type = tmp;
+      IN_UPDATE_SZ(tmp);
+      patch.subType = tmp;
+      IN_UPDATE_SZ(tmp);
+      patch.offset = tmp;
+
+      patches.push_back(patch);
+    }
+
+    IN_UPDATE_SZ(curbeSize);
+    IN_UPDATE_SZ(simdWidth);
+    IN_UPDATE_SZ(stackSize);
+    IN_UPDATE_SZ(useSLM);
+
+    IN_UPDATE_SZ(has_samplerset);
+    if (has_samplerset) {
+      samplerSet = GBE_NEW(ir::SamplerSet);
+      size_t sz = samplerSet->deserializeFromBin(ins);
+      if (sz == 0) {
+        return 0;
+      }
+
+      total_size += sz;
+    }
+
+    IN_UPDATE_SZ(has_imageset);
+    if (has_imageset) {
+      imageSet = GBE_NEW(ir::ImageSet);
+      size_t sz = imageSet->deserializeFromBin(ins);
+      if (sz == 0) {
+        return 0;
+      }
+
+      total_size += sz;
+    }
+
+    IN_UPDATE_SZ(code_size);
+    if (code_size) {
+      char* code = GBE_NEW_ARRAY_NO_ARG(char, code_size);
+      ins.read(code, code_size*sizeof(char));
+      total_size += sizeof(char)*code_size;
+      setCode(code, code_size);
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+#undef OUT_UPDATE_SZ
+#undef IN_UPDATE_SZ
+
+  void Program::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+
+    outs << spaces << "=============== Begin Program ===============" << "\n";
+
+    if (constantSet) {
+      constantSet->printStatus(indent + 4, outs);
+    }
+
+    for (auto ker : kernels) {
+      ker.second->printStatus(indent + 4, outs);
+    }
+
+    outs << spaces << "================ End Program ================" << "\n";
+  }
+
+  void Kernel::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+    string spaces_nl = indent_to_str(indent + 4);
+    int num;
+
+    outs << spaces << "+++++++++++ Begin Kernel +++++++++++" << "\n";
+    outs << spaces_nl << "Kernel Name: " << name << "\n";
+    outs << spaces_nl << "  curbeSize: " << curbeSize << "\n";
+    outs << spaces_nl << "  simdWidth: " << simdWidth << "\n";
+    outs << spaces_nl << "  stackSize: " << stackSize << "\n";
+    outs << spaces_nl << "  useSLM: " << useSLM << "\n";
+
+    outs << spaces_nl << "  Argument Number is " << argNum << "\n";
+    for (uint32_t i = 0; i < argNum; i++) {
+      KernelArgument& arg = args[i];
+      outs << spaces_nl << "  Arg " << i << ":\n";
+      outs << spaces_nl << "      type value: "<< arg.type << "\n";
+      outs << spaces_nl << "      size: "<< arg.size << "\n";
+      outs << spaces_nl << "      bufSize: "<< arg.bufSize << "\n";
+    }
+
+    outs << spaces_nl << "  Patches Number is " << patches.size() << "\n";
+    num = 0;
+    for (auto patch : patches) {
+      num++;
+      outs << spaces_nl << "  patch " << num << ":\n";
+      outs << spaces_nl << "      type value: "<< patch.type << "\n";
+      outs << spaces_nl << "      subtype value: "<< patch.subType << "\n";
+      outs << spaces_nl << "      offset: "<< patch.offset << "\n";
+    }
+
+    if (samplerSet) {
+      samplerSet->printStatus(indent + 4, outs);
+    }
+
+    if (imageSet) {
+      imageSet->printStatus(indent + 4, outs);
+    }
+
+    outs << spaces << "++++++++++++ End Kernel ++++++++++++" << "\n";
+  }
+
+  /*********************** End of Program class member function *************************/
+
   static void programDelete(gbe_program gbeProgram) {
     gbe::Program *program = (gbe::Program*)(gbeProgram);
     GBE_SAFE_DELETE(program);
@@ -152,7 +468,9 @@ namespace gbe {
       useless.push_back(str);
       args.push_back(str.c_str());
     }
-
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+    args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
+#endif
     args.push_back("-emit-llvm");
     // XXX we haven't implement those builtin functions,
     // so disable it currently.
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index d20e7af..8774344 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -70,8 +70,7 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Y,
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
-  GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
-  GBE_CURBE_GLOBAL_CONSTANT_DATA,
+  GBE_CURBE_SAMPLER_INFO,
   GBE_CURBE_IMAGE_INFO,
   GBE_CURBE_STACK_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 83aaab8..28a792d 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -67,7 +67,7 @@ namespace gbe {
   }
 
   /*! Describe a compiled kernel */
-  class Kernel : public NonCopyable
+  class Kernel : public NonCopyable, public Serializable
   {
   public:
     /*! Create an empty kernel with the given name */
@@ -76,6 +76,8 @@ namespace gbe {
     virtual ~Kernel(void);
     /*! Return the instruction stream (to be implemented) */
     virtual const char *getCode(void) const = 0;
+    /*! Set the instruction stream.*/
+    virtual const void setCode(const char *, size_t size) = 0;
     /*! Return the instruction stream size (to be implemented) */
     virtual size_t getCodeSize(void) const = 0;
     /*! Get the kernel name */
@@ -128,9 +130,37 @@ namespace gbe {
     size_t getImageSize(void) const { return imageSet->getDataSize(); }
     /*! Get defined image value array */
     void getImageData(ImageInfo *images) const { imageSet->getData(images); }
+
+    static const uint32_t magic_begin = TO_MAGIC('K', 'E', 'R', 'N');
+    static const uint32_t magic_end = TO_MAGIC('N', 'R', 'E', 'K');
+
+    /* format:
+       magic_begin       |
+       name_size         |
+       name              |
+       arg_num           |
+       args              |
+       PatchInfo_num     |
+       PatchInfo         |
+       curbeSize         |
+       simdWidth         |
+       stackSize         |
+       useSLM            |
+       samplers          |
+       images            |
+       code_size         |
+       code              |
+       magic_end
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
   protected:
     friend class Context;      //!< Owns the kernels
-    const std::string name;    //!< Kernel name
+    std::string name;    //!< Kernel name
     KernelArgument *args;      //!< Each argument
     vector<PatchInfo> patches; //!< Indicates how to build the curbe
     uint32_t argNum;           //!< Number of function arguments
@@ -146,7 +176,7 @@ namespace gbe {
   };
 
   /*! Describe a compiled program */
-  class Program : public NonCopyable
+  class Program : public NonCopyable, public Serializable
   {
   public:
     /*! Create an empty program */
@@ -186,9 +216,32 @@ namespace gbe {
     size_t getGlobalConstantSize(void) const { return constantSet->getDataSize(); }
     /*! Get the content of global constant arrays */
     void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
+
+    static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
+    static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
+
+    /* format:
+       magic_begin       |
+       constantSet_flag  |
+       constSet_data     |
+       kernel_num        |
+       kernel_1          |
+       ........          |
+       kernel_n          |
+       magic_end         |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
   protected:
     /*! Compile a kernel */
     virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name) = 0;
+    /*! Allocate an empty kernel. */
+    virtual Kernel *allocateKernel(const std::string &name) = 0;
     /*! Kernels sorted by their name */
     hash_map<std::string, Kernel*> kernels;
     /*! Global (constants) outside any kernel */
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
index 440b455..2b8f913 100644
--- a/backend/src/builtin_vector_proto.def
+++ b/backend/src/builtin_vector_proto.def
@@ -6,11 +6,10 @@ gentype asin (gentype)
 gentype asinh (gentype)
 gentype asinpi (gentype x)
 gentype atan (gentype y_over_x)
-# XXX atan2 is a builtin function
-#gentype atan2 (gentype y, gentype x)
+gentype atan2 (gentype y, gentype x)
 gentype atanh (gentype)
 gentype atanpi (gentype x)
-#gentype atan2pi (gentype y, gentype x)
+gentype atan2pi (gentype y, gentype x)
 gentype cbrt (gentype)
 gentype ceil (gentype)
 gentype copysign (gentype x, gentype y)
@@ -61,13 +60,13 @@ float ldexp (float x, int k)
 doublen ldexp (doublen x, intn k)
 doublen ldexp (doublen x, int k)
 double ldexp (double x, int k)
-#gentype lgamma (gentype x)
-#floatn lgamma_r (floatn x, __global intn *signp)
-#floatn lgamma_r (floatn x, __local intn *signp)
-#floatn lgamma_r (floatn x, __private intn *signp)
-#float lgamma_r (float x, __global int *signp)
-#float lgamma_r (float x, __local int *signp)
-#float lgamma_r (float x,   __private int *signp)
+gentype lgamma (gentype x)
+floatn lgamma_r (floatn x, __global intn *signp)
+floatn lgamma_r (floatn x, __local intn *signp)
+floatn lgamma_r (floatn x, __private intn *signp)
+float lgamma_r (float x, __global int *signp)
+float lgamma_r (float x, __local int *signp)
+float lgamma_r (float x,   __private int *signp)
 #doublen lgamma_r (doublen x, __global intn *signp)
 #doublen lgamma_r (doublen x, __local intn *signp)
 #doublen lgamma_r (doublen x, __private intn *signp)
@@ -127,7 +126,7 @@ gentype sqrt (gentype)
 gentype tan (gentype)
 gentype tanh (gentype)
 gentype tanpi (gentype x)
-#gentype tgamma (gentype)
+gentype tgamma (gentype)
 gentype trunc (gentype)
 
 ##half_native_math
@@ -253,8 +252,8 @@ int any (igentype x)
 int all (igentype x)
 # XXX need to revisit select latter
 #gentype bitselect (gentype a, gentype b, gentype c)
-#gentype select (gentype a, gentype b, igentype c)
-#gentype select (gentype a, gentype b, ugentype c)
+gentype select (gentype a, gentype b, igentype c)
+gentype select (gentype a, gentype b, ugentype c)
 
 ##misc
 #gentypen shuffle (gentypem x, ugentypen mask)
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
new file mode 100644
index 0000000..afe86f2
--- /dev/null
+++ b/backend/src/gbe_bin_generater.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*******************************************************************************
+   This file is used to generating the gbe kernel binary.  These binary may be
+   used in CL API, such as enqueue memory We generate the binary in build time
+   to improve the performance.
+ *******************************************************************************/
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <fstream>
+#include <deque>
+#include <vector>
+#include <algorithm>
+#include "backend/program.h"
+#include "backend/program.hpp"
+
+using namespace std;
+
+#define FILE_NOT_FIND_ERR 1
+#define FILE_MAP_ERR 2
+#define FILE_BUILD_FAILED 3
+#define FILE_SERIALIZATION_FAILED 4
+
+class program_build_instance {
+
+protected:
+    string prog_path;
+    string build_opt;
+    static string bin_path;
+    int fd;
+    int file_len;
+    const char* code;
+    gbe::Program* gbe_prog;
+
+public:
+    program_build_instance (void) : fd(-1), file_len(0), code(NULL), gbe_prog(NULL) { }
+    explicit program_build_instance (const char* file_path, const char* option = NULL)
+        : prog_path(file_path), build_opt(option), fd(-1), file_len(0),
+          code(NULL), gbe_prog(NULL) { }
+
+    ~program_build_instance () {
+        if (code) {
+            munmap((void *)(code), file_len);
+            code = NULL;
+        }
+
+        if (fd >= 0)
+            close(fd);
+
+        if (gbe_prog)
+            gbe_program_delete(reinterpret_cast<gbe_program>(gbe_prog));
+    }
+
+    program_build_instance(program_build_instance&& other) = default;
+#if 0
+    {
+#define SWAP(ELT) \
+	do { \
+	    auto elt = this->ELT; \
+	    this->ELT = other.ELT; \
+	    other.ELT = elt; \
+	} while(0)
+
+        SWAP(fd);
+        SWAP(code);
+        SWAP(file_len);
+        SWAP(prog_path);
+        SWAP(build_opt);
+#undef SWAP
+    }
+#endif
+
+    explicit program_build_instance(const program_build_instance& other) = delete;
+    program_build_instance& operator= (const program_build_instance& other) {
+        /* we do not want to be Lvalue copied, but operator is needed to instance the
+           template of vector<program_build_instance>. */
+        assert(1);
+        return *this;
+    }
+
+
+    const char* file_map_open (void) throw (int);
+
+    const char* get_code (void) {
+        return code;
+    }
+
+    const string& get_program_path (void) {
+        return prog_path;
+    }
+
+    int get_size (void) {
+        return file_len;
+    }
+
+    void print_file (void) {
+        cout << code << endl;
+    }
+
+    void dump (void) {
+        cout << "program path: " << prog_path << endl;
+        cout << "Build option: " << build_opt << endl;
+        print_file();
+    }
+
+    static int set_bin_path (const char* path) {
+        if (bin_path.size())
+            return 0;
+
+        bin_path = path;
+        return 1;
+    }
+
+    void build_program(void) throw(int);
+    void serialize_program(void) throw(int);
+};
+
+string program_build_instance::bin_path;
+
+void program_build_instance::serialize_program(void) throw(int)
+{
+    ofstream ofs;
+    ostringstream oss;
+    ofs.open(bin_path, ofstream::out | ofstream::app | ofstream::binary);
+
+    size_t sz = gbe_prog->serializeToBin(ofs);
+    ofs.close();
+
+    if (!sz) {
+        throw FILE_SERIALIZATION_FAILED;
+    }
+}
+
+
+void program_build_instance::build_program(void) throw(int)
+{
+    gbe_program opaque = gbe_program_new_from_source(code, 0, build_opt.c_str(), NULL, NULL);
+    if (!opaque)
+        throw FILE_BUILD_FAILED;
+
+    gbe_prog = reinterpret_cast<gbe::Program*>(opaque);
+
+    assert(gbe_program_get_kernel_num(opaque));
+}
+
+const char* program_build_instance::file_map_open(void) throw(int)
+{
+    void * address;
+
+    /* Open the file */
+    fd = ::open(prog_path.c_str(), O_RDONLY);
+    if (fd < 0) {
+        throw FILE_NOT_FIND_ERR;
+    }
+
+    /* Map it */
+    file_len = lseek(fd, 0, SEEK_END);
+    lseek(fd, 0, SEEK_SET);
+    address = mmap(0, file_len, PROT_READ, MAP_SHARED, fd, 0);
+    if (address == NULL) {
+        throw FILE_MAP_ERR;
+    }
+
+    code = reinterpret_cast<const char*>(address);
+    return code;
+}
+
+typedef vector<program_build_instance> prog_vector;
+
+int main (int argc, const char **argv)
+{
+    prog_vector prog_insts;
+    vector<string> argv_saved;
+    const char* build_opt;
+    const char* file_path;
+    int i;
+    int oc;
+    deque<int> used_index;
+
+    if (argc < 2) {
+        cout << "Usage: kernel_path [-pbuild_parameter]\n[-obin_path]" << endl;
+        return 0;
+    }
+
+    used_index.assign(argc, 0);
+
+    /* because getopt will re-sort the argv, so we save here. */
+    for (i=0; i< argc; i++) {
+        argv_saved.push_back(string(argv[i]));
+    }
+
+    while ( (oc = getopt(argc, (char * const *)argv, "o:p:")) != -1 ) {
+        switch (oc) {
+        case 'p':
+        {
+            int opt_index;
+
+            if (argv[optind-1][0] == '-') {// -pXXX like
+                opt_index = optind - 1;
+            } else { // Must be -p XXXX mode
+                opt_index = optind - 2;
+                used_index[opt_index + 1] = 1;
+            }
+
+            /* opt must follow the file name.*/
+            if ((opt_index < 2 ) || argv[opt_index-1][0] == '-') {
+                cout << "Usage note: Building option must follow file name" << endl;
+                return 1;
+            }
+
+            file_path = argv[opt_index - 1];
+            build_opt = optarg;
+
+            prog_insts.push_back(program_build_instance(file_path, build_opt));
+            break;
+        }
+
+        case 'o':
+            if (!program_build_instance::set_bin_path(optarg)) {
+                cout << "Can not specify the bin path more than once." << endl;
+                return 1;
+            }
+            used_index[optind-1] = 1;
+            break;
+
+        case ':':
+            cout << "Miss the file option argument" << endl;
+            return 1;
+
+        default:
+            cout << "Unknown opt" << endl;
+        }
+    }
+
+    for (i=1; i < argc; i++) {
+        //cout << argv_saved[i] << endl;
+        if (argv_saved[i].size() && argv_saved[i][0] != '-') {
+            if (used_index[i])
+                continue;
+
+            string file_name = argv_saved[i];
+            prog_vector::iterator result = find_if(prog_insts.begin(), prog_insts.end(),
+            [&](program_build_instance & prog_inst)-> bool {
+                bool result = false;
+                if (prog_inst.get_program_path() == file_name)
+                    result = true;
+
+                return result;
+            });
+
+            if (result == prog_insts.end()) {
+                prog_insts.push_back(program_build_instance(file_name.c_str(), ""));
+            }
+        }
+    }
+
+    for (auto& inst : prog_insts) {
+        try {
+            inst.file_map_open();
+            inst.build_program();
+            inst.serialize_program();
+        }
+        catch (int & err_no) {
+            if (err_no == FILE_NOT_FIND_ERR) {
+                cout << "can not open the file " <<
+                     inst.get_program_path() << endl;
+            } else if (err_no == FILE_MAP_ERR) {
+                cout << "map the file " <<
+                     inst.get_program_path() << " failed" << endl;
+            } else if (err_no == FILE_BUILD_FAILED) {
+                cout << "build the file " <<
+                     inst.get_program_path() << " failed" << endl;
+            } else if (err_no == FILE_SERIALIZATION_FAILED) {
+                cout << "Serialize the file " <<
+                     inst.get_program_path() << " failed" << endl;
+            }
+            return -1;
+        }
+    }
+
+    //for (auto& inst : prog_insts) {
+    //    inst.dump();
+    //}
+
+    return 0;
+}
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
index 35e3a2a..0a30738 100755
--- a/backend/src/gen_builtin_vector.py
+++ b/backend/src/gen_builtin_vector.py
@@ -311,7 +311,10 @@ class builtinProto():
                 if (isPointer(ptype)):
                     formatStr += '({} {} *)param{} + {:2d}'.format(ptype[2], ptype[0], n, j)
                 else:
-                    formatStr += 'param{}.s{:x}'.format(n, j)
+                    if (self.functionName == 'select' and n == 2):
+                        formatStr += '({0})(param{1}.s{2:x} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
+                    else:
+                        formatStr += 'param{}.s{:x}'.format(n, j)
 
             formatStr += ')'
 
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
index 056b529..6cc81f1 100755
--- a/backend/src/gen_convert.sh
+++ b/backend/src/gen_convert.sh
@@ -5,14 +5,33 @@
 # For all vector lengths and types, generate conversion functions
 for vector_length in $VECTOR_LENGTHS; do
         if test $vector_length -eq 1; then
-                continue;
-        fi
-        for ftype in $TYPES; do
+          for ftype in $TYPES; do
+            fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+            for ttype in $TYPES; do
+              tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+              if test $fbasetype = $tbasetype; then
+                continue
+              fi
+              echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
+              echo "  return ($tbasetype)v;"
+              echo "}"
+              echo
+            done
+          done
+        else
+          for ftype in $TYPES; do
                 fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
                 for ttype in $TYPES; do
                         tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
                         if test $fbasetype = $tbasetype; then
-                                continue
+                          if test $vector_length -gt 1; then
+                            fvectortype=$fbasetype$vector_length
+                            tvectortype=$tbasetype$vector_length
+                            echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
+                          else
+                            echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
+                          fi
+                          continue
                         fi
                         fvectortype=$fbasetype$vector_length
                         tvectortype=$tbasetype$vector_length
@@ -48,5 +67,6 @@ for vector_length in $VECTOR_LENGTHS; do
                         echo "}"
                         echo
                 done
-        done
+          done
+        fi
 done
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index c9f5bfe..7a8f80f 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -40,6 +40,107 @@ namespace ir {
     for (uint32_t i = 0; i < size; ++i) this->data.push_back(data[i]);
   }
 
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  size_t ConstantSet::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    /* output the const data. */
+    OUT_UPDATE_SZ((data.size()*sizeof(char)));
+    if(data.size() > 0) {
+      outs.write(data.data(), data.size()*sizeof(char));
+      ret_size += data.size()*sizeof(char);
+    }
+
+    OUT_UPDATE_SZ(constants.size());
+    for (auto const &cnst : constants) {
+      size_t bytes = sizeof(cnst.getName().size())        //name length self
+                     + cnst.getName().size()*sizeof(char) //name
+                     + sizeof(cnst.getSize())             //size
+                     + sizeof(cnst.getAlignment())        //alignment
+                     + sizeof(cnst.getOffset())	          //offset
+                     + sizeof(cnst.getReg());             //reg
+      OUT_UPDATE_SZ(bytes);
+
+      OUT_UPDATE_SZ(cnst.getName().size());
+      outs.write(cnst.getName().c_str(), cnst.getName().size());
+      ret_size += sizeof(char)*cnst.getName().size();
+      OUT_UPDATE_SZ(cnst.getSize());
+      OUT_UPDATE_SZ(cnst.getAlignment());
+      OUT_UPDATE_SZ(cnst.getOffset());
+      OUT_UPDATE_SZ(cnst.getReg());
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+    OUT_UPDATE_SZ(ret_size);
+
+    return ret_size;
+  }
+
+  size_t ConstantSet::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    size_t global_data_sz = 0;
+    size_t const_num;
+    uint32_t magic;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(global_data_sz);
+    for (size_t i = 0; i < global_data_sz; i++) {
+      char elt;
+      IN_UPDATE_SZ(elt);
+      data.push_back(elt);
+    }
+
+    IN_UPDATE_SZ(const_num);
+    for (size_t i = 0; i < const_num; i++) {
+      size_t bytes;
+      IN_UPDATE_SZ(bytes);
+
+      size_t name_len;
+      IN_UPDATE_SZ(name_len);
+
+      char* c_name = new char[name_len+1];
+      ins.read(c_name, name_len);
+      total_size += sizeof(char)*name_len;
+      c_name[name_len] = 0;
+
+      uint32_t size, align, offset;
+      uint16_t reg;
+      IN_UPDATE_SZ(size);
+      IN_UPDATE_SZ(align);
+      IN_UPDATE_SZ(offset);
+      IN_UPDATE_SZ(reg);
+
+      ir::Constant constant(c_name, size, align, offset);
+      constant.setReg(reg);
+      constants.push_back(constant);
+
+      delete[] c_name;
+
+      /* Saint check */
+      if (bytes != sizeof(name_len) + sizeof(char)*name_len + sizeof(size)
+              + sizeof(align) + sizeof(offset) + sizeof(reg))
+        return 0;
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index 0717391..4bb549e 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -52,6 +52,8 @@ namespace ir {
     /*! Nothing happens here */
     INLINE ~Constant(void) {}
     const std::string& getName(void) const { return name; }
+    uint32_t getSize (void) const { return size; }
+    uint32_t getAlignment (void) const { return alignment; }
     uint32_t getOffset(void) const { return offset; }
     uint16_t getReg(void) const { return reg; }
     void setReg(uint16_t reg) { this->reg = reg; }
@@ -67,7 +69,7 @@ namespace ir {
   /*! A constant set is a set of immutable data associated to a compilation
    *  unit
    */
-  class ConstantSet
+  class ConstantSet : public Serializable
   {
   public:
     /*! Append a new constant in the constant set */
@@ -93,7 +95,8 @@ namespace ir {
         mem[i] = data[i];
     }
     ConstantSet() {}
-    ConstantSet(const ConstantSet& other) : data(other.data), constants(other.constants) {}
+    ConstantSet(const ConstantSet& other) : Serializable(other),
+                data(other.data), constants(other.constants) {}
     ConstantSet & operator = (const ConstantSet& other) {
       if (&other != this) {
         data = other.data;
@@ -101,6 +104,27 @@ namespace ir {
       }
       return *this;
     }
+
+    static const uint32_t magic_begin = TO_MAGIC('C', 'N', 'S', 'T');
+    static const uint32_t magic_end = TO_MAGIC('T', 'S', 'N', 'C');
+
+    /* format:
+       magic_begin     |
+       const_data_size |
+       const_data      |
+       constant_1_size |
+       constant_1      |
+       ........        |
+       constant_n_size |
+       constant_n      |
+       magic_end       |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+
   private:
     vector<char> data;         //!< The constant data serialized in one array
     vector<Constant> constants;//!< Each constant description
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index c286f1d..a7337e6 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -142,6 +142,7 @@ namespace ir {
       this->NAME(type, dst, index); \
     }
     DECL_THREE_SRC_INSN(SEL);
+    DECL_THREE_SRC_INSN(I64MADSAT);
 #undef DECL_THREE_SRC_INSN
 
     /*! For all unary functions */
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index 486fde1..b901a12 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -110,5 +110,144 @@ namespace ir {
       GBE_DELETE(it.second);
   }
 
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  /*! Implements the serialization. */
+  size_t ImageSet::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    OUT_UPDATE_SZ(regMap.size());
+    for (auto iter : regMap) {
+      OUT_UPDATE_SZ(iter.first);
+      OUT_UPDATE_SZ(iter.second->arg_idx);
+      OUT_UPDATE_SZ(iter.second->idx);
+      OUT_UPDATE_SZ(iter.second->wSlot);
+      OUT_UPDATE_SZ(iter.second->hSlot);
+      OUT_UPDATE_SZ(iter.second->depthSlot);
+      OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+      OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+      OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+    }
+
+    OUT_UPDATE_SZ(indexMap.size());
+    for (auto iter : indexMap) {
+      OUT_UPDATE_SZ(iter.first);
+      OUT_UPDATE_SZ(iter.second->arg_idx);
+      OUT_UPDATE_SZ(iter.second->idx);
+      OUT_UPDATE_SZ(iter.second->wSlot);
+      OUT_UPDATE_SZ(iter.second->hSlot);
+      OUT_UPDATE_SZ(iter.second->depthSlot);
+      OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+      OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+      OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+    OUT_UPDATE_SZ(ret_size);
+
+    return ret_size;
+  }
+
+  size_t ImageSet::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    uint32_t magic;
+    size_t image_map_sz = 0;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(image_map_sz); //regMap
+    for (size_t i = 0; i < image_map_sz; i++) {
+      ir::Register reg;
+      ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+      IN_UPDATE_SZ(reg);
+      IN_UPDATE_SZ(img_info->arg_idx);
+      IN_UPDATE_SZ(img_info->idx);
+      IN_UPDATE_SZ(img_info->wSlot);
+      IN_UPDATE_SZ(img_info->hSlot);
+      IN_UPDATE_SZ(img_info->depthSlot);
+      IN_UPDATE_SZ(img_info->dataTypeSlot);
+      IN_UPDATE_SZ(img_info->channelOrderSlot);
+      IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+      regMap.insert(std::make_pair(reg, img_info));
+    }
+
+    IN_UPDATE_SZ(image_map_sz); //indexMap
+    for (uint32_t i = 0; i < image_map_sz; i++) {
+      uint32_t index;
+      ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+      IN_UPDATE_SZ(index);
+      IN_UPDATE_SZ(img_info->arg_idx);
+      IN_UPDATE_SZ(img_info->idx);
+      IN_UPDATE_SZ(img_info->wSlot);
+      IN_UPDATE_SZ(img_info->hSlot);
+      IN_UPDATE_SZ(img_info->depthSlot);
+      IN_UPDATE_SZ(img_info->dataTypeSlot);
+      IN_UPDATE_SZ(img_info->channelOrderSlot);
+      IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+      indexMap.insert(std::make_pair(index, img_info));
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+  void ImageSet::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+    string spaces_nl = indent_to_str(indent + 4);
+
+    outs << spaces << "------------ Begin ImageSet ------------" << "\n";
+
+    outs << spaces_nl  << "  ImageSet Map: [reg, arg_idx, idx, wSlot, hSlot, depthSlot, "
+                "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+    outs << spaces_nl << "     regMap size: " << regMap.size() << "\n";
+    for (auto iter : regMap) {
+      outs << spaces_nl << "         [" << iter.first << ", "
+           << iter.second->arg_idx << ", "
+           << iter.second->idx << ", "
+           << iter.second->wSlot << ", "
+           << iter.second->hSlot << ", "
+           << iter.second->depthSlot << ", "
+           << iter.second->dataTypeSlot << ", "
+           << iter.second->channelOrderSlot << ", "
+           << iter.second->dimOrderSlot << "]" << "\n";
+   }
+
+   outs << spaces_nl << "  ImageSet Map: [index, arg_idx, idx, wSlot, hSlot, depthSlot, "
+           "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+   outs << spaces_nl << "     regMap size: " << indexMap.size() << "\n";
+   for (auto iter : indexMap) {
+     outs << spaces_nl << "         [" << iter.first << ", "
+          << iter.second->arg_idx << ", "
+          << iter.second->idx << ", "
+          << iter.second->wSlot << ", "
+          << iter.second->hSlot << ", "
+          << iter.second->depthSlot << ", "
+          << iter.second->dataTypeSlot << ", "
+          << iter.second->channelOrderSlot << ", "
+          << iter.second->dimOrderSlot << ", " << "\n";
+   }
+
+   outs << spaces << "------------- End ImageSet -------------" << "\n";
+  }
+
+
 } /* namespace ir */
 } /* namespace gbe */
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
index 04e78e6..c084c7d 100644
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -40,7 +40,7 @@ namespace ir {
    *  for each individual image. And that individual image could be used
    *  at backend to identify this image's location.
    */
-  class ImageSet
+  class ImageSet : public Serializable
   {
   public:
     /*! Append an image argument. */
@@ -60,6 +60,29 @@ namespace ir {
     ImageSet(const ImageSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
     ImageSet() {}
     ~ImageSet();
+
+    static const uint32_t magic_begin = TO_MAGIC('I', 'M', 'A', 'G');
+    static const uint32_t magic_end = TO_MAGIC('G', 'A', 'M', 'I');
+
+    /* format:
+       magic_begin     |
+       regMap_size     |
+       element_1       |
+       ........        |
+       element_n       |
+       indexMap_size   |
+       element_1       |
+       ........        |
+       element_n       |
+       magic_end       |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
   private:
     map<Register, struct ImageInfo *> regMap;
     map<uint32_t, struct ImageInfo *> indexMap;
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 48e83b4..7c6c6c6 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -173,6 +173,30 @@ namespace ir {
       }
     };
 
+    class ALIGNED_INSTRUCTION TernaryInstruction :
+      public BasePolicy,
+      public NDstPolicy<TernaryInstruction, 1>,
+      public TupleSrcPolicy<TernaryInstruction>
+    {
+     public:
+      TernaryInstruction(Opcode opcode,
+                         Type type,
+                         Register dst,
+                         Tuple src) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+        this->src = src;
+      }
+      Type getType(void) const { return type; }
+      bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Type type;
+      Register dst[1];
+      Tuple src;
+      static const uint32_t srcNum = 3;
+    };
+
     /*! Three sources mean we need a tuple to encode it */
     class ALIGNED_INSTRUCTION SelectInstruction :
       public BasePolicy,
@@ -462,7 +486,7 @@ namespace ir {
       INLINE Type getSrcType(void) const { return this->srcType; }
       INLINE Type getDstType(void) const { return this->dstType; }
 
-      static const uint32_t srcNum = 5;
+      static const uint32_t srcNum = 6;
       static const uint32_t dstNum = 4;
     };
 
@@ -504,20 +528,48 @@ namespace ir {
       Register dst[0];               //!< No dest register
     };
 
+    class ALIGNED_INSTRUCTION GetSamplerInfoInstruction :
+      public BasePolicy,
+      public NSrcPolicy<GetSamplerInfoInstruction, 1>,
+      public NDstPolicy<GetSamplerInfoInstruction, 1>
+    {
+    public:
+      GetSamplerInfoInstruction( Register dst,
+                                 Register src)
+      {
+        this->opcode = OP_GET_SAMPLER_INFO;
+        this->dst[0] = dst;
+        this->src[0] = src;
+      }
+
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " sampler id %" << this->getSrc(fn, 0)
+            << " %" << this->getDst(fn, 0);
+      }
+
+      Register src[1];                  //!< Surface to get info
+      Register dst[1];                  //!< return value
+      static const uint32_t dstNum = 1;
+    };
+
     class ALIGNED_INSTRUCTION GetImageInfoInstruction :
       public BasePolicy,
-      public NSrcPolicy<GetImageInfoInstruction, 1>,
-      public TupleDstPolicy<GetImageInfoInstruction>
+      public NSrcPolicy<GetImageInfoInstruction, 2>,
+      public NDstPolicy<GetImageInfoInstruction, 1>
     {
     public:
       GetImageInfoInstruction( int type,
-                               Tuple dst,
-                               Register src)
+                               Register dst,
+                               Register src,
+                               Register infoReg)
       {
         this->opcode = OP_GET_IMAGE_INFO;
         this->infoType = type;
-        this->dst = dst;
+        this->dst[0] = dst;
         this->src[0] = src;
+        this->src[1] = infoReg;
       }
 
       INLINE uint32_t getInfoType(void) const { return infoType; }
@@ -530,11 +582,9 @@ namespace ir {
       }
 
       uint8_t infoType;                 //!< Type of the requested information.
-      Register src[1];                  //!< Surface to get info
-      Tuple dst;                        //!< dest register to put the information.
-      static const uint32_t dstNum = 4; //! The maximum dst number. Not the actual number
-                                        // of destination tuple. We use the infoType to determin
-                                        // the actual num.
+      Register src[2];                  //!< Surface to get info
+      Register dst[1];                        //!< dest register to put the information.
+      static const uint32_t dstNum = 1;
     };
 
     class ALIGNED_INSTRUCTION LoadImmInstruction :
@@ -788,6 +838,25 @@ namespace ir {
       return true;
     }
 
+    INLINE bool TernaryInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(src + 3u > fn.tupleNum())) {
+        whyNot = "Out-of-bound index for ternary instruction";
+        return false;
+      }
+      for (uint32_t srcID = 0; srcID < 3; ++srcID) {
+        const Register regID = fn.getRegister(src, srcID);
+        if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+          return false;
+      }
+      return true;
+    }
+
     /*! Loads and stores follow the same restrictions */
     template <typename T>
     INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
@@ -843,6 +912,9 @@ namespace ir {
     { return true; }
     INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
+    INLINE bool GetSamplerInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
+
 
     // Ensure that types and register family match
     INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
@@ -934,6 +1006,10 @@ namespace ir {
       ternaryOrSelectOut(*this, out, fn);
     }
 
+    INLINE void TernaryInstruction::out(std::ostream &out, const Function &fn) const {
+      ternaryOrSelectOut(*this, out, fn);
+    }
+
     INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
       out << "." << addrSpace;
@@ -1077,6 +1153,10 @@ START_INTROSPECTION(SelectInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(SelectInstruction)
 
+START_INTROSPECTION(TernaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(TernaryInstruction)
+
 START_INTROSPECTION(BranchInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(BranchInstruction)
@@ -1093,6 +1173,10 @@ START_INTROSPECTION(GetImageInfoInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(GetImageInfoInstruction)
 
+START_INTROSPECTION(GetSamplerInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetSamplerInfoInstruction)
+
 START_INTROSPECTION(LoadImmInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LoadImmInstruction)
@@ -1259,6 +1343,7 @@ DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
 DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
+DECL_MEM_FN(TernaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
@@ -1333,6 +1418,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   DECL_EMIT_FUNCTION(SUB)
   DECL_EMIT_FUNCTION(SUBSAT)
   DECL_EMIT_FUNCTION(MUL_HI)
+  DECL_EMIT_FUNCTION(I64_MUL_HI)
   DECL_EMIT_FUNCTION(UPSAMPLE_SHORT)
   DECL_EMIT_FUNCTION(UPSAMPLE_INT)
   DECL_EMIT_FUNCTION(UPSAMPLE_LONG)
@@ -1348,6 +1434,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   DECL_EMIT_FUNCTION(AND)
   DECL_EMIT_FUNCTION(HADD)
   DECL_EMIT_FUNCTION(RHADD)
+  DECL_EMIT_FUNCTION(I64HADD)
+  DECL_EMIT_FUNCTION(I64RHADD)
 
 #undef DECL_EMIT_FUNCTION
 
@@ -1356,6 +1444,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
     return internal::SelectInstruction(type, dst, src).convert();
   }
 
+  Instruction I64MADSAT(Type type, Register dst, Tuple src) {
+    return internal::TernaryInstruction(OP_I64MADSAT, type, dst, src).convert();
+  }
+
   // All compare functions
 #define DECL_EMIT_FUNCTION(NAME) \
   Instruction NAME(Type type, Register dst,  Register src0, Register src1) { \
@@ -1436,8 +1528,12 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
     return internal::TypedWriteInstruction(src, srcType, coordType).convert();
   }
 
-  Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src) {
-    return internal::GetImageInfoInstruction(infoType, dst, src).convert();
+  Instruction GET_IMAGE_INFO(int infoType, Register dst, Register src, Register infoReg) {
+    return internal::GetImageInfoInstruction(infoType, dst, src, infoReg).convert();
+  }
+
+  Instruction GET_SAMPLER_INFO(Register dst, Register src) {
+    return internal::GetSamplerInfoInstruction(dst, src).convert();
   }
 
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 40a3d40..27a34d1 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -206,6 +206,13 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Ternary instructions are typed. dst and sources share the same type */
+  class TernaryInstruction : public Instruction {
+   public:
+    Type getType(void) const;
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Select instructions writes src0 to dst if cond is true. Otherwise, it
    *  writes src1
    */
@@ -353,11 +360,12 @@ namespace ir {
 
   typedef union {
     struct {
-     uint16_t index; /*! the allocated image index */
-     uint16_t type;  /*! the information type */
+     uint8_t index; /*! the allocated image index */
+     uint8_t  type;  /*! the information type */
     };
     uint32_t data;
   } ImageInfoKey;
+
   /*! Get image information */
   class GetImageInfoInstruction : public Instruction {
   public:
@@ -392,6 +400,14 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Get image information */
+  class GetSamplerInfoInstruction : public Instruction {
+  public:
+
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Branch instruction is the unified way to branch (with or without
    *  predicate)
    */
@@ -519,6 +535,10 @@ namespace ir {
   Instruction SIN(Type type, Register dst, Register src);
   /*! mul_hi.type dst src */
   Instruction MUL_HI(Type type, Register dst, Register src0, Register src1);
+  /*! i64_mul_hi.type dst src */
+  Instruction I64_MUL_HI(Type type, Register dst, Register src0, Register src1);
+  /*! i64madsat.type dst src */
+  Instruction I64MADSAT(Type type, Register dst, Tuple src);
   /*! upsample_short.type dst src */
   Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
   /*! upsample_int.type dst src */
@@ -533,6 +553,10 @@ namespace ir {
   Instruction HADD(Type type, Register dst, Register src0, Register src1);
   /*! rhadd.type dst src */
   Instruction RHADD(Type type, Register dst, Register src0, Register src1);
+  /*! i64hadd.type dst src */
+  Instruction I64HADD(Type type, Register dst, Register src0, Register src1);
+  /*! i64rhadd.type dst src */
+  Instruction I64RHADD(Type type, Register dst, Register src0, Register src1);
   /*! tan.type dst src */
   Instruction RCP(Type type, Register dst, Register src);
   /*! abs.type dst src */
@@ -620,7 +644,9 @@ namespace ir {
   /*! sample textures */
   Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType);
   /*! get image information , such as width/height/depth/... */
-  Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src);
+  Instruction GET_IMAGE_INFO(int infoType, Register dst, Register src, Register infoReg);
+  /*! get sampler information  */
+  Instruction GET_SAMPLER_INFO(Register dst, Register src);
   /*! label labelIndex */
   Instruction LABEL(LabelIndex labelIndex);
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index c15e912..1a9f867 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -72,11 +72,16 @@ DECL_INSN(SAMPLE, SampleInstruction)
 DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_INSN(GET_SAMPLER_INFO, GetSamplerInfoInstruction)
 DECL_INSN(MUL_HI, BinaryInstruction)
+DECL_INSN(I64_MUL_HI, BinaryInstruction)
 DECL_INSN(FBH, UnaryInstruction)
 DECL_INSN(FBL, UnaryInstruction)
 DECL_INSN(HADD, BinaryInstruction)
 DECL_INSN(RHADD, BinaryInstruction)
+DECL_INSN(I64HADD, BinaryInstruction)
+DECL_INSN(I64RHADD, BinaryInstruction)
 DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
 DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
 DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
+DECL_INSN(I64MADSAT, TernaryInstruction)
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 675018a..10e0c59 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,8 +40,7 @@ namespace ir {
         "stack_pointer",
         "block_ip",
         "barrier_id", "thread_number",
-        "const_curbe_offset",
-        "work_dimension",
+        "work_dimension", "sampler_info"
     };
 
 #if GBE_DEBUG
@@ -76,8 +75,8 @@ namespace ir {
       DECL_NEW_REG(FAMILY_WORD, blockip);
       DECL_NEW_REG(FAMILY_DWORD, barrierid);
       DECL_NEW_REG(FAMILY_DWORD, threadn);
-      DECL_NEW_REG(FAMILY_DWORD, constoffst);
       DECL_NEW_REG(FAMILY_DWORD, workdim);
+      DECL_NEW_REG(FAMILY_WORD, samplerinfo);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 4b0ef5e..89dd69f 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -63,8 +63,8 @@ namespace ir {
     static const Register blockip = Register(19);  // blockip
     static const Register barrierid = Register(20);// barrierid
     static const Register threadn = Register(21);  // number of threads
-    static const Register constoffst = Register(22); // offset of global constant array's curbe
-    static const Register workdim = Register(23);  // work dimention.
+    static const Register workdim = Register(22);  // work dimention.
+    static const Register samplerinfo = Register(23); // store sampler info.
     static const uint32_t regNum = 24;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index 62bdc16..cff1012 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -74,5 +74,103 @@ namespace ir {
     appendReg(samplerReg, SAMPLER_ID(id), ctx);
   }
 
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  /*! Implements the serialization. */
+  size_t SamplerSet::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    OUT_UPDATE_SZ(samplerMap.size());
+    for (auto iter : samplerMap) {
+      OUT_UPDATE_SZ(iter.first);
+      OUT_UPDATE_SZ(iter.second.reg);
+      OUT_UPDATE_SZ(iter.second.slot);
+    }
+
+    OUT_UPDATE_SZ(regMap.size());
+    for (auto iter : regMap) {
+      OUT_UPDATE_SZ(iter.first);
+      OUT_UPDATE_SZ(iter.second.reg);
+      OUT_UPDATE_SZ(iter.second.slot);
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+    OUT_UPDATE_SZ(ret_size);
+
+    return ret_size;
+  }
+
+  size_t SamplerSet::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    uint32_t magic;
+    size_t sampler_map_sz = 0;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(sampler_map_sz);
+    for (size_t i = 0; i < sampler_map_sz; i++) {
+      uint32_t key;
+      ir::SamplerRegSlot reg_slot;
+
+      IN_UPDATE_SZ(key);
+      IN_UPDATE_SZ(reg_slot.reg);
+      IN_UPDATE_SZ(reg_slot.slot);
+      samplerMap.insert(std::make_pair(key, reg_slot));
+    }
+
+    IN_UPDATE_SZ(sampler_map_sz);
+    for (size_t i = 0; i < sampler_map_sz; i++) {
+      ir::Register key;
+      ir::SamplerRegSlot reg_slot;
+
+      IN_UPDATE_SZ(key);
+      IN_UPDATE_SZ(reg_slot.reg);
+      IN_UPDATE_SZ(reg_slot.slot);
+      regMap.insert(std::make_pair(key, reg_slot));
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+  void SamplerSet::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+    string spaces_nl = indent_to_str(indent + 4);
+
+    outs << spaces << "------------ Begin SamplerSet ------------" << "\n";
+
+    outs << spaces_nl << "  SamplerSet Map: [index, sampler_reg, sampler_slot]\n";
+    outs << spaces_nl << "     samplerMap size: " << samplerMap.size() << "\n";
+
+    for (auto iter : samplerMap) {
+      outs << spaces_nl <<  "     [" << iter.first << ", "
+           << iter.second.reg << ", " << iter.second.slot << "]\n";
+    }
+
+    outs << spaces_nl << "  SamplerSet Map: [reg, sampler_reg, sampler_slot]\n";
+    outs << spaces_nl << "     regMap size: " << regMap.size() << "\n";
+    for (auto iter : regMap) {
+      outs << spaces_nl << "     [" << iter.first << ", "
+           << iter.second.reg << ", " << iter.second.slot << "]\n";
+    }
+
+    outs << spaces << "------------- End SamplerSet -------------" << "\n";
+  }
+
 } /* namespace ir */
 } /* namespace gbe */
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index f968299..3c72e3e 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -41,7 +41,7 @@ namespace ir {
     uint32_t slot;
   };
 
-  class SamplerSet
+  class SamplerSet : public Serializable
   {
   public:
     /*! Append the specified sampler and return the allocated offset.
@@ -66,6 +66,29 @@ namespace ir {
 
     SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
     SamplerSet() {}
+
+    static const uint32_t magic_begin = TO_MAGIC('S', 'A', 'M', 'P');
+    static const uint32_t magic_end = TO_MAGIC('P', 'M', 'A', 'S');
+
+    /* format:
+       magic_begin     |
+       samplerMap_size |
+       element_1       |
+       ........        |
+       element_n       |
+       regMap_size     |
+       element_1       |
+       ........        |
+       element_n       |
+       magic_end       |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
   private:
     void appendReg(const Register reg, uint32_t key, Context *ctx);
     map<uint32_t, SamplerRegSlot> samplerMap;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 12d809d..8b73ac9 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -460,7 +460,8 @@ namespace gbe
     }
 
     virtual bool doInitialization(Module &M);
-
+    /*! helper function for parsing global constant data */
+    void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
     void collectGlobalConstant(void) const;
 
     bool runOnFunction(Function &F) {
@@ -539,6 +540,8 @@ namespace gbe
     // Emit unary instructions from gen native function
     void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
 
+    ir::Register appendSampler(CallSite::arg_iterator AI);
+
     // These instructions are not supported at all
     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
     void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
@@ -559,6 +562,101 @@ namespace gbe
   };
 
   char GenWriter::ID = 0;
+  void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
+    StringRef data = cda->getRawDataValues();
+    memcpy((char*)ptr+offset, data.data(), data.size());
+    offset += data.size();
+    return;
+  }
+
+  void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+    Type * type = c->getType();
+    Type::TypeID id = type->getTypeID();
+
+    GBE_ASSERT(c);
+    if(isa<UndefValue>(c)) {
+      uint32_t n = c->getNumOperands();
+      Type * opTy = type->getArrayElementType();
+      uint32_t size = opTy->getIntegerBitWidth()/ 8;
+      offset += size*n;
+      return;
+    }
+    switch(id) {
+      case Type::TypeID::StructTyID:
+        {
+          const StructType * strTy = cast<StructType>(c->getType());
+          uint32_t size = 0;
+
+          for(uint32_t op=0; op < strTy->getNumElements(); op++)
+          {
+            Type* elementType = strTy->getElementType(op);
+            uint32_t align = 8 * getAlignmentByte(unit, elementType);
+            uint32_t padding = getPadding(size, align);
+            size += padding;
+            size += getTypeBitSize(unit, elementType);
+
+            offset += padding/8;
+            const Constant* sub = cast<Constant>(c->getOperand(op));
+            GBE_ASSERT(sub);
+            getConstantData(sub, mem, offset);
+          }
+          break;
+        }
+      case Type::TypeID::ArrayTyID:
+        {
+          const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+          if(cds)
+            getSequentialData(cds, mem, offset);
+          else {
+            const ConstantArray *ca = dyn_cast<ConstantArray>(c);
+            const ArrayType *arrTy = ca->getType();
+            Type* elemTy = arrTy->getElementType();
+            uint32_t elemSize = getTypeBitSize(unit, elemTy);
+            uint32_t padding = getPadding(elemSize, 8 * getAlignmentByte(unit, elemTy));
+            padding /= 8;
+            uint32_t ops = c->getNumOperands();
+            for(uint32_t op = 0; op < ops; ++op) {
+              Constant * ca = dyn_cast<Constant>(c->getOperand(op));
+              getConstantData(ca, mem, offset);
+              offset += padding;
+            }
+          }
+          break;
+        }
+      case Type::TypeID::VectorTyID:
+        {
+          const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+          GBE_ASSERT(cds);
+          getSequentialData(cds, mem, offset);
+          break;
+        }
+      case Type::TypeID::IntegerTyID:
+        {
+          const ConstantInt *ci = dyn_cast<ConstantInt>(c);
+          uint32_t size = ci->getBitWidth() / 8;
+          uint64_t data = ci->isNegative() ? ci->getSExtValue() : ci->getZExtValue();
+          memcpy((char*)mem+offset, &data, size);
+          offset += size;
+          break;
+        }
+      case Type::TypeID::FloatTyID:
+        {
+          const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+          *(float *)((char*)mem + offset) = cf->getValueAPF().convertToFloat();
+          offset += sizeof(float);
+          break;
+        }
+      case Type::TypeID::DoubleTyID:
+        {
+          const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+          *(double *)((char*)mem + offset) = cf->getValueAPF().convertToDouble();
+          offset += sizeof(double);
+          break;
+        }
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
 
   void GenWriter::collectGlobalConstant(void) const {
     const Module::GlobalListType &globalList = TheModule->getGlobalList();
@@ -569,69 +667,13 @@ namespace gbe
       if(addrSpace == ir::AddressSpace::MEM_CONSTANT) {
         GBE_ASSERT(v.hasInitializer());
         const Constant *c = v.getInitializer();
-        if (c->getType()->getTypeID() != Type::ArrayTyID) {
-          void *mem = malloc(sizeof(double));
-          int size = 0;
-          switch(c->getType()->getTypeID()) {
-            case Type::TypeID::IntegerTyID: {
-              const ConstantInt *ci = dyn_cast<ConstantInt>(c);
-              *(int *)mem = ci->isNegative() ? ci->getSExtValue() : ci->getZExtValue();
-              size = sizeof(int);
-              break;
-            }
-            case Type::TypeID::FloatTyID: {
-              const ConstantFP *cf = dyn_cast<ConstantFP>(c);
-              *(float *)mem = cf->getValueAPF().convertToFloat();
-              size = sizeof(float);
-              break;
-            }
-            case Type::TypeID::DoubleTyID: {
-              const ConstantFP *cf = dyn_cast<ConstantFP>(c);
-              *(double *)mem = cf->getValueAPF().convertToDouble();
-              size = sizeof(double);
-              break;
-            }
-            default:
-              NOT_IMPLEMENTED;
-          }
-          unit.newConstant((char *)mem, name, size, size);
-          free(mem);
-          continue;
-        }
-        GBE_ASSERT(c->getType()->getTypeID() == Type::ArrayTyID);
-        const ConstantDataArray *cda = dyn_cast<ConstantDataArray>(c);
-        GBE_ASSERT(cda);
-        unsigned len = cda->getNumElements();
-        uint64_t elementSize = cda->getElementByteSize();
-        Type::TypeID typeID = cda->getElementType()->getTypeID();
-        if(typeID == Type::TypeID::IntegerTyID)
-          elementSize = sizeof(unsigned);
-        void *mem = malloc(elementSize * len);
-        for(unsigned j = 0; j < len; j ++) {
-          switch(typeID) {
-            case Type::TypeID::FloatTyID:
-             {
-              float f = cda->getElementAsFloat(j);
-              memcpy((float *)mem + j, &f, elementSize);
-             }
-              break;
-            case Type::TypeID::DoubleTyID:
-             {
-              double d = cda->getElementAsDouble(j);
-              memcpy((double *)mem + j, &d, elementSize);
-             }
-              break;
-            case Type::TypeID::IntegerTyID:
-             {
-              unsigned u = (unsigned) cda->getElementAsInteger(j);
-              memcpy((unsigned *)mem + j, &u, elementSize);
-             }
-              break;
-            default:
-              NOT_IMPLEMENTED;
-          }
-        }
-        unit.newConstant((char *)mem, name, elementSize * len, sizeof(unsigned));
+        Type * type = c->getType();
+
+        uint32_t size = getTypeByteSize(unit, type);
+        void* mem = malloc(size);
+        uint32_t offset = 0;
+        getConstantData(c, mem, offset);
+        unit.newConstant((char *)mem, name, size, sizeof(unsigned));
         free(mem);
       }
     }
@@ -819,18 +861,38 @@ namespace gbe
         return ir::Register(reg);
       }
       if (isa<ConstantExpr>(CPV)) {
+        uint32_t TypeIndex;
+        uint32_t constantOffset = 0;
+        uint32_t offset = 0;
         ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV);
-        GBE_ASSERT(CE->isGEPWithNoNotionalOverIndexing());
-        auto pointer = CE->getOperand(0);
-        auto offset1 = dyn_cast<ConstantInt>(CE->getOperand(1));
-        GBE_ASSERT(offset1->getZExtValue() == 0);
-        auto offset2 = dyn_cast<ConstantInt>(CE->getOperand(2));
-        int type_size = pointer->getType()->getTypeID() == Type::TypeID::DoubleTyID ? sizeof(double) : sizeof(int);
-        int type_offset = offset2->getSExtValue() * type_size;
-        auto pointer_name = pointer->getName().str();
+
+        // currently only GetElementPtr is handled
+        GBE_ASSERT(CE->getOpcode() == Instruction::GetElementPtr);
+        Value *pointer = CE->getOperand(0);
+        CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+        for(uint32_t op=1; op<CE->getNumOperands(); ++op) {
+          ConstantInt* ConstOP = dyn_cast<ConstantInt>(CE->getOperand(op));
+          GBE_ASSERT(ConstOP);
+          TypeIndex = ConstOP->getZExtValue();
+          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+          {
+            Type* elementType = CompTy->getTypeAtIndex(ty_i);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            offset += getPadding(offset, align);
+            offset += getTypeByteSize(unit, elementType);
+          }
+
+          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+          offset += getPadding(offset, align);
+
+          constantOffset += offset;
+          CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+        }
+
+        const std::string &pointer_name = pointer->getName().str();
         ir::Register pointer_reg = ir::Register(unit.getConstantSet().getConstant(pointer_name).getReg());
         ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
-        ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(type_offset, ir::Type::TYPE_S32));
+        ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32));
         ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
         ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
         return reg;
@@ -1243,12 +1305,7 @@ namespace gbe
       ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
       ir::Constant &con = unit.getConstantSet().getConstant(j ++);
       con.setReg(reg.value());
-      if(con.getOffset() != 0) {
-        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
-        ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg);
-      } else {
-        ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst);
-      }
+      ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
     }
 
     // Visit all the instructions and emit the IR registers or the value to
@@ -1521,7 +1578,7 @@ namespace gbe
         Type *llvmSrcType = I.getOperand(0)->getType();
         const ir::Type dstType = getType(ctx, llvmDstType);
         ir::Type srcType;
-        if (I.getOpcode() == Instruction::ZExt) {
+        if (I.getOpcode() == Instruction::ZExt || I.getOpcode() == Instruction::UIToFP) {
           srcType = getUnsignedType(ctx, llvmSrcType);
         } else {
           srcType = getType(ctx, llvmSrcType);
@@ -1754,6 +1811,7 @@ namespace gbe
       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
       case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
       case GEN_OCL_GET_IMAGE_DEPTH:
+      case GEN_OCL_GET_SAMPLER_INFO:
       case GEN_OCL_ATOMIC_ADD0:
       case GEN_OCL_ATOMIC_ADD1:
       case GEN_OCL_ATOMIC_SUB0:
@@ -1825,6 +1883,8 @@ namespace gbe
       }
       case GEN_OCL_MUL_HI_INT:
       case GEN_OCL_MUL_HI_UINT:
+      case GEN_OCL_MUL_HI_I64:
+      case GEN_OCL_MUL_HI_UI64:
       case GEN_OCL_UPSAMPLE_SHORT:
       case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_UPSAMPLE_LONG:
@@ -1846,6 +1906,10 @@ namespace gbe
       case GEN_OCL_USUB_SAT_LONG:
       case GEN_OCL_HADD:
       case GEN_OCL_RHADD:
+      case GEN_OCL_I64HADD:
+      case GEN_OCL_I64RHADD:
+      case GEN_OCL_I64_MAD_SAT:
+      case GEN_OCL_I64_MAD_SATU:
         this->newRegister(&I);
         break;
       default:
@@ -1891,6 +1955,25 @@ namespace gbe
     ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
   }
 
+  /* append a new sampler. should be called before any reference to
+   * a sampler_t value. */
+  ir::Register GenWriter::appendSampler(CallSite::arg_iterator AI) {
+    Constant *CPV = dyn_cast<Constant>(*AI);
+    ir::Register sampler;
+    if (CPV != NULL)
+    {
+      // This is not a kernel argument sampler, we need to append it to sampler set,
+      // and allocate a sampler slot for it.
+      auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+      GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+      sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+    } else {
+      sampler = this->getRegister(*AI);
+      ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
+    }
+    return sampler;
+  }
+
   void GenWriter::emitCallInst(CallInst &I) {
     if (Function *F = I.getCalledFunction()) {
       if (F->getIntrinsicID() != 0) {
@@ -2014,21 +2097,18 @@ namespace gbe
             GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
             uint32_t elemNum;
             (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
-            vector<ir::Register> dstTupleData;
-            ir::Register lastReg;
-            for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-              const ir::Register reg = this->getRegister(&I, elemID);
-              dstTupleData.push_back(reg);
-              lastReg = reg;
-            }
-            // A walk around for the gen IR limitation.
-            for (uint32_t elemID = elemNum; elemID < 4; ++ elemID) {
-              dstTupleData.push_back(lastReg);
-            }
-            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], 4);
+            const ir::Register reg = this->getRegister(&I, 0);
             int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
 
-            ctx.GET_IMAGE_INFO(infoType, dstTuple, surface_id);
+            ctx.GET_IMAGE_INFO(infoType, reg, surface_id, ctx.reg(ir::FAMILY_DWORD));
+            break;
+          }
+          case GEN_OCL_GET_SAMPLER_INFO:
+          {
+            GBE_ASSERT(AI != AE);
+            const ir::Register sampler = this->appendSampler(AI); ++AI;
+            const ir::Register reg = this->getRegister(&I, 0);
+            ctx.GET_SAMPLER_INFO(reg, sampler);
             break;
           }
           case GEN_OCL_READ_IMAGE0:
@@ -2046,29 +2126,13 @@ namespace gbe
           {
             GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE);
-            Constant *CPV = dyn_cast<Constant>(*AI);
-            ir::Register sampler;
-            if (CPV != NULL)
-            {
-              // This is not a kernel argument sampler, we need to append it to sampler set,
-              // and allocate a sampler slot for it.
-              auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-              GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
-              sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
-            } else {
-              sampler = this->getRegister(*AI);
-              ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
-            }
+            const ir::Register sampler = this->appendSampler(AI);
             ++AI;
 
             GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
             ir::Register wcoord;
-            if (it->second == GEN_OCL_READ_IMAGE10 ||
-                it->second == GEN_OCL_READ_IMAGE11 ||
-                it->second == GEN_OCL_READ_IMAGE12 ||
-                it->second == GEN_OCL_READ_IMAGE13 ||
-                it->second == GEN_OCL_READ_IMAGE14) {
+            if (it->second >= GEN_OCL_READ_IMAGE10 && it->second <= GEN_OCL_READ_IMAGE15) {
               GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
             } else
               wcoord = ir::Register(0);
@@ -2084,8 +2148,19 @@ namespace gbe
             srcTupleData.push_back(ucoord);
             srcTupleData.push_back(vcoord);
             srcTupleData.push_back(wcoord);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+            GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+            assert(CPV);
+            auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+            GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+            ir::Register offsetReg(x.data.u32);
+            srcTupleData.push_back(offsetReg);
+#else
+            ir::Register offsetReg(0);
+#endif
+            srcTupleData.push_back(offsetReg);
             const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
-            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 5);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 6);
 
             ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32;
 
@@ -2137,11 +2212,7 @@ namespace gbe
             GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
             ir::Register wcoord;
-            if(it->second == GEN_OCL_WRITE_IMAGE10 ||
-               it->second == GEN_OCL_WRITE_IMAGE11 ||
-               it->second == GEN_OCL_WRITE_IMAGE12 ||
-               it->second == GEN_OCL_WRITE_IMAGE13 ||
-               it->second == GEN_OCL_WRITE_IMAGE14) {
+            if(it->second >= GEN_OCL_WRITE_IMAGE10 && it->second <= GEN_OCL_WRITE_IMAGE15) {
               GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
             } else
               wcoord = ir::Register(0);
@@ -2208,6 +2279,22 @@ namespace gbe
             ctx.MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_MUL_HI_I64:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64_MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_MUL_HI_UI64:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64_MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           case GEN_OCL_UPSAMPLE_SHORT:
           {
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
@@ -2276,6 +2363,24 @@ namespace gbe
             ctx.SUBSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_I64_MAD_SAT:
+           {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64MADSAT(getType(ctx, I.getType()), dst, src0, src1, src2);
+            break;
+           }
+          case GEN_OCL_I64_MAD_SATU:
+           {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
+            break;
+           }
           case GEN_OCL_HADD: {
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
@@ -2283,6 +2388,16 @@ namespace gbe
             ctx.HADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_I64HADD:
+           {
+            GBE_ASSERT(AI != AE);
+            const ir::Register src0 = this->getRegister(*(AI++));
+            GBE_ASSERT(AI != AE);
+            const ir::Register src1 = this->getRegister(*(AI++));
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64HADD(ir::TYPE_U64, dst, src0, src1);
+            break;
+           }
           case GEN_OCL_RHADD: {
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
@@ -2290,6 +2405,16 @@ namespace gbe
             ctx.RHADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_I64RHADD:
+           {
+            GBE_ASSERT(AI != AE);
+            const ir::Register src0 = this->getRegister(*(AI++));
+            GBE_ASSERT(AI != AE);
+            const ir::Register src1 = this->getRegister(*(AI++));
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64RHADD(ir::TYPE_U64, dst, src0, src1);
+            break;
+           }
           default: break;
         }
       }
@@ -2407,13 +2532,19 @@ namespace gbe
       const ir::Type type = getType(ctx, elemType);
       const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
 
-      if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) {
+      if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
           // Build the tuple data in the vector
           vector<ir::Register> tupleData; // put registers here
           for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-            const ir::Register reg = this->getRegister(llvmValues, elemID);
+            ir::Register reg;
+            if(regTranslator.isUndefConst(llvmValues, elemID)) {
+              Value *v = Constant::getNullValue(elemType);
+              reg = this->getRegister(v);
+            } else
+              reg = this->getRegister(llvmValues, elemID);
+
             tupleData.push_back(reg);
           }
           const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
@@ -2433,7 +2564,13 @@ namespace gbe
             // Build the tuple data in the vector
             vector<ir::Register> tupleData; // put registers here
             for (uint32_t elemID = 0; elemID < 4; ++elemID) {
-              const ir::Register reg = this->getRegister(llvmValues, 4*msg+elemID);
+              ir::Register reg;
+              if(regTranslator.isUndefConst(llvmValues, elemID)) {
+                Value *v = Constant::getNullValue(elemType);
+                reg = this->getRegister(v);
+              } else
+                reg = this->getRegister(llvmValues, 4*msg+elemID);
+
               tupleData.push_back(reg);
             }
             const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], 4);
@@ -2468,6 +2605,9 @@ namespace gbe
         }
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+          if(regTranslator.isUndefConst(llvmValues, elemID))
+            continue;
+
           const ir::Register reg = this->getRegister(llvmValues, elemID);
           ir::Register addr;
           if (elemID == 0)
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index b712860..321fc4e 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -42,19 +42,19 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
 DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
 
 // To read_image functions.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjffj)
 
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfff)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiii)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfff)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfffj)
 
 // To write_image functions.
 DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE0, _Z22__gen_ocl_write_imageijiiDv4_i)
@@ -125,14 +125,24 @@ DECL_LLVM_GEN_FUNCTION(USUB_SAT_SHORT, _Z12ocl_usub_sattt)
 DECL_LLVM_GEN_FUNCTION(USUB_SAT_INT, _Z12ocl_usub_satjj)
 DECL_LLVM_GEN_FUNCTION(USUB_SAT_LONG, _Z12ocl_usub_satmm)
 
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SAT, _Z17__gen_ocl_mad_satlll)
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SATU, _Z17__gen_ocl_mad_satmmm)
+
 // integer built-in functions
 DECL_LLVM_GEN_FUNCTION(MUL_HI_INT, _Z16__gen_ocl_mul_hiii)
 DECL_LLVM_GEN_FUNCTION(MUL_HI_UINT, _Z16__gen_ocl_mul_hijj)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_I64, _Z16__gen_ocl_mul_hill)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_UI64, _Z16__gen_ocl_mul_himm)
 DECL_LLVM_GEN_FUNCTION(FBH, __gen_ocl_fbh)
 DECL_LLVM_GEN_FUNCTION(FBL, __gen_ocl_fbl)
 DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_abs)
-DECL_LLVM_GEN_FUNCTION(HADD, __gen_ocl_hadd)
-DECL_LLVM_GEN_FUNCTION(RHADD, __gen_ocl_rhadd)
+DECL_LLVM_GEN_FUNCTION(HADD, _Z14__gen_ocl_haddjj)
+DECL_LLVM_GEN_FUNCTION(RHADD, _Z15__gen_ocl_rhaddjj)
+DECL_LLVM_GEN_FUNCTION(I64HADD, _Z14__gen_ocl_haddmm)
+DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+
+// get sampler info
+DECL_LLVM_GEN_FUNCTION(GET_SAMPLER_INFO, __gen_ocl_get_sampler_info)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 41674b6..7a40616 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -383,7 +383,6 @@ namespace gbe {
 
     if (PHINode* phi = dyn_cast<PHINode>(inst)) {
       PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues());
-      assert(args.size() % 2 == 0 && "Odd number of arguments for a PHI");
 
       // Loop over pairs of operands: [Value*, BasicBlock*]
       for (unsigned int i = 0; i < args.size(); i++) {
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index 1ea150b..b736a88 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -4,6 +4,7 @@
 //
 // Common defines for Image intrinsics
 // Channel order
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
 enum {
   CLK_R = 0x10B0,
   CLK_A = 0x10B1,
@@ -66,54 +67,52 @@ typedef enum clk_channel_type {
 
 typedef enum clk_sampler_type {
     __CLK_ADDRESS_BASE             = 0,
-    CLK_ADDRESS_NONE               = 0 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_CLAMP              = 1 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_CLAMP_TO_EDGE      = 2 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_REPEAT             = 3 << __CLK_ADDRESS_BASE,
-    CLK_ADDRESS_MIRROR             = 4 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_NONE               = (0 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_CLAMP              = (1 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_CLAMP_TO_EDGE      = (2 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_REPEAT             = (3 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_MIRROR             = (4 << __CLK_ADDRESS_BASE),
 
 #if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
     CLK_ADDRESS_MIRRORED_REPEAT    = CLK_ADDRESS_MIRROR,
 #endif
-    __CLK_ADDRESS_MASK             = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+    __CLK_ADDRESS_MASK             = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
                                      CLK_ADDRESS_CLAMP_TO_EDGE |
-                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR),
     __CLK_ADDRESS_BITS             = 3,        // number of bits required to
                                                // represent address info
 
     __CLK_NORMALIZED_BASE          = __CLK_ADDRESS_BITS,
     CLK_NORMALIZED_COORDS_FALSE    = 0,
-    CLK_NORMALIZED_COORDS_TRUE     = 1 << __CLK_NORMALIZED_BASE,
-    __CLK_NORMALIZED_MASK          = CLK_NORMALIZED_COORDS_FALSE |
-                                     CLK_NORMALIZED_COORDS_TRUE,
+    CLK_NORMALIZED_COORDS_TRUE     = (1 << __CLK_NORMALIZED_BASE),
+    __CLK_NORMALIZED_MASK          = (CLK_NORMALIZED_COORDS_FALSE |
+                                      CLK_NORMALIZED_COORDS_TRUE),
     __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
                                                // represent normalization
-
-    __CLK_FILTER_BASE              = __CLK_NORMALIZED_BASE +
-                                     __CLK_NORMALIZED_BITS,
-    CLK_FILTER_NEAREST             = 0 << __CLK_FILTER_BASE,
-    CLK_FILTER_LINEAR              = 1 << __CLK_FILTER_BASE,
-    CLK_FILTER_ANISOTROPIC         = 2 << __CLK_FILTER_BASE,
-    __CLK_FILTER_MASK              = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
-                                     CLK_FILTER_ANISOTROPIC,
+    __CLK_FILTER_BASE              = (__CLK_NORMALIZED_BASE +  __CLK_NORMALIZED_BITS),
+    CLK_FILTER_NEAREST             = (0 << __CLK_FILTER_BASE),
+    CLK_FILTER_LINEAR              = (1 << __CLK_FILTER_BASE),
+    CLK_FILTER_ANISOTROPIC         = (2 << __CLK_FILTER_BASE),
+    __CLK_FILTER_MASK              = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+                                     CLK_FILTER_ANISOTROPIC),
     __CLK_FILTER_BITS              = 2,        // number of bits required to
                                                // represent address info
 
-    __CLK_MIP_BASE                 = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
-    CLK_MIP_NEAREST                = 0 << __CLK_MIP_BASE,
-    CLK_MIP_LINEAR                 = 1 << __CLK_MIP_BASE,
-    CLK_MIP_ANISOTROPIC            = 2 << __CLK_MIP_BASE,
-    __CLK_MIP_MASK                 = CLK_MIP_NEAREST | CLK_MIP_LINEAR |
-                                     CLK_MIP_ANISOTROPIC,
+    __CLK_MIP_BASE                 = (__CLK_FILTER_BASE + __CLK_FILTER_BITS),
+    CLK_MIP_NEAREST                = (0 << __CLK_MIP_BASE),
+    CLK_MIP_LINEAR                 = (1 << __CLK_MIP_BASE),
+    CLK_MIP_ANISOTROPIC            = (2 << __CLK_MIP_BASE),
+    __CLK_MIP_MASK                 = (CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+                                     CLK_MIP_ANISOTROPIC),
     __CLK_MIP_BITS                 = 2,
 
-    __CLK_SAMPLER_BITS             = __CLK_MIP_BASE + __CLK_MIP_BITS,
-    __CLK_SAMPLER_MASK             = __CLK_MIP_MASK | __CLK_FILTER_MASK |
-                                     __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+    __CLK_SAMPLER_BITS             = (__CLK_MIP_BASE + __CLK_MIP_BITS),
+    __CLK_SAMPLER_MASK             = (__CLK_MIP_MASK | __CLK_FILTER_MASK |
+                                      __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK),
 
-    __CLK_SAMPLER_ARG_BASE         = __CLK_MIP_BASE + __CLK_SAMPLER_BITS,
+    __CLK_SAMPLER_ARG_BASE         = (__CLK_MIP_BASE + __CLK_SAMPLER_BITS),
     __CLK_SAMPLER_ARG_BITS         = 8,
-    __CLK_SAMPLER_ARG_MASK         = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE,
+    __CLK_SAMPLER_ARG_MASK         = (((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE),
     __CLK_SAMPLER_ARG_KEY_BIT      = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
     __CLK_SAMPLER_ARG_KEY_BITS     = 1,
 
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
index 4063788..13ae5ba 100644
--- a/backend/src/ocl_convert.h
+++ b/backend/src/ocl_convert.h
@@ -1,5 +1,366 @@
 // This file is autogenerated by gen_convert.sh.
 // Don't modify it manually.
+INLINE OVERLOADABLE ulong convert_ulong(long v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(long v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(long v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(long v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(long v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(long v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(long v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(long v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(long v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ulong v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ulong v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ulong v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ulong v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(ulong v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ulong v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ulong v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ulong v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ulong v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(int v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(int v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(int v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(int v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(int v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(int v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(int v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(int v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(int v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uint v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uint v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uint v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uint v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uint v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uint v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(uint v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uint v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uint v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(short v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(short v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(short v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(short v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(short v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(short v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(short v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(short v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(short v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ushort v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(ushort v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ushort v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ushort v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ushort v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ushort v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ushort v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ushort v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ushort v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(char v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(char v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(char v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(char v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(char v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(char v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(char v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(char v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(char v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uchar v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uchar v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uchar v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(uchar v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uchar v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uchar v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uchar v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uchar v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uchar v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(double v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(double v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(double v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(double v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(double v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(double v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(double v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(double v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE float convert_float(double v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(float v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(float v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(float v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(float v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(float v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(float v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(float v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(float v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(float v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE long2 convert_long2(long2 v) { return v; }
 INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
   return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
 }
@@ -40,6 +401,7 @@ INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
   return (long2)((long)(v.s0), (long)(v.s1));
 }
 
+INLINE OVERLOADABLE ulong2 convert_ulong2(ulong2 v) { return v; }
 INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
   return (int2)((int)(v.s0), (int)(v.s1));
 }
@@ -80,6 +442,7 @@ INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
   return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
 }
 
+INLINE OVERLOADABLE int2 convert_int2(int2 v) { return v; }
 INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
   return (uint2)((uint)(v.s0), (uint)(v.s1));
 }
@@ -120,6 +483,7 @@ INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
   return (int2)((int)(v.s0), (int)(v.s1));
 }
 
+INLINE OVERLOADABLE uint2 convert_uint2(uint2 v) { return v; }
 INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
   return (short2)((short)(v.s0), (short)(v.s1));
 }
@@ -160,6 +524,7 @@ INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
   return (uint2)((uint)(v.s0), (uint)(v.s1));
 }
 
+INLINE OVERLOADABLE short2 convert_short2(short2 v) { return v; }
 INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
   return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
 }
@@ -200,6 +565,7 @@ INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
   return (short2)((short)(v.s0), (short)(v.s1));
 }
 
+INLINE OVERLOADABLE ushort2 convert_ushort2(ushort2 v) { return v; }
 INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
   return (char2)((char)(v.s0), (char)(v.s1));
 }
@@ -240,6 +606,7 @@ INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
   return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
 }
 
+INLINE OVERLOADABLE char2 convert_char2(char2 v) { return v; }
 INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
   return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
 }
@@ -280,6 +647,7 @@ INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
   return (char2)((char)(v.s0), (char)(v.s1));
 }
 
+INLINE OVERLOADABLE uchar2 convert_uchar2(uchar2 v) { return v; }
 INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
   return (double2)((double)(v.s0), (double)(v.s1));
 }
@@ -320,6 +688,7 @@ INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
   return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
 }
 
+INLINE OVERLOADABLE double2 convert_double2(double2 v) { return v; }
 INLINE OVERLOADABLE float2 convert_float2(double2 v) {
   return (float2)((float)(v.s0), (float)(v.s1));
 }
@@ -360,6 +729,8 @@ INLINE OVERLOADABLE double2 convert_double2(float2 v) {
   return (double2)((double)(v.s0), (double)(v.s1));
 }
 
+INLINE OVERLOADABLE float2 convert_float2(float2 v) { return v; }
+INLINE OVERLOADABLE long3 convert_long3(long3 v) { return v; }
 INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
   return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
 }
@@ -400,6 +771,7 @@ INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
   return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
 }
 
+INLINE OVERLOADABLE ulong3 convert_ulong3(ulong3 v) { return v; }
 INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
   return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
 }
@@ -440,6 +812,7 @@ INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
   return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
 }
 
+INLINE OVERLOADABLE int3 convert_int3(int3 v) { return v; }
 INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
   return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
 }
@@ -480,6 +853,7 @@ INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
   return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
 }
 
+INLINE OVERLOADABLE uint3 convert_uint3(uint3 v) { return v; }
 INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
   return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
 }
@@ -520,6 +894,7 @@ INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
   return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
 }
 
+INLINE OVERLOADABLE short3 convert_short3(short3 v) { return v; }
 INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
   return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
 }
@@ -560,6 +935,7 @@ INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
   return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
 }
 
+INLINE OVERLOADABLE ushort3 convert_ushort3(ushort3 v) { return v; }
 INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
   return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
 }
@@ -600,6 +976,7 @@ INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
   return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
 }
 
+INLINE OVERLOADABLE char3 convert_char3(char3 v) { return v; }
 INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
   return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
 }
@@ -640,6 +1017,7 @@ INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
   return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
 }
 
+INLINE OVERLOADABLE uchar3 convert_uchar3(uchar3 v) { return v; }
 INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
   return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
 }
@@ -680,6 +1058,7 @@ INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
   return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
 }
 
+INLINE OVERLOADABLE double3 convert_double3(double3 v) { return v; }
 INLINE OVERLOADABLE float3 convert_float3(double3 v) {
   return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
 }
@@ -720,6 +1099,8 @@ INLINE OVERLOADABLE double3 convert_double3(float3 v) {
   return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
 }
 
+INLINE OVERLOADABLE float3 convert_float3(float3 v) { return v; }
+INLINE OVERLOADABLE long4 convert_long4(long4 v) { return v; }
 INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
   return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
 }
@@ -760,6 +1141,7 @@ INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
   return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
 }
 
+INLINE OVERLOADABLE ulong4 convert_ulong4(ulong4 v) { return v; }
 INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
   return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
 }
@@ -800,6 +1182,7 @@ INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
   return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
 }
 
+INLINE OVERLOADABLE int4 convert_int4(int4 v) { return v; }
 INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
   return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
 }
@@ -840,6 +1223,7 @@ INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
   return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
 }
 
+INLINE OVERLOADABLE uint4 convert_uint4(uint4 v) { return v; }
 INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
   return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
 }
@@ -880,6 +1264,7 @@ INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
   return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
 }
 
+INLINE OVERLOADABLE short4 convert_short4(short4 v) { return v; }
 INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
   return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
 }
@@ -920,6 +1305,7 @@ INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
   return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
 }
 
+INLINE OVERLOADABLE ushort4 convert_ushort4(ushort4 v) { return v; }
 INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
   return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
 }
@@ -960,6 +1346,7 @@ INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
   return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
 }
 
+INLINE OVERLOADABLE char4 convert_char4(char4 v) { return v; }
 INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
   return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
 }
@@ -1000,6 +1387,7 @@ INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
   return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
 }
 
+INLINE OVERLOADABLE uchar4 convert_uchar4(uchar4 v) { return v; }
 INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
   return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
 }
@@ -1040,6 +1428,7 @@ INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
   return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
 }
 
+INLINE OVERLOADABLE double4 convert_double4(double4 v) { return v; }
 INLINE OVERLOADABLE float4 convert_float4(double4 v) {
   return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
 }
@@ -1080,6 +1469,8 @@ INLINE OVERLOADABLE double4 convert_double4(float4 v) {
   return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
 }
 
+INLINE OVERLOADABLE float4 convert_float4(float4 v) { return v; }
+INLINE OVERLOADABLE long8 convert_long8(long8 v) { return v; }
 INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
   return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
 }
@@ -1120,6 +1511,7 @@ INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
   return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
 }
 
+INLINE OVERLOADABLE ulong8 convert_ulong8(ulong8 v) { return v; }
 INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
   return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
 }
@@ -1160,6 +1552,7 @@ INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
   return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
 }
 
+INLINE OVERLOADABLE int8 convert_int8(int8 v) { return v; }
 INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
   return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
 }
@@ -1200,6 +1593,7 @@ INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
   return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
 }
 
+INLINE OVERLOADABLE uint8 convert_uint8(uint8 v) { return v; }
 INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
   return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
 }
@@ -1240,6 +1634,7 @@ INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
   return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
 }
 
+INLINE OVERLOADABLE short8 convert_short8(short8 v) { return v; }
 INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
   return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
 }
@@ -1280,6 +1675,7 @@ INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
   return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
 }
 
+INLINE OVERLOADABLE ushort8 convert_ushort8(ushort8 v) { return v; }
 INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
   return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
 }
@@ -1320,6 +1716,7 @@ INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
   return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
 }
 
+INLINE OVERLOADABLE char8 convert_char8(char8 v) { return v; }
 INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
   return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
 }
@@ -1360,6 +1757,7 @@ INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
   return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
 }
 
+INLINE OVERLOADABLE uchar8 convert_uchar8(uchar8 v) { return v; }
 INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
   return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
 }
@@ -1400,6 +1798,7 @@ INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
   return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
 }
 
+INLINE OVERLOADABLE double8 convert_double8(double8 v) { return v; }
 INLINE OVERLOADABLE float8 convert_float8(double8 v) {
   return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
 }
@@ -1440,6 +1839,8 @@ INLINE OVERLOADABLE double8 convert_double8(float8 v) {
   return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
 }
 
+INLINE OVERLOADABLE float8 convert_float8(float8 v) { return v; }
+INLINE OVERLOADABLE long16 convert_long16(long16 v) { return v; }
 INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
   return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
 }
@@ -1480,6 +1881,7 @@ INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
   return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
 }
 
+INLINE OVERLOADABLE ulong16 convert_ulong16(ulong16 v) { return v; }
 INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
   return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
 }
@@ -1520,6 +1922,7 @@ INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
   return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
 }
 
+INLINE OVERLOADABLE int16 convert_int16(int16 v) { return v; }
 INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
   return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
 }
@@ -1560,6 +1963,7 @@ INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
   return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
 }
 
+INLINE OVERLOADABLE uint16 convert_uint16(uint16 v) { return v; }
 INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
   return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
 }
@@ -1600,6 +2004,7 @@ INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
   return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
 }
 
+INLINE OVERLOADABLE short16 convert_short16(short16 v) { return v; }
 INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
   return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
 }
@@ -1640,6 +2045,7 @@ INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
   return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
 }
 
+INLINE OVERLOADABLE ushort16 convert_ushort16(ushort16 v) { return v; }
 INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
   return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
 }
@@ -1680,6 +2086,7 @@ INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
   return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
 }
 
+INLINE OVERLOADABLE char16 convert_char16(char16 v) { return v; }
 INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
   return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
 }
@@ -1720,6 +2127,7 @@ INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
   return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
 }
 
+INLINE OVERLOADABLE uchar16 convert_uchar16(uchar16 v) { return v; }
 INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
   return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
 }
@@ -1760,6 +2168,7 @@ INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
   return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
 }
 
+INLINE OVERLOADABLE double16 convert_double16(double16 v) { return v; }
 INLINE OVERLOADABLE float16 convert_float16(double16 v) {
   return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
 }
@@ -1799,3 +2208,5 @@ INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
 INLINE OVERLOADABLE double16 convert_double16(float16 v) {
   return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
 }
+
+INLINE OVERLOADABLE float16 convert_float16(float16 v) { return v; }
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 8d4220c..170ec70 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -182,9 +182,31 @@ INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); }
 INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
 SDEF(char);
 SDEF(short);
-SDEF(int);
-SDEF(long);
 #undef SDEF
+OVERLOADABLE int ocl_sadd_sat(int x, int y);
+INLINE_OVERLOADABLE int add_sat(int x, int y) { return ocl_sadd_sat(x, y); }
+OVERLOADABLE int ocl_ssub_sat(int x, int y);
+INLINE_OVERLOADABLE int sub_sat(int x, int y) {
+  return (y == 0x80000000u) ? (x & 0x7FFFFFFF) : ocl_ssub_sat(x, y);
+}
+OVERLOADABLE long ocl_sadd_sat(long x, long y);
+INLINE_OVERLOADABLE long add_sat(long x, long y) {
+  union {long l; uint i[2];} ux, uy;
+  ux.l = x;
+  uy.l = y;
+  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+    return x + y;
+  return ocl_sadd_sat(x, y);
+}
+OVERLOADABLE long ocl_ssub_sat(long x, long y);
+INLINE_OVERLOADABLE long sub_sat(long x, long y) {
+  union {long l; uint i[2];} ux, uy;
+  ux.l = x;
+  uy.l = y;
+  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+    return ocl_ssub_sat(x, y);
+  return x - y;
+}
 #define UDEF(TYPE)                                                              \
 OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
 OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
@@ -196,7 +218,6 @@ UDEF(uint);
 UDEF(ulong);
 #undef UDEF
 
-
 uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
     return add_sat((uchar)x, (uchar)0);
 }
@@ -258,6 +279,7 @@ DEC(16);
 
 #define DEF(type) INLINE_OVERLOADABLE type bitselect(type a, type b, type c) { return (a & ~c) | (b & c); }
 DEF(char); DEF(uchar); DEF(short); DEF(ushort); DEF(int); DEF(uint)
+DEF(long); DEF(ulong)
 #undef DEF
 INLINE_OVERLOADABLE float bitselect(float a, float b, float c) {
   return as_float(bitselect(as_int(a), as_int(b), as_int(c)));
@@ -274,13 +296,13 @@ INLINE_OVERLOADABLE char clz(char x) {
     return 0;
   if (x == 0)
     return 8;
-  return __gen_ocl_fbl(x) - 24;
+  return __gen_ocl_fbh(x) - 24;
 }
 
 INLINE_OVERLOADABLE uchar clz(uchar x) {
   if (x == 0)
     return 8;
-  return __gen_ocl_fbl(x) - 24;
+  return __gen_ocl_fbh(x) - 24;
 }
 
 INLINE_OVERLOADABLE short clz(short x) {
@@ -312,15 +334,33 @@ INLINE_OVERLOADABLE uint clz(uint x) {
 }
 
 INLINE_OVERLOADABLE long clz(long x) {
-  return 0;
+  union { int i[2]; long x; } u;
+  u.x = x;
+  if (u.i[1] & 0x80000000u)
+    return 0;
+  if (u.i[1] == 0 && u.i[0] == 0)
+    return 64;
+  uint v = clz(u.i[1]);
+  if(v == 32)
+    v += clz(u.i[0]);
+  return v;
 }
 
 INLINE_OVERLOADABLE ulong clz(ulong x) {
-  return 0;
+  if (x == 0)
+    return 64;
+  union { uint i[2]; ulong x; } u;
+  u.x = x;
+  uint v = clz(u.i[1]);
+  if(v == 32)
+    v += clz(u.i[0]);
+  return v;
 }
 
 OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
 OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
+OVERLOADABLE long __gen_ocl_mul_hi(long x, long y);
+OVERLOADABLE ulong __gen_ocl_mul_hi(ulong x, ulong y);
 INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
 INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
 INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
@@ -328,10 +368,10 @@ INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
 INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
 INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
 INLINE_OVERLOADABLE long mul_hi(long x, long y) {
-  return 0;
+  return __gen_ocl_mul_hi(x, y);
 }
 INLINE_OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
-  return 0;
+  return __gen_ocl_mul_hi(x, y);
 }
 
 #define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
@@ -399,12 +439,15 @@ INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
   return (uint)x;
 }
 
+OVERLOADABLE long __gen_ocl_mad_sat(long a, long b, long c);
+OVERLOADABLE ulong __gen_ocl_mad_sat(ulong a, ulong b, ulong c);
+
 INLINE_OVERLOADABLE long mad_sat(long a, long b, long c) {
-  return 0;
+  return __gen_ocl_mad_sat(a, b, c);
 }
 
 INLINE_OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
-  return 0;
+  return __gen_ocl_mad_sat(a, b, c);
 }
 
 INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
@@ -413,6 +456,8 @@ INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) |
 INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
 INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
 INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
+INLINE_OVERLOADABLE ulong __rotate_left(ulong x, ulong y) { return (x << y) | (x >> (64 - y)); }
+INLINE_OVERLOADABLE long __rotate_left(long x, long y) { return __rotate_left((ulong)x, (ulong)y); }
 #define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
 DEF(char, 7)
 DEF(uchar, 7)
@@ -420,13 +465,9 @@ DEF(short, 15)
 DEF(ushort, 15)
 DEF(int, 31)
 DEF(uint, 31)
+DEF(long, 63)
+DEF(ulong, 63)
 #undef DEF
-INLINE_OVERLOADABLE long rotate(long x, long y) {
-  return 0;
-}
-INLINE_OVERLOADABLE ulong rotate(ulong x, ulong y) {
-  return 0;
-}
 
 OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
 OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
@@ -442,8 +483,8 @@ INLINE_OVERLOADABLE ulong upsample(uint hi, uint lo) {
   return __gen_ocl_upsample((long)hi, (long)lo);
 }
 
-PURE CONST uint __gen_ocl_hadd(uint x, uint y);
-PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
+OVERLOADABLE uint __gen_ocl_hadd(uint x, uint y);
+OVERLOADABLE uint __gen_ocl_rhadd(uint x, uint y);
 #define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
 #define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
 DEC
@@ -452,21 +493,35 @@ DEC
 DEC
 #undef DEF
 #undef DEC
-INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y) >> 1) : __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE int hadd(int x, int y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y) >> 1) :
+         __gen_ocl_hadd((uint)x, (uint)y);
+}
 INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
-INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
+INLINE_OVERLOADABLE int rhadd(int x, int y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y + 1) >> 1) :
+         __gen_ocl_rhadd((uint)x, (uint)y);
+ }
 INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+OVERLOADABLE ulong __gen_ocl_hadd(ulong x, ulong y);
+OVERLOADABLE ulong __gen_ocl_rhadd(ulong x, ulong y);
 INLINE_OVERLOADABLE long hadd(long x, long y) {
-  return 0;
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y) >> 1) :
+         __gen_ocl_hadd((ulong)x, (ulong)y);
 }
 INLINE_OVERLOADABLE ulong hadd(ulong x, ulong y) {
-  return 0;
+  return __gen_ocl_hadd(x, y);
 }
 INLINE_OVERLOADABLE long rhadd(long x, long y) {
-  return 0;
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y + 1) >> 1) :
+         __gen_ocl_rhadd((ulong)x, (ulong)y);
 }
 INLINE_OVERLOADABLE ulong rhadd(ulong x, ulong y) {
-  return 0;
+  return __gen_ocl_rhadd(x, y);
 }
 
 int __gen_ocl_abs(int x);
@@ -508,10 +563,12 @@ INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
 }
 
 INLINE_OVERLOADABLE ulong abs_diff (long x, long y) {
-  return 0;
+  if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+    return abs(x - y);
+  return abs(x) + abs(y);
 }
 INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
-  return 0;
+  return y > x ? (y - x) : (x - y);
 }
 
 /////////////////////////////////////////////////////////////////////////////
@@ -552,7 +609,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
 INLINE uint get_global_id(uint dim) {
-  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
+  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
 }
 
 /////////////////////////////////////////////////////////////////////////////
@@ -577,7 +634,61 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
 }
 INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
-  return __gen_ocl_sin(x * M_PI_F);
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float y, z;
+  int n, ix;
+  ix = *(int *) (&x) & 0x7fffffff;
+  if (ix < 0x3e800000)
+    return __gen_ocl_sin(M_PI_F * x);
+  y = -x;
+  z = __gen_ocl_rndd(y);
+  if (z != y) {
+    y *= 0.5f;
+    y = 2.f * (y - __gen_ocl_rndd(y));
+    n = y * 4.f;
+  } else {
+    if (ix >= 0x4b800000) {
+      y = 0;
+      n = 0;
+    } else {
+      if (ix < 0x4b000000)
+        z = y + 8.3886080000e+06f;
+      int n = *(int *) (&z);
+      n &= 1;
+      y = n;
+      n <<= 2;
+    }
+  }
+  switch (n) {
+  case 0:
+    y = __gen_ocl_sin(M_PI_F * y);
+    break;
+  case 1:
+  case 2:
+    y = __gen_ocl_cos(M_PI_F * (0.5f - y));
+    break;
+  case 3:
+  case 4:
+    y = __gen_ocl_sin(M_PI_F * (1.f - y));
+    break;
+  case 5:
+  case 6:
+    y = -__gen_ocl_cos(M_PI_F * (y - 1.5f));
+    break;
+  default:
+    y = __gen_ocl_sin(M_PI_F * (y - 2.f));
+    break;
+  }
+  return -y;
 }
 INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
 INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
@@ -585,6 +696,572 @@ INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
 INLINE_OVERLOADABLE float native_log(float x) {
   return native_log2(x) * 0.6931472002f;
 }
+INLINE_OVERLOADABLE float tgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float pi = 3.1415927410e+00,
+    a0 = 7.7215664089e-02,
+    a1 = 3.2246702909e-01,
+    a2 = 6.7352302372e-02,
+    a3 = 2.0580807701e-02,
+    a4 = 7.3855509982e-03,
+    a5 = 2.8905137442e-03,
+    a6 = 1.1927076848e-03,
+    a7 = 5.1006977446e-04,
+    a8 = 2.2086278477e-04,
+    a9 = 1.0801156895e-04,
+    a10 = 2.5214456400e-05,
+    a11 = 4.4864096708e-05,
+    tc = 1.4616321325e+00,
+    tf = -1.2148628384e-01,
+    tt = 6.6971006518e-09,
+    t0 = 4.8383611441e-01,
+    t1 = -1.4758771658e-01,
+    t2 = 6.4624942839e-02,
+    t3 = -3.2788541168e-02,
+    t4 = 1.7970675603e-02,
+    t5 = -1.0314224288e-02,
+    t6 = 6.1005386524e-03,
+    t7 = -3.6845202558e-03,
+    t8 = 2.2596477065e-03,
+    t9 = -1.4034647029e-03,
+    t10 = 8.8108185446e-04,
+    t11 = -5.3859531181e-04,
+    t12 = 3.1563205994e-04,
+    t13 = -3.1275415677e-04,
+    t14 = 3.3552918467e-04,
+    u0 = -7.7215664089e-02,
+    u1 = 6.3282704353e-01,
+    u2 = 1.4549225569e+00,
+    u3 = 9.7771751881e-01,
+    u4 = 2.2896373272e-01,
+    u5 = 1.3381091878e-02,
+    v1 = 2.4559779167e+00,
+    v2 = 2.1284897327e+00,
+    v3 = 7.6928514242e-01,
+    v4 = 1.0422264785e-01,
+    v5 = 3.2170924824e-03,
+    s0 = -7.7215664089e-02,
+    s1 = 2.1498242021e-01,
+    s2 = 3.2577878237e-01,
+    s3 = 1.4635047317e-01,
+    s4 = 2.6642270386e-02,
+    s5 = 1.8402845599e-03,
+    s6 = 3.1947532989e-05,
+    r1 = 1.3920053244e+00,
+    r2 = 7.2193557024e-01,
+    r3 = 1.7193385959e-01,
+    r4 = 1.8645919859e-02,
+    r5 = 7.7794247773e-04,
+    r6 = 7.3266842264e-06,
+    w0 = 4.1893854737e-01,
+    w1 = 8.3333335817e-02,
+    w2 = -2.7777778450e-03,
+    w3 = 7.9365057172e-04,
+    w4 = -5.9518753551e-04,
+    w5 = 8.3633989561e-04,
+    w6 = -1.6309292987e-03;
+  float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+  int i, hx, ix;
+  nadj = 0;
+  hx = *(int *) (&x);
+  ix = hx & 0x7fffffff;
+  if (ix >= 0x7f800000)
+    return x * x;
+  if (ix == 0)
+    return INFINITY;
+  if (ix < 0x1c800000) {
+    if (hx < 0) {
+      return - native_log(-x);
+    } else
+      return - native_log(x);
+  }
+  if (hx < 0) {
+    if (ix >= 0x4b000000)
+      return INFINITY;
+    t = __gen_ocl_internal_sinpi(x);
+    if (__gen_ocl_fabs(t) < 1e-8f)
+      return INFINITY;
+    nadj = native_log(M_PI_F / __gen_ocl_fabs(t * x));
+    x = -x;
+  }
+
+  if (ix == 0x3f800000 || ix == 0x40000000)
+    r = 0;
+  else if (ix < 0x40000000) {
+    if (ix <= 0x3f666666) {
+      r = - native_log(x);
+      if (ix >= 0x3f3b4a20) {
+        y = 1 - x;
+        i = 0;
+      } else if (ix >= 0x3e6d3308) {
+        y = x - (tc - 1);
+        i = 1;
+      } else {
+        y = x;
+        i = 2;
+      }
+    } else {
+      r = 0;
+      if (ix >= 0x3fdda618) {
+        y = 2 - x;
+        i = 0;
+      } else if (ix >= 0x3F9da620) {
+        y = x - tc;
+        i = 1;
+      } else {
+        y = x - 1;
+        i = 2;
+      }
+    }
+    switch (i) {
+    case 0:
+      z = y * y;
+      p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+      p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+      p = y * p1 + p2;
+      r += (p - .5f * y);
+      break;
+    case 1:
+      z = y * y;
+      w = z * y;
+      p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+      p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+      p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+      p = z * p1 - (tt - w * (p2 + y * p3));
+      r += (tf + p);
+      break;
+    case 2:
+      p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+      p2 = 1 + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+      r += (-.5f * y + p1 / p2);
+    }
+  } else if (ix < 0x41000000) {
+    i = x;
+    t = 0;
+    y = x - i;
+    p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
+    q = 1 + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+    r = .5f * y + p / q;
+    z = 1;
+    switch (i) {
+    case 7:
+      z *= (y + 6.f);
+    case 6:
+      z *= (y + 5.f);
+    case 5:
+      z *= (y + 4.f);
+    case 4:
+      z *= (y + 3.f);
+    case 3:
+      z *= (y + 2.f);
+      r += native_log(z);
+      break;
+    }
+  } else if (ix < 0x5c800000) {
+    t = native_log(x);
+    z = 1 / x;
+    y = z * z;
+    w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+    r = (x - .5f) * (t - 1) + w;
+  } else
+    r = x * (native_log(x) - 1);
+  if (hx < 0)
+    r = nadj - r;
+  return r;
+}
+
+INLINE_OVERLOADABLE float lgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+    const float
+        zero=  0.,
+        one =  1.0000000000e+00,
+        pi  =  3.1415927410e+00,
+        a0  =  7.7215664089e-02,
+        a1  =  3.2246702909e-01,
+        a2  =  6.7352302372e-02,
+        a3  =  2.0580807701e-02,
+        a4  =  7.3855509982e-03,
+        a5  =  2.8905137442e-03,
+        a6  =  1.1927076848e-03,
+        a7  =  5.1006977446e-04,
+        a8  =  2.2086278477e-04,
+        a9  =  1.0801156895e-04,
+        a10 =  2.5214456400e-05,
+        a11 =  4.4864096708e-05,
+        tc  =  1.4616321325e+00,
+        tf  = -1.2148628384e-01,
+        tt  =  6.6971006518e-09,
+        t0  =  4.8383611441e-01,
+        t1  = -1.4758771658e-01,
+        t2  =  6.4624942839e-02,
+        t3  = -3.2788541168e-02,
+        t4  =  1.7970675603e-02,
+        t5  = -1.0314224288e-02,
+        t6  =  6.1005386524e-03,
+        t7  = -3.6845202558e-03,
+        t8  =  2.2596477065e-03,
+        t9  = -1.4034647029e-03,
+        t10 =  8.8108185446e-04,
+        t11 = -5.3859531181e-04,
+        t12 =  3.1563205994e-04,
+        t13 = -3.1275415677e-04,
+        t14 =  3.3552918467e-04,
+        u0  = -7.7215664089e-02,
+        u1  =  6.3282704353e-01,
+        u2  =  1.4549225569e+00,
+        u3  =  9.7771751881e-01,
+        u4  =  2.2896373272e-01,
+        u5  =  1.3381091878e-02,
+        v1  =  2.4559779167e+00,
+        v2  =  2.1284897327e+00,
+        v3  =  7.6928514242e-01,
+        v4  =  1.0422264785e-01,
+        v5  =  3.2170924824e-03,
+        s0  = -7.7215664089e-02,
+        s1  =  2.1498242021e-01,
+        s2  =  3.2577878237e-01,
+        s3  =  1.4635047317e-01,
+        s4  =  2.6642270386e-02,
+        s5  =  1.8402845599e-03,
+        s6  =  3.1947532989e-05,
+        r1  =  1.3920053244e+00,
+        r2  =  7.2193557024e-01,
+        r3  =  1.7193385959e-01,
+        r4  =  1.8645919859e-02,
+        r5  =  7.7794247773e-04,
+        r6  =  7.3266842264e-06,
+        w0  =  4.1893854737e-01,
+        w1  =  8.3333335817e-02,
+        w2  = -2.7777778450e-03,
+        w3  =  7.9365057172e-04,
+        w4  = -5.9518753551e-04,
+        w5  =  8.3633989561e-04,
+        w6  = -1.6309292987e-03;
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+	int i, hx, ix;
+	nadj = 0;
+	hx = *(int *)&x;
+	ix = hx & 0x7fffffff;
+	if (ix >= 0x7f800000)
+		return x * x;
+	if (ix == 0)
+		return ((x + one) / zero);
+	if (ix < 0x1c800000) {
+		if (hx < 0) {
+			return -native_log(-x);
+		} else
+			return -native_log(x);
+	}
+	if (hx < 0) {
+		if (ix >= 0x4b000000)
+			return ((-x) / zero);
+		t = __gen_ocl_internal_sinpi(x);
+		if (t == zero)
+			return ((-x) / zero);
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));
+		x = -x;
+	}
+	if (ix == 0x3f800000 || ix == 0x40000000)
+		r = 0;
+	else if (ix < 0x40000000) {
+		if (ix <= 0x3f666666) {
+			r = -native_log(x);
+			if (ix >= 0x3f3b4a20) {
+				y = one - x;
+				i = 0;
+			} else if (ix >= 0x3e6d3308) {
+				y = x - (tc - one);
+				i = 1;
+			} else {
+				y = x;
+				i = 2;
+			}
+		} else {
+			r = zero;
+			if (ix >= 0x3fdda618) {
+				y = (float) 2.0 - x;
+				i = 0;
+			}
+			else if (ix >= 0x3F9da620) {
+				y = x - tc;
+				i = 1;
+			}
+			else {
+				y = x - one;
+				i = 2;
+			}
+		}
+		switch (i) {
+		case 0:
+			z = y * y;
+			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+			p = y * p1 + p2;
+			r += (p - (float) 0.5 * y);
+			break;
+		case 1:
+			z = y * y;
+			w = z * y;
+			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+			p = z * p1 - (tt - w * (p2 + y * p3));
+			r += (tf + p);
+			break;
+		case 2:
+			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+			r += (-(float) 0.5 * y + p1 / p2);
+		}
+	} else if (ix < 0x41000000) {
+		i = (int) x;
+		t = zero;
+		y = x - (float) i;
+		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
+		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+		r = .5f * y + p / q;
+		z = one;
+		switch (i) {
+		case 7:
+			z *= (y + (float) 6.0);
+		case 6:
+			z *= (y + (float) 5.0);
+		case 5:
+			z *= (y + (float) 4.0);
+		case 4:
+			z *= (y + (float) 3.0);
+		case 3:
+			z *= (y + (float) 2.0);
+			r += native_log(z);
+			break;
+		}
+
+	} else if (ix < 0x5c800000) {
+		t = native_log(x);
+		z = one / x;
+		y = z * z;
+		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+		r = (x - .5f) * (t - one) + w;
+	} else
+		r = x * (native_log(x) - one);
+	if (hx < 0)
+		r = nadj - r;
+	return r;
+}
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#define BODY \
+    const float  \
+        zero=  0.,  \
+        one =  1.0000000000e+00,  \
+        pi  =  3.1415927410e+00,  \
+        a0  =  7.7215664089e-02,  \
+        a1  =  3.2246702909e-01,  \
+        a2  =  6.7352302372e-02,  \
+        a3  =  2.0580807701e-02,  \
+        a4  =  7.3855509982e-03,  \
+        a5  =  2.8905137442e-03,  \
+        a6  =  1.1927076848e-03,  \
+        a7  =  5.1006977446e-04,  \
+        a8  =  2.2086278477e-04,  \
+        a9  =  1.0801156895e-04,  \
+        a10 =  2.5214456400e-05,  \
+        a11 =  4.4864096708e-05,  \
+        tc  =  1.4616321325e+00,  \
+        tf  = -1.2148628384e-01,  \
+        tt  =  6.6971006518e-09,  \
+        t0  =  4.8383611441e-01,  \
+        t1  = -1.4758771658e-01,  \
+        t2  =  6.4624942839e-02,  \
+        t3  = -3.2788541168e-02,  \
+        t4  =  1.7970675603e-02,  \
+        t5  = -1.0314224288e-02,  \
+        t6  =  6.1005386524e-03,  \
+        t7  = -3.6845202558e-03,  \
+        t8  =  2.2596477065e-03,  \
+        t9  = -1.4034647029e-03,  \
+        t10 =  8.8108185446e-04,  \
+        t11 = -5.3859531181e-04,  \
+        t12 =  3.1563205994e-04,  \
+        t13 = -3.1275415677e-04,  \
+        t14 =  3.3552918467e-04,  \
+        u0  = -7.7215664089e-02,  \
+        u1  =  6.3282704353e-01,  \
+        u2  =  1.4549225569e+00,  \
+        u3  =  9.7771751881e-01,  \
+        u4  =  2.2896373272e-01,  \
+        u5  =  1.3381091878e-02,  \
+        v1  =  2.4559779167e+00,  \
+        v2  =  2.1284897327e+00,  \
+        v3  =  7.6928514242e-01,  \
+        v4  =  1.0422264785e-01,  \
+        v5  =  3.2170924824e-03,  \
+        s0  = -7.7215664089e-02,  \
+        s1  =  2.1498242021e-01,  \
+        s2  =  3.2577878237e-01,  \
+        s3  =  1.4635047317e-01,  \
+        s4  =  2.6642270386e-02,  \
+        s5  =  1.8402845599e-03,  \
+        s6  =  3.1947532989e-05,  \
+        r1  =  1.3920053244e+00,  \
+        r2  =  7.2193557024e-01,  \
+        r3  =  1.7193385959e-01,  \
+        r4  =  1.8645919859e-02,  \
+        r5  =  7.7794247773e-04,  \
+        r6  =  7.3266842264e-06,  \
+        w0  =  4.1893854737e-01,  \
+        w1  =  8.3333335817e-02,  \
+        w2  = -2.7777778450e-03,  \
+        w3  =  7.9365057172e-04,  \
+        w4  = -5.9518753551e-04,  \
+        w5  =  8.3633989561e-04,  \
+        w6  = -1.6309292987e-03;  \
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;  \
+	int i, hx, ix;  \
+	nadj = 0;  \
+	hx = *(int *)&x;  \
+	*signgamp = 1;  \
+	ix = hx & 0x7fffffff;  \
+	if (ix >= 0x7f800000)  \
+		return x * x;  \
+	if (ix == 0)  \
+		return ((x + one) / zero);  \
+	if (ix < 0x1c800000) {  \
+		if (hx < 0) {  \
+			*signgamp = -1;  \
+			return -native_log(-x);  \
+		} else  \
+			return -native_log(x);  \
+	}  \
+	if (hx < 0) {  \
+		if (ix >= 0x4b000000)  \
+			return ((-x) / zero);  \
+		t = __gen_ocl_internal_sinpi(x);  \
+		if (t == zero)  \
+			return ((-x) / zero);  \
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));  \
+		if (t < zero)  \
+			*signgamp = -1;  \
+		x = -x;  \
+	}  \
+	if (ix == 0x3f800000 || ix == 0x40000000)  \
+		r = 0;  \
+	else if (ix < 0x40000000) {  \
+		if (ix <= 0x3f666666) {  \
+			r = -native_log(x);  \
+			if (ix >= 0x3f3b4a20) {  \
+				y = one - x;  \
+				i = 0;  \
+			} else if (ix >= 0x3e6d3308) {  \
+				y = x - (tc - one);  \
+				i = 1;  \
+			} else {  \
+				y = x;  \
+				i = 2;  \
+			}  \
+		} else {  \
+			r = zero;  \
+			if (ix >= 0x3fdda618) {  \
+				y = (float) 2.0 - x;  \
+				i = 0;  \
+			}  \
+			else if (ix >= 0x3F9da620) {  \
+				y = x - tc;  \
+				i = 1;  \
+			}  \
+			else {  \
+				y = x - one;  \
+				i = 2;  \
+			}  \
+		}  \
+		switch (i) {  \
+		case 0:  \
+			z = y * y;  \
+			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));  \
+			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));  \
+			p = y * p1 + p2;  \
+			r += (p - (float) 0.5 * y);  \
+			break;  \
+		case 1:  \
+			z = y * y;  \
+			w = z * y;  \
+			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));  \
+			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));  \
+			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));  \
+			p = z * p1 - (tt - w * (p2 + y * p3));  \
+			r += (tf + p);  \
+			break;  \
+		case 2:  \
+			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));  \
+			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));  \
+			r += (-(float) 0.5 * y + p1 / p2);  \
+		}  \
+	} else if (ix < 0x41000000) {  \
+		i = (int) x;  \
+		t = zero;  \
+		y = x - (float) i;  \
+		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));  \
+		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));  \
+		r = .5f * y + p / q;  \
+		z = one;  \
+		switch (i) {  \
+		case 7:  \
+			z *= (y + (float) 6.0);  \
+		case 6:  \
+			z *= (y + (float) 5.0);  \
+		case 5:  \
+			z *= (y + (float) 4.0);  \
+		case 4:  \
+			z *= (y + (float) 3.0);  \
+		case 3:  \
+			z *= (y + (float) 2.0);  \
+			r += native_log(z);  \
+			break;  \
+		}  \
+		  \
+	} else if (ix < 0x5c800000) {  \
+		t = native_log(x);  \
+		z = one / x;  \
+		y = z * z;  \
+		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));  \
+		r = (x - .5f) * (t - one) + w;  \
+	} else  \
+		r = x * (native_log(x) - one);  \
+	if (hx < 0)  \
+		r = nadj - r;  \
+	return r;
+INLINE_OVERLOADABLE float lgamma_r(float x, global int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, local int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
+#undef BODY
+
 INLINE_OVERLOADABLE float native_log10(float x) {
   return native_log2(x) * 0.3010299956f;
 }
@@ -689,7 +1366,13 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
     x = 1 / x;
     c = -1;
   }
-  return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
+  a += c*x;
+  int i;
+  int sign;
+  for(i=3, sign=-1; i<63; i+=2, sign=-sign) {
+    a += c*sign*__gen_ocl_pow(x,i)/i;
+  }
+  return a;
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
   return __gen_ocl_internal_atan(x) / M_PI_F;
@@ -716,6 +1399,86 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
 // XXX work-around PTX profile
 #define sqrt native_sqrt
 INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+  uint hx = *(uint *)(&x), ix = hx & 0x7FFFFFFF;
+  uint hy = *(uint *)(&y), iy = hy & 0x7FFFFFFF;
+  if (ix > 0x7F800000 || iy > 0x7F800000)
+    return nan(0u);
+  if (ix == 0) {
+    if (y > 0)
+      return M_PI_2_F;
+    if (y < 0)
+      return - M_PI_2_F;
+    return nan(0u);
+  } else {
+    float z = __gen_ocl_internal_atan(y / x);
+    if (x > 0)
+      return z;
+    if (y >= 0)
+      return M_PI_F + z;
+    return - M_PI_F + z;
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+  uint ix = as_uint(x), iy = as_uint(y),
+       pos_zero = 0, neg_zero = 0x80000000u,
+       pos_inf = 0x7f800000, neg_inf = 0xff800000u;
+  if(iy == pos_zero) {
+    if(ix == pos_zero)
+      return 0;
+    if(ix == neg_zero)
+      return 1;
+    if(x < 0)
+      return 1;
+    if(x > 0)
+      return 0;
+  }
+  if(iy == neg_zero) {
+    if(ix == pos_zero)
+      return -0.f;
+    if(ix == neg_zero)
+      return -1;
+    if(x < 0)
+      return -1;
+    if(x > 0)
+      return -0.f;
+  }
+  if((ix & 0x7fffffff) == 0) {
+    if(y < 0)
+      return -.5f;
+    if(y > 0)
+      return .5f;
+  }
+  if(ix == pos_inf) {
+    if(y > 0 && iy != pos_inf)
+      return 0;
+    if(y < 0 && iy != neg_inf)
+      return -0.f;
+  }
+  if(ix == neg_inf) {
+    if(y > 0 && iy != pos_inf)
+      return 1;
+    if(y < 0 && iy != neg_inf)
+      return -1;
+  }
+  if(iy == pos_inf) {
+    if(ix == pos_inf)
+      return 0.25f;
+    if(ix == neg_inf)
+      return 0.75f;
+    if(x >= 0 || x <= 0)
+      return 0.5f;
+  }
+  if(iy == neg_inf) {
+    if(ix == pos_inf)
+      return -0.25f;
+    if(ix == neg_inf)
+      return -0.75f;
+    if(x >= 0 || x <= 0)
+      return -0.5f;
+  }
+  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+}
 INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
@@ -748,6 +1511,8 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
 #define tanpi __gen_ocl_internal_tanpi
 #define tanh __gen_ocl_internal_tanh
 #define atan __gen_ocl_internal_atan
+#define atan2 __gen_ocl_internal_atan2
+#define atan2pi __gen_ocl_internal_atan2pi
 #define atanpi __gen_ocl_internal_atanpi
 #define atanh __gen_ocl_internal_atanh
 #define pow powr
@@ -761,48 +1526,29 @@ INLINE_OVERLOADABLE float mad(float a, float b, float c) {
   return a*b+c;
 }
 
-INLINE_OVERLOADABLE uint select(uint src0, uint src1, int cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE int select(int src0, int src1, int cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE int select(int src0, int src1, uint cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE float select(float src0, float src1, int cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE float select(float src0, float src1, uint cond) {
-  return cond ? src1 : src0;
-}
-
-// This will be optimized out by LLVM and will output LLVM select instructions
-#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-INLINE_OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
-  TYPE4 dst; \
-  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
-  const TYPE x1 = src1.x; \
-  const TYPE y0 = src0.y; \
-  const TYPE y1 = src1.y; \
-  const TYPE z0 = src0.z; \
-  const TYPE z1 = src1.z; \
-  const TYPE w0 = src0.w; \
-  const TYPE w1 = src1.w; \
-  dst.x = (cond.x & MASK) ? x1 : x0; \
-  dst.y = (cond.y & MASK) ? y1 : y0; \
-  dst.z = (cond.z & MASK) ? z1 : z0; \
-  dst.w = (cond.w & MASK) ? w1 : w0; \
-  return dst; \
-}
-DECL_SELECT4(int4, int, int4, 0x80000000)
-DECL_SELECT4(int4, int, uint4, 0x80000000)
-DECL_SELECT4(float4, float, int4, 0x80000000)
-DECL_SELECT4(float4, float, uint4, 0x80000000)
-#undef DECL_SELECT4
+#define DEF(TYPE1, TYPE2) \
+  INLINE_OVERLOADABLE TYPE1 select(TYPE1 src0, TYPE1 src1, TYPE2 cond) { \
+    return cond ? src1 : src0; \
+  }
+DEF(char, char)
+DEF(char, uchar)
+DEF(uchar, char)
+DEF(uchar, uchar)
+DEF(short, short)
+DEF(short, ushort)
+DEF(ushort, short)
+DEF(ushort, ushort)
+DEF(int, int)
+DEF(int, uint)
+DEF(uint, int)
+DEF(uint, uint)
+DEF(long, long)
+DEF(long, ulong)
+DEF(ulong, long)
+DEF(ulong, ulong)
+DEF(float, int)
+DEF(float, uint)
+#undef DEF
 
 /////////////////////////////////////////////////////////////////////////////
 // Common Functions (see 6.11.4 of OCL 1.1 spec)
@@ -1034,9 +1780,19 @@ INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p)
   *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
 }
 
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##3 *) (p + 3 * offset); \
+}
+
 #define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \
+  DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
@@ -1151,6 +1907,8 @@ DEF(ushort)
 DEF(int)
 DEF(uint)
 DEF(float)
+DEF(long)
+DEF(ulong)
 #undef DEF
 #undef DEC2
 #undef DEC4
@@ -1256,6 +2014,8 @@ DEF(ushort)
 DEF(int)
 DEF(uint)
 DEF(float)
+DEF(long)
+DEF(ulong)
 #undef DEF
 #undef DEC2
 #undef DEC2X
@@ -1395,10 +2155,10 @@ OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
 
 #define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
   DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
 
 #define DECL_ATOMIC_OP(NAME) \
-  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)              \
+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)        \
   DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
 
 DECL_ATOMIC_OP(add)
@@ -1407,12 +2167,20 @@ DECL_ATOMIC_OP(and)
 DECL_ATOMIC_OP(or)
 DECL_ATOMIC_OP(xor)
 DECL_ATOMIC_OP(xchg)
-DECL_ATOMIC_OP_TYPE(xchg, float, atomic_)
 DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
 DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
 DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
 DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
 
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+  }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+
 #undef DECL_ATOMIC_OP
 #undef DECL_ATOMIC_OP_TYPE
 #undef DECL_ATOMIC_OP_SPACE
@@ -1456,6 +2224,21 @@ DECL_ATOMIC_OP(cmpxchg)
 #undef DECL_ATOMIC_OP_TYPE
 #undef DECL_ATOMIC_OP_SPACE
 
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
 /////////////////////////////////////////////////////////////////////////////
 // Force the compilation to SIMD8 or SIMD16
 /////////////////////////////////////////////////////////////////////////////
@@ -1472,19 +2255,19 @@ int __gen_ocl_force_simd16(void);
 // Image access functions
 /////////////////////////////////////////////////////////////////////////////
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
 
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
@@ -1504,46 +2287,145 @@ int __gen_ocl_get_image_height(uint surface_id);
 int __gen_ocl_get_image_channel_data_type(uint surface_id);
 int __gen_ocl_get_image_channel_order(uint surface_id);
 int __gen_ocl_get_image_depth(uint surface_id);
+ushort __gen_ocl_get_sampler_info(uint sampler_id);
 
 #define GET_IMAGE(cl_image, surface_id) \
     uint surface_id = (uint)cl_image
 
-#define DECL_READ_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, sampler_t sampler, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ##suffix(surface_id, sampler, coord.s0, coord.s1);\
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_1 1
+#else
+#define GEN_FIX_1 0
+#endif
+
+#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix,          \
+                        image_type, type, suffix, coord_type)                \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               sampler_t sampler,            \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    coord_type tmpCoord = coord;                                             \
+    ushort samplerValue;                                                     \
+    if (float_coord_rounding_fix | int_clamping_fix) {                       \
+      samplerValue = __gen_ocl_get_sampler_info(sampler);                    \
+      if (((samplerValue & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)         \
+          && ((samplerValue & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {   \
+        if (float_coord_rounding_fix                                         \
+            && ((samplerValue & CLK_NORMALIZED_COORDS_TRUE) == 0)) {         \
+          FIXUP_FLOAT_COORD(tmpCoord);                                       \
+        }                                                                    \
+        if (int_clamping_fix) {                                              \
+           if (OUT_OF_BOX(tmpCoord, surface_id,                              \
+                          (samplerValue & CLK_NORMALIZED_COORDS_TRUE))) {    \
+            unsigned int border_alpha;                                       \
+            int order = __gen_ocl_get_image_channel_order(surface_id);       \
+            if (!CLK_HAS_ALPHA(order)) {                                     \
+              border_alpha = 1;                                              \
+            } else                                                           \
+              border_alpha = 0;                                              \
+              return (type)(0, 0, 0, border_alpha);                          \
+          } else                                                             \
+            return   __gen_ocl_read_image ##suffix(                          \
+                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 1);\
+       }                                                                     \
+      }                                                                      \
+    }                                                                        \
+    return  __gen_ocl_read_image ##suffix(                                   \
+                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 0);\
   }
 
-#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ##suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1);\
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type)      \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    return __gen_ocl_read_image ##suffix(                                    \
+           EXPEND_READ_COORD(surface_id,                                     \
+                             CLK_NORMALIZED_COORDS_FALSE                     \
+                             | CLK_ADDRESS_NONE                              \
+                             | CLK_FILTER_NEAREST, coord), 0);               \
   }
 
-#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE void write_image ##suffix(image2d_t cl_image, coord_type coord, type color)\
+#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
+  INLINE_OVERLOADABLE void write_image ##suffix(image_type cl_image, coord_type coord, type color)\
   {\
     GET_IMAGE(cl_image, surface_id);\
-    __gen_ocl_write_image ##suffix(surface_id, coord.s0, coord.s1, color);\
+    __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
   }
 
-#define DECL_IMAGE(type, suffix)        \
-  DECL_READ_IMAGE(type, suffix, int2)   \
-  DECL_READ_IMAGE(type, suffix, float2) \
-  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int2) \
-  DECL_WRITE_IMAGE(type, suffix, int2)   \
-  DECL_WRITE_IMAGE(type, suffix, float2)
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
+
+#define OUT_OF_BOX(coord, surface, normalized)                   \
+  (coord.s0 < 0 || coord.s1 < 0 ||                               \
+   ((normalized == 0)                                            \
+     && (coord.s0 >= __gen_ocl_get_image_width(surface)          \
+         || coord.s1 >= __gen_ocl_get_image_height(surface)))    \
+   || ((normalized != 0) && (coord.s0 > 0x1p0 || coord.s1 > 0x1p0)))
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
+      tmpCoord.s0 += -0x1p-9;                                  \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)            \
+      tmpCoord.s1 += -0x1p-9f;                                 \
+  }
 
-DECL_IMAGE(int4, i)
-DECL_IMAGE(uint4, ui)
-DECL_IMAGE(float4, f)
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                   \
+  DECL_READ_IMAGE(0, int_clamping_fix, image_type, type, suffix, int ##n)           \
+  DECL_READ_IMAGE(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n) \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n)                      \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                              \
+  DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
+
+DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
+DECL_IMAGE(0, image2d_t, float4, f, 2)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
+#define OUT_OF_BOX(coord, surface, normalized)                  \
+  (coord.s0 < 0 || coord.s1 < 0 || coord.s2 < 0 ||              \
+   ((normalized == 0)                                           \
+     && (coord.s0 >= __gen_ocl_get_image_width(surface)         \
+         || coord.s1 >= __gen_ocl_get_image_height(surface)     \
+         || coord.s2 >= __gen_ocl_get_image_depth(surface)))    \
+   || ((normalized != 0)                                        \
+        &&(coord.s0 > 1 || coord.s1 > 1 || coord.s2 > 1)))
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                             \
+  {                                                             \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20)              \
+      tmpCoord.s0 += -0x1p-9;                                   \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20)              \
+      tmpCoord.s1 += -0x1p-9;                                   \
+    if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20)              \
+      tmpCoord.s2 += -0x1p-9;                                   \
+  } 
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
+DECL_IMAGE(0, image3d_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
+DECL_IMAGE(0, image3d_t, float4, f, 3)
+#undef EXPEND_READ_COORD
+#undef EXPEND_WRITE_COORD
+#undef OUT_OF_BOX
+#undef FIXUP_FLOAT_COORD
 
 #undef DECL_IMAGE
 #undef DECL_READ_IMAGE
 #undef DECL_READ_IMAGE_NOSAMPLER
 #undef DECL_WRITE_IMAGE
+#undef GEN_FIX_1
 
 #define DECL_IMAGE_INFO(image_type)    \
   INLINE_OVERLOADABLE  int get_image_width(image_type image) \
@@ -1601,40 +2483,6 @@ INLINE_OVERLOADABLE  size_t get_image_array_size(image1d_array_t image)
   { return __gen_ocl_get_image_array_size(image); }
 #endif
 
-#define DECL_READ_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, sampler_t sampler, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ## suffix(surface_id, (uint)sampler, coord.s0, coord.s1, coord.s2);\
-  }
-
-#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ## suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1, coord.s2);\
-  }
-
-#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE void write_image ## suffix(image3d_t cl_image, coord_type coord, type color)\
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    __gen_ocl_write_image ## suffix(surface_id, coord.s0, coord.s1, coord.s2, color);\
-  }
-
-#define DECL_IMAGE(type, suffix)        \
-  DECL_READ_IMAGE(type, suffix, int4)   \
-  DECL_READ_IMAGE(type, suffix, float4) \
-  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int4) \
-  DECL_WRITE_IMAGE(type, suffix, int4)   \
-  DECL_WRITE_IMAGE(type, suffix, float4)
-
-DECL_IMAGE(int4, i)
-DECL_IMAGE(uint4, ui)
-DECL_IMAGE(float4, f)
-
-
-
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 
 #undef DECL_IMAGE
diff --git a/backend/src/sys/platform.hpp b/backend/src/sys/platform.hpp
index a665356..b8a2841 100644
--- a/backend/src/sys/platform.hpp
+++ b/backend/src/sys/platform.hpp
@@ -24,6 +24,9 @@
 #include <cstdlib>
 #include <cstdio>
 #include <iostream>
+#include <ostream>
+#include <istream>
+#include <string>
 #include <cassert>
 #include <new>
 
@@ -323,6 +326,47 @@ private:
   INLINE NonCopyable& operator= (const NonCopyable&) {return *this;}
 };
 
+#define TO_MAGIC(A, B, C, D)  (A<<24 | B<<16 | C<<8 | D)
+
+class Serializable
+{
+public:
+  INLINE Serializable(void) = default;
+  INLINE Serializable(const Serializable&) = default;
+  INLINE Serializable& operator= (const Serializable&) = default;
+
+  virtual size_t serializeToBin(std::ostream& outs) = 0;
+  virtual size_t deserializeFromBin(std::istream& ins) = 0;
+
+  /* These two will follow LLVM's ABI. */
+  virtual size_t serializeToLLVM(void) { return 0;/* not implemented now. */}
+  virtual size_t deserializeFromLLVM(void) { return 0;/* not implemented now. */}
+
+  virtual void printStatus(int indent = 0, std::ostream& outs = std::cout) { }
+
+  virtual ~Serializable(void) { }
+
+protected:
+  static std::string indent_to_str(int indent) {
+    std::string ind(indent, ' ');
+    return ind;
+  }
+};
+
+/* Help Macro for serialization. */
+#define SERIALIZE_OUT(elt, out, sz)			\
+     do {						\
+	  auto tmp_val = elt;				\
+	  out.write((char *)(&tmp_val), sizeof(elt));	\
+	  sz += sizeof(elt);				\
+     } while(0)
+
+#define DESERIALIZE_IN(elt, in, sz)			\
+     do {						\
+	  in.read((char *)(&(elt)), sizeof(elt));	\
+	  sz += sizeof(elt);				\
+     } while(0)
+
 ////////////////////////////////////////////////////////////////////////////////
 /// Disable some compiler warnings
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/kernels/builtin_atan2.cl b/kernels/builtin_atan2.cl
new file mode 100644
index 0000000..aba73be
--- /dev/null
+++ b/kernels/builtin_atan2.cl
@@ -0,0 +1,4 @@
+kernel void builtin_atan2(global float *y, global float *x, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = atan2(y[i], x[i]);
+};
diff --git a/kernels/builtin_lgamma.cl b/kernels/builtin_lgamma.cl
new file mode 100644
index 0000000..85bf859
--- /dev/null
+++ b/kernels/builtin_lgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = lgamma(src[i]);
+};
diff --git a/kernels/builtin_lgamma_r.cl b/kernels/builtin_lgamma_r.cl
new file mode 100644
index 0000000..71fcc36
--- /dev/null
+++ b/kernels/builtin_lgamma_r.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma_r(global float *src, global float *dst, global int *signp) {
+  int i = get_global_id(0);
+  dst[i] = lgamma_r(src[i], signp+i);
+};
diff --git a/kernels/builtin_sinpi.cl b/kernels/builtin_sinpi.cl
new file mode 100644
index 0000000..134152d
--- /dev/null
+++ b/kernels/builtin_sinpi.cl
@@ -0,0 +1,4 @@
+kernel void builtin_sinpi(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = sinpi(src[i]);
+};
diff --git a/kernels/builtin_tgamma.cl b/kernels/builtin_tgamma.cl
new file mode 100644
index 0000000..1f7abc3
--- /dev/null
+++ b/kernels/builtin_tgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_tgamma(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = tgamma(src[i]);
+};
diff --git a/kernels/compiler_abs_diff.cl b/kernels/compiler_abs_diff.cl
index 583ba2b..1f30df4 100644
--- a/kernels/compiler_abs_diff.cl
+++ b/kernels/compiler_abs_diff.cl
@@ -26,3 +26,5 @@ COMPILER_ABS(char, uchar)
 COMPILER_ABS(uchar, uchar)
 COMPILER_ABS(short, ushort)
 COMPILER_ABS(ushort, ushort)
+COMPILER_ABS(long, ulong)
+COMPILER_ABS(ulong, ulong)
diff --git a/kernels/compiler_bool_cross_basic_block.cl b/kernels/compiler_bool_cross_basic_block.cl
new file mode 100644
index 0000000..9aeb16d
--- /dev/null
+++ b/kernels/compiler_bool_cross_basic_block.cl
@@ -0,0 +1,21 @@
+__kernel
+void compiler_bool_cross_basic_block(__global int *src,
+				     __global int *dst,
+				     int scale){
+  int id = (int)get_global_id(0);
+
+  bool isRedRow = false;
+  bool isRed;
+  int val = src[id];
+  for (unsigned int i=0; i<scale; i++, isRedRow = !isRedRow) {
+    if (isRedRow) {
+      isRed= false;
+      for (unsigned int j=0; j < scale; j++, isRed=!isRed) {
+        if (isRed) {
+	  val++;
+        }
+      }
+    }
+  }
+  dst[id] = val;
+}
diff --git a/kernels/compiler_box_blur_image.cl b/kernels/compiler_box_blur_image.cl
index 7bcbdeb..42f463b 100644
--- a/kernels/compiler_box_blur_image.cl
+++ b/kernels/compiler_box_blur_image.cl
@@ -10,7 +10,7 @@ __kernel void compiler_box_blur_image(__read_only image2d_t src,
 
   for (offset.y = -1; offset.y <= 1; offset.y++) {
     for (offset.x = -1; offset.x <= 1; offset.x++) {
-      sum += read_imagef(src, sampler, coord + offset);
+      sum +=  read_imagef(src, sampler, coord + offset);
     }
   }
 
diff --git a/kernels/compiler_function_constant0.cl b/kernels/compiler_function_constant0.cl
index 363d84e..5340352 100644
--- a/kernels/compiler_function_constant0.cl
+++ b/kernels/compiler_function_constant0.cl
@@ -1,5 +1,5 @@
 __kernel void
-compiler_function_constant0(__constant short *c0, __constant char *c1, __global int *dst, int value)
+compiler_function_constant0(__constant int *c0, __constant char *c1, __global int *dst, int value)
 {
   int id = (int)get_global_id(0);
   dst[id] = value + c0[id%69] + c1[0];
diff --git a/kernels/compiler_global_constant.cl b/kernels/compiler_global_constant.cl
index 5db58d6..71fe86c 100644
--- a/kernels/compiler_global_constant.cl
+++ b/kernels/compiler_global_constant.cl
@@ -1,10 +1,65 @@
 constant int m[3] = {71,72,73};
 constant int n = 1;
-constant int o[3] = {1, 1, 1};
+constant int o[3] = {3, 2, 1};
+
+constant int4 a= {1, 2, 3, 4};
+constant int4 b = {0, -1, -2, -3};
+
+struct Person {
+  char name[7];
+  int3 idNumber;
+};
+
+struct Test1 {
+  int a0;
+  char a1;
+};
+
+struct Test2 {
+  char a0;
+  int a1;
+};
+
+constant struct Person james= {{"james"}, (int3)(1, 2, 3)};
+
+constant struct Test1 t0 = {1, 2};
+constant struct Test2 t1 = {1, 2};
+
+constant int3 c[3] = {(int3)(0, 1, 2), (int3)(3, 4, 5), (int3)(6,7,8) };
+constant char4 d[3] = {(char4)(0, 1, 2, 3), (char4)(4, 5, 6, 7), (char4)(8, 9, 10, 11)};
+
+constant struct Person members[3] = {{{"abc"}, (int3)(1, 2, 3)}, { {"defg"}, (int3)(4,5,6)}, { {"hijk"}, (int3)(7,8,9)} };
 
 __kernel void
 compiler_global_constant(__global int *dst, int e, int r)
 {
   int id = (int)get_global_id(0);
-  dst[id] = m[id%3] * n * o[2] + e + r;
+
+  int4 x = a + b;
+  dst[id] = m[id%3] * n * o[2] + e + r *x.y * a.x;
+}
+// array of vectors
+__kernel void
+compiler_global_constant1(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = c[id%3].y + d[id%3].w;
+}
+
+// structure
+__kernel void
+compiler_global_constant2(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+
+  dst[id] = james.idNumber.y + t0.a1 + t1.a1;
+}
+
+//array of structure
+__kernel void
+compiler_global_constant3(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+
+  dst[id] = members[id%3].idNumber.z + members[id%3].name[2];
 }
diff --git a/kernels/compiler_global_constant_2.cl b/kernels/compiler_global_constant_2.cl
index 353ebd7..04536c7 100644
--- a/kernels/compiler_global_constant_2.cl
+++ b/kernels/compiler_global_constant_2.cl
@@ -1,5 +1,9 @@
 constant int m[3] = {0x15b,0x25b,0x35b};
-constant int t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+constant short t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+constant long n[3] = {0x15b,0x25b,0xFFFFFFFFF};
+constant long p[3] = {1,1,1};
+constant long s = 1;
+
 
 __kernel void
 compiler_global_constant_2(__global int *dst, int e, int r)
@@ -7,3 +11,10 @@ compiler_global_constant_2(__global int *dst, int e, int r)
   int id = (int)get_global_id(0);
   dst[id] = m[id%3] + t[id%5] + e + r;
 }
+
+__kernel void
+compiler_global_constant_2_long(__global long *dst, int e, int r)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = n[id%3]*p[1] + e*s + r;
+}
diff --git a/kernels/compiler_group_size.cl b/kernels/compiler_group_size.cl
index 9dba236..4e2c333 100644
--- a/kernels/compiler_group_size.cl
+++ b/kernels/compiler_group_size.cl
@@ -10,3 +10,20 @@ compiler_group_size(__global unsigned int *dst)
   dst[idz*size_x*size_y + idy*size_x + idx] = idz*size_x*size_y + idy*size_x +idx;
 }
 
+struct xyz{
+  unsigned short b;
+  unsigned short e;
+  unsigned int o;
+};
+
+__kernel void
+compiler_group_size4(__global struct xyz *src, __global unsigned int *dst, unsigned int num, unsigned int c)
+{
+  uint idx = (uint)get_global_id(0);
+  if(idx>=num)
+    return;
+  struct xyz td = src[idx];
+  for(unsigned x = td.b;x<=td.e;x++)
+    dst[td.o+x] = c;
+}
+
diff --git a/kernels/compiler_long_convert.cl b/kernels/compiler_long_convert.cl
index f22914f..e5f7939 100644
--- a/kernels/compiler_long_convert.cl
+++ b/kernels/compiler_long_convert.cl
@@ -5,3 +5,15 @@ kernel void compiler_long_convert(global char *src1, global short *src2, global
   dst2[i] = src2[i];
   dst3[i] = src3[i];
 }
+
+kernel void compiler_long_convert_2(global char *dst1, global short *dst2, global int *dst3, global long *src) {
+  int i = get_global_id(0);
+  dst1[i] = src[i];
+  dst2[i] = src[i];
+  dst3[i] = src[i];
+}
+
+kernel void compiler_long_convert_to_float(global float *dst, global long *src) {
+  int i = get_global_id(0);
+  dst[i] = src[i];
+}
diff --git a/kernels/compiler_upsample_long.cl b/kernels/compiler_upsample_long.cl
index 16f806b..8f914e4 100644
--- a/kernels/compiler_upsample_long.cl
+++ b/kernels/compiler_upsample_long.cl
@@ -1,4 +1,4 @@
-kernel void compiler_upsample_int(global int *src1, global uint *src2, global long *dst) {
+kernel void compiler_upsample_long(global int *src1, global uint *src2, global long *dst) {
   int i = get_global_id(0);
   dst[i] = upsample(src1[i], src2[i]);
 }
diff --git a/kernels/compiler_vector_inc.cl b/kernels/compiler_vector_inc.cl
new file mode 100644
index 0000000..548dcb4
--- /dev/null
+++ b/kernels/compiler_vector_inc.cl
@@ -0,0 +1,13 @@
+kernel void compiler_vector_inc(global char *dst, global char *src) {
+    size_t i = get_global_id(0);
+    char2 dst2 = vload2(i, dst);
+    if (src[i] == 0)
+      dst2++;
+    else if(src[i] == 1)
+      ++dst2;
+    else if(src[i] == 2)
+      dst2--;
+    else
+      --dst2;
+    vstore2(dst2, i, dst);
+}
diff --git a/kernels/test_copy_image_3d.cl b/kernels/test_copy_image_3d.cl
index 766227a..103fb69 100644
--- a/kernels/test_copy_image_3d.cl
+++ b/kernels/test_copy_image_3d.cl
@@ -1,11 +1,28 @@
 __kernel void
-test_copy_image_3d(__read_only image3d_t src, __write_only image3d_t dst, sampler_t sampler)
+test_copy_image_3d(__read_only image3d_t src,
+                   __write_only image3d_t dst,
+                   sampler_t sampler,
+                   __write_only image2d_t buf0,
+                   __write_only image2d_t buf1,
+                   __write_only image2d_t buf2,
+                   __write_only image2d_t buf3)
 {
   int4 coord;
-  int4 color;
+  int2 coord2;
+  float4 color;
   coord.x = (int)get_global_id(0);
   coord.y = (int)get_global_id(1);
-  coord.z = 0;
-  color = read_imagei(src, sampler, coord);
-  write_imagei(dst, coord, color);
+  coord.z = (int)get_global_id(2);
+  coord2.x = coord.x;
+  coord2.y = coord.y;
+  color = read_imagef(src, sampler, coord);
+  write_imagef(dst, coord, color);
+  if (coord.z == 0)
+    write_imagef(buf0, coord2, color);
+  else if (coord.z == 1)
+    write_imagef(buf1, coord2, color);
+  else if (coord.z == 2)
+    write_imagef(buf2, coord2, color);
+  else if (coord.z == 3)
+    write_imagef(buf3, coord2, color);
 }
diff --git a/kernels/test_fill_image_3d.cl b/kernels/test_fill_image_3d.cl
index 0f0c6fd..4988f69 100644
--- a/kernels/test_fill_image_3d.cl
+++ b/kernels/test_fill_image_3d.cl
@@ -9,6 +9,6 @@ test_fill_image_3d(__write_only image3d_t dst, uint color)
   color4.s3  = color & 0xFF;
   coord.x = (int)get_global_id(0);
   coord.y = (int)get_global_id(1);
-  coord.z = 0;
+  coord.z = (int)get_global_id(2);
   write_imagei(dst, coord, color4);
 }
diff --git a/kernels/test_fill_image_3d_2.cl b/kernels/test_fill_image_3d_2.cl
index 22b6452..1f9eaa1 100644
--- a/kernels/test_fill_image_3d_2.cl
+++ b/kernels/test_fill_image_3d_2.cl
@@ -5,6 +5,6 @@ test_fill_image_3d_2(__write_only image3d_t dst)
   int4 color4 = {0x12, 0x34, 0x56, 0x78};
   coord.x = (int)get_global_id(0);
   coord.y = (int)get_global_id(1);
-  coord.z = 0;
+  coord.z = (int)get_global_id(2);
   write_imagei(dst, coord, color4);
 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 58d23cb..3fc8689 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,8 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}
                     ${DRM_INCLUDE_PATH}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
-                    ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+                    ${MESA_SOURCE_INCLUDES})
 
 set(OPENCL_SRC
     cl_api.c
@@ -29,16 +30,14 @@ set(OPENCL_SRC
     x11/dricommon.c 
     x11/va_dri2.c)
 
-if (EGL_FOUND AND GBM_FOUND)
-set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/gbm_dri2_x11_platform.c)
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
+set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
 SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
 SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
-SET(OPTIONAL_GBM_LIBRARY "${GBM_LIBRARY}")
-else(EGL_FOUND AND GBM_FOUND)
+else(EGL_FOUND AND MESA_SOURCE_FOUND)
 SET(OPTIONAL_EGL_LIBRARY "")
-SET(OPTIONAL_GBM_LIBRARY "")
-endif (EGL_FOUND AND GBM_FOUND)
+endif (EGL_FOUND AND MESA_SOURCE_FOUND)
 
 if (OCLIcd_FOUND)
 set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
@@ -46,7 +45,7 @@ SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
 endif (OCLIcd_FOUND)
 
-SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic")
+SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
 
 link_directories (${LLVM_LIBRARY_DIR})
 add_library(cl SHARED ${OPENCL_SRC})
@@ -59,6 +58,5 @@ target_link_libraries(
                       ${DRM_INTEL_LIBRARY}
                       ${DRM_LIBRARY}
                       ${OPENGL_LIBRARIES}
-                      ${OPTIONAL_EGL_LIBRARY}
-                      ${OPTIONAL_GBM_LIBRARY})
+                      ${OPTIONAL_EGL_LIBRARY})
 install (TARGETS cl LIBRARY DESTINATION lib)
diff --git a/src/cl_api.c b/src/cl_api.c
index 4f048ee..ded0e0c 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -66,7 +66,7 @@ inline cl_int
 handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
               cl_event* event, enqueue_data* data, cl_command_type type)
 {
-  cl_int status = cl_event_wait_events(num, wait_list);
+  cl_int status = cl_event_wait_events(num, wait_list, queue);
   cl_event e;
   if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
     e = cl_event_new(queue->ctx, queue, type, event!=NULL);
@@ -79,6 +79,66 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
   return status;
 }
 
+/* The following code checking overlap is from Appendix of openCL spec 1.1 */
+inline cl_bool check_copy_overlap(const size_t src_offset[3],
+                                  const size_t dst_offset[3],
+                                  const size_t region[3],
+                                  size_t row_pitch, size_t slice_pitch)
+{
+  const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+  const size_t src_max[] = {src_offset[0] + region[0],
+                            src_offset[1] + region[1],
+                            src_offset[2] + region[2]};
+  const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+  const size_t dst_max[] = {dst_offset[0] + region[0],
+                            dst_offset[1] + region[1],
+                            dst_offset[2] + region[2]};
+  // Check for overlap
+  cl_bool overlap = CL_TRUE;
+  unsigned i;
+  size_t dst_start = dst_offset[2] * slice_pitch +
+                     dst_offset[1] * row_pitch + dst_offset[0];
+  size_t dst_end = dst_start + (region[2] * slice_pitch +
+                   region[1] * row_pitch + region[0]);
+  size_t src_start = src_offset[2] * slice_pitch +
+                     src_offset[1] * row_pitch + src_offset[0];
+  size_t src_end = src_start + (region[2] * slice_pitch +
+                   region[1] * row_pitch + region[0]);
+
+  for (i=0; i != 3; ++i) {
+    overlap = overlap && (src_min[i] < dst_max[i])
+                      && (src_max[i] > dst_min[i]);
+  }
+
+  if (!overlap) {
+    size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
+                          src_offset[0] + region[0] - row_pitch : 0;
+    size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
+                          dst_offset[0] + region[0] - row_pitch : 0;
+    if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+         (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
+      if ( (src_start <= dst_start && dst_start < src_end) ||
+           (dst_start <= src_start && src_start < dst_end) )
+        overlap = CL_TRUE;
+    }
+    if (region[2] > 1) {
+      size_t src_height = slice_pitch / row_pitch;
+      size_t dst_height = slice_pitch / row_pitch;
+      size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
+                            src_offset[1] + region[1] - src_height : 0;
+      size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
+                            dst_offset[1] + region[1] - dst_height : 0;
+      if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+           (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
+        if ( (src_start <= dst_start && dst_start < src_end) ||
+             (dst_start <= src_start && src_start < dst_end) )
+          overlap = CL_TRUE;
+      }
+    }
+  }
+  return overlap;
+}
+
 static cl_int
 cl_check_device_type(cl_device_type device_type)
 {
@@ -408,7 +468,7 @@ clCreateBuffer(cl_context    context,
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
 
-  mem = cl_mem_new(context, flags, size, host_ptr, &err);
+  mem = cl_mem_new_buffer(context, flags, size, host_ptr, &err);
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -469,6 +529,7 @@ clCreateImage2D(cl_context              context,
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
   cl_image_desc image_desc;
+  memset(&image_desc, 0, sizeof(image_desc));
 
   image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
   image_desc.image_width = image_width;
@@ -592,13 +653,13 @@ error:
 }
 
 cl_int
-clGetImageInfo(cl_mem         image,
+clGetImageInfo(cl_mem         mem,
                cl_image_info  param_name,
                size_t         param_value_size,
                void *         param_value,
                size_t *       param_value_size_ret)
 {
-  return cl_get_image_info(image,
+  return cl_get_image_info(mem,
                            param_name,
                            param_value_size,
                            param_value,
@@ -777,7 +838,8 @@ clBuildProgram(cl_program            program,
 
   /* TODO support create program from binary */
   assert(program->source_type == FROM_LLVM ||
-         program->source_type == FROM_SOURCE);
+         program->source_type == FROM_SOURCE ||
+         program->source_type == FROM_BINARY);
   if((err = cl_program_build(program, options)) != CL_SUCCESS) {
     goto error;
   }
@@ -1015,7 +1077,7 @@ clWaitForEvents(cl_uint          num_events,
 
   TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
 
-  while(cl_event_wait_events(num_events, event_list) == CL_ENQUEUE_EXECUTE_DEFER) {
+  while(cl_event_wait_events(num_events, event_list, NULL) == CL_ENQUEUE_EXECUTE_DEFER) {
     usleep(8000);       //sleep 8ms to wait other thread
   }
 
@@ -1034,11 +1096,6 @@ clGetEventInfo(cl_event      event,
   CHECK_EVENT(event);
 
   if (param_name == CL_EVENT_COMMAND_QUEUE) {
-    if(event->queue == NULL) {
-      param_value_size_ret = 0;
-      param_value = NULL;
-      return err;
-    }
     FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
   } else if (param_name == CL_EVENT_CONTEXT) {
     FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
@@ -1243,8 +1300,74 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
                         const cl_event * event_wait_list,
                         cl_event *       event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (blocking_read != CL_TRUE)
+    NOT_IMPLEMENTED;
+
+  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(buffer_row_pitch == 0)
+    buffer_row_pitch = region[0];
+  if(buffer_slice_pitch == 0)
+    buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+  if(host_row_pitch == 0)
+    host_row_pitch = region[0];
+  if(host_slice_pitch == 0)
+    host_slice_pitch = region[1] * host_row_pitch;
+
+  if (buffer_row_pitch < region[0] ||
+      host_row_pitch < region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_origin[2]+region[2])*buffer_slice_pitch + (buffer_origin[1]+region[1])*buffer_row_pitch + buffer_origin[0] + region[0] > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueReadBufferRect;
+  data->mem_obj     = buffer;
+  data->ptr         = ptr;
+  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+  data->host_origin[0]  = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = buffer_row_pitch;
+  data->slice_pitch = buffer_slice_pitch;
+  data->host_row_pitch   = host_row_pitch;
+  data->host_slice_pitch = host_slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+ error:
+  return err;
 }
 
 cl_int
@@ -1291,7 +1414,7 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
   data->size      = size;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+                   event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
     err = cl_enqueue_handle(data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
@@ -1316,8 +1439,75 @@ clEnqueueWriteBufferRect(cl_command_queue     command_queue,
                          const cl_event *     event_wait_list,
                          cl_event *           event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (blocking_write != CL_TRUE)
+    NOT_IMPLEMENTED;
+
+
+  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(buffer_row_pitch == 0)
+    buffer_row_pitch = region[0];
+  if(buffer_slice_pitch == 0)
+    buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+  if(host_row_pitch == 0)
+    host_row_pitch = region[0];
+  if(host_slice_pitch == 0)
+    host_slice_pitch = region[1] * host_row_pitch;
+
+  if (buffer_row_pitch < region[0] ||
+      host_row_pitch < region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_origin[2]+region[2])*buffer_slice_pitch + (buffer_origin[1]+region[1])*buffer_row_pitch + buffer_origin[0] + region[0] > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueWriteBufferRect;
+  data->mem_obj     = buffer;
+  data->const_ptr   = ptr;
+  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+  data->host_origin[0]  = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = buffer_row_pitch;
+  data->slice_pitch = buffer_slice_pitch;
+  data->host_row_pitch   = host_row_pitch;
+  data->host_slice_pitch = host_slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
 }
 
 cl_int
@@ -1350,13 +1540,84 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
                         const cl_event *     event_wait_list,
                         cl_event *           event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_MEM(dst_buffer);
+
+  if ((command_queue->ctx != src_buffer->ctx) ||
+      (command_queue->ctx != dst_buffer->ctx)) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(src_row_pitch == 0)
+    src_row_pitch = region[0];
+  if(src_slice_pitch == 0)
+    src_slice_pitch = region[1] * src_row_pitch;
+
+  if(dst_row_pitch == 0)
+    dst_row_pitch = region[0];
+  if(dst_slice_pitch == 0)
+    dst_slice_pitch = region[1] * dst_row_pitch;
+
+  if (src_row_pitch < region[0] ||
+      dst_row_pitch < region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
+      (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_origin[2]+region[2])*src_slice_pitch + (src_origin[1]+region[1])*src_row_pitch + src_origin[0] + region[0] > src_buffer->size ||
+      (dst_origin[2]+region[2])*dst_slice_pitch + (dst_origin[1]+region[1])*dst_row_pitch + dst_origin[0] + region[0] > dst_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_buffer == dst_buffer &&
+      check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+    err = CL_MEM_COPY_OVERLAP;
+    goto error;
+  }
+
+  cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+                          src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBufferRect;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
+
+error:
+  return err;
 }
 
 cl_int
 clEnqueueReadImage(cl_command_queue      command_queue,
-                   cl_mem                image,
+                   cl_mem                mem,
                    cl_bool               blocking_read,
                    const size_t *        origin,
                    const size_t *        region,
@@ -1371,8 +1632,8 @@ clEnqueueReadImage(cl_command_queue      command_queue,
   enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(image);
-  if (command_queue->ctx != image->ctx) {
+  CHECK_IMAGE(mem, image);
+  if (command_queue->ctx != mem->ctx) {
      err = CL_INVALID_CONTEXT;
      goto error;
   }
@@ -1410,16 +1671,16 @@ clEnqueueReadImage(cl_command_queue      command_queue,
      goto error;
   }
 
-  if (image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+  if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
      err = CL_INVALID_OPERATION;
      goto error;
   }
 
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
 
   data = &no_wait_data;
   data->type        = EnqueueReadImage;
-  data->mem_obj     = image;
+  data->mem_obj     = mem;
   data->ptr         = ptr;
   data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
   data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
@@ -1427,7 +1688,7 @@ clEnqueueReadImage(cl_command_queue      command_queue,
   data->slice_pitch = slice_pitch;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+                   event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
     err = cl_enqueue_handle(data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
@@ -1438,7 +1699,7 @@ error:
 
 cl_int
 clEnqueueWriteImage(cl_command_queue     command_queue,
-                    cl_mem               image,
+                    cl_mem               mem,
                     cl_bool              blocking_write,
                     const size_t *       origin,
                     const size_t *       region,
@@ -1453,8 +1714,8 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
   enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(image);
-  if (command_queue->ctx != image->ctx) {
+  CHECK_IMAGE(mem, image);
+  if (command_queue->ctx != mem->ctx) {
     err = CL_INVALID_CONTEXT;
     goto error;
   }
@@ -1492,16 +1753,16 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
     goto error;
   }
 
-  if (image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+  if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
     err = CL_INVALID_OPERATION;
     goto error;
   }
 
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
 
   data = &no_wait_data;
   data->type        = EnqueueWriteImage;
-  data->mem_obj     = image;
+  data->mem_obj     = mem;
   data->const_ptr   = ptr;
   data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
   data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
@@ -1509,7 +1770,7 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
   data->slice_pitch = slice_pitch;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+                   event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
     err = cl_enqueue_handle(data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
@@ -1520,8 +1781,8 @@ error:
 
 cl_int
 clEnqueueCopyImage(cl_command_queue      command_queue,
-                   cl_mem                src_image,
-                   cl_mem                dst_image,
+                   cl_mem                src_mem,
+                   cl_mem                dst_mem,
                    const size_t *        src_origin,
                    const size_t *        dst_origin,
                    const size_t *        region,
@@ -1529,13 +1790,74 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
                    const cl_event *      event_wait_list,
                    cl_event *            event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+  cl_bool overlap = CL_TRUE;
+  cl_int i = 0;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(src_mem, src_image);
+  CHECK_IMAGE(dst_mem, dst_image);
+  if (command_queue->ctx != src_mem->ctx ||
+      command_queue->ctx != dst_mem->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
+      src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
+    err = CL_IMAGE_FORMAT_MISMATCH;
+    goto error;
+  }
+
+  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
+      (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_image == dst_image) {
+    for(i = 0; i < 3; i++)
+      overlap = overlap && (src_origin[i] < dst_origin[i] + region[i])
+                        && (dst_origin[i] < src_origin[i] + region[i]);
+    if(overlap == CL_TRUE) {
+      err = CL_MEM_COPY_OVERLAP;
+      goto error;
+    }
+  }
+
+  cl_mem_kernel_copy_image(command_queue, src_image, dst_image, src_origin, dst_origin, region);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyImage;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
+
+error:
+  return err;
 }
 
 cl_int
 clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
-                           cl_mem            src_image,
+                           cl_mem            src_mem,
                            cl_mem            dst_buffer,
                            const size_t *    src_origin,
                            const size_t *    region,
@@ -1544,14 +1866,55 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
                            const cl_event *  event_wait_list,
                            cl_event *        event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(src_mem, src_image);
+  CHECK_MEM(dst_buffer);
+  if (command_queue->ctx != src_mem->ctx ||
+      command_queue->ctx != dst_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp > dst_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer, src_origin, dst_offset, region);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyImageToBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
+
+error:
+  return err;
 }
 
 cl_int
 clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
                            cl_mem            src_buffer,
-                           cl_mem            dst_image,
+                           cl_mem            dst_mem,
                            size_t            src_offset,
                            const size_t *    dst_origin,
                            const size_t *    region,
@@ -1559,8 +1922,113 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
                            const cl_event *  event_wait_list,
                            cl_event *        event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_IMAGE(dst_mem, dst_image);
+  if (command_queue->ctx != src_buffer->ctx ||
+      command_queue->ctx != dst_mem->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp > src_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image, src_offset, dst_origin, region);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, dst_mem->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBufferToImage;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
+
+error:
+  return err;
+}
+
+static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset, size_t size)
+{
+  cl_int slot = -1;
+  int err = CL_SUCCESS;
+  if (!(*ptr = cl_mem_map_gtt_unsync(mem))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+  *ptr = (char*)(*ptr) + offset;
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mem->host_ptr);
+    //only calc ptr here, will do memcpy in enqueue
+    *mem_ptr = mem->host_ptr + offset;
+  } else {
+    *mem_ptr = *ptr;
+  }
+  /* Record the mapped address. */
+  if (!mem->mapped_ptr_sz) {
+    mem->mapped_ptr_sz = 16;
+    mem->mapped_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
+    if (!mem->mapped_ptr) {
+      cl_mem_unmap_gtt(mem);
+      err = CL_OUT_OF_HOST_MEMORY;
+      goto error;
+    }
+    memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+    slot = 0;
+  } else {
+   int i = 0;
+    for (; i < mem->mapped_ptr_sz; i++) {
+      if (mem->mapped_ptr[i].ptr == NULL) {
+        slot = i;
+        break;
+      }
+   }
+    if (i == mem->mapped_ptr_sz) {
+      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
+      if (!new_ptr) {
+        cl_mem_unmap_gtt (mem);
+        err = CL_OUT_OF_HOST_MEMORY;
+        goto error;
+      }
+      memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      memcpy(new_ptr, mem->mapped_ptr,
+             mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      slot = mem->mapped_ptr_sz;
+      mem->mapped_ptr_sz *= 2;
+      free(mem->mapped_ptr);
+      mem->mapped_ptr = new_ptr;
+    }
+  }
+  assert(slot != -1);
+  mem->mapped_ptr[slot].ptr = *mem_ptr;
+  mem->mapped_ptr[slot].v_ptr = *ptr;
+  mem->mapped_ptr[slot].size = size;
+  mem->map_ref++;
+error:
+  if (err != CL_SUCCESS)
+    *mem_ptr = NULL;
+  return err;
 }
 
 void *
@@ -1576,6 +2044,8 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
                    cl_int *          errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  void *ptr = NULL;
+  void *mem_ptr = NULL;
   enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
@@ -1602,6 +2072,10 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     goto error;
   }
 
+  err = _cl_map_mem(buffer, &ptr, &mem_ptr, offset, size);
+  if (err != CL_SUCCESS)
+    goto error;
+
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
 
   data = &no_wait_data;
@@ -1609,10 +2083,10 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
   data->mem_obj     = buffer;
   data->offset      = offset;
   data->size        = size;
-  data->map_flags   = map_flags;
+  data->ptr         = ptr;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+                   event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
     err = cl_enqueue_handle(data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
@@ -1620,12 +2094,12 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
 error:
   if (errcode_ret)
     *errcode_ret = err;
-  return data->ptr;
+  return mem_ptr;
 }
 
 void *
 clEnqueueMapImage(cl_command_queue   command_queue,
-                  cl_mem             image,
+                  cl_mem             mem,
                   cl_bool            blocking_map,
                   cl_map_flags       map_flags,
                   const size_t *     origin,
@@ -1638,11 +2112,13 @@ clEnqueueMapImage(cl_command_queue   command_queue,
                   cl_int *           errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  void *ptr  = NULL;
+  void *mem_ptr = NULL;
   enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
-  CHECK_IMAGE(image);
-  if (command_queue->ctx != image->ctx) {
+  CHECK_IMAGE(mem, image);
+  if (command_queue->ctx != mem->ctx) {
     err = CL_INVALID_CONTEXT;
     goto error;
   }
@@ -1665,27 +2141,51 @@ clEnqueueMapImage(cl_command_queue   command_queue,
     *image_slice_pitch = image->slice_pitch;
 
   if ((map_flags & CL_MAP_READ &&
-       image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+       mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
       (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
-       image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+       mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
   {
     err = CL_INVALID_OPERATION;
     goto error;
   }
 
-  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+  if (!(ptr = cl_mem_map_gtt_unsync(mem))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  size_t size;
+  if(region[2] == 1) {
+    if(region[1] == 1)
+      size = image->bpp * region[0];
+    else
+      size = image->row_pitch * (region[1] - 1) + (image->bpp * (origin[0] + region[0]));
+  } else {
+    size = image->slice_pitch * (region[2] - 1);
+    size += image->row_pitch * (origin[1] + region[1]);
+    size += image->bpp * (origin[0] + region[0]);
+  }
+
+  err = _cl_map_mem(mem, &ptr, &mem_ptr, offset, size);
+  if (err != CL_SUCCESS)
+    goto error;
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
 
   data = &no_wait_data;
   data->type        = EnqueueMapImage;
-  data->mem_obj     = image;
+  data->mem_obj     = mem;
   data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
   data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
   data->row_pitch   = *image_row_pitch;
-  data->slice_pitch = *image_slice_pitch;
-  data->map_flags   = map_flags;
+  if (image_slice_pitch)
+    data->slice_pitch = *image_slice_pitch;
+  data->ptr         = ptr;
+  data->offset      = offset;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+                   event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
     err = cl_enqueue_handle(data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
@@ -1693,7 +2193,7 @@ clEnqueueMapImage(cl_command_queue   command_queue,
 error:
   if (errcode_ret)
     *errcode_ret = err;
-  return data->ptr; //TODO: map and unmap first
+  return mem_ptr; //TODO: map and unmap first
 }
 
 cl_int
@@ -1722,7 +2222,7 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
   data->ptr         = mapped_ptr;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+                   event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
     err = cl_enqueue_handle(data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
@@ -1764,19 +2264,12 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
-  /* Check offset values. We add a non standard restriction. The offsets must
-   * also be evenly divided by the local sizes
-   */
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i) {
       if (UNLIKELY(~0LL - global_work_offset[i] > global_work_size[i])) {
         err = CL_INVALID_GLOBAL_OFFSET;
         goto error;
       }
-      if (UNLIKELY(local_work_size != NULL && global_work_offset[i] % local_work_size[i])) {
-        err = CL_INVALID_GLOBAL_OFFSET;
-        goto error;
-      }
     }
 
   /* Local sizes must be non-null and divide global sizes */
@@ -1824,7 +2317,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   data->queue = command_queue;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
-                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+                   event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -1839,8 +2332,11 @@ clEnqueueTask(cl_command_queue   command_queue,
               const cl_event *   event_wait_list,
               cl_event *         event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  const size_t global_size[3] = {1, 0, 0};
+  const size_t local_size[3]  = {1, 0, 0};
+
+  return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size,
+                                num_events_in_wait_list, event_wait_list, event);
 }
 
 cl_int
@@ -1855,16 +2351,74 @@ clEnqueueNativeKernel(cl_command_queue   command_queue,
                       const cl_event *   event_wait_list,
                       cl_event *         event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  void *new_args = NULL;
+  enqueue_data *data, no_wait_data = { 0 };
+  cl_int i;
+
+  if(user_func == NULL ||
+    (args == NULL && cb_args > 0) ||
+    (args == NULL && num_mem_objects ==0) ||
+    (args != NULL && cb_args == 0) ||
+    (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
+    (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  //Per spec, need copy args
+  if (cb_args)
+  {
+    new_args = malloc(cb_args);
+    if (!new_args)
+    {
+      err = CL_OUT_OF_HOST_MEMORY;
+      goto error;
+    }
+    memcpy(new_args, args, cb_args);
+
+    for (i=0; i<num_mem_objects; ++i)
+    {
+      CHECK_MEM(mem_list[i]);
+      args_mem_loc[i] = new_args + (args_mem_loc[i] - args);  //change to new args
+    }
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueNativeKernel;
+  data->mem_list    = mem_list;
+  data->ptr         = new_args;
+  data->size        = cb_args;
+  data->offset      = (size_t)num_mem_objects;
+  data->const_ptr   = args_mem_loc;
+  data->user_func   = user_func;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
 }
 
 cl_int
 clEnqueueMarker(cl_command_queue     command_queue,
                 cl_event *           event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+  if(event == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_event_marker(command_queue, event);
+error:
+  return err;
 }
 
 cl_int
@@ -1883,9 +2437,12 @@ error:
 cl_int
 clEnqueueBarrier(cl_command_queue  command_queue)
 {
-  NOT_IMPLEMENTED;
-  return 0;
-  //return clFinish(command_queue);
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+  cl_command_queue_set_barrier(command_queue);
+
+error:
+  return err;
 }
 
 #define EXTFUNC(x)                      \
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index e82f75c..ff78770 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -87,6 +87,7 @@ cl_command_queue_delete(cl_command_queue queue)
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
   cl_gpgpu_delete(queue->gpgpu);
+  cl_free(queue->wait_events);
   queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(queue);
 }
@@ -98,7 +99,9 @@ cl_command_queue_add_ref(cl_command_queue queue)
 }
 
 static void
-set_image_info(char *curbe, struct ImageInfo * image_info, cl_mem image)
+set_image_info(char *curbe,
+               struct ImageInfo * image_info,
+               struct _cl_mem_image *image)
 {
   if (image_info->wSlot >= 0)
     *(uint32_t*)(curbe + image_info->wSlot) = image->w;
@@ -118,12 +121,14 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
   uint32_t i;
   for (i = 0; i < k->image_sz; i++) {
     int id = k->images[i].arg_idx;
+    struct _cl_mem_image *image;
     assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
-    set_image_info(k->curbe, &k->images[i], k->args[id].mem);
-    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, k->args[id].mem->bo,
-                        k->args[id].mem->intel_fmt, k->args[id].mem->type,
-                        k->args[id].mem->w, k->args[id].mem->h,
-                        k->args[id].mem->row_pitch, k->args[id].mem->tiling);
+    image = cl_mem_image(k->args[id].mem);
+    set_image_info(k->curbe, &k->images[i], image);
+    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo,
+                        image->intel_fmt, image->image_type,
+                        image->w, image->h, image->depth,
+                        image->row_pitch, image->tiling);
   }
   return CL_SUCCESS;
 }
@@ -146,24 +151,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   return CL_SUCCESS;
 }
 
-LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
-                                                       char * dst)
-{
-  int i;
-  for(i = 0; i < k->arg_n; i++) {
-    enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
-
-    if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) {
-      uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
-      cl_mem mem = k->args[i].mem;
-      cl_buffer_map(mem->bo, 1);
-      void * addr = cl_buffer_get_virtual(mem->bo);
-      memcpy(dst + offset, addr, mem->size);
-      cl_buffer_unmap(mem->bo);
-    }
-  }
-  return CL_SUCCESS;
-}
 
 #if USE_FULSIM
 extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
@@ -426,3 +413,76 @@ cl_command_queue_finish(cl_command_queue queue)
   return CL_SUCCESS;
 }
 
+#define DEFAULT_WAIT_EVENTS_SIZE  16
+LOCAL void
+cl_command_queue_insert_event(cl_command_queue queue, cl_event event)
+{
+  cl_int i=0;
+  cl_event *new_list;
+
+  assert(queue != NULL);
+  if(queue->wait_events == NULL) {
+    queue->wait_events_size = DEFAULT_WAIT_EVENTS_SIZE;
+    TRY_ALLOC_NO_ERR (queue->wait_events, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+  }
+
+  for(i=0; i<queue->wait_events_num; i++) {
+    if(queue->wait_events[i] == event)
+      return;   //is in the wait_events, need to insert
+  }
+
+  if(queue->wait_events_num < queue->wait_events_size) {
+    queue->wait_events[queue->wait_events_num++] = event;
+    return;
+  }
+
+  //wait_events_num == wait_events_size, array is full
+  queue->wait_events_size *= 2;
+  TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+  memcpy(new_list, queue->wait_events, sizeof(cl_event)*queue->wait_events_num);
+  cl_free(queue->wait_events);
+  queue->wait_events = new_list;
+  queue->wait_events[queue->wait_events_num++] = event;
+  return;
+
+exit:
+  return;
+error:
+  if(queue->wait_events)
+    cl_free(queue->wait_events);
+  queue->wait_events = NULL;
+  queue->wait_events_size = 0;
+  queue->wait_events_num = 0;
+  goto exit;
+
+}
+
+LOCAL void
+cl_command_queue_remove_event(cl_command_queue queue, cl_event event)
+{
+  cl_int i=0;
+
+  assert(queue->wait_events);
+  for(i=0; i<queue->wait_events_num; i++) {
+    if(queue->wait_events[i] == event)
+      break;
+  }
+
+  if(i == queue->wait_events_num)
+    return;
+
+  if(queue->barrier_index >= i)
+    queue->barrier_index -= 1;
+
+  for(; i<queue->wait_events_num-1; i++) {
+    queue->wait_events[i] = queue->wait_events[i+1];
+  }
+  queue->wait_events_num -= 1;
+}
+
+LOCAL void
+cl_command_queue_set_barrier(cl_command_queue queue)
+{
+    queue->barrier_index = queue->wait_events_num;
+}
+
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 135d659..9396fd7 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -33,6 +33,11 @@ struct _cl_command_queue {
   uint64_t magic;                      /* To identify it as a command queue */
   volatile int ref_n;                  /* We reference count this object */
   cl_context ctx;                      /* Its parent context */
+  cl_event* wait_events;               /* Point to array of non-complete user events that block this command queue */
+  cl_int    wait_events_num;           /* Number of Non-complete user events */
+  cl_int    wait_events_size;          /* The size of array that wait_events point to */
+  cl_int    barrier_index;             /* Indicate event count in wait_events as barrier events */
+  cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
   cl_command_queue_properties  props;  /* Queue properties */
   cl_command_queue prev, next;         /* We chain the command queues together */
   cl_gpgpu gpgpu;                      /* Setup all GEN commands */
@@ -77,7 +82,14 @@ extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
 /* Bind all the image surfaces in the GPGPU state */
 extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
 
-/*update constant buffer to final curbe */
-extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
+/* Insert a user event to command's wait_events */
+extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
+
+/* Remove a user event from command's wait_events */
+extern void cl_command_queue_remove_event(cl_command_queue, cl_event);
+
+/* Set the barrier index */
+extern void cl_command_queue_set_barrier(cl_command_queue);
+
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 1d415d4..b85c0cd 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -76,7 +76,7 @@ cl_set_varying_payload(const cl_kernel ker,
     block_ips[curr] = 0;
   }
 
-  /* Copy them to the constant buffer */
+  /* Copy them to the curbe buffer */
   curr = 0;
   for (i = 0; i < thread_n; ++i, data += cst_sz) {
     uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
@@ -95,6 +95,62 @@ error:
   return err;
 }
 
+static void
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+{
+  /* calculate constant buffer size */
+  int32_t arg;
+  size_t offset;
+  gbe_program prog = ker->program->opaque;
+  const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
+  size_t global_const_size = gbe_program_get_global_constant_size(prog);
+  uint32_t constant_buf_size = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      cl_mem mem = ker->args[arg].mem;
+      constant_buf_size += ALIGN(mem->size, 4);
+    }
+  }
+  if(global_const_size == 0 && constant_buf_size == 0)
+     return;
+
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
+  cl_buffer_map(bo, 1);
+  char * cst_addr = cl_buffer_get_virtual(bo);
+  offset = 0;
+  if (global_const_size > 0) {
+    /* Write the global constant arrays */
+    gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+  }
+  offset += ALIGN(global_const_size, 4);
+
+  if(global_const_size == 0) {
+    /* reserve 4 bytes to get rid of 0 address */
+    offset += 4;
+  }
+
+  /* upload constant buffer argument */
+  int32_t curbe_offset = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      cl_mem mem = ker->args[arg].mem;
+
+      curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+      assert(curbe_offset >= 0);
+      *(uint32_t *) (ker->curbe + curbe_offset) = offset;
+
+      cl_buffer_map(mem->bo, 1);
+      void * addr = cl_buffer_get_virtual(mem->bo);
+      memcpy(cst_addr + offset, addr, mem->size);
+      cl_buffer_unmap(mem->bo);
+      offset += ALIGN(mem->size, 4);
+    }
+  }
+  cl_buffer_unmap(bo);
+}
+
 /* Will return the total amount of slm used */
 static int32_t
 cl_curbe_fill(cl_kernel ker,
@@ -122,9 +178,17 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
   UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
-  UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
 #undef UPLOAD
 
+  /* Upload sampler information. */
+  offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0);
+  if (offset >= 0) {
+    uint32_t i;
+    for(i = 0; i < ker->sampler_sz; i++, offset += 2) {
+      *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF;
+    }
+  }
+
   /* Write identity for the stack pointer. This is required by the stack pointer
    * computation in the kernel
    */
@@ -134,14 +198,6 @@ cl_curbe_fill(cl_kernel ker,
     int32_t i;
     for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
   }
-
-  /* Write global constant arrays */
-  if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) {
-    /* Write the global constant arrays */
-    gbe_program prog = ker->program->opaque;
-    gbe_program_get_global_constant_data(prog, ker->curbe + offset);
-  }
-
   /* Handle the various offsets to SLM */
   const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
   int32_t arg, slm_offset = 0;
@@ -220,9 +276,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Compute the number of HW threads we need */
   TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
-  kernel.cst_sz = cst_sz;
+  kernel.curbe_sz = cst_sz;
 
-  /* Curbe step 1: fill the constant buffer data shared by all threads */
+  /* Curbe step 1: fill the constant urb buffer data shared by all threads */
   if (ker->curbe) {
     kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
     if (kernel.slm_sz > ker->program->ctx->device->local_mem_size)
@@ -242,6 +298,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   cl_setup_scratch(gpgpu, ker);
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
+
+  cl_upload_constant_buffer(queue, ker);
+
   cl_gpgpu_states_setup(gpgpu, &kernel);
 
   /* Curbe step 2. Give the localID and upload it to video memory */
@@ -250,10 +309,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
     TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
     for (i = 0; i < thread_n; ++i) {
         memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
-        cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
     }
     TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
-    cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
+    cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz);
   }
 
   /* Start a new batch buffer */
diff --git a/src/cl_context.c b/src/cl_context.c
index a48436c..4f1c611 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -26,6 +26,8 @@
 #include "cl_utils.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_program.h"
 
 #include "CL/cl.h"
 #include "CL/cl_gl.h"
@@ -123,7 +125,6 @@ cl_create_context(const cl_context_properties *  properties,
   cl_int err = CL_SUCCESS;
   cl_uint prop_len = 0;
   /* XXX */
-  FATAL_IF (pfn_notify != NULL || user_data != NULL, "Unsupported call back");
   FATAL_IF (num_devices != 1, "Only one device is supported");
 
   /* Check that we are getting the right platform */
@@ -144,6 +145,10 @@ cl_create_context(const cl_context_properties *  properties,
   /* Attach the device to the context */
   ctx->device = *devices;
 
+  /* Save the user callback and user data*/
+  ctx->pfn_notify = pfn_notify;
+  ctx->user_data = user_data;
+
 exit:
   if (errcode_ret != NULL)
     *errcode_ret = err;
@@ -240,3 +245,26 @@ cl_context_get_bufmgr(cl_context ctx)
   return cl_driver_get_bufmgr(ctx->drv);
 }
 
+cl_kernel
+cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
+{
+  cl_int ret;
+  if (!ctx->internal_prgs[index])
+  {
+    size_t length = strlen(str_kernel) + 1;
+    ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
+
+    if (!ctx->internal_prgs[index])
+      return NULL;
+
+    ret = cl_program_build(ctx->internal_prgs[index], str_option);
+    if (ret != CL_SUCCESS)
+      return NULL;
+
+    ctx->internal_prgs[index]->is_built = 1;
+
+    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+  }
+
+  return ctx->internel_kernels[index];
+}
diff --git a/src/cl_context.h b/src/cl_context.h
index 718d589..7016733 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -39,18 +39,35 @@ enum _cl_gl_context_type {
   CL_GL_CGL_SHAREGROUP
 };
 
+enum _cl_internal_ker_type {
+  CL_ENQUEUE_COPY_BUFFER = 0,
+  CL_ENQUEUE_COPY_BUFFER_RECT = 1,
+  CL_ENQUEUE_COPY_IMAGE_0 = 2,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_1 = 3,             //copy image 3d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2 = 4,             //copy image 2d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_3 = 5,             //copy image 3d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0 = 6,   //copy image 2d to buffer
+  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1 = 7,   //copy image 3d tobuffer
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0 = 8,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1 = 9,   //copy buffer to image 3d
+  CL_INTERNAL_KERNEL_MAX = 10
+};
+
 struct _cl_context_prop {
   cl_context_properties platform_id;
   enum _cl_gl_context_type gl_type;
   cl_context_properties gl_context;
   union {
-    cl_context_properties egl_display; 
-    cl_context_properties glx_display; 
+    cl_context_properties egl_display;
+    cl_context_properties glx_display;
     cl_context_properties wgl_hdc;
     cl_context_properties cgl_sharegroup;
   };
 };
 
+#define IS_EGL_CONTEXT(ctx)  (ctx->props.gl_type == CL_GL_EGL_DISPLAY)
+#define EGL_DISP(ctx)   (EGLDisplay)(ctx->props.egl_display)
+#define EGL_CTX(ctx)    (EGLContext)(ctx->props.gl_context)
 /* Encapsulate the whole device */
 struct _cl_context {
   DEFINE_ICD(dispatch)
@@ -68,10 +85,18 @@ struct _cl_context {
   pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
   pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
   pthread_mutex_t event_lock;       /* To allocate and deallocate events */
+  cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
+                                    /* All programs internal used, for example clEnqueuexxx api use */
+  cl_kernel  internel_kernels[CL_INTERNAL_KERNEL_MAX];
+                                    /* All kernels  for clenqueuexxx api, for example clEnqueuexxx api use */
   uint32_t ver;                     /* Gen version */
   struct _cl_context_prop props;
   cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
   cl_uint                 prop_len;  /* count of the properties */
+  void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
+                                     /* User's callback when error occur in context */
+  void *user_data;                   /* A pointer to user supplied data */
+
 };
 
 /* Implement OpenCL function */
@@ -109,5 +134,8 @@ extern cl_int cl_context_ND_kernel(cl_context,
 /* Used for allocation */
 extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
 
+/* Get the internal used kernel */
+extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
+
 #endif /* __CL_CONTEXT_H__ */
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index a2c3ed2..16b343d 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -39,7 +39,7 @@ static struct _cl_device_id intel_ivb_gt2_device = {
   .max_compute_unit = 128,
   .max_thread_per_unit = 8,
   .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
+  .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
   .wg_sz = 1024,
   .compile_wg_sz = {0},	
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 1a0ec38..100b38d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -22,7 +22,7 @@
 
 #include <stdint.h>
 #include <stdlib.h>
-
+#include "cl_driver_type.h"
 /* Various limitations we should remove actually */
 #define GEN_MAX_SURFACES 128
 #define GEN_MAX_SAMPLERS 16
@@ -33,28 +33,6 @@
  * will allow us to make the use of a software performance simulator easier and
  * to minimize the code specific for the HW and for the simulator
  **************************************************************************/
-
-/* Encapsulates command buffer / data buffer / kernels */
-typedef struct _cl_buffer *cl_buffer;
-
-/* Encapsulates buffer manager */
-typedef struct _cl_buffer_mgr *cl_buffer_mgr;
-
-/* Encapsulates the driver backend functionalities */
-typedef struct _cl_driver *cl_driver;
-
-/* Encapsulates the gpgpu stream of commands */
-typedef struct _cl_gpgpu *cl_gpgpu;
-
-/* Encapsulates the event  of a command stream */
-typedef struct _cl_gpgpu_event *cl_gpgpu_event;
-
-typedef struct _cl_context_prop *cl_context_prop;
-typedef struct _cl_sampler *cl_sampler;
-
-/**************************************************************************
- * Driver
- **************************************************************************/
 /* Create a new driver */
 typedef cl_driver (cl_driver_new_cb)(cl_context_prop);
 extern cl_driver_new_cb *cl_driver_new;
@@ -100,7 +78,7 @@ typedef enum gpu_command_status {
 typedef struct cl_gpgpu_kernel {
   const char *name;        /* kernel name and bo name */
   uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
-  uint32_t cst_sz;         /* total size of all constants */
+  uint32_t curbe_sz;         /* total size of all curbes */
   cl_buffer bo;            /* kernel code in the proper addr space */
   int32_t barrierID;       /* barrierID for _this_ kernel */
   uint32_t use_slm:1;      /* For gen7 (automatic barrier management) */
@@ -136,6 +114,7 @@ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
                                       uint32_t type,
                                       int32_t w,
                                       int32_t h,
+                                      int32_t depth,
                                       int pitch,
                                       cl_gpgpu_tiling tiling);
 
@@ -157,9 +136,12 @@ extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
 typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
 extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
 
-/* Fills current constant buffer with data */
-typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t size);
-extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants;
+/* Fills current curbe buffer with data */
+typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
+
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size);
+extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
 
 /* Setup all indirect states */
 typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
@@ -231,14 +213,18 @@ typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride
 extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
 
 #include "cl_context.h"
+#include "cl_mem.h"
 typedef struct _cl_context *cl_context;
 
-typedef cl_buffer (cl_buffer_alloc_from_eglimage_cb)(cl_context, void*, unsigned int *,
-                                                     int *, int *, int *, int *);
-extern cl_buffer_alloc_from_eglimage_cb *cl_buffer_alloc_from_eglimage;
+typedef cl_buffer (cl_buffer_alloc_from_texture_cb)(cl_context, unsigned int, int, unsigned int,
+                                                    struct _cl_mem_image *gl_image);
+extern cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture;
+
+typedef void (cl_buffer_release_from_texture_cb)(cl_context, unsigned int, int, unsigned int);
+extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
 
 /* Unref a buffer and destroy it if no more ref */
-typedef void (cl_buffer_unreference_cb)(cl_buffer);
+typedef int (cl_buffer_unreference_cb)(cl_buffer);
 extern cl_buffer_unreference_cb *cl_buffer_unreference;
 
 /* Add one more ref on a buffer */
@@ -257,6 +243,10 @@ extern cl_buffer_unmap_cb *cl_buffer_unmap;
 typedef int (cl_buffer_map_gtt_cb)(cl_buffer);
 extern cl_buffer_map_gtt_cb *cl_buffer_map_gtt;
 
+/* Map a buffer in the GTT domain, non waiting the GPU read or write*/
+typedef int (cl_buffer_map_gtt_unsync_cb)(cl_buffer);
+extern cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync;
+
 /* Unmap a buffer in the GTT domain */
 typedef int (cl_buffer_unmap_gtt_cb)(cl_buffer);
 extern cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt;
@@ -289,5 +279,35 @@ extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
 typedef int (cl_driver_get_device_id_cb)(void);
 extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
 
+/**************************************************************************
+ * cl_khr_gl_sharing.
+ **************************************************************************/
+typedef int (cl_gl_acquire_texture_cb)(void *driver, void *ctx, int target,
+                                       int level, int texture, void*user_data);
+extern cl_gl_acquire_texture_cb *cl_gl_acquire_texture;
+
+typedef int (cl_gl_release_texture_cb)(void *driver, void *ctx, int target,
+                                       int level, int texture);
+extern cl_gl_release_texture_cb *cl_gl_release_texture;
+
+typedef int (cl_gl_acquire_buffer_object_cb)(void *driver, void *ctx,
+                                             int bufobj, void* user_data);
+extern cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object;
+
+typedef int (cl_gl_release_buffer_object_cb)(void *driver, void *ctx, int bufobj);
+extern cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object;
+
+typedef int (cl_gl_acquire_render_buffer_cb)(void *driver, void *ctx,
+                                             int rb, void* user_data);
+extern cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer;
+
+typedef int (cl_gl_release_render_buffer_cb)(void *driver, void *ctx, int rb);
+extern cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer;
+
+#ifndef DEFAULT_DRIVER_DIR
+/* this is normally defined in Mesa/configs/default with DRI_DRIVER_SEARCH_PATH */
+#define DEFAULT_DRIVER_DIR "/usr/local/lib/dri"
+#endif
+
 #endif /* __CL_DRIVER_H__ */
 
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index e7412de..ac4ff7a 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -30,12 +30,14 @@ LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
 /* Buffer */
 LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
 LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
-LOCAL cl_buffer_alloc_from_eglimage_cb *cl_buffer_alloc_from_eglimage = NULL;
+LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
+LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
 LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
 LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
 LOCAL cl_buffer_map_cb *cl_buffer_map = NULL;
 LOCAL cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
 LOCAL cl_buffer_map_gtt_cb *cl_buffer_map_gtt = NULL;
+LOCAL cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync = NULL;
 LOCAL cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt = NULL;
 LOCAL cl_buffer_get_virtual_cb *cl_buffer_get_virtual = NULL;
 LOCAL cl_buffer_get_size_cb *cl_buffer_get_size = NULL;
@@ -44,6 +46,13 @@ LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
 LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
 LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
 
+/* cl_khr_gl_sharing */
+LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
+LOCAL cl_gl_release_texture_cb *cl_gl_release_texture = NULL;
+LOCAL cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object = NULL;
+LOCAL cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object = NULL;
+LOCAL cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer = NULL;
+LOCAL cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer = NULL;
 /* GPGPU */
 LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
 LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
@@ -53,8 +62,9 @@ LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
 LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
-LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL;
+LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
 LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
 LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
 LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
diff --git a/src/cl_driver_type.h b/src/cl_driver_type.h
new file mode 100644
index 0000000..891a33c
--- /dev/null
+++ b/src/cl_driver_type.h
@@ -0,0 +1,24 @@
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
+ * will allow us to make the use of a software performance simulator easier and
+ * to minimize the code specific for the HW and for the simulator
+ **************************************************************************/
+
+/* Encapsulates command buffer / data buffer / kernels */
+typedef struct _cl_buffer *cl_buffer;
+
+/* Encapsulates buffer manager */
+typedef struct _cl_buffer_mgr *cl_buffer_mgr;
+
+/* Encapsulates the driver backend functionalities */
+typedef struct _cl_driver *cl_driver;
+
+/* Encapsulates the gpgpu stream of commands */
+typedef struct _cl_gpgpu *cl_gpgpu;
+
+/* Encapsulates the event  of a command stream */
+typedef struct _cl_gpgpu_event *cl_gpgpu_event;
+
+typedef struct _cl_context_prop *cl_context_prop;
+typedef struct _cl_sampler *cl_sampler;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index a112cc4..0330691 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -45,6 +45,53 @@ error:
   return err;
 }
 
+cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
+{
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+  void* dst_ptr;
+
+  const size_t* origin = data->origin;
+  const size_t* host_origin = data->host_origin;
+  const size_t* region = data->region;
+
+  if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+   size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+   src_ptr = (char*)src_ptr + offset;
+
+   offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+   dst_ptr = (char *)data->ptr + offset;
+
+   if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+       (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
+   {
+     memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+   }
+   else {
+     cl_uint y, z;
+     for (z = 0; z < region[2]; z++) {
+       const char* src = src_ptr;
+       char* dst = dst_ptr;
+       for (y = 0; y < region[1]; y++) {
+         memcpy(dst, src, region[0]);
+         src += data->row_pitch;
+         dst += data->host_row_pitch;
+       }
+       src_ptr = (char*)src_ptr + data->slice_pitch;
+       dst_ptr = (char*)dst_ptr + data->host_slice_pitch;
+     }
+   }
+
+  err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+  return err;
+}
+
 cl_int cl_enqueue_write_buffer(enqueue_data *data)
 {
   cl_int err = CL_SUCCESS;
@@ -63,186 +110,166 @@ error:
   return err;
 }
 
-cl_int cl_enqueue_read_image(enqueue_data *data)
+cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
 {
   cl_int err = CL_SUCCESS;
   void* src_ptr;
+  void* dst_ptr;
 
-  cl_mem image = data->mem_obj;
   const size_t* origin = data->origin;
+  const size_t* host_origin = data->host_origin;
   const size_t* region = data->region;
 
-  if (!(src_ptr = cl_mem_map_auto(image))) {
+  if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
-  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  src_ptr = (char*)src_ptr + offset;
+  size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+  dst_ptr = (char *)dst_ptr + offset;
 
-  if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
-      (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+  offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+  src_ptr = (char*)data->const_ptr + offset;
+
+  if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+      (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
   {
-    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
   }
   else {
     cl_uint y, z;
     for (z = 0; z < region[2]; z++) {
       const char* src = src_ptr;
-      char* dst = data->ptr;
+      char* dst = dst_ptr;
       for (y = 0; y < region[1]; y++) {
-        memcpy(dst, src, image->bpp*region[0]);
-        src += image->row_pitch;
+        memcpy(dst, src, region[0]);
+        src += data->host_row_pitch;
         dst += data->row_pitch;
       }
-      src_ptr = (char*)src_ptr + image->slice_pitch;
-      data->ptr = (char*)data->ptr + data->slice_pitch;
+      src_ptr = (char*)src_ptr + data->host_slice_pitch;
+      dst_ptr = (char*)dst_ptr + data->slice_pitch;
     }
   }
 
- err = cl_mem_unmap_auto(image);
+  err = cl_mem_unmap_auto(data->mem_obj);
 
 error:
   return err;
-
 }
 
-cl_int cl_enqueue_write_image(enqueue_data *data)
+
+cl_int cl_enqueue_read_image(enqueue_data *data)
 {
   cl_int err = CL_SUCCESS;
-  void* dst_ptr;
+  void* src_ptr;
 
-  cl_mem image = data->mem_obj;
-  const size_t *origin = data->origin;
-  const size_t *region = data->region;
+  cl_mem mem = data->mem_obj;
+  CHECK_IMAGE(mem, image);
+  const size_t* origin = data->origin;
+  const size_t* region = data->region;
 
-  if (!(dst_ptr = cl_mem_map_auto(image))) {
+  if (!(src_ptr = cl_mem_map_auto(mem))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
   size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  dst_ptr = (char*)dst_ptr + offset;
+  src_ptr = (char*)src_ptr + offset;
 
   if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
       (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
   {
-    memcpy(dst_ptr, data->ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
   }
   else {
     cl_uint y, z;
     for (z = 0; z < region[2]; z++) {
-      const char* src = data->const_ptr;
-      char* dst = dst_ptr;
+      const char* src = src_ptr;
+      char* dst = data->ptr;
       for (y = 0; y < region[1]; y++) {
         memcpy(dst, src, image->bpp*region[0]);
-        src += data->row_pitch;
-        dst += image->row_pitch;
+        src += image->row_pitch;
+        dst += data->row_pitch;
       }
+      src_ptr = (char*)src_ptr + image->slice_pitch;
       data->ptr = (char*)data->ptr + data->slice_pitch;
-      dst_ptr = (char*)dst_ptr + image->slice_pitch;
     }
   }
 
-  err = cl_mem_unmap_auto(image);
+ err = cl_mem_unmap_auto(mem);
 
 error:
   return err;
 
 }
 
-cl_int cl_enqueue_map_buffer(enqueue_data *data)
+cl_int cl_enqueue_write_image(enqueue_data *data)
 {
+  cl_int err = CL_SUCCESS;
+  void* dst_ptr;
+
+  cl_mem mem = data->mem_obj;
+  CHECK_IMAGE(mem, image);
+
+  if (!(dst_ptr = cl_mem_map_auto(mem))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  cl_mem_copy_image_region(data->origin, data->region, dst_ptr,
+                           image->row_pitch, image->slice_pitch,
+                           data->const_ptr, data->row_pitch,
+                           data->slice_pitch, image);
+  err = cl_mem_unmap_auto(mem);
+
+error:
+  return err;
 
+}
+
+cl_int cl_enqueue_map_buffer(enqueue_data *data)
+{
   void *ptr = NULL;
   cl_int err = CL_SUCCESS;
-  void *mem_ptr = NULL;
-  cl_int slot = -1;
   cl_mem buffer = data->mem_obj;
-
-  if (!(ptr = cl_mem_map_auto(buffer))) {
+  //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+  if (!(ptr = cl_mem_map_gtt(buffer))) {
     err = CL_MAP_FAILURE;
+    goto error;
   }
 
   ptr = (char*)ptr + data->offset;
+  assert(data->ptr == ptr);
 
   if(buffer->flags & CL_MEM_USE_HOST_PTR) {
     assert(buffer->host_ptr);
     memcpy(buffer->host_ptr + data->offset, ptr, data->size);
-    mem_ptr = buffer->host_ptr + data->offset;
-  } else {
-    mem_ptr = ptr;
-  }
-
-  /* Record the mapped address. */
-  if (!buffer->mapped_ptr_sz) {
-    buffer->mapped_ptr_sz = 16;
-    buffer->mapped_ptr = (cl_mapped_ptr *)malloc(
-          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz);
-    if (!buffer->mapped_ptr) {
-      cl_mem_unmap_auto (buffer);
-      err = CL_OUT_OF_HOST_MEMORY;
-      ptr = NULL;
-      goto error;
-    }
-
-    memset(buffer->mapped_ptr, 0, buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-    slot = 0;
-  } else {
-   int i = 0;
-    for (; i < buffer->mapped_ptr_sz; i++) {
-      if (buffer->mapped_ptr[i].ptr == NULL) {
-        slot = i;
-        break;
-      }
-   }
-
-    if (i == buffer->mapped_ptr_sz) {
-      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
-          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz * 2);
-      if (!new_ptr) {
-       cl_mem_unmap_auto (buffer);
-        err = CL_OUT_OF_HOST_MEMORY;
-        ptr = NULL;
-        goto error;
-      }
-      memset(new_ptr, 0, 2 * buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-      memcpy(new_ptr, buffer->mapped_ptr,
-             buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-      slot = buffer->mapped_ptr_sz;
-      buffer->mapped_ptr_sz *= 2;
-      free(buffer->mapped_ptr);
-      buffer->mapped_ptr = new_ptr;
-    }
   }
 
-  assert(slot != -1);
-  buffer->mapped_ptr[slot].ptr = mem_ptr;
-  buffer->mapped_ptr[slot].v_ptr = ptr;
-  buffer->mapped_ptr[slot].size = data->size;
-  buffer->map_ref++;
-
-  data->ptr = mem_ptr;
-
 error:
   return err;
 }
 
 cl_int cl_enqueue_map_image(enqueue_data *data)
 {
-  void *ptr = NULL;
   cl_int err = CL_SUCCESS;
+  cl_mem mem = data->mem_obj;
+  void *ptr = NULL;
+  CHECK_IMAGE(mem, image);
 
-  cl_mem image = data->mem_obj;
-  const size_t *origin = data->origin;
-
-  if (!(ptr = cl_mem_map_auto(image))) {
+  if (!(ptr = cl_mem_map_gtt(mem))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
-  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  data->ptr = (char*)ptr + offset;
+  assert(data->ptr == (char*)ptr + data->offset);
+
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mem->host_ptr);
+    cl_mem_copy_image_region(data->origin, data->region,
+                             mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+                             data->ptr, data->row_pitch, data->slice_pitch, image);
+  }
 
 error:
   return err;
@@ -282,7 +309,7 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
     assert(v_ptr == mapped_ptr);
   }
 
-  cl_mem_unmap_auto(memobj);
+  cl_mem_unmap_gtt(memobj);
 
   /* shrink the mapped slot. */
   if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
@@ -311,13 +338,43 @@ error:
   return err;
 }
 
+cl_int cl_enqueue_native_kernel(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  cl_uint num_mem_objects = (cl_uint)data->offset;
+  const cl_mem *mem_list = data->mem_list;
+  const void **args_mem_loc = (const void **)data->const_ptr;
+  cl_uint i;
+
+  for (i=0; i<num_mem_objects; ++i)
+  {
+      const cl_mem buffer = mem_list[i];
+      CHECK_MEM(buffer);
+
+      *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer);
+  }
+  data->user_func(data->ptr);
+
+  for (i=0; i<num_mem_objects; ++i)
+  {
+      cl_mem_unmap_auto(mem_list[i]);
+  }
+
+  free(data->ptr);
+error:
+  return err;
+}
 cl_int cl_enqueue_handle(enqueue_data* data)
 {
   switch(data->type) {
     case EnqueueReadBuffer:
       return cl_enqueue_read_buffer(data);
+    case EnqueueReadBufferRect:
+      return cl_enqueue_read_buffer_rect(data);
     case EnqueueWriteBuffer:
       return cl_enqueue_write_buffer(data);
+    case EnqueueWriteBufferRect:
+      return cl_enqueue_write_buffer_rect(data);
     case EnqueueReadImage:
       return cl_enqueue_read_image(data);
     case EnqueueWriteImage:
@@ -328,8 +385,15 @@ cl_int cl_enqueue_handle(enqueue_data* data)
       return cl_enqueue_map_image(data);
     case EnqueueUnmapMemObject:
       return cl_enqueue_unmap_mem_object(data);
+    case EnqueueCopyBufferRect:
+    case EnqueueCopyImage:
+    case EnqueueCopyBufferToImage:
+    case EnqueueCopyImageToBuffer:
     case EnqueueNDRangeKernel:
-      cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);   //goto default
+      cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
+      return CL_SUCCESS;
+    case EnqueueNativeKernel:
+      return cl_enqueue_native_kernel(data);
     default:
       return CL_SUCCESS;
   }
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 7dc8ceb..b412d58 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -19,9 +19,8 @@
 #ifndef __CL_ENQUEUE_H__
 #define __CL_ENQUEUE_H__
 
-#include "cl_mem.h"
-#include "cl_command_queue.h"
 #include "cl_internals.h"
+#include "cl_driver.h"
 #include "CL/cl.h"
 
 typedef enum {
@@ -40,22 +39,28 @@ typedef enum {
   EnqueueMapImage,
   EnqueueUnmapMemObject,
   EnqueueNDRangeKernel,
+  EnqueueNativeKernel,
+  EnqueueMarker,
   EnqueueInvalid
 } enqueue_type;
 
 typedef struct _enqueue_data {
-  enqueue_type      type;          /* Command type */
-  cl_mem            mem_obj;       /* Enqueue's cl_mem */
-  cl_command_queue  queue;         /* Command queue */
-  size_t            offset;        /* Mem object's offset */
-  size_t            size;          /* Size */
-  size_t            origin[3];     /* Origin */
-  size_t            region[3];     /* Region */
-  size_t            row_pitch;     /* Row pitch */
-  size_t            slice_pitch;   /* Slice pitch */
-  cl_map_flags      map_flags;     /* Map flags */
-  const void *      const_ptr;     /* Const ptr for memory read */
-  void *            ptr;           /* ptr for write and return value */
+  enqueue_type      type;             /* Command type */
+  cl_mem            mem_obj;          /* Enqueue's cl_mem */
+  cl_command_queue  queue;            /* Command queue */
+  size_t            offset;           /* Mem object's offset */
+  size_t            size;             /* Size */
+  size_t            origin[3];        /* Origin */
+  size_t            host_origin[3];   /* Origin */
+  size_t            region[3];        /* Region */
+  size_t            row_pitch;        /* Row pitch */
+  size_t            slice_pitch;      /* Slice pitch */
+  size_t            host_row_pitch;   /* Host row pitch, used in read/write buffer rect */
+  size_t            host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */
+  const void *      const_ptr;        /* Const ptr for memory read */
+  void *            ptr;              /* Ptr for write and return value */
+  const cl_mem*     mem_list;         /* mem_list of clEnqueueNativeKernel */
+  void (*user_func)(void *);          /* pointer to a host-callable user function */
 } enqueue_data;
 
 /* Do real enqueue commands */
diff --git a/src/cl_event.c b/src/cl_event.c
index e882c7c..918e245 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -23,10 +23,28 @@
 #include "cl_alloc.h"
 #include "cl_khr_icd.h"
 #include "cl_kernel.h"
+#include "cl_command_queue.h"
 
 #include <assert.h>
 #include <stdio.h>
 
+inline cl_bool
+cl_event_is_gpu_command_type(cl_command_type type)
+{
+  switch(type) {
+    case CL_COMMAND_COPY_BUFFER:
+    case CL_COMMAND_COPY_IMAGE:
+    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
+    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+    case CL_COMMAND_COPY_BUFFER_RECT:
+    case CL_COMMAND_TASK:
+    case CL_COMMAND_NDRANGE_KERNEL:
+      return CL_TRUE;
+    default:
+      return CL_FALSE;
+  }
+}
+
 cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
 {
   cl_event event = NULL;
@@ -56,13 +74,16 @@ cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type ty
   }
   else {
     event->status = CL_QUEUED;
-    event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
+    if(cl_event_is_gpu_command_type(event->type))
+      event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
   }
   cl_event_add_ref(event);       //dec when complete
   event->user_cb = NULL;
   event->enqueue_cb = NULL;
   event->waits_head = NULL;
   event->emplict = emplict;
+  if(queue && event->gpgpu_event)
+    queue->last_event = event;
 
 exit:
   return event;
@@ -77,9 +98,14 @@ void cl_event_delete(cl_event event)
   if (UNLIKELY(event == NULL))
     return;
 
+  cl_event_update_status(event);
+
   if (atomic_dec(&event->ref_n) > 1)
     return;
 
+  if(event->queue && event->queue->last_event == event)
+    event->queue->last_event = NULL;
+
   /* Call all user's callback if haven't execute */
   user_callback *cb = event->user_cb;
   while(event->user_cb) {
@@ -153,7 +179,7 @@ cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
   /* check the event_wait_list and num_events_in_wait_list */
   if((event_wait_list == NULL) &&
      (num_events_in_wait_list > 0))
-    goto exit;
+    goto error;
 
   if ((event_wait_list != NULL) &&
       (num_events_in_wait_list == 0)){
@@ -180,10 +206,11 @@ error:
   goto exit;
 }
 
-cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
-                          const cl_event *event_wait_list)
+cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+                            cl_command_queue queue)
 {
   cl_int i, j;
+
   /* Check whether wait user events */
   for(i=0; i<num_events_in_wait_list; i++) {
     if(event_wait_list[i]->status <= CL_COMPLETE)
@@ -199,6 +226,10 @@ cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
     }
   }
 
+  if(queue && queue->barrier_index > 0) {
+    return CL_ENQUEUE_EXECUTE_DEFER;
+  }
+
   /* Non user events or all user event finished, wait all enqueue events finish */
   for(i=0; i<num_events_in_wait_list; i++) {
     if(event_wait_list[i]->status <= CL_COMPLETE)
@@ -207,7 +238,8 @@ cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
     //enqueue callback haven't finish, in another thread, wait
     if(event_wait_list[i]->enqueue_cb != NULL)
       return CL_ENQUEUE_EXECUTE_DEFER;
-    cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
+    if(event_wait_list[i]->gpgpu_event)
+      cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
     cl_event_set_status(event_wait_list[i], CL_COMPLETE);  //Execute user's callback
   }
   return CL_ENQUEUE_EXECUTE_IMM;
@@ -220,16 +252,40 @@ void cl_event_new_enqueue_callback(cl_event event,
 {
   enqueue_callback *cb, *node;
   user_event *user_events, *u_ev;
+  cl_command_queue queue = event->queue;
   cl_int i;
 
   /* Allocate and inialize the structure itself */
   TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
   cb->num_events = num_events_in_wait_list;
-  cb->wait_list = event_wait_list;
+  TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list));
+  for(i=0; i<num_events_in_wait_list; i++)
+    cb->wait_list[i] = event_wait_list[i];
   cb->event = event;
   cb->next = NULL;
   cb->wait_user_events = NULL;
 
+  if(queue && queue->barrier_index > 0) {
+    for(i=0; i<queue->barrier_index; i++) {
+      /* Insert the enqueue_callback to user event list */
+      node = queue->wait_events[i]->waits_head;
+      if(node == NULL)
+        queue->wait_events[i]->waits_head = cb;
+      else
+        while((node != cb) && node->next)
+          node = node->next;
+        if(node == cb)   //wait on dup user event
+          continue;
+        node->next = cb;
+
+      /* Insert the user event to enqueue_callback's wait_user_events */
+      TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+      u_ev->event = queue->wait_events[i];
+      u_ev->next = cb->wait_user_events;
+      cb->wait_user_events = u_ev;
+    }
+  }
+
   /* Find out all user events that events in event_wait_list wait */
   for(i=0; i<num_events_in_wait_list; i++) {
     if(event_wait_list[i]->status <= CL_COMPLETE)
@@ -252,6 +308,7 @@ void cl_event_new_enqueue_callback(cl_event event,
       u_ev->event = event_wait_list[i];
       u_ev->next = cb->wait_user_events;
       cb->wait_user_events = u_ev;
+      cl_command_queue_insert_event(event->queue, event_wait_list[i]);
     } else if(event_wait_list[i]->enqueue_cb != NULL) {
       user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
       while(user_events != NULL) {
@@ -271,11 +328,11 @@ void cl_event_new_enqueue_callback(cl_event event,
         u_ev->next = cb->wait_user_events;
         cb->wait_user_events = u_ev;
         user_events = user_events->next;
+        cl_command_queue_insert_event(event->queue, event_wait_list[i]);
       }
     }
   }
-  if(data->queue != NULL) {
-    assert(event->gpgpu_event);
+  if(data->queue != NULL && event->gpgpu_event != NULL) {
     cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
     data->ptr = (void *)event->gpgpu_event;
   }
@@ -291,6 +348,8 @@ error:
       cb->wait_user_events = cb->wait_user_events->next;
       cl_free(u_ev);
     }
+    if(cb->wait_list)
+      cl_free(cb->wait_list);
     cl_free(cb);
   }
   goto exit;
@@ -317,6 +376,8 @@ void cl_event_set_status(cl_event event, cl_int status)
   if(status <= CL_COMPLETE) {
     if(event->enqueue_cb) {
       cl_enqueue_handle(&event->enqueue_cb->data);
+      if(event->gpgpu_event)
+        cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
       event->status = status;  //Change the event status after enqueue and befor unlock
 
       pthread_mutex_unlock(&event->ctx->event_lock);
@@ -324,6 +385,8 @@ void cl_event_set_status(cl_event event, cl_int status)
         cl_event_delete(event->enqueue_cb->wait_list[i]);
       pthread_mutex_lock(&event->ctx->event_lock);
 
+      if(event->enqueue_cb->wait_list)
+        cl_free(event->enqueue_cb->wait_list);
       cl_free(event->enqueue_cb);
       event->enqueue_cb = NULL;
     }
@@ -375,8 +438,12 @@ void cl_event_set_status(cl_event event, cl_int status)
       continue;
     }
 
+    //remove user event frome enqueue_cb's ctx
+    cl_command_queue_remove_event(enqueue_cb->event->queue, event);
+
     /* All user events complete, now wait enqueue events */
-    ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list);
+    ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
+                               enqueue_cb->event->queue);
     assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
 
     cb = enqueue_cb;
@@ -385,7 +452,7 @@ void cl_event_set_status(cl_event event, cl_int status)
     /* Call the pending operation */
     evt = cb->event;
     cl_event_set_status(cb->event, CL_COMPLETE);
-    if(cb->event->emplict == CL_FALSE) {
+    if(evt->emplict == CL_FALSE) {
       cl_event_delete(evt);
     }
   }
@@ -400,3 +467,26 @@ void cl_event_update_status(cl_event event)
      (cl_gpgpu_event_update_status(event->gpgpu_event, 0) == command_complete))
     cl_event_set_status(event, CL_COMPLETE);
 }
+
+cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
+{
+  enqueue_data data;
+
+  *event = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
+  if(event == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  //if wait_events_num>0, the marker event need wait queue->wait_events
+  if(queue->wait_events_num > 0) {
+    data.type = EnqueueMarker;
+    cl_event_new_enqueue_callback(*event, &data, queue->wait_events_num, queue->wait_events);
+    return CL_SUCCESS;
+  }
+
+  if(queue->last_event && queue->last_event->gpgpu_event) {
+    cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
+  }
+
+  cl_event_set_status(*event, CL_COMPLETE);
+  return CL_SUCCESS;
+}
diff --git a/src/cl_event.h b/src/cl_event.h
index c921cb2..7dde24b 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -22,9 +22,9 @@
 
 #include <semaphore.h>
 
-#include "cl_enqueue.h"
 #include "cl_internals.h"
 #include "cl_driver.h"
+#include "cl_enqueue.h"
 #include "CL/cl.h"
 
 #define CL_ENQUEUE_EXECUTE_IMM   0
@@ -39,7 +39,7 @@ typedef struct _enqueue_callback {
   cl_event           event;            /* The event relative this enqueue callback */
   enqueue_data       data;             /* Hold all enqueue callback's infomation */
   cl_uint            num_events;       /* num events in wait list */
-  const cl_event*    wait_list;        /* All event wait list this callback wait on */
+  cl_event*          wait_list;        /* All event wait list this callback wait on */
   user_event*        wait_user_events; /* The head of user event list the callback wait on */
   struct _enqueue_callback*  next;     /* The  next enqueue callback in wait list */
 } enqueue_callback;
@@ -81,12 +81,14 @@ cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
 /* Check events wait list for enqueue commonds */
 cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
 /* Wait the all events in wait list complete */
-cl_int cl_event_wait_events(cl_uint, const cl_event *);
+cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue);
 /* New a enqueue suspend task */
 void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
 /* Set the event status and call all callbacks */
 void cl_event_set_status(cl_event, cl_int);
 /* Check and update event status */
 void cl_event_update_status(cl_event);
+/* Create the marker event */
+cl_int cl_event_marker(cl_command_queue, cl_event*);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 1ff81c1..d07a525 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -26,6 +26,7 @@ void check_basic_extension(cl_extensions_t *extensions)
 {
   int id;
   for(id = BASE_EXT_START_ID; id <= BASE_EXT_END_ID; id++)
+    if (id != EXT_ID(khr_fp64))
       extensions->extensions[id].base.ext_enabled = 1;
 }
 
@@ -39,26 +40,12 @@ void check_opt1_extension(cl_extensions_t *extensions)
 
 void
 check_gl_extension(cl_extensions_t *extensions) {
-#ifdef HAS_EGL
-static struct cl_gl_ext_deps egl_funcs;
+#if defined(HAS_EGL)
   int id;
-#if defined(EGL_KHR_image) && defined(EGL_KHR_gl_texture_2D_image) && defined(HAS_GBM)
-  egl_funcs.eglCreateImageKHR_func = (PFNEGLCREATEIMAGEKHRPROC) eglGetProcAddress("eglCreateImageKHR");
-  egl_funcs.eglDestroyImageKHR_func = (PFNEGLDESTROYIMAGEKHRPROC) eglGetProcAddress("eglDestroyImageKHR");
-#else
-  egl_funcs.eglCreateImageKHR_func = NULL;
-  egl_funcs.eglDestroyImageKHR_func = NULL;
-#endif
-
-  if (egl_funcs.eglCreateImageKHR_func != NULL
-      && egl_funcs.eglDestroyImageKHR_func != NULL) {
       /* For now, we only support cl_khr_gl_sharing. */
-    for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
-      if (id == EXT_ID(khr_gl_sharing)) {
-        extensions->extensions[id].base.ext_enabled = 1;
-        extensions->extensions[id].EXT_STRUCT_NAME(khr_gl_sharing).gl_ext_deps = &egl_funcs;
-      }
-  }
+  for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
+    if (id == EXT_ID(khr_gl_sharing))
+      extensions->extensions[id].base.ext_enabled = 1;
 #endif
 }
 
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 51eb8e0..52ee0a4 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -76,29 +76,6 @@ struct EXT_STRUCT_NAME(name) { \
 DECL_BASE_EXTENSIONS
 DECL_OPT1_EXTENSIONS
 DECL_D3D_EXTENSIONS
-#undef DECL_EXT
-
-#define DECL_EXT(name) \
-struct EXT_STRUCT_NAME(name) { \
-  struct cl_extension_base base; \
-  struct cl_gl_ext_deps *gl_ext_deps; \
-};
-
-struct cl_gl_ext_deps {
-#ifdef HAS_EGL
-#ifndef EGL_KHR_image
-#define PFNEGLCREATEIMAGEKHRPROC void*
-#define PFNEGLDESTROYIMAGEKHRPROC void*
-#endif
-  PFNEGLCREATEIMAGEKHRPROC eglCreateImageKHR_func;
-  PFNEGLDESTROYIMAGEKHRPROC eglDestroyImageKHR_func;
-#ifndef EGL_KHR_image
-#undef PFNEGLCREATEIMAGEKHRPROC
-#undef PFNEGLDESTROYIMAGEKHRPROC
-#endif
-#endif
-};
-
 DECL_GL_EXTENSIONS
 #undef DECL_EXT
 
@@ -117,8 +94,6 @@ typedef struct cl_extensions {
 
 struct _cl_platform_id;
 typedef struct _cl_platform_id * cl_platform_id;
-#define CL_EXTENSION_GET_FUNCS(ctx, name, funcs) \
-  ctx->device->platform->internal_extensions->extensions[EXT_ID(name)].EXT_STRUCT_NAME(name).funcs
 
 extern void
 cl_intel_platform_extension_init(cl_platform_id intel_platform);
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f58e1fd..6bfc453 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -45,13 +45,13 @@
 .image3d_max_width = 8192,
 .image3d_max_height = 8192,
 .image3d_max_depth = 8192,
-.max_samplers = 0,
+.max_samplers = 8,
 .mem_base_addr_align = sizeof(cl_uint) * 8,
 .min_data_type_align_size = sizeof(cl_uint),
 .single_fp_config = 0, /* XXX */
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
-.global_mem_size = 4,
-.max_constant_buffer_size = 64 << 10,
+.global_mem_size = 128 * 1024 * 1024,
+.max_constant_buffer_size = 512 << 10,
 .max_constant_args = 8,
 .error_correction_support = CL_FALSE,
 .host_unified_memory = CL_FALSE,
@@ -59,9 +59,11 @@
 .endian_little = CL_TRUE,
 .available = CL_TRUE,
 .compiler_available = CL_FALSE, /* XXX */
-.execution_capabilities = CL_EXEC_KERNEL,
+.execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
 .queue_properties = CL_QUEUE_PROFILING_ENABLE,
 .platform = NULL, /* == intel_platform (set when requested) */
+/* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
+.single_fp_config = CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
 
 #define DECL_INFO_STRING(FIELD, STRING) \
     .FIELD = STRING,                    \
diff --git a/src/cl_image.c b/src/cl_image.c
index 6ea104b..f89bcae 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -61,11 +61,11 @@ cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp)
   };
 
   switch (order) {
+    case CL_Rx: break;
     case CL_R: break;
     case CL_A: break;
     case CL_RA: *bpp *= 2; break;
     case CL_RG: *bpp *= 2; break;
-    case CL_Rx: *bpp *= 2; break;
     case CL_INTENSITY:
     case CL_LUMINANCE:
       if (type != CL_UNORM_INT8 && type != CL_UNORM_INT16 &&
@@ -101,14 +101,26 @@ cl_image_get_intel_format(const cl_image_format *fmt)
   const uint32_t order = fmt->image_channel_order;
   switch (order) {
     case CL_R:
+#if 0
+    case CL_Rx:
     case CL_A:
     case CL_INTENSITY:
     case CL_LUMINANCE:
+      if ((order == CL_INTENSITY || order == CL_LUMINANCE)
+          && (type != CL_UNORM_INT8 && type != CL_UNORM_INT16
+              && type != CL_SNORM_INT8 && type != CL_SNORM_INT16
+              && type != CL_HALF_FLOAT && type != CL_FLOAT))
+        return INTEL_UNSUPPORTED_FORMAT;
+#endif
+
+/* XXX it seems we have some acuracy compatible issue with snomr_int8/16,
+ * have to disable those formats currently. */
+
       switch (type) {
         case CL_HALF_FLOAT:     return I965_SURFACEFORMAT_R16_FLOAT;
         case CL_FLOAT:          return I965_SURFACEFORMAT_R32_FLOAT;
-        case CL_SNORM_INT16:    return I965_SURFACEFORMAT_R16_SNORM;
-        case CL_SNORM_INT8:     return I965_SURFACEFORMAT_R8_SNORM;
+//        case CL_SNORM_INT16:    return I965_SURFACEFORMAT_R16_SNORM;
+//        case CL_SNORM_INT8:     return I965_SURFACEFORMAT_R8_SNORM;
         case CL_UNORM_INT8:     return I965_SURFACEFORMAT_R8_UNORM;
         case CL_UNORM_INT16:    return I965_SURFACEFORMAT_R16_UNORM;
         case CL_SIGNED_INT8:    return I965_SURFACEFORMAT_R8_SINT;
@@ -119,9 +131,9 @@ cl_image_get_intel_format(const cl_image_format *fmt)
         case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32_UINT;
         default: return INTEL_UNSUPPORTED_FORMAT;
       };
+#if 0
     case CL_RG:
     case CL_RA:
-    case CL_Rx:
       switch (type) {
         case CL_HALF_FLOAT:     return I965_SURFACEFORMAT_R16G16_FLOAT;
         case CL_FLOAT:          return I965_SURFACEFORMAT_R32G32_FLOAT;
@@ -145,12 +157,13 @@ cl_image_get_intel_format(const cl_image_format *fmt)
         case CL_UNORM_SHORT_555:
         default: return INTEL_UNSUPPORTED_FORMAT;
       };
+#endif
     case CL_RGBA:
       switch (type) {
         case CL_HALF_FLOAT:     return I965_SURFACEFORMAT_R16G16B16A16_FLOAT;
         case CL_FLOAT:          return I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
-        case CL_SNORM_INT16:    return I965_SURFACEFORMAT_R16G16B16A16_SNORM;
-        case CL_SNORM_INT8:     return I965_SURFACEFORMAT_R8G8B8A8_SNORM;
+//        case CL_SNORM_INT16:    return I965_SURFACEFORMAT_R16G16B16A16_SNORM;
+//        case CL_SNORM_INT8:     return I965_SURFACEFORMAT_R8G8B8A8_SNORM;
         case CL_UNORM_INT8:     return I965_SURFACEFORMAT_R8G8B8A8_UNORM;
         case CL_UNORM_INT16:    return I965_SURFACEFORMAT_R16G16B16A16_UNORM;
         case CL_SIGNED_INT8:    return I965_SURFACEFORMAT_R8G8B8A8_SINT;
@@ -195,7 +208,6 @@ cl_image_get_supported_fmt(cl_context ctx,
                            cl_uint *num_image_formats)
 {
   size_t i, j, n = 0;
-  assert(image_formats);
   for (i = 0; i < cl_image_order_n; ++i)
   for (j = 0; j < cl_image_type_n; ++j) {
     const cl_image_format fmt = {
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 41e6a8a..4ba1c11 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -133,8 +133,8 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
       if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
         return CL_INVALID_MEM_OBJECT;
 
-      if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !mem->is_image)
-         || (arg_type != GBE_ARG_IMAGE && mem->is_image)))
+      if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
+         || (arg_type != GBE_ARG_IMAGE && IS_IMAGE(mem))))
           return CL_INVALID_ARG_VALUE;
     }
   }
@@ -186,16 +186,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
 
   mem = *(cl_mem*) value;
 
-  if(arg_type == GBE_ARG_CONSTANT_PTR) {
-    int32_t cbOffset;
-    cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
-    //constant ptr's curbe offset changed, update it
-    if(cbOffset >= 0) {
-      offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-      *((uint32_t *)(k->curbe + offset)) = cbOffset;  //cb offset in curbe
-    }
-  }
-
   cl_mem_add_ref(mem);
   if (k->args[index].mem)
     cl_mem_delete(k->args[index].mem);
diff --git a/src/cl_khr_icd.h b/src/cl_khr_icd.h
index 6c8b9f4..1e206b4 100644
--- a/src/cl_khr_icd.h
+++ b/src/cl_khr_icd.h
@@ -14,6 +14,8 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  */
+#ifndef __CL_KHR_ICD_H__
+#define __CL_KHR_ICD_H__
 
 #ifdef HAS_OCLIcd
 
@@ -28,3 +30,5 @@ extern struct _cl_icd_dispatch const cl_khr_icd_dispatch;
 #define INIT_ICD(member)
 #define DEFINE_ICD(member)
 #endif
+
+#endif
diff --git a/src/cl_mem.c b/src/cl_mem.c
index f794ce7..68753f1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -25,6 +25,8 @@
 #include "cl_device_id.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
 
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
@@ -42,6 +44,31 @@
       return CL_INVALID_VALUE;              \
     break;
 
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+
+static cl_mem_object_type
+cl_get_mem_object_type(cl_mem mem)
+{
+  switch (mem->type) {
+    case CL_MEM_BUFFER_TYPE: return CL_MEM_OBJECT_BUFFER;
+    case CL_MEM_IMAGE_TYPE:
+    case CL_MEM_GL_IMAGE_TYPE:
+    {
+      struct _cl_mem_image *image = cl_mem_image(mem);
+      if (image->depth == 1)
+        return CL_MEM_OBJECT_IMAGE1D;
+      else if (image->depth == 2)
+        return CL_MEM_OBJECT_IMAGE2D;
+      else if (image->depth == 3)
+        return CL_MEM_OBJECT_IMAGE3D;
+    }
+    default:
+      return CL_MEM_OBJECT_BUFFER;
+  }
+}
+
 LOCAL cl_int
 cl_get_mem_object_info(cl_mem mem,
                 cl_mem_info param_name,
@@ -67,7 +94,7 @@ cl_get_mem_object_info(cl_mem mem,
   switch(param_name)
   {
   case CL_MEM_TYPE:
-    *((cl_mem_object_type *)param_value) = mem->type;
+    *((cl_mem_object_type *)param_value) = cl_get_mem_object_type(mem);
     break;
   case CL_MEM_FLAGS:
     *((cl_mem_flags *)param_value) = mem->flags;
@@ -106,8 +133,8 @@ cl_get_image_info(cl_mem mem,
                   void *param_value,
                   size_t *param_value_size_ret)
 {
-  if(!mem || !mem->is_image)
-    return CL_INVALID_MEM_OBJECT;
+  int err;
+  CHECK_IMAGE(mem, image);
 
   switch(param_name)
   {
@@ -125,35 +152,39 @@ cl_get_image_info(cl_mem mem,
   switch(param_name)
   {
   case CL_IMAGE_FORMAT:
-    *(cl_image_format *)param_value = mem->fmt;
+    *(cl_image_format *)param_value = image->fmt;
     break;
   case CL_IMAGE_ELEMENT_SIZE:
-    *(size_t *)param_value = mem->bpp;
+    *(size_t *)param_value = image->bpp;
     break;
   case CL_IMAGE_ROW_PITCH:
-    *(size_t *)param_value = mem->row_pitch;
+    *(size_t *)param_value = image->row_pitch;
     break;
   case CL_IMAGE_SLICE_PITCH:
-    *(size_t *)param_value = mem->slice_pitch;
+    *(size_t *)param_value = image->slice_pitch;
     break;
   case CL_IMAGE_WIDTH:
-    *(size_t *)param_value = mem->w;
+    *(size_t *)param_value = image->w;
     break;
   case CL_IMAGE_HEIGHT:
-    *(size_t *)param_value = mem->h;
+    *(size_t *)param_value = image->h;
     break;
   case CL_IMAGE_DEPTH:
-    *(size_t *)param_value = mem->depth;
+    *(size_t *)param_value = image->depth;
     break;
   }
 
   return CL_SUCCESS;
+
+error:
+    return err;
 }
 
 #undef FIELD_SIZE
 
-static cl_mem
-cl_mem_allocate(cl_context ctx,
+LOCAL cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+                cl_context ctx,
                 cl_mem_flags flags,
                 size_t sz,
                 cl_int is_tiled,
@@ -174,41 +205,56 @@ cl_mem_allocate(cl_context ctx,
                                 NULL)) != CL_SUCCESS) {
     goto error;
   }
-  if (UNLIKELY(sz == 0 || sz > max_mem_size)) {
+  if (UNLIKELY(sz > max_mem_size)) {
     err = CL_INVALID_BUFFER_SIZE;
     goto error;
   }
 
   /* Allocate and inialize the structure itself */
-  TRY_ALLOC (mem, CALLOC(struct _cl_mem));
+  if (type == CL_MEM_IMAGE_TYPE) {
+    struct _cl_mem_image *image = NULL;
+    TRY_ALLOC (image, CALLOC(struct _cl_mem_image));
+    mem = &image->base;
+  } else if (type == CL_MEM_GL_IMAGE_TYPE ) {
+    struct _cl_mem_gl_image *gl_image = NULL;
+    TRY_ALLOC (gl_image, CALLOC(struct _cl_mem_gl_image));
+    mem = &gl_image->base.base;
+  } else {
+    struct _cl_mem_buffer *buffer = NULL;
+    TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer));
+    mem = &buffer->base;
+  }
+  mem->type = type;
   SET_ICD(mem->dispatch)
   mem->ref_n = 1;
   mem->magic = CL_MAGIC_MEM_HEADER;
   mem->flags = flags;
 
-  /* Pinning will require stricter alignment rules */
-  if ((flags & CL_MEM_PINNABLE) || is_tiled)
-    alignment = 4096;
-
-  /* Allocate space in memory */
-  bufmgr = cl_context_get_bufmgr(ctx);
-  assert(bufmgr);
-  mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
-  if (UNLIKELY(mem->bo == NULL)) {
-    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
-    goto error;
+  if (sz != 0) {
+    /* Pinning will require stricter alignment rules */
+    if ((flags & CL_MEM_PINNABLE) || is_tiled)
+      alignment = 4096;
+
+    /* Allocate space in memory */
+    bufmgr = cl_context_get_bufmgr(ctx);
+    assert(bufmgr);
+    mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+    if (UNLIKELY(mem->bo == NULL)) {
+      err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+      goto error;
+    }
+    mem->size = sz;
   }
-  mem->size = sz;
 
-  /* Append the buffer in the context buffer list */
+  cl_context_add_ref(ctx);
+  mem->ctx = ctx;
+    /* Append the buffer in the context buffer list */
   pthread_mutex_lock(&ctx->buffer_lock);
-    mem->next = ctx->buffers;
-    if (ctx->buffers != NULL)
-      ctx->buffers->prev = mem;
-    ctx->buffers = mem;
+  mem->next = ctx->buffers;
+  if (ctx->buffers != NULL)
+    ctx->buffers->prev = mem;
+  ctx->buffers = mem;
   pthread_mutex_unlock(&ctx->buffer_lock);
-  mem->ctx = ctx;
-  cl_context_add_ref(ctx);
 
 exit:
   if (errcode)
@@ -222,11 +268,11 @@ error:
 }
 
 LOCAL cl_mem
-cl_mem_new(cl_context ctx,
-           cl_mem_flags flags,
-           size_t sz,
-           void *data,
-           cl_int *errcode_ret)
+cl_mem_new_buffer(cl_context ctx,
+                  cl_mem_flags flags,
+                  size_t sz,
+                  void *data,
+                  cl_int *errcode_ret)
 {
   /* Possible mem type combination:
        CL_MEM_ALLOC_HOST_PTR
@@ -262,12 +308,10 @@ cl_mem_new(cl_context ctx,
   }
 
   /* Create the buffer in video memory */
-  mem = cl_mem_allocate(ctx, flags, sz, CL_FALSE, &err);
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
-  mem->type = CL_MEM_OBJECT_BUFFER;
-
   /* Copy the data if required */
   if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR)
     cl_buffer_subdata(mem->bo, 0, sz, data);
@@ -285,35 +329,48 @@ error:
   goto exit;
 }
 
-static void
-cl_mem_copy_image(cl_mem image,
-		  size_t row_pitch,
-		  size_t slice_pitch,
-		  void* host_ptr)
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+                         void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+                         const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+                         const struct _cl_mem_image *image)
 {
-  char* dst_ptr = cl_mem_map_auto(image);
-
-  if (row_pitch == image->row_pitch &&
-      (image->depth == 1 || slice_pitch == image->slice_pitch))
+  size_t offset = image->bpp * origin[0] + dst_row_pitch * origin[1] + dst_slice_pitch * origin[2];
+  dst = (char*)dst + offset;
+  if (!origin[0] && region[0] == image->w && dst_row_pitch == src_row_pitch &&
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && dst_slice_pitch == src_slice_pitch)))
   {
-    memcpy(dst_ptr, host_ptr, image->depth == 1 ? row_pitch*image->h : slice_pitch*image->depth);
+    memcpy(dst, src, region[2] == 1 ? src_row_pitch*region[1] : src_slice_pitch*region[2]);
   }
   else {
-    size_t y, z;
-    for (z = 0; z < image->depth; z++) {
-      const char* src = host_ptr;
-      char* dst = dst_ptr;
-      for (y = 0; y < image->h; y++) {
-	memcpy(dst, src, image->bpp*image->w);
-	src += row_pitch;
-	dst += image->row_pitch;
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src_ptr = src;
+      char* dst_ptr = dst;
+      for (y = 0; y < region[1]; y++) {
+        memcpy(dst_ptr, src_ptr, image->bpp*region[0]);
+        src_ptr += src_row_pitch;
+        dst_ptr += dst_row_pitch;
       }
-      host_ptr = (char*)host_ptr + slice_pitch;
-      dst_ptr = (char*)dst_ptr + image->slice_pitch;
+      src = (char*)src + src_slice_pitch;
+      dst = (char*)dst + dst_slice_pitch;
     }
   }
+}
 
-  cl_mem_unmap_auto(image);
+static void
+cl_mem_copy_image(struct _cl_mem_image *image,
+		  size_t row_pitch,
+		  size_t slice_pitch,
+		  void* host_ptr)
+{
+  char* dst_ptr = cl_mem_map_auto((cl_mem)image);
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {image->w, image->h, image->depth};
+
+  cl_mem_copy_image_region(origin, region, dst_ptr, image->row_pitch, image->slice_pitch,
+                           host_ptr, row_pitch, slice_pitch, image);
+  cl_mem_unmap_auto((cl_mem)image);
 }
 
 static const uint32_t tile_sz = 4096; /* 4KB per tile */
@@ -338,11 +395,11 @@ _cl_mem_new_image(cl_context ctx,
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
   uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
-  size_t sz = 0, aligned_pitch = 0, aligned_h;
+  size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h;
   cl_image_tiling_t tiling = CL_NO_TILE;
 
   /* Check flags consistency */
-  if (UNLIKELY((flags & CL_MEM_COPY_HOST_PTR) && data == NULL)) {
+  if (UNLIKELY((flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) && data == NULL)) {
     err = CL_INVALID_HOST_PTR;
     goto error;
   }
@@ -416,27 +473,27 @@ _cl_mem_new_image(cl_context ctx,
   }
 
   sz = aligned_pitch * aligned_h * depth;
-  mem = cl_mem_allocate(ctx, flags, sz, tiling != CL_NO_TILE, &err);
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
-  mem->w = w;
-  mem->h = h;
-  mem->depth = depth;
-  mem->fmt = *fmt;
-  mem->intel_fmt = intel_fmt;
-  mem->bpp = bpp;
-  mem->is_image = 1;
-  mem->row_pitch = aligned_pitch;
-  mem->slice_pitch = image_type == CL_MEM_OBJECT_IMAGE1D || image_type == CL_MEM_OBJECT_IMAGE2D ? 0 : aligned_pitch*aligned_h;
-  mem->tiling = tiling;
-  mem->type = image_type;
-
   cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+  aligned_slice_pitch = (image_type == CL_MEM_OBJECT_IMAGE1D
+                         || image_type == CL_MEM_OBJECT_IMAGE2D) ? 0 : aligned_pitch * ALIGN(h, 2);
+
+  cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+                    intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+                    0, 0, 0);
 
   /* Copy the data if required */
-  if (flags & CL_MEM_COPY_HOST_PTR)
-    cl_mem_copy_image(mem, pitch, slice_pitch, data);
+  if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+    cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
+    if (flags & CL_MEM_USE_HOST_PTR) {
+      mem->host_ptr = data;
+      cl_mem_image(mem)->host_row_pitch = pitch;
+      cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+    }
+  }
 
 exit:
   if (errcode_ret)
@@ -479,17 +536,18 @@ cl_mem_new_image(cl_context context,
 LOCAL void
 cl_mem_delete(cl_mem mem)
 {
+  cl_int i;
   if (UNLIKELY(mem == NULL))
     return;
   if (atomic_dec(&mem->ref_n) > 1)
     return;
-  if (LIKELY(mem->bo != NULL))
-    cl_buffer_unreference(mem->bo);
 #ifdef HAS_EGL
-  if (UNLIKELY(mem->egl_image != NULL)) {
-     cl_mem_gl_delete(mem);
+  if (UNLIKELY(IS_GL_IMAGE(mem))) {
+     cl_mem_gl_delete(cl_mem_gl_image(mem));
   }
 #endif
+  if (LIKELY(mem->bo != NULL))
+    cl_buffer_unreference(mem->bo);
 
   /* Remove it from the list */
   assert(mem->ctx);
@@ -503,8 +561,17 @@ cl_mem_delete(cl_mem mem)
   pthread_mutex_unlock(&mem->ctx->buffer_lock);
   cl_context_delete(mem->ctx);
 
-  /* Someone still mapped? */
-  assert(!mem->map_ref);
+  /* Someone still mapped, unmap */
+  if(mem->map_ref > 0) {
+    assert(mem->mapped_ptr);
+    for(i=0; i<mem->mapped_ptr_sz; i++) {
+      if(mem->mapped_ptr[i].ptr != NULL) {
+        mem->map_ref--;
+        cl_mem_unmap_gtt(mem);
+      }
+    }
+    assert(mem->map_ref == 0);
+  }
 
   if (mem->mapped_ptr)
     free(mem->mapped_ptr);
@@ -529,6 +596,390 @@ cl_mem_add_ref(cl_mem mem)
   atomic_inc(&mem->ref_n);
 }
 
+#define LOCAL_SZ_0   16
+#define LOCAL_SZ_1   4
+#define LOCAL_SZ_2   4
+
+LOCAL cl_int
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+                       const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+                       size_t src_row_pitch, size_t src_slice_pitch,
+                       size_t dst_row_pitch, size_t dst_slice_pitch) {
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+  cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
+  cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+  cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+
+  static const char *str_kernel =
+      "kernel void __cl_cpy_buffer_rect ( \n"
+      "       global char* src, global char* dst, \n"
+      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
+      "       unsigned int src_offset, unsigned int dst_offset, \n"
+      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+      "  int i = get_global_id(0); \n"
+      "  int j = get_global_id(1); \n"
+      "  int k = get_global_id(2); \n"
+      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+      "    return; \n"
+      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+      "  dst[dst_offset] = src[src_offset]; \n"
+      "}";
+
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(src_buf->ctx == dst_buf->ctx);
+
+  /* setup the kernel and run. */
+  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL);
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
+  cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
+  cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+  return ret;
+}
+
+LOCAL cl_int
+cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
+                         const size_t *src_origin, const size_t *dst_origin, const size_t *region) {
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+  cl_int index = CL_ENQUEUE_COPY_IMAGE_0;
+  char option[40] = "";
+  uint32_t fixupDataType;
+  uint32_t savedIntelFmt;
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    strcat(option, "-D SRC_IMAGE_3D");
+    index += 1;
+  }
+  if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    strcat(option, " -D DST_IMAGE_3D");
+    index += 2;
+  }
+
+  switch (src_image->fmt.image_channel_data_type) {
+    case CL_SNORM_INT8:
+    case CL_UNORM_INT8:  fixupDataType = CL_UNSIGNED_INT8; break;
+    case CL_HALF_FLOAT:
+    case CL_SNORM_INT16:
+    case CL_UNORM_INT16: fixupDataType = CL_UNSIGNED_INT16; break;
+    case CL_FLOAT:       fixupDataType = CL_UNSIGNED_INT32; break;
+    default:
+      fixupDataType = 0;
+  }
+
+  if (fixupDataType) {
+    cl_image_format fmt;
+    if (src_image->fmt.image_channel_order != CL_BGRA)
+      fmt.image_channel_order = src_image->fmt.image_channel_order;
+    else
+      fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = fixupDataType;
+    savedIntelFmt = src_image->intel_fmt;
+    src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+    dst_image->intel_fmt = src_image->intel_fmt;
+  }
+  static const char *str_kernel =
+      "#ifdef SRC_IMAGE_3D \n"
+      "  #define SRC_IMAGE_TYPE image3d_t \n"
+      "  #define SRC_COORD_TYPE int4 \n"
+      "#else \n"
+      "  #define SRC_IMAGE_TYPE image2d_t \n"
+      "  #define SRC_COORD_TYPE int2 \n"
+      "#endif \n"
+      "#ifdef DST_IMAGE_3D \n"
+      "  #define DST_IMAGE_TYPE image3d_t \n"
+      "  #define DST_COORD_TYPE int4 \n"
+      "#else \n"
+      "  #define DST_IMAGE_TYPE image2d_t \n"
+      "  #define DST_COORD_TYPE int2 \n"
+      "#endif \n"
+      "kernel void __cl_copy_image ( \n"
+      "       __read_only SRC_IMAGE_TYPE src_image, __write_only DST_IMAGE_TYPE dst_image, \n"
+      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
+      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
+      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2) { \n"
+      "  int i = get_global_id(0); \n"
+      "  int j = get_global_id(1); \n"
+      "  int k = get_global_id(2); \n"
+      "  int4 color; \n"
+      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
+      "  SRC_COORD_TYPE src_coord; \n"
+      "  DST_COORD_TYPE dst_coord; \n"
+      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+      "    return; \n"
+      "  src_coord.x = src_origin0 + i; \n"
+      "  src_coord.y = src_origin1 + j; \n"
+      "#ifdef SRC_IMAGE_3D \n"
+      "  src_coord.z = src_origin2 + k; \n"
+      "#endif \n"
+      "  dst_coord.x = dst_origin0 + i; \n"
+      "  dst_coord.y = dst_origin1 + j; \n"
+      "#ifdef DST_IMAGE_3D \n"
+      "  dst_coord.z = dst_origin2 + k; \n"
+      "#endif \n"
+      "  color = read_imagei(src_image, sampler, src_coord); \n"
+      "  write_imagei(dst_image, dst_coord, color); \n"
+      "}";
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(src_image->base.ctx == dst_image->base.ctx);
+
+  /* setup the kernel and run. */
+  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+  if (!ker) {
+    ret = CL_OUT_OF_RESOURCES;
+    goto fail;
+  }
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]);
+  cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
+  cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+  if (fixupDataType) {
+    src_image->intel_fmt = savedIntelFmt;
+    dst_image->intel_fmt = savedIntelFmt;
+  }
+  return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer,
+                         const size_t *src_origin, const size_t dst_offset, const size_t *region) {
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+  cl_int index = CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0;
+  char option[40] = "";
+  uint32_t intel_fmt, bpp;
+  cl_image_format fmt;
+  size_t origin0, region0;
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    strcat(option, "-D IMAGE_3D");
+    index += 1;
+  }
+
+  static const char *str_kernel =
+      "#ifdef IMAGE_3D \n"
+      "  #define IMAGE_TYPE image3d_t \n"
+      "  #define COORD_TYPE int4 \n"
+      "#else \n"
+      "  #define IMAGE_TYPE image2d_t \n"
+      "  #define COORD_TYPE int2 \n"
+      "#endif \n"
+      "kernel void __cl_copy_image_to_buffer ( \n"
+      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
+      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
+      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
+      "       unsigned int dst_offset) { \n"
+      "  int i = get_global_id(0); \n"
+      "  int j = get_global_id(1); \n"
+      "  int k = get_global_id(2); \n"
+      "  uint4 color; \n"
+      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
+      "  COORD_TYPE src_coord; \n"
+      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+      "    return; \n"
+      "  src_coord.x = src_origin0 + i; \n"
+      "  src_coord.y = src_origin1 + j; \n"
+      "#ifdef IMAGE_3D \n"
+      "  src_coord.z = src_origin2 + k; \n"
+      "#endif \n"
+      "  color = read_imageui(image, sampler, src_coord); \n"
+      "  dst_offset += (k * region1 + j) * region0 + i; \n"
+      "  buffer[dst_offset] = color.x; \n"
+      "}";
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(image->base.ctx == buffer->ctx);
+
+  fmt.image_channel_order = CL_R;
+  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+  intel_fmt = image->intel_fmt;
+  bpp = image->bpp;
+  image->intel_fmt = cl_image_get_intel_format(&fmt);
+  image->w = image->w * image->bpp;
+  image->bpp = 1;
+  region0 = region[0] * bpp;
+  origin0 = src_origin[0] * bpp;
+  global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+  /* setup the kernel and run. */
+  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+  if (!ker) {
+    ret = CL_OUT_OF_RESOURCES;
+    goto fail;
+  }
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_offset);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->w = image->w / bpp;
+
+  return ret;
+}
+
+
+LOCAL cl_int
+cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image,
+                         const size_t src_offset, const size_t *dst_origin, const size_t *region) {
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+  cl_int index = CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0;
+  char option[40] = "";
+  uint32_t intel_fmt, bpp;
+  cl_image_format fmt;
+  size_t origin0, region0;
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    strcat(option, "-D IMAGE_3D");
+    index += 1;
+  }
+
+  static const char *str_kernel =
+      "#ifdef IMAGE_3D \n"
+      "  #define IMAGE_TYPE image3d_t \n"
+      "  #define COORD_TYPE int4 \n"
+      "#else \n"
+      "  #define IMAGE_TYPE image2d_t \n"
+      "  #define COORD_TYPE int2 \n"
+      "#endif \n"
+      "kernel void __cl_copy_image_to_buffer ( \n"
+      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
+      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
+      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2, \n"
+      "       unsigned int src_offset) { \n"
+      "  int i = get_global_id(0); \n"
+      "  int j = get_global_id(1); \n"
+      "  int k = get_global_id(2); \n"
+      "  uint4 color = (uint4)(0); \n"
+      "  COORD_TYPE dst_coord; \n"
+      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+      "    return; \n"
+      "  dst_coord.x = dst_origin0 + i; \n"
+      "  dst_coord.y = dst_origin1 + j; \n"
+      "#ifdef IMAGE_3D \n"
+      "  dst_coord.z = dst_origin2 + k; \n"
+      "#endif \n"
+      "  src_offset += (k * region1 + j) * region0 + i; \n"
+      "  color.x = buffer[src_offset]; \n"
+      "  write_imageui(image, dst_coord, color); \n"
+      "}";
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(image->base.ctx == buffer->ctx);
+
+  fmt.image_channel_order = CL_R;
+  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+  intel_fmt = image->intel_fmt;
+  bpp = image->bpp;
+  image->intel_fmt = cl_image_get_intel_format(&fmt);
+  image->w = image->w * image->bpp;
+  image->bpp = 1;
+  region0 = region[0] * bpp;
+  origin0 = dst_origin[0] * bpp;
+  global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+  /* setup the kernel and run. */
+  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->w = image->w / bpp;
+
+  return ret;
+}
+
+
 LOCAL void*
 cl_mem_map(cl_mem mem)
 {
@@ -552,6 +1003,14 @@ cl_mem_map_gtt(cl_mem mem)
   return cl_buffer_get_virtual(mem->bo);
 }
 
+LOCAL void *
+cl_mem_map_gtt_unsync(cl_mem mem)
+{
+  cl_buffer_map_gtt_unsync(mem->bo);
+  assert(cl_buffer_get_virtual(mem->bo));
+  return cl_buffer_get_virtual(mem->bo);
+}
+
 LOCAL cl_int
 cl_mem_unmap_gtt(cl_mem mem)
 {
@@ -562,7 +1021,7 @@ cl_mem_unmap_gtt(cl_mem mem)
 LOCAL void*
 cl_mem_map_auto(cl_mem mem)
 {
-  if (mem->is_image && mem->tiling != CL_NO_TILE)
+  if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
     return cl_mem_map_gtt(mem);
   else
     return cl_mem_map(mem);
@@ -571,7 +1030,7 @@ cl_mem_map_auto(cl_mem mem)
 LOCAL cl_int
 cl_mem_unmap_auto(cl_mem mem)
 {
-  if (mem->is_image && mem->tiling != CL_NO_TILE)
+  if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
     cl_buffer_unmap_gtt(mem->bo);
   else
     cl_buffer_unmap(mem->bo);
@@ -597,4 +1056,3 @@ cl_mem_unpin(cl_mem mem)
   cl_buffer_unpin(mem->bo);
   return CL_SUCCESS;
 }
-
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 1b1709a..ca601f9 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -21,8 +21,10 @@
 #define __CL_MEM_H__
 
 #include "cl_internals.h"
-#include "cl_driver.h"
+#include "cl_driver_type.h"
 #include "CL/cl.h"
+#include "cl_khr_icd.h"
+#include <assert.h>
 
 #ifndef CL_VERSION_1_2
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
@@ -62,31 +64,103 @@ typedef struct _cl_mem_dstr_cb {
 }cl_mem_dstr_cb;
 
 /* Used for buffers and images */
-struct _cl_mem {
-  DEFINE_ICD(dispatch)
+enum cl_mem_type {
+  CL_MEM_BUFFER_TYPE,
+  CL_MEM_IMAGE_TYPE,
+  CL_MEM_GL_IMAGE_TYPE,
+};
+#define IS_IMAGE(mem) (mem->type >= CL_MEM_IMAGE_TYPE)
+#define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
+
+typedef  struct _cl_mem {
   uint64_t magic;           /* To identify it as a memory object */
+  DEFINE_ICD(dispatch)
+  cl_mem prev, next;        /* We chain the memory buffers together */
+  enum cl_mem_type type;
   volatile int ref_n;       /* This object is reference counted */
   cl_buffer bo;             /* Data in GPU memory */
-  void *egl_image;          /* created from external egl image*/
   size_t size;              /* original request size, not alignment size, used in constant buffer */
-  cl_mem prev, next;        /* We chain the memory buffers together */
   cl_context ctx;           /* Context it belongs to */
   cl_mem_flags flags;       /* Flags specified at the creation time */
-  uint32_t is_image;        /* Indicate if this is an image or not */
-  cl_image_format fmt;      /* only for images */
-  cl_mem_object_type type;  /* only for images 1D/2D...*/
-  size_t w,h,depth;         /* only for images (depth is only for 3D images) */
-  size_t row_pitch,slice_pitch;
-  uint32_t intel_fmt;       /* format to provide in the surface state */
-  uint32_t bpp;             /* number of bytes per pixel */
-  cl_image_tiling_t tiling; /* only IVB+ supports TILE_[X,Y] (image only) */
   void * host_ptr;          /* Pointer of the host mem specified by CL_MEM_ALLOC_HOST_PTR */
   cl_mapped_ptr* mapped_ptr;/* Store the mapped addresses and size by caller. */
   int mapped_ptr_sz;        /* The array size of mapped_ptr. */
   int map_ref;              /* The mapped count. */
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
+} _cl_mem;
+
+struct _cl_mem_image {
+  _cl_mem base;
+  cl_image_format fmt;            /* only for images */
+  uint32_t intel_fmt;             /* format to provide in the surface state */
+  uint32_t bpp;                   /* number of bytes per pixel */
+  cl_mem_object_type image_type;  /* only for images 1D/2D...*/
+  size_t w, h, depth;             /* only for images (depth is only for 3D images) */
+  size_t row_pitch, slice_pitch;
+  size_t host_row_pitch, host_slice_pitch;
+  cl_image_tiling_t tiling;       /* only IVB+ supports TILE_[X,Y] (image only) */
+  size_t tile_x, tile_y;          /* tile offset, used for mipmap images.  */
+  size_t offset;
+};
+
+struct _cl_mem_gl_image {
+  struct _cl_mem_image base;
+  uint32_t target;
+  int      miplevel;
+  uint32_t texture;
+};
+
+inline static void
+cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
+                  cl_mem_object_type image_type,
+                  size_t depth, cl_image_format fmt,
+                  uint32_t intel_fmt, uint32_t bpp,
+                  size_t row_pitch, size_t slice_pitch,
+                  cl_image_tiling_t tiling,
+                  size_t tile_x, size_t tile_y,
+                  size_t offset)
+{
+  image->w = w;
+  image->h = h;
+  image->image_type = image_type;
+  image->depth = depth;
+  image->fmt = fmt;
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->row_pitch = row_pitch;
+  image->slice_pitch = slice_pitch;
+  image->tiling = tiling;
+  image->tile_x = tile_x;
+  image->tile_y = tile_y;
+  image->offset = offset;
+}
+
+struct _cl_mem_buffer {
+  _cl_mem base;
+  size_t offset;
 };
 
+inline static struct _cl_mem_image *
+cl_mem_image(cl_mem mem)
+{
+  assert(IS_IMAGE(mem));
+  return (struct _cl_mem_image *)mem;
+}
+
+inline static struct _cl_mem_gl_image *
+cl_mem_gl_image(cl_mem mem)
+{
+  assert(IS_GL_IMAGE(mem));
+  return (struct _cl_mem_gl_image*)mem;
+}
+
+inline static struct _cl_mem_buffer *
+cl_mem_buffer(cl_mem mem)
+{
+  assert(!IS_IMAGE(mem));
+  return (struct _cl_mem_buffer *)mem;
+}
+
 /* Query information about a memory object */
 extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
 
@@ -94,7 +168,7 @@ extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t
 extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
 
 /* Create a new memory object and initialize it with possible user data */
-extern cl_mem cl_mem_new(cl_context, cl_mem_flags, size_t, void*, cl_int*);
+extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
 
 /* Idem but this is an image */
 extern cl_mem
@@ -109,11 +183,28 @@ cl_mem_new_image(cl_context context,
 extern void cl_mem_delete(cl_mem);
 
 /* Destroy egl image. */
-extern void cl_mem_gl_delete(cl_mem);
+extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
 
 /* Add one more reference to this object */
 extern void cl_mem_add_ref(cl_mem);
 
+/* api clEnqueueCopyBufferRect help function */
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+                                     const size_t *, const size_t *, const size_t *,
+                                     size_t, size_t, size_t, size_t);
+
+/* api clEnqueueCopyImage help function */
+extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct _cl_mem_image*, struct _cl_mem_image*,
+                                       const size_t *, const size_t *, const size_t *);
+
+/* api clEnqueueCopyImageToBuffer help function */
+extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct _cl_mem_image*, cl_mem,
+                                          const size_t *, const size_t, const size_t *);
+
+/* api clEnqueueCopyBufferToImage help function */
+extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_mem_image*,
+                                          const size_t, const size_t *, const size_t *);
+
 /* Directly map a memory object */
 extern void *cl_mem_map(cl_mem);
 
@@ -123,6 +214,9 @@ extern cl_int cl_mem_unmap(cl_mem);
 /* Directly map a memory object in GTT mode */
 extern void *cl_mem_map_gtt(cl_mem);
 
+/* Directly map a memory object in GTT mode, with out waiting gpu idle */
+extern void *cl_mem_map_gtt_unsync(cl_mem);
+
 /* Unmap a memory object in GTT mode */
 extern cl_int cl_mem_unmap_gtt(cl_mem);
 
@@ -136,5 +230,19 @@ extern cl_int cl_mem_unmap_auto(cl_mem);
 extern cl_int cl_mem_pin(cl_mem);
 extern cl_int cl_mem_unpin(cl_mem);
 
+extern cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+                cl_context ctx,
+                cl_mem_flags flags,
+                size_t sz,
+                cl_int is_tiled,
+                cl_int *errcode);
+
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+                         void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+                         const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+                         const struct _cl_mem_image *image);
+
 #endif /* __CL_MEM_H__ */
 
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index f247171..28d2ac6 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -37,191 +37,46 @@
 #include "CL/cl_intel.h"
 #include "CL/cl_gl.h"
 
-#ifndef CL_VERSION_1_2
-#define CL_INVALID_IMAGE_DESCRIPTOR -65
-#endif
 
-static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
-{
-  cl_int ret = CL_SUCCESS;
-
-  switch (tex_format) {
-  case GL_RGBA8:
-  case GL_RGBA:
-  case GL_RGBA16:
-  case GL_RGBA8I:
-  case GL_RGBA16I:
-  case GL_RGBA32I:
-  case GL_RGBA8UI:
-  case GL_RGBA16UI:
-  case GL_RGBA32UI:
-  case GL_RGBA16F:
-  case GL_RGBA32F:
-    cl_format->image_channel_order = CL_RGBA;
-    break;
-  case GL_BGRA:
-    cl_format->image_channel_order = CL_BGRA;
-    break;
-  default:
-    ret = CL_INVALID_IMAGE_DESCRIPTOR;
-    goto error;
-  }
-
-  switch (tex_format) {
-  case GL_RGBA8:
-  case GL_RGBA:
-  case GL_BGRA:
-    cl_format->image_channel_data_type = CL_UNORM_INT8;
-    break;
-  case GL_RGBA16:
-    cl_format->image_channel_data_type = CL_UNORM_INT16;
-    break;
-  case GL_RGBA8I:
-    cl_format->image_channel_data_type = CL_SIGNED_INT8;
-    break;
-  case GL_RGBA16I:
-    cl_format->image_channel_data_type = CL_SIGNED_INT16;
-    break;
-  case GL_RGBA32I:
-    cl_format->image_channel_data_type = CL_SIGNED_INT32;
-    break;
-  case GL_RGBA8UI:
-    cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
-    break;
-  case GL_RGBA16UI:
-    cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
-    break;
-  case GL_RGBA32UI:
-    cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
-    break;
-  case GL_RGBA16F:
-    cl_format->image_channel_data_type = CL_HALF_FLOAT;
-    break;
-  case GL_RGBA32F:
-    cl_format->image_channel_order = CL_FLOAT;
-    break;
-  default:
-    ret = CL_INVALID_IMAGE_DESCRIPTOR;
-    goto error;
-  }
-
-error:
-  return ret;
-}
-
-static cl_mem_object_type
-get_mem_type_from_target(GLenum texture_target)
-{
-  switch(texture_target) {
-  case GL_TEXTURE_1D: return CL_MEM_OBJECT_IMAGE1D;
-  case GL_TEXTURE_2D: return CL_MEM_OBJECT_IMAGE2D;
-  case GL_TEXTURE_3D: return CL_MEM_OBJECT_IMAGE3D;
-  case GL_TEXTURE_1D_ARRAY: return CL_MEM_OBJECT_IMAGE1D_ARRAY;
-  case GL_TEXTURE_2D_ARRAY: return CL_MEM_OBJECT_IMAGE2D_ARRAY;
-  default:
-    assert(0);
-  }
-  return 0;
-}
-
-LOCAL cl_mem cl_mem_new_gl_buffer(cl_context ctx,
-                                  cl_mem_flags flags,
-                                  GLuint buf_obj, 
-                                  cl_int *errcode_ret)
+LOCAL cl_mem
+cl_mem_new_gl_buffer(cl_context ctx,
+                     cl_mem_flags flags,
+                     GLuint buf_obj,
+                     cl_int *errcode_ret)
 {
   NOT_IMPLEMENTED;
 }
 
-EGLImageKHR cl_create_textured_egl_image(cl_context ctx,
-                                         GLenum texture_target,
-                                         GLint miplevel,
-                                         GLuint texture)
-{
-  struct cl_gl_ext_deps *egl_funcs;
-  EGLDisplay egl_display;
-  EGLContext egl_context;
-  EGLint egl_attribs[] = { EGL_GL_TEXTURE_LEVEL_KHR, miplevel, EGL_NONE};
-
-  assert(ctx->props.gl_type == CL_GL_EGL_DISPLAY);
-  egl_funcs =  CL_EXTENSION_GET_FUNCS(ctx, khr_gl_sharing, gl_ext_deps);
-  assert(egl_funcs != NULL);
-  egl_display = (EGLDisplay)ctx->props.egl_display;
-  egl_context = (EGLDisplay)ctx->props.gl_context;
-  return egl_funcs->eglCreateImageKHR_func(egl_display, egl_context,
-                                           EGL_GL_TEXTURE_2D_KHR,
-                                           (EGLClientBuffer)(uintptr_t)texture,
-                                           &egl_attribs[0]);
-}
-
-LOCAL cl_mem cl_mem_new_gl_texture(cl_context ctx,
-                                   cl_mem_flags flags,
-                                   GLenum texture_target,
-                                   GLint miplevel,
-                                   GLuint texture,
-                                   cl_int *errcode_ret)
+LOCAL cl_mem
+cl_mem_new_gl_texture(cl_context ctx,
+                      cl_mem_flags flags,
+                      GLenum texture_target,
+                      GLint miplevel,
+                      GLuint texture,
+                      cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
-  EGLImageKHR egl_image;
-  int w, h, pitch, tiling;
-  unsigned int bpp, intel_fmt;
-  cl_image_format cl_format;
-  unsigned int gl_format;
   /* Check flags consistency */
   if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR)) {
     err = CL_INVALID_ARG_VALUE;
     goto error;
   }
 
-  TRY_ALLOC (mem, CALLOC(struct _cl_mem));
-  mem->ctx = ctx;
-  cl_context_add_ref(ctx);
-
-  egl_image = cl_create_textured_egl_image(ctx, texture_target, miplevel, texture);
-
-  if (egl_image == NULL) {
-    err = CL_INVALID_GL_OBJECT;
+  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, &err);
+  if (mem == NULL || err != CL_SUCCESS)
     goto error;
-  }
-  mem->egl_image = egl_image;
-  mem->bo = cl_buffer_alloc_from_eglimage(ctx, (void*)egl_image, &gl_format, &w, &h, &pitch, &tiling);
+
+  mem->bo = cl_buffer_alloc_from_texture(ctx, texture_target, miplevel,
+                                         texture, cl_mem_image(mem));
   if (UNLIKELY(mem->bo == NULL)) {
     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
     goto error;
   }
-  cl_get_clformat_from_texture(gl_format, &cl_format);
-
-  /* XXX Maybe we'd better to check the hw format in driver? */
-  intel_fmt = cl_image_get_intel_format(&cl_format);
-
-  if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) {
-    err = CL_INVALID_IMAGE_DESCRIPTOR;
-    goto error;
-  }
-  cl_image_byte_per_pixel(&cl_format, &bpp);
-
-  mem->type = get_mem_type_from_target(texture_target);
-  mem->w = w;
-  mem->h = h;
-  mem->depth = 1;
-  mem->fmt = cl_format;
-  mem->intel_fmt = intel_fmt;
-  mem->bpp = bpp;
-  mem->is_image = 1;
-  mem->row_pitch = pitch;
-  mem->slice_pitch = 0;
-  mem->tiling = tiling;
-  mem->ref_n = 1;
-  mem->magic = CL_MAGIC_MEM_HEADER;
-  mem->flags = flags;
 
-  /* Append the buffer in the context buffer list */
-  pthread_mutex_lock(&ctx->buffer_lock);
-    mem->next = ctx->buffers;
-    if (ctx->buffers != NULL)
-      ctx->buffers->prev = mem;
-    ctx->buffers = mem;
-  pthread_mutex_unlock(&ctx->buffer_lock);
+  cl_mem_gl_image(mem)->target = texture_target;
+  cl_mem_gl_image(mem)->miplevel = miplevel;
+  cl_mem_gl_image(mem)->texture = texture;
 
 exit:
   if (errcode_ret)
@@ -234,10 +89,9 @@ error:
 
 }
 
-LOCAL void cl_mem_gl_delete(cl_mem mem)
+LOCAL void cl_mem_gl_delete(struct _cl_mem_gl_image *gl_image)
 {
-  struct cl_gl_ext_deps *egl_funcs;
-  EGLDisplay egl_display = (EGLDisplay)mem->ctx->props.egl_display;
-  egl_funcs =  CL_EXTENSION_GET_FUNCS(mem->ctx, khr_gl_sharing, gl_ext_deps);
-  egl_funcs->eglDestroyImageKHR_func(egl_display, mem->egl_image);
+  if (gl_image->base.base.bo != NULL)
+    cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image->target,
+                                   gl_image->miplevel, gl_image->texture);
 }
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index 33915ce..fdf0d78 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -28,7 +28,7 @@
 
 #define DECL_INFO_STRING(FIELD, STRING) \
     .FIELD = STRING,                    \
-    .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
+    .JOIN(FIELD,_sz) = sizeof(STRING),
 
 static struct _cl_platform_id intel_platform_data = {
   INIT_ICD(dispatch)
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index b8f7d61..6b70aee 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -62,9 +62,10 @@ extern cl_int cl_get_platform_info(cl_platform_id    platform,
 #define _STR(x) #x
 #define _JOINT(x, y) _STR(x) "." _STR(y)
 
-#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR)
-#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR)
+
 #define LIBCL_DRIVER_VERSION_STRING _JOINT(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR)
+#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
+#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
 
 #endif /* __CL_PLATFORM_ID_H__ */
 
diff --git a/src/cl_program.c b/src/cl_program.c
index 7870514..a0e0104 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -42,6 +42,15 @@ cl_program_release_sources(cl_program p)
   }
 }
 
+static void
+cl_program_release_binary(cl_program p)
+{
+  if (p->binary) {
+    cl_free(p->binary);
+    p->binary = NULL;
+  }
+}
+
 LOCAL void
 cl_program_delete(cl_program p)
 {
@@ -53,8 +62,9 @@ cl_program_delete(cl_program p)
   /* We are not done with it yet */
   if ((ref = atomic_dec(&p->ref_n)) > 1) return;
 
-  /* Destroy the sources if still allocated */
+  /* Destroy the sources and binary if still allocated */
   cl_program_release_sources(p);
+  cl_program_release_binary(p);
 
   /* Release the build options. */
   if (p->build_opts) {
@@ -149,7 +159,6 @@ cl_program_create_from_binary(cl_context             ctx,
                               cl_int *               binary_status,
                               cl_int *               errcode_ret)
 {
-#if 0
   cl_program program = NULL;
   cl_int err = CL_SUCCESS;
 
@@ -174,7 +183,16 @@ cl_program_create_from_binary(cl_context             ctx,
     goto error;
   }
 
-  // TRY_ALLOC (program, cl_program_new(ctx, (const char *) binaries[0], lengths[0]));
+  program = cl_program_new(ctx);
+
+  // TODO:  Need to check the binary format here to return CL_INVALID_BINARY.
+  TRY_ALLOC(program->binary, cl_calloc(lengths[0], sizeof(char)));
+  memcpy(program->binary, binaries[0], lengths[0]);
+  program->binary_sz = lengths[0];
+  program->source_type = FROM_BINARY;
+
+  if (binary_status)
+    binary_status[0] = CL_SUCCESS;
 
 exit:
   if (errcode_ret)
@@ -184,8 +202,7 @@ error:
   cl_program_delete(program);
   program = NULL;
   goto exit;
-#endif
-  NOT_IMPLEMENTED;
+
   return CL_SUCCESS;
 }
 
@@ -303,6 +320,16 @@ cl_program_build(cl_program p, const char *options)
     /* Create all the kernels */
     TRY (cl_program_load_gen_program, p);
     p->source_type = FROM_LLVM;
+  } else if (p->source_type == FROM_BINARY) {
+    p->opaque = gbe_program_new_from_binary(p->binary, p->binary_sz);
+    if (UNLIKELY(p->opaque == NULL)) {
+      err = CL_INVALID_PROGRAM;
+      goto error;
+    }
+
+    /* Create all the kernels */
+    TRY (cl_program_load_gen_program, p);
+    p->source_type = FROM_LLVM;
   }
 
   for (i = 0; i < p->ker_n; i ++) {
diff --git a/src/cl_program.h b/src/cl_program.h
index 996a496..de82fd5 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -48,6 +48,8 @@ struct _cl_program {
   char *bin;              /* The program copied verbatim */
   size_t bin_sz;          /* Its size in memory */
   char *source;           /* Program sources */
+  char *binary;           /* Program binary. */
+  size_t binary_sz;       /* The binary size. */
   uint32_t ker_n;         /* Number of declared kernels */
   uint32_t source_type:2; /* Built from binary, source or LLVM */
   uint32_t is_built:1;    /* Did we call clBuildProgram on it? */
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index 7e0b7b0..b3f7045 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -49,7 +49,7 @@ uint32_t cl_to_clk(cl_bool normalized_coords,
   }
   return (clk_address << __CLK_ADDRESS_BASE)
          | (normalized_coords << __CLK_NORMALIZED_BASE)
-         | (clk_filter << __CLK_FILTER_BASE);
+         | (clk_filter);
 }
 
 #define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
diff --git a/src/cl_utils.h b/src/cl_utils.h
index bfe418d..fa900a7 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -138,19 +138,21 @@ do {                                                        \
   }                                                         \
 } while (0)
 
-#define CHECK_IMAGE(IMAGE)                                  \
-CHECK_MEM(image);                                           \
+#define CHECK_IMAGE(MEM, IMAGE)                             \
+CHECK_MEM(MEM);                                             \
 do {                                                        \
-  if (UNLIKELY(!IMAGE->is_image)) {                         \
+  if (UNLIKELY(!IS_IMAGE(MEM))) {                           \
     err = CL_INVALID_MEM_OBJECT;                            \
     goto error;                                             \
   }                                                         \
-} while (0)
+} while (0);                                                \
+struct _cl_mem_image *IMAGE;                                \
+IMAGE = cl_mem_image(MEM);                                  \
 
 #define CHECK_EVENT(EVENT)                                    \
   do {                                                        \
     if (UNLIKELY(EVENT == NULL)) {                            \
-      err = CL_INVALID_EVENT;                            \
+      err = CL_INVALID_EVENT;                                 \
       goto error;                                             \
     }                                                         \
     if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) {    \
diff --git a/src/intel/intel_dri_resource_sharing.c b/src/intel/intel_dri_resource_sharing.c
new file mode 100644
index 0000000..b31844e
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#define HAVE_PTHREAD 1
+#include <errno.h>
+#include <time.h>
+#include "main/context.h"
+#include "main/renderbuffer.h"
+#include "main/texobj.h"
+#include <stdbool.h>
+#include <string.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+#include <GL/internal/dri_interface.h>
+#include "intel_mipmap_tree.h"
+#include "intel_regions.h"
+#include "intel_context.h"
+
+#include "intel_dri_resource_sharing.h"
+#include "intel_dri_resource_sharing_int.h"
+
+#include <dlfcn.h>
+/**
+ * Sets up a DRIImage structure to point to our shared image in a region
+ */
+static bool
+intel_setup_cl_region_from_mipmap_tree(void *driver,
+                                       struct intel_context *intel,
+                                       struct intel_mipmap_tree *mt,
+                                       GLuint level, GLuint zoffset,
+                                       struct _intel_dri_share_image_region *region)
+{
+   unsigned int draw_x, draw_y;
+   uint32_t mask_x, mask_y;
+   struct intel_region *null_region = (struct intel_region *)NULL;
+
+   intel_miptree_check_level_layer(mt, level, zoffset);
+
+   _intel_region_get_tile_masks(mt->region, &mask_x, &mask_y, false);
+   _intel_miptree_get_image_offset(mt, level, zoffset, &draw_x, &draw_y);
+
+   region->w = mt->level[level].width;
+   region->h = mt->level[level].height;
+   region->tile_x = draw_x & mask_x;
+   region->tile_y = draw_y & mask_y;
+   region->tiling = mt->region->tiling;
+   /* XXX hard code to 1 right now. */
+   region->depth = 1;
+   region->row_pitch = mt->region->pitch;
+
+   region->offset = _intel_region_get_aligned_offset(mt->region,
+                                                     draw_x & ~mask_x,
+                                                     draw_y & ~mask_y,
+                                                     false);
+   if (!_intel_region_flink(mt->region, &region->name))
+      return false;
+   _intel_region_reference(&null_region, mt->region);
+   return true;
+}
+
+typedef void
+_mesa_test_texobj_completeness_t( const struct gl_context *ctx,
+                                struct gl_texture_object *t );
+_mesa_test_texobj_completeness_t *__mesa_test_texobj_completeness;
+
+typedef struct gl_texture_object *
+_mesa_lookup_texture_t( const struct gl_context *ctx, GLuint id);
+_mesa_lookup_texture_t *__mesa_lookup_texture;
+
+static struct gl_texture_object *
+intel_get_gl_obj_from_texture(void *driver,
+                              struct intel_context *intel,
+                              GLenum target, GLint level,
+                              GLuint texture, GLuint face)
+{
+   struct gl_texture_object *obj;
+   __mesa_lookup_texture = dlsym(driver, "_mesa_lookup_texture");
+   obj = __mesa_lookup_texture(&intel->ctx, texture);
+   if (!obj || obj->Target != target) {
+      return NULL;
+   }
+
+   __mesa_test_texobj_completeness = dlsym(driver, "_mesa_test_texobj_completeness");
+   __mesa_test_texobj_completeness(&intel->ctx, obj);
+   if (!obj->_BaseComplete || (level > 0 && !obj->_MipmapComplete)) {
+      return NULL;
+   }
+
+   if (level < obj->BaseLevel || level > obj->_MaxLevel) {
+      return NULL;
+   }
+
+   return obj;
+}
+
+static GLenum
+get_cl_gl_format(gl_format format)
+{
+   switch (format) {
+   case MESA_FORMAT_RGBA8888:
+      return GL_RGBA;
+   case MESA_FORMAT_ARGB8888:
+      return GL_BGRA;
+   default:
+      return GL_BGRA;
+  }
+}
+
+static bool
+intelAcquireTexture(void *driver, __DRIcontext *context, GLenum target,
+                    GLint level, GLuint texture, void *user_data)
+{
+   struct _intel_dri_share_image_region *region = intel_dri_share_image_region(user_data);
+   struct intel_context *intel = context->driverPrivate;
+   struct gl_texture_object *obj;
+   struct intel_texture_object *iobj;
+   /* XXX Always be face 0? */
+   GLuint face = 0;
+
+   obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+   if (obj == NULL)
+     return false;
+   iobj = intel_texture_object(obj);
+   region->gl_format = get_cl_gl_format(obj->Image[face][level]->TexFormat);
+   return intel_setup_cl_region_from_mipmap_tree(driver, intel, iobj->mt, level, 0, region);
+}
+
+static bool
+intelReleaseTexture(void *driver, __DRIcontext *context, GLenum target,
+                    GLint level, GLuint texture)
+{
+   struct intel_context *intel = context->driverPrivate;
+   struct gl_texture_object *obj;
+   struct intel_texture_object *iobj;
+   /* XXX Always be face 0? */
+   GLuint face = 0;
+
+   obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+   if (obj == NULL)
+     return false;
+
+   iobj = intel_texture_object(obj);
+   _intel_region_release(&iobj->mt->region);
+   return true;
+}
+
+static bool
+intelAcquireBufferObj(void *driver, __DRIcontext *driContextPriv,
+                      GLuint bufobj, void *user_data)
+{
+  return false;
+}
+
+static bool
+intelReleaseBufferObj(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+  return false;
+}
+
+static bool
+intelAcquireRenderBuffer(void *driver, __DRIcontext *driContextPriv,
+                         GLuint bufobj, void *user_data)
+{
+  return false;
+}
+
+static bool
+intelReleaseRenderBuffer(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+  return false;
+}
+
+#include "cl_driver.h"
+void
+intel_set_cl_gl_callbacks(void)
+{
+  cl_gl_acquire_texture = (cl_gl_acquire_texture_cb*)intelAcquireTexture;
+  cl_gl_release_texture = (cl_gl_release_texture_cb*)intelReleaseTexture;
+  cl_gl_acquire_buffer_object = (cl_gl_acquire_buffer_object_cb*)intelAcquireBufferObj;
+  cl_gl_release_buffer_object = (cl_gl_release_buffer_object_cb*)intelReleaseBufferObj;
+  cl_gl_acquire_render_buffer = (cl_gl_acquire_render_buffer_cb*)intelAcquireRenderBuffer;
+  cl_gl_release_render_buffer = (cl_gl_release_render_buffer_cb*)intelReleaseRenderBuffer;
+}
diff --git a/src/intel/intel_dri_resource_sharing.h b/src/intel/intel_dri_resource_sharing.h
new file mode 100644
index 0000000..6d2ce4d
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.h
@@ -0,0 +1,39 @@
+#ifndef __INTEL_DRI_RESOURCE_SHARING_H__
+#define __INTEL_DRI_RESOURCE_SHARING_H__
+
+struct _intel_dri_share_image_region {
+  unsigned int name;
+  size_t w;
+  size_t h;
+  size_t depth;
+  size_t pitch;
+  int tiling;
+  size_t offset;
+  size_t tile_x;
+  size_t tile_y;
+  unsigned int gl_format;
+  size_t row_pitch, slice_pitch;
+};
+
+struct _intel_dri_share_buffer_object {
+  unsigned int name;
+  size_t sz;
+  size_t offset;
+};
+
+inline static struct _intel_dri_share_image_region *
+intel_dri_share_image_region(void *user_data)
+{
+   return (struct _intel_dri_share_image_region *)user_data;
+}
+
+inline static struct _intel_dri_share_buffer_object *
+intel_dri_share_buffer_object(void *user_data)
+{
+   return (struct _intel_dri_share_buffer_object *)user_data;
+}
+
+extern void intel_set_cl_gl_callbacks(void);
+
+
+#endif
diff --git a/src/intel/intel_dri_resource_sharing_int.h b/src/intel/intel_dri_resource_sharing_int.h
new file mode 100644
index 0000000..c7b283a
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing_int.h
@@ -0,0 +1,143 @@
+/*****************************************************************
+ * The following functions are copied from i965 driver, commit
+ * id 292368570a13501dfa95b1b0dd70966caf6ffc6b. Need to keep consistant
+ * with the dri driver installed on current system.
+ *****************************************************************/
+static bool
+_intel_region_flink(struct intel_region *region, uint32_t *name)
+{
+   if (region->name == 0) {
+      if (drm_intel_bo_flink(region->bo, &region->name))
+         return false;
+   }
+
+   *name = region->name;
+
+   return true;
+}
+
+#define _DBG(...)
+static void
+_intel_region_release(struct intel_region **region_handle)
+{
+   struct intel_region *region = *region_handle;
+
+   if (region == NULL) {
+      _DBG("%s NULL\n", __FUNCTION__);
+      return;
+   }
+
+   _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
+
+   ASSERT(region->refcount > 0);
+   region->refcount--;
+
+   if (region->refcount == 0) {
+      drm_intel_bo_unreference(region->bo);
+
+      free(region);
+   }
+   *region_handle = NULL;
+}
+
+static void
+_intel_region_reference(struct intel_region **dst, struct intel_region *src)
+{
+   _DBG("%s: %p(%d) -> %p(%d)\n", __FUNCTION__,
+        *dst, *dst ? (*dst)->refcount : 0, src, src ? src->refcount : 0);
+
+   if (src != *dst) {
+      if (*dst)
+         _intel_region_release(dst);
+
+      if (src)
+         src->refcount++;
+      *dst = src;
+   }
+}
+
+/**
+ * This function computes masks that may be used to select the bits of the X
+ * and Y coordinates that indicate the offset within a tile.  If the region is
+ * untiled, the masks are set to 0.
+ */
+static void
+_intel_region_get_tile_masks(struct intel_region *region,
+                             uint32_t *mask_x, uint32_t *mask_y,
+                             bool map_stencil_as_y_tiled)
+{
+   int cpp = region->cpp;
+   uint32_t tiling = region->tiling;
+
+   if (map_stencil_as_y_tiled)
+      tiling = I915_TILING_Y;
+
+   switch (tiling) {
+   default:
+      assert(false);
+   case I915_TILING_NONE:
+      *mask_x = *mask_y = 0;
+      break;
+   case I915_TILING_X:
+      *mask_x = 512 / cpp - 1;
+      *mask_y = 7;
+      break;
+   case I915_TILING_Y:
+      *mask_x = 128 / cpp - 1;
+      *mask_y = 31;
+      break;
+   }
+}
+
+/**
+ * Compute the offset (in bytes) from the start of the region to the given x
+ * and y coordinate.  For tiled regions, caller must ensure that x and y are
+ * multiples of the tile size.
+ */
+static uint32_t
+_intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
+                                 uint32_t y, bool map_stencil_as_y_tiled)
+{
+   int cpp = region->cpp;
+   uint32_t pitch = region->pitch;
+   uint32_t tiling = region->tiling;
+
+   if (map_stencil_as_y_tiled) {
+      tiling = I915_TILING_Y;
+
+      /* When mapping a W-tiled stencil buffer as Y-tiled, each 64-high W-tile
+       * gets transformed into a 32-high Y-tile.  Accordingly, the pitch of
+       * the resulting region is twice the pitch of the original region, since
+       * each row in the Y-tiled view corresponds to two rows in the actual
+       * W-tiled surface.  So we need to correct the pitch before computing
+       * the offsets.
+       */
+      pitch *= 2;
+   }
+
+   switch (tiling) {
+   default:
+      assert(false);
+   case I915_TILING_NONE:
+      return y * pitch + x * cpp;
+   case I915_TILING_X:
+      assert((x % (512 / cpp)) == 0);
+      assert((y % 8) == 0);
+      return y * pitch + x / (512 / cpp) * 4096;
+   case I915_TILING_Y:
+      assert((x % (128 / cpp)) == 0);
+      assert((y % 32) == 0);
+      return y * pitch + x / (128 / cpp) * 4096;
+   }
+}
+
+static void
+_intel_miptree_get_image_offset(struct intel_mipmap_tree *mt,
+                                GLuint level, GLuint slice,
+                                GLuint *x, GLuint *y)
+{
+   assert(slice < mt->level[level].depth);
+
+   *x = mt->level[level].slice[slice].x_offset;
+   *y = mt->level[level].slice[slice].y_offset;
+}
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 6c6b9fb..cc33914 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -45,6 +45,13 @@
  *    Zou Nan hai <nanhai.zou at intel.com>
  *
  */
+
+#if defined(HAS_EGL)
+#include "GL/gl.h"
+#include "EGL/egl.h"
+#include "x11/mesa_egl_extension.h"
+#endif
+
 #include "intel_driver.h"
 #include "intel_gpgpu.h"
 #include "intel_batchbuffer.h"
@@ -65,6 +72,8 @@
 #include "cl_alloc.h"
 #include "cl_context.h"
 #include "cl_driver.h"
+#include "cl_device_id.h"
+#include "cl_platform_id.h"
 
 #define SET_BLOCKED_SIGSET(DRIVER)   do {                     \
   sigset_t bl_mask;                                           \
@@ -169,6 +178,7 @@ static void
 intel_driver_open(intel_driver_t *intel, cl_context_prop props)
 {
   int cardi;
+  char *driver_name;
   if (props != NULL
       && props->gl_type != CL_GL_NOSHARE
       && props->gl_type != CL_GL_GLX_DISPLAY
@@ -182,7 +192,7 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
   if(intel->x11_display) {
     if((intel->dri_ctx = getDRI2State(intel->x11_display,
                                      DefaultScreen(intel->x11_display),
-                                     NULL)))
+                                     &driver_name)))
       intel_driver_init_shared(intel, intel->dri_ctx);
     else
       printf("X server found. dri2 connection failed! \n");
@@ -206,15 +216,9 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
     exit(-1);
   }
 
-#if defined(HAS_GBM) && defined(HAS_EGL)
+#ifdef HAS_EGL
   if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
     assert(props->egl_display);
-    intel->gbm = gbm_create_device(intel->fd);
-    if (intel->gbm == NULL) {
-      printf("GBM device create failed.\n");
-      exit(-1);
-    }
-    cl_gbm_set_image_extension(intel->gbm, (void*)props->egl_display);
   }
 #endif
 }
@@ -222,9 +226,6 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
 static void
 intel_driver_close(intel_driver_t *intel)
 {
-#ifdef HAS_GBM
-  if(intel->gbm) gbm_device_destroy(intel->gbm);
-#endif
   if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
   if(intel->x11_display) XCloseDisplay(intel->x11_display);
   if(intel->fd) close(intel->fd);
@@ -325,11 +326,11 @@ intel_driver_unlock_hardware(intel_driver_t *driver)
 }
 
 LOCAL dri_bo*
-intel_driver_share_buffer(intel_driver_t *driver, uint32_t name)
+intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t name)
 {
   assert(!driver->master);
   dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
-                                             "rendering buffer",
+                                             sname,
                                              name);
   return bo;
 }
@@ -380,7 +381,7 @@ cl_intel_driver_new(cl_context_prop props)
   /* We use the first 2 slots(0,1) for all the bufs.
    * Notify the gbe this base index, thus gbe can avoid conflicts
    * when it allocates slots for images*/
-  gbe_set_image_base_index(2);
+  gbe_set_image_base_index(3);
 exit:
   return driver;
 error:
@@ -404,11 +405,9 @@ intel_driver_get_ver(struct intel_driver *drv)
 static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
 static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
 
-#if defined(HAS_EGL) && defined(HAS_GBM)
-#include "gbm.h"
-#include "GL/gl.h"
-#include "EGL/egl.h"
-#include "EGL/eglext.h"
+#if defined(HAS_EGL)
+#include "intel_dri_resource_sharing.h"
+#include "cl_image.h"
 static int get_cl_tiling(uint32_t drm_tiling)
 {
   switch(drm_tiling) {
@@ -421,50 +420,166 @@ static int get_cl_tiling(uint32_t drm_tiling)
   return CL_NO_TILE;
 }
 
-static unsigned int get_gl_format(uint32_t gbm_format)
+static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
 {
-  switch(gbm_format) {
-  case GBM_FORMAT_ARGB8888: return GL_BGRA;
-  case GBM_FORMAT_ABGR8888: return GL_RGBA;
+  cl_int ret = CL_SUCCESS;
+
+  switch (tex_format) {
+  case GL_RGBA8:
+  case GL_RGBA:
+  case GL_RGBA16:
+  case GL_RGBA8I:
+  case GL_RGBA16I:
+  case GL_RGBA32I:
+  case GL_RGBA8UI:
+  case GL_RGBA16UI:
+  case GL_RGBA32UI:
+  case GL_RGBA16F:
+  case GL_RGBA32F:
+    cl_format->image_channel_order = CL_RGBA;
+    break;
+  case GL_BGRA:
+    cl_format->image_channel_order = CL_BGRA;
+    break;
   default:
-    NOT_IMPLEMENTED;
+    ret = -1;
+    goto error;
   }
-  return 0;
+
+  switch (tex_format) {
+  case GL_RGBA8:
+  case GL_RGBA:
+  case GL_BGRA:
+    cl_format->image_channel_data_type = CL_UNORM_INT8;
+    break;
+  case GL_RGBA16:
+    cl_format->image_channel_data_type = CL_UNORM_INT16;
+    break;
+  case GL_RGBA8I:
+    cl_format->image_channel_data_type = CL_SIGNED_INT8;
+    break;
+  case GL_RGBA16I:
+    cl_format->image_channel_data_type = CL_SIGNED_INT16;
+    break;
+  case GL_RGBA32I:
+    cl_format->image_channel_data_type = CL_SIGNED_INT32;
+    break;
+  case GL_RGBA8UI:
+    cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
+    break;
+  case GL_RGBA16UI:
+    cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
+    break;
+  case GL_RGBA32UI:
+    cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
+    break;
+  case GL_RGBA16F:
+    cl_format->image_channel_data_type = CL_HALF_FLOAT;
+    break;
+  case GL_RGBA32F:
+    cl_format->image_channel_order = CL_FLOAT;
+    break;
+  default:
+    ret = -1;
+    goto error;
+  }
+
+error:
+  return ret;
 }
 
-cl_buffer intel_alloc_buffer_from_eglimage(cl_context ctx,
-                                           void* image,
-                                           unsigned int *gl_format,
-                                           int *w, int *h, int *pitch,
-                                           int *tiling)
+static int
+get_mem_type_from_target(GLenum texture_target, cl_mem_object_type *type)
 {
-  struct gbm_bo *bo;
-  uint32_t gbm_format;
-  drm_intel_bo *intel_bo;
-  int32_t name;
-  uint32_t drm_tiling, swizzle;
-  EGLImageKHR egl_image = (EGLImageKHR)image;
-  intel_driver_t *intel = (intel_driver_t*)ctx->drv;
-
-  bo = gbm_bo_import(intel->gbm, GBM_BO_IMPORT_EGL_IMAGE, (void*)egl_image, 0);
-
-  *w = gbm_bo_get_width(bo);
-  *h = gbm_bo_get_height(bo);
-  *pitch = gbm_bo_get_stride(bo);
-  gbm_format = gbm_bo_get_format(bo);
-  *gl_format = get_gl_format(gbm_format);
-  name = cl_gbm_bo_get_name(bo);
-
-  intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, name);
-
-  if (drm_intel_bo_get_tiling(intel_bo, &drm_tiling, &swizzle)!= 0)
-    assert(0);
-  *tiling = get_cl_tiling(drm_tiling);
+  switch(texture_target) {
+  case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
+  case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
+  case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
+  case GL_TEXTURE_1D_ARRAY: *type = CL_MEM_OBJECT_IMAGE1D_ARRAY; break;
+  case GL_TEXTURE_2D_ARRAY: *type = CL_MEM_OBJECT_IMAGE2D_ARRAY; break;
+  default:
+    return -1;
+  }
+  return CL_SUCCESS;
+}
 
-  gbm_bo_destroy(bo);
+static cl_buffer
+intel_alloc_buffer_from_texture_egl(cl_context ctx, unsigned int target,
+                                    int miplevel, unsigned int texture,
+                                    struct _cl_mem_image *image)
+{
+  cl_buffer bo = (cl_buffer) NULL;
+  struct _intel_dri_share_image_region region;
+  unsigned int bpp, intel_fmt;
+  cl_image_format cl_format;
+  EGLBoolean ret;
+  EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+                           EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+                           EGL_GL_TEXTURE_TARGET_MESA, target,
+                           EGL_NONE};
+  ret = eglAcquireResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx),
+                               EGL_GL_TEXTURE_MESA,
+                               &attrib_list[0], &region);
+  if (!ret)
+      goto out;
+
+  bo = (cl_buffer)intel_driver_share_buffer((intel_driver_t *)ctx->drv, "rendering buffer", region.name);
+
+  if (bo == NULL) {
+    eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+    goto out;
+  }
+  region.tiling = get_cl_tiling(region.tiling);
+  if (cl_get_clformat_from_texture(region.gl_format, &cl_format) != 0)
+    goto error;
+  intel_fmt = cl_image_get_intel_format(&cl_format);
+  if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
+    goto error;
+  cl_image_byte_per_pixel(&cl_format, &bpp);
+  cl_mem_object_type image_type;
+  if (get_mem_type_from_target(target, &image_type) != 0)
+    goto error;
+
+  cl_mem_image_init(image, region.w, region.h,
+                    image_type, region.depth, cl_format,
+                    intel_fmt, bpp, region.row_pitch,
+                    region.slice_pitch, region.tiling,
+                    region.tile_x, region.tile_y, region.offset);
+out:
+  return bo;
 
-  return (cl_buffer)intel_bo;
+error:
+  cl_buffer_unreference(bo);
+  eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+  return NULL;
+}
 
+static cl_buffer
+intel_alloc_buffer_from_texture(cl_context ctx, unsigned int target,
+                                int miplevel, unsigned int texture,
+                                struct _cl_mem_image *image)
+{
+
+  if (IS_EGL_CONTEXT(ctx))
+    return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
+
+  return NULL;
+}
+
+static int
+intel_release_buffer_from_texture(cl_context ctx, unsigned int target,
+                                  int miplevel, unsigned int texture)
+{
+  if (IS_EGL_CONTEXT(ctx)) {
+    EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+                           EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+                           EGL_GL_TEXTURE_TARGET_MESA, target,
+                           EGL_NONE};
+
+    eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+    return CL_SUCCESS;
+  }
+  return -1;
 }
 #endif
 
@@ -510,8 +625,10 @@ intel_setup_callbacks(void)
   cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
   cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
-#ifdef HAS_EGL
-  cl_buffer_alloc_from_eglimage = (cl_buffer_alloc_from_eglimage_cb *) intel_alloc_buffer_from_eglimage;
+#if defined(HAS_EGL)
+  cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
+  cl_buffer_release_from_texture = (cl_buffer_release_from_texture_cb *) intel_release_buffer_from_texture;
+  intel_set_cl_gl_callbacks();
 #endif
   cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference;
   cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference;
@@ -519,6 +636,7 @@ intel_setup_callbacks(void)
   cl_buffer_unmap = (cl_buffer_unmap_cb *) drm_intel_bo_unmap;
   cl_buffer_map_gtt = (cl_buffer_map_gtt_cb *) drm_intel_gem_bo_map_gtt;
   cl_buffer_unmap_gtt = (cl_buffer_unmap_gtt_cb *) drm_intel_gem_bo_unmap_gtt;
+  cl_buffer_map_gtt_unsync = (cl_buffer_map_gtt_unsync_cb *) drm_intel_gem_bo_map_unsynchronized;
   cl_buffer_get_virtual = (cl_buffer_get_virtual_cb *) drm_intel_bo_get_virtual;
   cl_buffer_get_size = (cl_buffer_get_size_cb *) drm_intel_bo_get_size;
   cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
@@ -527,4 +645,3 @@ intel_setup_callbacks(void)
   cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
   intel_set_gpgpu_callbacks();
 }
-
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index f70f96a..a01d881 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -54,9 +54,6 @@
 #include <drm.h>
 #include <i915_drm.h>
 #include <intel_bufmgr.h>
-#ifdef HAS_GBM
-#include <gbm.h>
-#endif
 
 #define CMD_MI                                  (0x0 << 29)
 #define CMD_2D                                  (0x2 << 29)
@@ -90,9 +87,6 @@ typedef struct intel_driver
   int master;
   Display *x11_display;
   struct dri_state *dri_ctx;
-#ifdef HAS_GBM
-  struct gbm_device *gbm;
-#endif
 } intel_driver_t;
 
 /* device control */
@@ -100,7 +94,7 @@ extern void intel_driver_lock_hardware(intel_driver_t*);
 extern void intel_driver_unlock_hardware(intel_driver_t*);
 
 /* methods working in shared mode */
-extern dri_bo* intel_driver_share_buffer(intel_driver_t*, uint32_t name);
+extern dri_bo* intel_driver_share_buffer(intel_driver_t*, const char *sname, uint32_t name);
 extern uint32_t intel_driver_shared_name(intel_driver_t*, dri_bo*);
 
 /* init driver shared with X using dri state, acquired from X Display */
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 1301b66..5d93a67 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -79,7 +79,7 @@ struct intel_gpgpu
   intel_batchbuffer_t *batch;
   cl_gpgpu_kernel *ker;
   drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
-  uint32_t binded_offset[max_buf_n];    /* their offsets in the constant buffer */
+  uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
   uint32_t binded_n;                    /* number of buffers binded */
 
   unsigned long img_bitmap;              /* image usage bitmap. */
@@ -94,8 +94,10 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } vfe_state_b;
   struct { drm_intel_bo *bo; } curbe_b;
   struct { drm_intel_bo *bo; } sampler_state_b;
+  struct { drm_intel_bo *bo; } sampler_border_color_state_b;
   struct { drm_intel_bo *bo; } perf_b;
   struct { drm_intel_bo *bo; } scratch_b;
+  struct { drm_intel_bo *bo; } constant_b;
 
   uint32_t per_thread_scratch;
   struct {
@@ -131,6 +133,8 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
     drm_intel_bo_unreference(gpgpu->curbe_b.bo);
   if (gpgpu->sampler_state_b.bo)
     drm_intel_bo_unreference(gpgpu->sampler_state_b.bo);
+  if (gpgpu->sampler_border_color_state_b.bo)
+    drm_intel_bo_unreference(gpgpu->sampler_border_color_state_b.bo);
   if (gpgpu->perf_b.bo)
     drm_intel_bo_unreference(gpgpu->perf_b.bo);
   if (gpgpu->stack_b.bo)
@@ -138,6 +142,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
   if (gpgpu->scratch_b.bo)
     drm_intel_bo_unreference(gpgpu->scratch_b.bo);
 
+  if(gpgpu->constant_b.bo)
+    drm_intel_bo_unreference(gpgpu->constant_b.bo);
+
   intel_batchbuffer_delete(gpgpu->batch);
   cl_free(gpgpu);
 }
@@ -197,7 +204,10 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
   OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
 #else
   OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
-  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+  /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+   * to a valid bound value, otherwise, the border color pointer may be rejected and you
+   * may get incorrect border color. This is a known hardware bug. */
+  OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
   OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
   OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
 #endif /* USE_FULSIM */
@@ -231,7 +241,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 }
 
 static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
@@ -284,6 +294,7 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
   pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
   pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
   pc->dw1.render_target_cache_flush_enable = 1;
+  pc->dw1.texture_cache_invalidation_enable = 1;
   pc->dw1.cs_stall = 1;
   pc->dw1.dc_flush_enable = 1;
   ADVANCE_BATCH(gpgpu->batch);
@@ -319,7 +330,7 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
   intel_gpgpu_select_pipeline(gpgpu);
   intel_gpgpu_set_base_address(gpgpu);
   intel_gpgpu_load_vfe_state(gpgpu);
-  intel_gpgpu_load_constant_buffer(gpgpu);
+  intel_gpgpu_load_curbe_buffer(gpgpu);
   intel_gpgpu_load_idrt(gpgpu);
 
   if (gpgpu->perf_b.bo) {
@@ -372,6 +383,7 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
   for (i = 0; i < gpgpu->binded_n; ++i)
     assert(gpgpu->binded_buf[i]->offset != 0);
 }
+
 static void
 intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
 {
@@ -391,7 +403,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   /* Binded buffers */
   gpgpu->binded_n = 0;
   gpgpu->img_bitmap = 0;
-  gpgpu->img_index_base = 2;
+  gpgpu->img_index_base = 3;
   gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
 
   /* URB */
@@ -399,12 +411,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->urb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
-  /* Constant buffer */
+  /* Constant URB  buffer */
   if(gpgpu->curbe_b.bo)
     dri_bo_unreference(gpgpu->curbe_b.bo);
   uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
   size_cb = ALIGN(size_cb, 4096);
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64);
   assert(bo);
   gpgpu->curbe_b.bo = bo;
 
@@ -447,6 +459,18 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   memset(bo->virtual, 0, sizeof(gen6_sampler_state_t) * GEN_MAX_SAMPLERS);
   gpgpu->sampler_state_b.bo = bo;
 
+  /* sampler border color state */
+  if (gpgpu->sampler_border_color_state_b.bo)
+    dri_bo_unreference(gpgpu->sampler_border_color_state_b.bo);
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr,
+                    "SAMPLER_BORDER_COLOR_STATE",
+                    sizeof(gen7_sampler_border_color_t),
+                    32);
+  assert(bo);
+  dri_bo_map(bo, 1);
+  memset(bo->virtual, 0, sizeof(gen7_sampler_border_color_t));
+  gpgpu->sampler_border_color_state_b.bo = bo;
+
   /* stack */
   if (gpgpu->stack_b.bo)
     dri_bo_unreference(gpgpu->stack_b.bo);
@@ -468,6 +492,39 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
                     obj_bo);
 }
 
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size)
+{
+  uint32_t s = size - 1;
+  assert(size != 0);
+
+  surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2];
+  memset(ss2, 0, sizeof(gen7_surface_state_t));
+  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
+  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
+  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
+  ss2->ss5.cache_control = cc_llc_l3;
+  heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t);
+
+  if(gpgpu->constant_b.bo)
+    dri_bo_unreference(gpgpu->constant_b.bo);
+  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+  assert(gpgpu->constant_b.bo);
+  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+  dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      heap->binding_table[2] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      gpgpu->constant_b.bo);
+  return gpgpu->constant_b.bo;
+}
+
+
 /* Map address space with two 2GB surfaces. One surface for untyped message and
  * one surface for byte scatters / gathers. Actually the HW does not require two
  * surfaces but Fulsim complains
@@ -517,6 +574,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
                               cl_mem_object_type type,
                               int32_t w,
                               int32_t h,
+                              int32_t depth,
                               int32_t pitch,
                               int32_t tiling)
 {
@@ -530,6 +588,9 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
   ss->ss1.base_addr = obj_bo->offset;
   ss->ss2.width = w - 1;
   ss->ss2.height = h - 1;
+  ss->ss3.depth = depth - 1;
+  ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+  ss->ss4.not_str_buf.min_array_element = 0;
   ss->ss3.pitch = pitch - 1;
   ss->ss5.cache_control = cc_llc_l3;
   if (tiling == GPGPU_TILE_X) {
@@ -586,10 +647,11 @@ intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
                        cl_mem_object_type type,
                        int32_t w,
                        int32_t h,
+                       int32_t depth,
                        int32_t pitch,
                        cl_gpgpu_tiling tiling)
 {
-  intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, type, w, h, pitch, tiling);
+  intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, type, w, h, depth, pitch, tiling);
   assert(index < GEN_MAX_SURFACES);
 }
 
@@ -608,10 +670,12 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   ker_bo = (drm_intel_bo *) kernel->bo;
   desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
   desc->desc1.single_program_flow = 1;
+  desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
+  desc->desc5.rounding_mode = 0; /* round to nearest even */
   desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
   desc->desc3.binding_table_entry_count = 0; /* no prefetch */
   desc->desc3.binding_table_pointer = 0;
-  desc->desc4.curbe_read_len = kernel->cst_sz / 32;
+  desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
   desc->desc4.curbe_read_offset = 0;
 
   /* Barriers / SLM are automatically handled on Gen7+ */
@@ -642,7 +706,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
                     ker_bo);
 
   dri_bo_emit_reloc(bo,
-                    I915_GEM_DOMAIN_INSTRUCTION, 0,
+                    I915_GEM_DOMAIN_SAMPLER, 0,
                     0,
                     offsetof(gen6_interface_descriptor_t, desc2),
                     gpgpu->sampler_state_b.bo);
@@ -650,7 +714,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 }
 
 static void
-intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
 {
   unsigned char *curbe = NULL;
   cl_gpgpu_kernel *k = gpgpu->ker;
@@ -665,9 +729,9 @@ intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t si
   /* Now put all the relocations for our flat address space */
   for (i = 0; i < k->thread_n; ++i)
     for (j = 0; j < gpgpu->binded_n; ++j) {
-      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->cst_sz) = gpgpu->binded_buf[j]->offset;
+      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset;
       drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
-                              gpgpu->binded_offset[j]+i*k->cst_sz,
+                              gpgpu->binded_offset[j]+i*k->curbe_sz,
                               gpgpu->binded_buf[j],
                               0,
                               I915_GEM_DOMAIN_RENDER,
@@ -692,19 +756,7 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
    case CLK_ADDRESS_REPEAT:
       return GEN_TEXCOORDMODE_WRAP;
    case CLK_ADDRESS_CLAMP:
-      /* GL_CLAMP is the weird mode where coordinates are clamped to
-       * [0.0, 1.0], so linear filtering of coordinates outside of
-       * [0.0, 1.0] give you half edge texel value and half border
-       * color.  The fragment shader will clamp the coordinates, and
-       * we set clamp_border here, which gets the result desired.  We
-       * just use clamp(_to_edge) for nearest, because for nearest
-       * clamping to 1.0 gives border color instead of the desired
-       * edge texels.
-       */
-      if (using_nearest)
-         return GEN_TEXCOORDMODE_CLAMP;
-      else
-         return GEN_TEXCOORDMODE_CLAMP_BORDER;
+      return GEN_TEXCOORDMODE_CLAMP_BORDER;
    case CLK_ADDRESS_CLAMP_TO_EDGE:
       return GEN_TEXCOORDMODE_CLAMP;
    case CLK_ADDRESS_MIRRORED_REPEAT:
@@ -721,7 +773,9 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
   uint32_t wrap_mode;
   gen7_sampler_state_t *sampler;
 
-  sampler = (gen7_sampler_state_t *)gpgpu->sampler_state_b.bo->virtual + index;
+  sampler = (gen7_sampler_state_t *)(gpgpu->sampler_state_b.bo->virtual)  + index;
+  memset(sampler, 0, sizeof(*sampler));
+  sampler->ss2.default_color_pointer = (gpgpu->sampler_border_color_state_b.bo->offset) >> 5;
   if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
     sampler->ss3.non_normalized_coord = 1;
   else
@@ -742,9 +796,11 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
   }
 
   wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
-  sampler->ss3.r_wrap_mode = wrap_mode;
   sampler->ss3.s_wrap_mode = wrap_mode;
+  /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
+   * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
   sampler->ss3.t_wrap_mode = wrap_mode;
+  sampler->ss3.r_wrap_mode = wrap_mode;
 
   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
@@ -762,15 +818,36 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
      sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
                                    GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
                                    GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+
+  dri_bo_emit_reloc(gpgpu->sampler_state_b.bo,
+                    I915_GEM_DOMAIN_SAMPLER, 0,
+                    0,
+                    index * sizeof(gen7_sampler_state_t) +
+                    offsetof(gen7_sampler_state_t, ss2),
+                    gpgpu->sampler_border_color_state_b.bo);
+
 }
 
 static void
 intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
 {
   int index;
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+  assert(sampler_sz <= GEN_MAX_SAMPLERS/2);
+#else
   assert(sampler_sz <= GEN_MAX_SAMPLERS);
-  for(index = 0; index < sampler_sz; index++)
-    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK);
+#endif
+  for(index = 0; index < sampler_sz; index++) {
+    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+    /* Duplicate the sampler to 8 + index and fixup the address mode
+     * to repeat.*/
+    if ((samplers[index] & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) {
+      intel_gpgpu_insert_sampler(gpgpu, index + 8,
+                                 (samplers[index] & ~__CLK_ADDRESS_MASK) | CLK_ADDRESS_CLAMP_TO_EDGE);
+    }
+#endif
+  }
 }
 
 static void
@@ -781,6 +858,7 @@ intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
   intel_gpgpu_map_address_space(gpgpu);
   dri_bo_unmap(gpgpu->surface_heap_b.bo);
   dri_bo_unmap(gpgpu->sampler_state_b.bo);
+  dri_bo_unmap(gpgpu->sampler_border_color_state_b.bo);
 }
 
 static void
@@ -821,11 +899,11 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
     OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
   else
     OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
-  OUT_BATCH(gpgpu->batch, global_wk_off[0]);
+  OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
-  OUT_BATCH(gpgpu->batch, global_wk_off[1]);
+  OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
-  OUT_BATCH(gpgpu->batch, global_wk_off[2]);
+  OUT_BATCH(gpgpu->batch, 0);
   OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
   OUT_BATCH(gpgpu->batch, right_mask);
   OUT_BATCH(gpgpu->batch, ~0x0);                     /* we always set height as 1, so set bottom mask as all 1*/
@@ -925,7 +1003,8 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
   cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
   cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
-  cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
+  cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+  cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
   cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
   cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
   cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index ff339c5..36b5971 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -209,7 +209,17 @@ typedef struct gen7_surface_state
     uint32_t depth:11;
   } ss3;
 
-  uint32_t ss4;
+  union {
+    struct {
+      uint32_t mulsample_pal_idx:3;
+      uint32_t numer_mulsample:3;
+      uint32_t mss_fmt:1;
+      uint32_t rt_view_extent:11;
+      uint32_t min_array_element:11;
+      uint32_t rt_rotate:2;
+      uint32_t pad0:1;
+    } not_str_buf;
+  } ss4;
 
   struct {
     uint32_t mip_count:4;
@@ -380,6 +390,10 @@ typedef struct gen6_sampler_state
   } ss3;
 } gen6_sampler_state_t;
 
+typedef struct gen7_sampler_border_color {
+    float r,g,b,a;
+} gen7_sampler_border_color_t;
+
 typedef struct gen7_sampler_state
 {
   struct {
diff --git a/src/x11/dricommon.h b/src/x11/dricommon.h
index 08e66a5..5a950b4 100644
--- a/src/x11/dricommon.h
+++ b/src/x11/dricommon.h
@@ -94,11 +94,6 @@ void dri_state_release(dri_state_t*);
 
 // Create a dri2 state from dpy and screen
 dri_state_t *getDRI2State(Display* dpy, int screen, char **driver_name);
-#ifdef HAS_GBM
-#include<gbm.h>
-void cl_gbm_set_image_extension(struct gbm_device *gbm, void *display);
-int cl_gbm_bo_get_name(struct gbm_bo *bo);
-#endif
 
 #endif /* _VA_DRICOMMON_H_ */
 
diff --git a/src/x11/gbm_deps/backend.h b/src/x11/gbm_deps/backend.h
deleted file mode 100644
index 4a64375..0000000
--- a/src/x11/gbm_deps/backend.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef MODULE_H_
-#define MODULE_H_
-
-#include "gbmint.h"
-
-struct gbm_device *
-_gbm_create_device(int fd);
-
-#endif
diff --git a/src/x11/gbm_deps/common.h b/src/x11/gbm_deps/common.h
deleted file mode 100644
index 1fcdfca..0000000
--- a/src/x11/gbm_deps/common.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-#include <libudev.h>
-
-struct udev_device *
-_gbm_udev_device_new_from_fd(struct udev *udev, int fd);
-
-char *
-_gbm_fd_get_device_name(int fd);
-
-void
-_gbm_log(const char *fmt_str, ...);
-
-#endif
diff --git a/src/x11/gbm_deps/common_drm.h b/src/x11/gbm_deps/common_drm.h
deleted file mode 100644
index d28c3f0..0000000
--- a/src/x11/gbm_deps/common_drm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _COMMON_DRM_H_
-#define _COMMON_DRM_H_
-
-#include "gbmint.h"
-
-enum gbm_drm_driver_type {
-   GBM_DRM_DRIVER_TYPE_DRI,
-   GBM_DRM_DRIVER_TYPE_GALLIUM,
-};
-
-struct gbm_drm_device {
-   struct gbm_device base;
-   enum gbm_drm_driver_type type;
-   char *driver_name;
-};
-
-struct gbm_drm_bo {
-   struct gbm_bo base;
-};
-
-#endif
diff --git a/src/x11/gbm_deps/gbm.h b/src/x11/gbm_deps/gbm.h
deleted file mode 100644
index e516df2..0000000
--- a/src/x11/gbm_deps/gbm.h
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _GBM_H_
-#define _GBM_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#define __GBM__ 1
-
-#include <stdint.h>
-
-/**
- * \file gbm.h
- * \brief Generic Buffer Manager
- */
-
-struct gbm_device;
-struct gbm_bo;
-struct gbm_surface;
-
-/**
- * \mainpage The Generic Buffer Manager
- *
- * This module provides an abstraction that the caller can use to request a
- * buffer from the underlying memory management system for the platform.
- *
- * This allows the creation of portable code whilst still allowing access to
- * the underlying memory manager.
- */
-
-/**
- * Abstraction representing the handle to a buffer allocated by the
- * manager
- */
-union gbm_bo_handle {
-   void *ptr;
-   int32_t s32;
-   uint32_t u32;
-   int64_t s64;
-   uint64_t u64;
-};
-
-/** Format of the allocated buffer */
-enum gbm_bo_format {
-   /** RGB with 8 bits per channel in a 32 bit value */
-   GBM_BO_FORMAT_XRGB8888,
-   /** ARGB with 8 bits per channel in a 32 bit value */
-   GBM_BO_FORMAT_ARGB8888
-};
-
-#define __gbm_fourcc_code(a,b,c,d) ((uint32_t)(a) | ((uint32_t)(b) << 8) | \
-			      ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))
-
-#define GBM_FORMAT_BIG_ENDIAN (1<<31) /* format is big endian instead of little endian */
-
-/* color index */
-#define GBM_FORMAT_C8		__gbm_fourcc_code('C', '8', ' ', ' ') /* [7:0] C */
-
-/* 8 bpp RGB */
-#define GBM_FORMAT_RGB332	__gbm_fourcc_code('R', 'G', 'B', '8') /* [7:0] R:G:B 3:3:2 */
-#define GBM_FORMAT_BGR233	__gbm_fourcc_code('B', 'G', 'R', '8') /* [7:0] B:G:R 2:3:3 */
-
-/* 16 bpp RGB */
-#define GBM_FORMAT_XRGB4444	__gbm_fourcc_code('X', 'R', '1', '2') /* [15:0] x:R:G:B 4:4:4:4 little endian */
-#define GBM_FORMAT_XBGR4444	__gbm_fourcc_code('X', 'B', '1', '2') /* [15:0] x:B:G:R 4:4:4:4 little endian */
-#define GBM_FORMAT_RGBX4444	__gbm_fourcc_code('R', 'X', '1', '2') /* [15:0] R:G:B:x 4:4:4:4 little endian */
-#define GBM_FORMAT_BGRX4444	__gbm_fourcc_code('B', 'X', '1', '2') /* [15:0] B:G:R:x 4:4:4:4 little endian */
-
-#define GBM_FORMAT_ARGB4444	__gbm_fourcc_code('A', 'R', '1', '2') /* [15:0] A:R:G:B 4:4:4:4 little endian */
-#define GBM_FORMAT_ABGR4444	__gbm_fourcc_code('A', 'B', '1', '2') /* [15:0] A:B:G:R 4:4:4:4 little endian */
-#define GBM_FORMAT_RGBA4444	__gbm_fourcc_code('R', 'A', '1', '2') /* [15:0] R:G:B:A 4:4:4:4 little endian */
-#define GBM_FORMAT_BGRA4444	__gbm_fourcc_code('B', 'A', '1', '2') /* [15:0] B:G:R:A 4:4:4:4 little endian */
-
-#define GBM_FORMAT_XRGB1555	__gbm_fourcc_code('X', 'R', '1', '5') /* [15:0] x:R:G:B 1:5:5:5 little endian */
-#define GBM_FORMAT_XBGR1555	__gbm_fourcc_code('X', 'B', '1', '5') /* [15:0] x:B:G:R 1:5:5:5 little endian */
-#define GBM_FORMAT_RGBX5551	__gbm_fourcc_code('R', 'X', '1', '5') /* [15:0] R:G:B:x 5:5:5:1 little endian */
-#define GBM_FORMAT_BGRX5551	__gbm_fourcc_code('B', 'X', '1', '5') /* [15:0] B:G:R:x 5:5:5:1 little endian */
-
-#define GBM_FORMAT_ARGB1555	__gbm_fourcc_code('A', 'R', '1', '5') /* [15:0] A:R:G:B 1:5:5:5 little endian */
-#define GBM_FORMAT_ABGR1555	__gbm_fourcc_code('A', 'B', '1', '5') /* [15:0] A:B:G:R 1:5:5:5 little endian */
-#define GBM_FORMAT_RGBA5551	__gbm_fourcc_code('R', 'A', '1', '5') /* [15:0] R:G:B:A 5:5:5:1 little endian */
-#define GBM_FORMAT_BGRA5551	__gbm_fourcc_code('B', 'A', '1', '5') /* [15:0] B:G:R:A 5:5:5:1 little endian */
-
-#define GBM_FORMAT_RGB565	__gbm_fourcc_code('R', 'G', '1', '6') /* [15:0] R:G:B 5:6:5 little endian */
-#define GBM_FORMAT_BGR565	__gbm_fourcc_code('B', 'G', '1', '6') /* [15:0] B:G:R 5:6:5 little endian */
-
-/* 24 bpp RGB */
-#define GBM_FORMAT_RGB888	__gbm_fourcc_code('R', 'G', '2', '4') /* [23:0] R:G:B little endian */
-#define GBM_FORMAT_BGR888	__gbm_fourcc_code('B', 'G', '2', '4') /* [23:0] B:G:R little endian */
-
-/* 32 bpp RGB */
-#define GBM_FORMAT_XRGB8888	__gbm_fourcc_code('X', 'R', '2', '4') /* [31:0] x:R:G:B 8:8:8:8 little endian */
-#define GBM_FORMAT_XBGR8888	__gbm_fourcc_code('X', 'B', '2', '4') /* [31:0] x:B:G:R 8:8:8:8 little endian */
-#define GBM_FORMAT_RGBX8888	__gbm_fourcc_code('R', 'X', '2', '4') /* [31:0] R:G:B:x 8:8:8:8 little endian */
-#define GBM_FORMAT_BGRX8888	__gbm_fourcc_code('B', 'X', '2', '4') /* [31:0] B:G:R:x 8:8:8:8 little endian */
-
-#define GBM_FORMAT_ARGB8888	__gbm_fourcc_code('A', 'R', '2', '4') /* [31:0] A:R:G:B 8:8:8:8 little endian */
-#define GBM_FORMAT_ABGR8888	__gbm_fourcc_code('A', 'B', '2', '4') /* [31:0] A:B:G:R 8:8:8:8 little endian */
-#define GBM_FORMAT_RGBA8888	__gbm_fourcc_code('R', 'A', '2', '4') /* [31:0] R:G:B:A 8:8:8:8 little endian */
-#define GBM_FORMAT_BGRA8888	__gbm_fourcc_code('B', 'A', '2', '4') /* [31:0] B:G:R:A 8:8:8:8 little endian */
-
-#define GBM_FORMAT_XRGB2101010	__gbm_fourcc_code('X', 'R', '3', '0') /* [31:0] x:R:G:B 2:10:10:10 little endian */
-#define GBM_FORMAT_XBGR2101010	__gbm_fourcc_code('X', 'B', '3', '0') /* [31:0] x:B:G:R 2:10:10:10 little endian */
-#define GBM_FORMAT_RGBX1010102	__gbm_fourcc_code('R', 'X', '3', '0') /* [31:0] R:G:B:x 10:10:10:2 little endian */
-#define GBM_FORMAT_BGRX1010102	__gbm_fourcc_code('B', 'X', '3', '0') /* [31:0] B:G:R:x 10:10:10:2 little endian */
-
-#define GBM_FORMAT_ARGB2101010	__gbm_fourcc_code('A', 'R', '3', '0') /* [31:0] A:R:G:B 2:10:10:10 little endian */
-#define GBM_FORMAT_ABGR2101010	__gbm_fourcc_code('A', 'B', '3', '0') /* [31:0] A:B:G:R 2:10:10:10 little endian */
-#define GBM_FORMAT_RGBA1010102	__gbm_fourcc_code('R', 'A', '3', '0') /* [31:0] R:G:B:A 10:10:10:2 little endian */
-#define GBM_FORMAT_BGRA1010102	__gbm_fourcc_code('B', 'A', '3', '0') /* [31:0] B:G:R:A 10:10:10:2 little endian */
-
-/* packed YCbCr */
-#define GBM_FORMAT_YUYV		__gbm_fourcc_code('Y', 'U', 'Y', 'V') /* [31:0] Cr0:Y1:Cb0:Y0 8:8:8:8 little endian */
-#define GBM_FORMAT_YVYU		__gbm_fourcc_code('Y', 'V', 'Y', 'U') /* [31:0] Cb0:Y1:Cr0:Y0 8:8:8:8 little endian */
-#define GBM_FORMAT_UYVY		__gbm_fourcc_code('U', 'Y', 'V', 'Y') /* [31:0] Y1:Cr0:Y0:Cb0 8:8:8:8 little endian */
-#define GBM_FORMAT_VYUY		__gbm_fourcc_code('V', 'Y', 'U', 'Y') /* [31:0] Y1:Cb0:Y0:Cr0 8:8:8:8 little endian */
-
-#define GBM_FORMAT_AYUV		__gbm_fourcc_code('A', 'Y', 'U', 'V') /* [31:0] A:Y:Cb:Cr 8:8:8:8 little endian */
-
-/*
- * 2 plane YCbCr
- * index 0 = Y plane, [7:0] Y
- * index 1 = Cr:Cb plane, [15:0] Cr:Cb little endian
- * or
- * index 1 = Cb:Cr plane, [15:0] Cb:Cr little endian
- */
-#define GBM_FORMAT_NV12		__gbm_fourcc_code('N', 'V', '1', '2') /* 2x2 subsampled Cr:Cb plane */
-#define GBM_FORMAT_NV21		__gbm_fourcc_code('N', 'V', '2', '1') /* 2x2 subsampled Cb:Cr plane */
-#define GBM_FORMAT_NV16		__gbm_fourcc_code('N', 'V', '1', '6') /* 2x1 subsampled Cr:Cb plane */
-#define GBM_FORMAT_NV61		__gbm_fourcc_code('N', 'V', '6', '1') /* 2x1 subsampled Cb:Cr plane */
-
-/*
- * 3 plane YCbCr
- * index 0: Y plane, [7:0] Y
- * index 1: Cb plane, [7:0] Cb
- * index 2: Cr plane, [7:0] Cr
- * or
- * index 1: Cr plane, [7:0] Cr
- * index 2: Cb plane, [7:0] Cb
- */
-#define GBM_FORMAT_YUV410	__gbm_fourcc_code('Y', 'U', 'V', '9') /* 4x4 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU410	__gbm_fourcc_code('Y', 'V', 'U', '9') /* 4x4 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV411	__gbm_fourcc_code('Y', 'U', '1', '1') /* 4x1 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU411	__gbm_fourcc_code('Y', 'V', '1', '1') /* 4x1 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV420	__gbm_fourcc_code('Y', 'U', '1', '2') /* 2x2 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU420	__gbm_fourcc_code('Y', 'V', '1', '2') /* 2x2 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV422	__gbm_fourcc_code('Y', 'U', '1', '6') /* 2x1 subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU422	__gbm_fourcc_code('Y', 'V', '1', '6') /* 2x1 subsampled Cr (1) and Cb (2) planes */
-#define GBM_FORMAT_YUV444	__gbm_fourcc_code('Y', 'U', '2', '4') /* non-subsampled Cb (1) and Cr (2) planes */
-#define GBM_FORMAT_YVU444	__gbm_fourcc_code('Y', 'V', '2', '4') /* non-subsampled Cr (1) and Cb (2) planes */
-
-
-/**
- * Flags to indicate the intended use for the buffer - these are passed into
- * gbm_bo_create(). The caller must set the union of all the flags that are
- * appropriate
- *
- * \sa Use gbm_device_is_format_supported() to check if the combination of format
- * and use flags are supported
- */
-enum gbm_bo_flags {
-   /**
-    * Buffer is going to be presented to the screen using an API such as KMS
-    */
-   GBM_BO_USE_SCANOUT      = (1 << 0),
-   /**
-    * Buffer is going to be used as cursor - the dimensions for the buffer
-    * must be 64x64 if this flag is passed.
-    */
-   GBM_BO_USE_CURSOR_64X64 = (1 << 1),
-   /**
-    * Buffer is to be used for rendering - for example it is going to be used
-    * as the storage for a color buffer
-    */
-   GBM_BO_USE_RENDERING    = (1 << 2),
-   /**
-    * Buffer can be used for gbm_bo_write.  This is guaranteed to work
-    * with GBM_BO_USE_CURSOR_64X64. but may not work for other
-    * combinations.
-    */
-   GBM_BO_USE_WRITE    = (1 << 3),
-};
-
-int
-gbm_device_get_fd(struct gbm_device *gbm);
-
-const char *
-gbm_device_get_backend_name(struct gbm_device *gbm);
-
-int
-gbm_device_is_format_supported(struct gbm_device *gbm,
-                               uint32_t format, uint32_t usage);
-
-void
-gbm_device_destroy(struct gbm_device *gbm);
-
-struct gbm_device *
-gbm_create_device(int fd);
-
-struct gbm_bo *
-gbm_bo_create(struct gbm_device *gbm,
-              uint32_t width, uint32_t height,
-              uint32_t format, uint32_t flags);
-
-#define GBM_BO_IMPORT_WL_BUFFER         0x5501
-#define GBM_BO_IMPORT_EGL_IMAGE         0x5502
-
-struct gbm_bo *
-gbm_bo_import(struct gbm_device *gbm, uint32_t type,
-              void *buffer, uint32_t usage);
-
-uint32_t
-gbm_bo_get_width(struct gbm_bo *bo);
-
-uint32_t
-gbm_bo_get_height(struct gbm_bo *bo);
-
-uint32_t
-gbm_bo_get_stride(struct gbm_bo *bo);
-
-uint32_t
-gbm_bo_get_format(struct gbm_bo *bo);
-
-struct gbm_device *
-gbm_bo_get_device(struct gbm_bo *bo);
-
-union gbm_bo_handle
-gbm_bo_get_handle(struct gbm_bo *bo);
-
-int
-gbm_bo_write(struct gbm_bo *bo, const void *buf, size_t count);
-
-void
-gbm_bo_set_user_data(struct gbm_bo *bo, void *data,
-		     void (*destroy_user_data)(struct gbm_bo *, void *));
-
-void *
-gbm_bo_get_user_data(struct gbm_bo *bo);
-
-void
-gbm_bo_destroy(struct gbm_bo *bo);
-
-struct gbm_surface *
-gbm_surface_create(struct gbm_device *gbm,
-                   uint32_t width, uint32_t height,
-		   uint32_t format, uint32_t flags);
-
-struct gbm_bo *
-gbm_surface_lock_front_buffer(struct gbm_surface *surface);
-
-void
-gbm_surface_release_buffer(struct gbm_surface *surface, struct gbm_bo *bo);
-
-int
-gbm_surface_has_free_buffers(struct gbm_surface *surface);
-
-void
-gbm_surface_destroy(struct gbm_surface *surface);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/x11/gbm_deps/gbm_driint.h b/src/x11/gbm_deps/gbm_driint.h
deleted file mode 100644
index 18fc3c0..0000000
--- a/src/x11/gbm_deps/gbm_driint.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef _GBM_DRI_INTERNAL_H_
-#define _GBM_DRI_INTERNAL_H_
-
-#include "gbmint.h"
-
-#include "common.h"
-#include "common_drm.h"
-
-#include <GL/gl.h> /* dri_interface needs GL types */
-#include "GL/internal/dri_interface.h"
-
-struct gbm_dri_surface;
-
-struct gbm_dri_device {
-   struct gbm_drm_device base;
-
-   void *driver;
-
-   __DRIscreen *screen;
-
-   __DRIcoreExtension   *core;
-   __DRIdri2Extension   *dri2;
-   __DRIimageExtension  *image;
-   __DRI2flushExtension *flush;
-   __DRIdri2LoaderExtension *loader;
-
-   const __DRIconfig   **driver_configs;
-   const __DRIextension *extensions[4];
-
-   __DRIimage *(*lookup_image)(__DRIscreen *screen, void *image, void *data);
-   void *lookup_user_data;
-
-   __DRIbuffer *(*get_buffers)(__DRIdrawable * driDrawable,
-                               int *width, int *height,
-                               unsigned int *attachments, int count,
-                               int *out_count, void *data);
-   void (*flush_front_buffer)(__DRIdrawable * driDrawable, void *data);
-   __DRIbuffer *(*get_buffers_with_format)(__DRIdrawable * driDrawable,
-			     int *width, int *height,
-			     unsigned int *attachments, int count,
-			     int *out_count, void *data);
-};
-
-struct gbm_dri_bo {
-   struct gbm_drm_bo base;
-
-   __DRIimage *image;
-
-   /* Only used for cursors */
-   uint32_t handle, size;
-   void *map;
-};
-
-struct gbm_dri_surface {
-   struct gbm_surface base;
-
-   void *dri_private;
-};
-
-static inline struct gbm_dri_device *
-gbm_dri_device(struct gbm_device *gbm)
-{
-   return (struct gbm_dri_device *) gbm;
-}
-
-static inline struct gbm_dri_bo *
-gbm_dri_bo(struct gbm_bo *bo)
-{
-   return (struct gbm_dri_bo *) bo;
-}
-
-static inline struct gbm_dri_surface *
-gbm_dri_surface(struct gbm_surface *surface)
-{
-   return (struct gbm_dri_surface *) surface;
-}
-
-char *
-dri_fd_get_driver_name(int fd);
-
-#endif
diff --git a/src/x11/gbm_deps/gbmint.h b/src/x11/gbm_deps/gbmint.h
deleted file mode 100644
index a467bea..0000000
--- a/src/x11/gbm_deps/gbmint.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke at googlemail.com>
- */
-
-#ifndef INTERNAL_H_
-#define INTERNAL_H_
-
-#include "gbm.h"
-#include <sys/stat.h>
-
-/* GCC visibility */
-#if defined(__GNUC__) && __GNUC__ >= 4
-#define GBM_EXPORT __attribute__ ((visibility("default")))
-#else
-#define GBM_EXPORT
-#endif
-
-/**
- * \file gbmint.h
- * \brief Internal implementation details of gbm
- */
-
-/**
- * The device used for the memory allocation.
- *
- * The members of this structure should be not accessed directly
- */
-struct gbm_device {
-   /* Hack to make a gbm_device detectable by its first element. */
-   struct gbm_device *(*dummy)(int);
-
-   int fd;
-   const char *name;
-   unsigned int refcount;
-   struct stat stat;
-
-   void (*destroy)(struct gbm_device *gbm);
-   int (*is_format_supported)(struct gbm_device *gbm,
-                              uint32_t format,
-                              uint32_t usage);
-
-   struct gbm_bo *(*bo_create)(struct gbm_device *gbm,
-                               uint32_t width, uint32_t height,
-                               uint32_t format,
-                               uint32_t usage);
-   struct gbm_bo *(*bo_import)(struct gbm_device *gbm, uint32_t type,
-                               void *buffer, uint32_t usage);
-   int (*bo_write)(struct gbm_bo *bo, const void *buf, size_t data);
-   void (*bo_destroy)(struct gbm_bo *bo);
-
-   struct gbm_surface *(*surface_create)(struct gbm_device *gbm,
-                                         uint32_t width, uint32_t height,
-                                         uint32_t format, uint32_t flags);
-   struct gbm_bo *(*surface_lock_front_buffer)(struct gbm_surface *surface);
-   void (*surface_release_buffer)(struct gbm_surface *surface,
-                                  struct gbm_bo *bo);
-   int (*surface_has_free_buffers)(struct gbm_surface *surface);
-   void (*surface_destroy)(struct gbm_surface *surface);
-};
-
-/**
- * The allocated buffer object.
- *
- * The members in this structure should not be accessed directly.
- */
-struct gbm_bo {
-   struct gbm_device *gbm;
-   uint32_t width;
-   uint32_t height;
-   uint32_t stride;
-   uint32_t format;
-   union gbm_bo_handle  handle;
-   void *user_data;
-   void (*destroy_user_data)(struct gbm_bo *, void *);
-};
-
-struct gbm_surface {
-   struct gbm_device *gbm;
-   uint32_t width;
-   uint32_t height;
-   uint32_t format;
-   uint32_t flags;
-};
-
-struct gbm_backend {
-   const char *backend_name;
-   struct gbm_device *(*create_device)(int fd);
-};
-
-GBM_EXPORT struct gbm_device *
-_gbm_mesa_get_device(int fd);
-
-#endif
diff --git a/src/x11/gbm_dri2_x11_platform.c b/src/x11/gbm_dri2_x11_platform.c
deleted file mode 100644
index 481f407..0000000
--- a/src/x11/gbm_dri2_x11_platform.c
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <string.h>
-#include "GL/gl.h" /* dri_interface need gl types definitions. */
-#include "GL/internal/dri_interface.h"
-#include "gbm_deps/gbm_driint.h"
-#include "gbm_deps/gbmint.h"
-#include "dricommon.h"
-
-typedef struct EGLDisplay _EGLDisplay;
-typedef struct EGLDriver  _EGLDriver;
-/* XXX should check whether we support pthread.*/
-typedef pthread_mutex_t _EGLMutex;
-
-enum _egl_platform_type {
-   _EGL_PLATFORM_WINDOWS,
-   _EGL_PLATFORM_X11,
-   _EGL_PLATFORM_WAYLAND,
-   _EGL_PLATFORM_DRM,
-   _EGL_PLATFORM_FBDEV,
-   _EGL_PLATFORM_NULL,
-   _EGL_PLATFORM_ANDROID,
-
-   _EGL_NUM_PLATFORMS,
-   _EGL_INVALID_PLATFORM = -1
-};
-typedef enum _egl_platform_type _EGLPlatformType;
-typedef unsigned int EGLBoolean;
-typedef int32_t EGLint;
-
-struct _hack_egl_display
-{
-   /* used to link displays */
-   _EGLDisplay *Next;
-
-   _EGLMutex Mutex;
-
-   _EGLPlatformType Platform; /**< The type of the platform display */
-   void *PlatformDisplay;     /**< A pointer to the platform display */
-
-   _EGLDriver *Driver;        /**< Matched driver of the display */
-
-   EGLBoolean Initialized;    /**< True if the display is initialized */
-
-   /* options that affect how the driver initializes the display */
-   struct {
-      EGLBoolean TestOnly;    /**< Driver should not set fields when true */
-      EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
-   } Options;
-
-   /* these fields are set by the driver during init */
-   void *DriverData;          /**< Driver private data */
-   EGLint VersionMajor;       /**< EGL major version */
-   EGLint VersionMinor;       /**< EGL minor version */
-   EGLint ClientAPIs;         /**< Bitmask of APIs supported (EGL_xxx_BIT) */
-};
-
-struct _hack_dri2_egl_display
-{
-   int                       dri2_major;
-   int                       dri2_minor;
-   __DRIscreen              *dri_screen;
-   int                       own_dri_screen;
-   const __DRIconfig       **driver_configs;
-   void                     *driver;
-   __DRIcoreExtension       *core;
-   __DRIdri2Extension       *dri2;
-   __DRIswrastExtension     *swrast;
-   __DRI2flushExtension     *flush;
-   __DRItexBufferExtension  *tex_buffer;
-   __DRIimageExtension      *image;
-   __DRIrobustnessExtension *robustness;
-   __DRI2configQueryExtension *config;
-   int                       fd;
-
-   int                       own_device;
-   int                       swap_available;
-   int                       invalidate_available;
-   int                       min_swap_interval;
-   int                       max_swap_interval;
-   int                       default_swap_interval;
-   struct gbm_dri_device    *gbm_dri;
-
-   char                     *device_name;
-   char                     *driver_name;
-
-   __DRIdri2LoaderExtension    dri2_loader_extension;
-   __DRIswrastLoaderExtension  swrast_loader_extension;
-   const __DRIextension     *extensions[4];
-};
-
-static __DRIimageLookupExtension *image_lookup_extension;
-
-/* We are use DRI2 x11 platform, and by default, gbm doesn't register
- * a valid image extension, and actually, it doesn't know how to register
- * it based on current interface. We have to hack it here. */
-void cl_gbm_set_image_extension(struct gbm_device *gbm, void *display)
-{
-  struct gbm_dri_device *gbm_dri = gbm_dri_device(gbm);
-  struct _hack_egl_display *egl_dpy = (struct _hack_egl_display*)display;
-  struct _hack_dri2_egl_display *dri2_dpy = (struct _hack_dri2_egl_display*)egl_dpy->DriverData;
-  int i;
-
-  if (gbm_dri->lookup_image == NULL
-      && egl_dpy->Platform == _EGL_PLATFORM_X11) {
-    for(i = 0; i < 4; i++)
-     if (dri2_dpy->extensions[i]
-         && ((strncmp(dri2_dpy->extensions[i]->name,
-                      __DRI_IMAGE_LOOKUP,
-                      sizeof(__DRI_IMAGE_LOOKUP))) == 0))
-       break;
-    if (i >= 4) return;
-    image_lookup_extension = (__DRIimageLookupExtension*)dri2_dpy->extensions[i];
-    gbm_dri->lookup_image = image_lookup_extension->lookupEGLImage;
-    gbm_dri->lookup_user_data = display;
-  }
-}
-
-int cl_gbm_bo_get_name(struct gbm_bo *bo)
-{
-  int name;
-  struct gbm_dri_device *gbm_dri = gbm_dri_device(bo->gbm);
-  struct gbm_dri_bo *bo_dri = gbm_dri_bo(bo);
-
-  gbm_dri->image->queryImage(bo_dri->image, __DRI_IMAGE_ATTRIB_NAME,
-                             &name);
-  return name;
-}
diff --git a/src/x11/mesa_egl_extension.c b/src/x11/mesa_egl_extension.c
new file mode 100644
index 0000000..a7fc8cb
--- /dev/null
+++ b/src/x11/mesa_egl_extension.c
@@ -0,0 +1,307 @@
+#include <stdio.h>
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+#include "src/cl_driver.h"
+
+struct _egl_display;
+struct _egl_resource;
+struct _egl_thread_info;
+struct _egl_config;
+struct _egl_surface;
+struct _egl_driver;
+
+typedef struct _egl_display _EGLDisplay;
+typedef struct _egl_resource _EGLResource;
+typedef struct _egl_thread_info _EGLThreadInfo;
+typedef struct _egl_config _EGLConfig;
+typedef struct _egl_surface _EGLSurface;
+typedef struct _egl_driver _EGLDriver;
+
+/**
+ * A resource of a display.
+ */
+struct _egl_resource
+{
+   /* which display the resource belongs to */
+   _EGLDisplay *Display;
+   EGLBoolean IsLinked;
+   EGLint RefCount;
+
+   /* used to link resources of the same type */
+   _EGLResource *Next;
+};
+
+/**
+ * "Base" class for device driver contexts.
+ */
+struct _egl_context
+{
+   /* A context is a display resource */
+   _EGLResource Resource;
+
+   /* The bound status of the context */
+   _EGLThreadInfo *Binding;
+   _EGLSurface *DrawSurface;
+   _EGLSurface *ReadSurface;
+
+   _EGLConfig *Config;
+
+   EGLint ClientAPI; /**< EGL_OPENGL_ES_API, EGL_OPENGL_API, EGL_OPENVG_API */
+   EGLint ClientMajorVersion;
+   EGLint ClientMinorVersion;
+   EGLint Flags;
+   EGLint Profile;
+   EGLint ResetNotificationStrategy;
+
+   /* The real render buffer when a window surface is bound */
+   EGLint WindowRenderBuffer;
+};
+
+typedef struct _egl_context _EGLContext;
+
+struct dri2_egl_display
+{
+   int                       dri2_major;
+   int                       dri2_minor;
+   __DRIscreen              *dri_screen;
+   int                       own_dri_screen;
+   const __DRIconfig       **driver_configs;
+   void                     *driver;
+};
+
+enum _egl_platform_type {
+   _EGL_PLATFORM_WINDOWS,
+   _EGL_PLATFORM_X11,
+   _EGL_PLATFORM_WAYLAND,
+   _EGL_PLATFORM_DRM,
+   _EGL_PLATFORM_FBDEV,
+   _EGL_PLATFORM_NULL,
+   _EGL_PLATFORM_ANDROID,
+
+   _EGL_NUM_PLATFORMS,
+   _EGL_INVALID_PLATFORM = -1
+};
+typedef enum _egl_platform_type _EGLPlatformType;
+
+typedef pthread_mutex_t _EGLMutex;
+
+struct _egl_display
+{
+   /* used to link displays */
+   _EGLDisplay *Next;
+
+   _EGLMutex Mutex;
+
+   _EGLPlatformType Platform; /**< The type of the platform display */
+   void *PlatformDisplay;     /**< A pointer to the platform display */
+
+   _EGLDriver *Driver;        /**< Matched driver of the display */
+   EGLBoolean Initialized;    /**< True if the display is initialized */
+
+   /* options that affect how the driver initializes the display */
+   struct {
+      EGLBoolean TestOnly;    /**< Driver should not set fields when true */
+      EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
+   } Options;
+
+   /* these fields are set by the driver during init */
+   void *DriverData;          /**< Driver private data */
+};
+
+static struct dri2_egl_display *
+dri2_egl_display(_EGLDisplay *dpy)
+{
+  return (struct dri2_egl_display *)dpy->DriverData;
+}
+
+static _EGLDisplay *
+_eglLockDisplay(EGLDisplay dpy)
+{
+  return (_EGLDisplay *)dpy;
+}
+
+static _EGLContext *
+_eglLookupContext(EGLContext ctx, EGLDisplay disp)
+{
+  disp = disp;
+  return (_EGLContext *) ctx;
+}
+
+struct dri2_egl_context
+{
+   _EGLContext   base;
+   __DRIcontext *dri_context;
+};
+
+static struct dri2_egl_context *
+dri2_egl_context(_EGLContext *ctx)
+{
+  return (struct dri2_egl_context *)ctx;
+}
+
+static EGLBoolean
+dri2_acquire_texture(_EGLDisplay *disp,
+                     _EGLContext *ctx,
+                     const EGLint *attr_list,
+                     void *user_data)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint texture = 0;
+   GLenum gl_target = 0;
+   GLint level = 0;
+   GLboolean ret;
+
+   if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_acquire_texture(dri2_dpy->driver,
+                               dri2_ctx->dri_context,
+                               gl_target, level, texture,
+                               user_data);
+   return ret;
+}
+
+static EGLBoolean
+dri2_release_texture(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint texture = 0;
+   GLenum gl_target = 0;
+   GLint level = 0;
+   GLboolean ret;
+
+   if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_release_texture(dri2_dpy->driver, dri2_ctx->dri_context,
+                               gl_target, level, texture);
+   return ret;
+}
+
+static EGLBoolean
+dri2_acquire_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list,
+                           void *user_data)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint bufobj = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_acquire_buffer_object(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     bufobj, user_data);
+   return ret;
+}
+
+static EGLBoolean
+dri2_release_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint bufobj = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_release_buffer_object(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     bufobj);
+   return ret;
+}
+
+static EGLBoolean
+dri2_acquire_render_buffer(_EGLDisplay *disp,
+                           _EGLContext *ctx,
+                           const EGLint *attr_list,
+                           void *user_data)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint rb = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_acquire_render_buffer(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     rb, user_data);
+   return ret;
+}
+
+static EGLBoolean
+dri2_release_render_buffer(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint rb = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_release_render_buffer(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     rb);
+   return ret;
+}
+
+static EGLBoolean
+dri2_acquire_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+                           const EGLint *attrib_list, void *user_data)
+{
+   switch (target) {
+   case EGL_GL_TEXTURE_MESA:
+     return dri2_acquire_texture(disp, ctx, attrib_list, user_data);
+   case EGL_GL_BUFFER_OBJECT_MESA:
+     return dri2_acquire_buffer_object(disp, ctx, attrib_list, user_data);
+   case EGL_GL_RENDER_BUFFER_MESA:
+     return dri2_acquire_render_buffer(disp, ctx, attrib_list, user_data);
+   default:
+      fprintf(stderr, "bad resource target value 0x%04x",
+              target);
+   }
+   return EGL_FALSE;
+}
+
+static EGLBoolean
+dri2_release_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+                           const EGLint *attrib_list)
+{
+   switch (target) {
+   case EGL_GL_TEXTURE_MESA:
+     return dri2_release_texture(disp, ctx, attrib_list);
+   case EGL_GL_BUFFER_OBJECT_MESA:
+     return dri2_release_buffer_object(disp, ctx, attrib_list);
+   case EGL_GL_RENDER_BUFFER_MESA:
+     return dri2_release_render_buffer(disp, ctx, attrib_list);
+   default:
+      fprintf(stderr, "bad resource target value 0x%04x",
+              target);
+   }
+   return EGL_FALSE;
+}
+
+EGLBoolean
+eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void *user)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLContext *context = _eglLookupContext(ctx, disp);
+
+   return dri2_acquire_resource_mesa(disp, context, target, attrib_list, user);
+}
+
+EGLBoolean
+eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLContext *context = _eglLookupContext(ctx, disp);
+
+   return dri2_release_resource_mesa(disp, context, target, attrib_list);
+}
diff --git a/src/x11/mesa_egl_extension.h b/src/x11/mesa_egl_extension.h
new file mode 100644
index 0000000..39ea134
--- /dev/null
+++ b/src/x11/mesa_egl_extension.h
@@ -0,0 +1,20 @@
+#ifndef __MESA_EGL_EXTENSION_H__
+#define __MESA_EGL_EXTENSION_H__
+
+#include <EGL/egl.h>
+#include <GL/gl.h>
+#include <GL/internal/dri_interface.h>
+
+#define EGL_GL_TEXTURE_MESA             0x3300  /* eglAcuireResource target */
+#define EGL_GL_BUFFER_OBJECT_MESA       0x3301  /* eglAcuireResource target */
+#define EGL_GL_RENDER_BUFFER_MESA       0x3302  /* eglAcuireResource target */
+#define EGL_GL_TEXTURE_ID_MESA          0x3303  /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_LEVEL_MESA       0x3304  /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_TARGET_MESA      0x3305  /* eglAcuireResource attribute */
+#define EGL_GL_BUFFER_OBJECT_ID_MESA    0x3306  /* eglAcuireResource attribute */
+#define EGL_GL_RENDER_BUFFER_ID_MESA    0x3307  /* eglAcuireResource attribute */
+
+EGLBoolean eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void * user_data);
+EGLBoolean eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list);
+
+#endif
diff --git a/src/x11/mesa_egl_res_share.c b/src/x11/mesa_egl_res_share.c
new file mode 100644
index 0000000..93e9454
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.c
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <assert.h>
+#include <string.h>
+
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target, EGLint *level,
+                           const EGLint *attrib_list)
+{
+   EGLint i, err = EGL_SUCCESS;
+
+   *texture = 0;
+   *gl_target = 0;
+   *level = 0;
+
+   if (!attrib_list)
+      return EGL_BAD_ATTRIBUTE;
+
+   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+      EGLint attr = attrib_list[i++];
+      EGLint val = attrib_list[i];
+
+      switch (attr) {
+      case EGL_GL_TEXTURE_LEVEL_MESA:
+         *level = val;
+         break;
+      case EGL_GL_TEXTURE_ID_MESA:
+         *texture = val;
+         break;
+      case EGL_GL_TEXTURE_TARGET_MESA:
+         *gl_target = val;
+         break;
+      default:
+         /* unknown attrs are ignored */
+         break;
+      }
+   }
+
+   return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj, const EGLint *attrib_list)
+{
+   EGLint i, err = EGL_SUCCESS;
+   *bufobj = 0;
+
+   if (!attrib_list)
+      return EGL_BAD_ATTRIBUTE;
+
+   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+      EGLint attr = attrib_list[i++];
+      EGLint val = attrib_list[i];
+
+      switch (attr) {
+      case EGL_GL_BUFFER_OBJECT_ID_MESA:
+         *bufobj = val;
+         break;
+      default:
+         /* unknown attrs are ignored */
+         break;
+      }
+   }
+   if (*bufobj == 0)
+      err = EGL_BAD_ATTRIBUTE;
+
+   return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list)
+{
+   EGLint i, err = EGL_SUCCESS;
+   *rb = 0;
+
+   if (!attrib_list)
+      return EGL_BAD_ATTRIBUTE;
+
+   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+      EGLint attr = attrib_list[i++];
+      EGLint val = attrib_list[i];
+
+      switch (attr) {
+      case EGL_GL_RENDER_BUFFER_ID_MESA:
+         *rb = val;
+         break;
+      default:
+         /* unknown attrs are ignored */
+         break;
+      }
+   }
+   if (*rb == 0)
+      err = EGL_BAD_ATTRIBUTE;
+
+   return err;
+}
diff --git a/src/x11/mesa_egl_res_share.h b/src/x11/mesa_egl_res_share.h
new file mode 100644
index 0000000..43e746e
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef EGLRESSHARE_INCLUDED
+#define EGLRESSHARE_INCLUDED
+
+#include <EGL/egl.h>
+
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target,
+                           EGLint *level, const EGLint *attrib_list);
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj,
+                             const EGLint *attrib_list);
+
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list);
+#endif
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 97b7519..f18bd46 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -90,7 +90,7 @@ set (utests_sources
   compiler_insn_selection_min.cpp
   compiler_insn_selection_max.cpp
   compiler_insn_selection_masked_min_max.cpp
-	compiler_load_bool_imm.cpp
+  compiler_load_bool_imm.cpp
   compiler_global_memory_barrier.cpp
   compiler_local_memory_two_ptr.cpp
   compiler_local_memory_barrier.cpp
@@ -102,8 +102,10 @@ set (utests_sources
   compiler_get_image_info.cpp
   compiler_vect_compare.cpp
   compiler_vector_load_store.cpp
+  compiler_vector_inc.cpp
   compiler_cl_finish.cpp
   get_cl_info.cpp
+  builtin_atan2.cpp
   builtin_bitselect.cpp
   builtin_frexp.cpp
   builtin_mad_sat.cpp
@@ -113,6 +115,10 @@ set (utests_sources
   builtin_shuffle.cpp
   builtin_shuffle2.cpp
   builtin_sign.cpp
+  builtin_sinpi.cpp
+  builtin_lgamma.cpp
+  builtin_lgamma_r.cpp
+  builtin_tgamma.cpp
   buildin_work_dim.cpp
   builtin_global_size.cpp
   builtin_local_size.cpp
@@ -122,7 +128,7 @@ set (utests_sources
   builtin_acos_asin.cpp
   runtime_createcontext.cpp
   runtime_null_kernel_arg.cpp
-	runtime_event.cpp
+  runtime_event.cpp
   compiler_double.cpp
   compiler_double_2.cpp
   compiler_double_3.cpp
@@ -135,16 +141,28 @@ set (utests_sources
   compiler_long_asr.cpp
   compiler_long_mult.cpp
   compiler_long_cmp.cpp
+  compiler_bool_cross_basic_block.cpp
+  load_program_from_bin.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
   utest_helper.cpp)
 
-if (EGL_FOUND)
+SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
+ADD_CUSTOM_COMMAND(
+    OUTPUT ${kernel_bin}.bin
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl -o${kernel_bin}.bin
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl
+    )
+
+ADD_CUSTOM_TARGET(kernel_bin.bin
+    DEPENDS ${kernel_bin}.bin)
+
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
 SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
 SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
-endif (EGL_FOUND)
+endif (EGL_FOUND AND MESA_SOURCE_FOUND)
 
 ADD_LIBRARY(utests SHARED ${utests_sources})
 
@@ -152,7 +170,7 @@ TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 
 ADD_EXECUTABLE(utest_run utest_run.cpp)
 TARGET_LINK_LIBRARIES(utest_run utests)
+ADD_DEPENDENCIES (utest_run kernel_bin.bin)
 
 ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
 TARGET_LINK_LIBRARIES(flat_address_space utests)
-
diff --git a/utests/builtin_atan2.cpp b/utests/builtin_atan2.cpp
new file mode 100644
index 0000000..29dd7b4
--- /dev/null
+++ b/utests/builtin_atan2.cpp
@@ -0,0 +1,43 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_atan2(void) {
+	const int n = 1024;
+	float y[n], x[n];
+
+	// Setup kernel and buffers
+	OCL_CREATE_KERNEL("builtin_atan2");
+	OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+	globals[0] = n;
+	locals[0] = 16;
+
+	OCL_MAP_BUFFER(0);
+	OCL_MAP_BUFFER(1);
+	for (int i = 0; i < n; ++i) {
+		y[i] = ((float*) buf_data[0])[i] = (rand()&255) * 0.01f;
+		x[i] = ((float*) buf_data[1])[i] = (rand()&255) * 0.01f;
+	}
+	OCL_UNMAP_BUFFER(0);
+	OCL_UNMAP_BUFFER(1);
+
+	OCL_NDRANGE(1);
+
+	OCL_MAP_BUFFER(2);
+	float *dst = (float*) buf_data[2];
+	for (int i = 0; i < n; ++i) {
+		float cpu = atan2f(y[i], x[i]);
+		float gpu = dst[i];
+		if (fabsf(cpu - gpu) >= 1e-2) {
+			printf("%f %f %f %f\n", y[i], x[i], cpu, gpu);
+			OCL_ASSERT(0);
+		}
+	}
+	OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_atan2);
diff --git a/utests/builtin_lgamma.cpp b/utests/builtin_lgamma.cpp
new file mode 100644
index 0000000..876699a
--- /dev/null
+++ b/utests/builtin_lgamma.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma(void) {
+	const int n = 1024;
+	float src[n];
+
+	// Setup kernel and buffers
+	OCL_CREATE_KERNEL("builtin_lgamma");
+	OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	globals[0] = n;
+	locals[0] = 16;
+
+	for (int j = 0; j < 1024; j++) {
+		OCL_MAP_BUFFER(0);
+		for (int i = 0; i < n; ++i) {
+			src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+		}
+		OCL_UNMAP_BUFFER(0);
+
+		OCL_NDRANGE(1);
+
+		OCL_MAP_BUFFER(1);
+		float *dst = (float*) buf_data[1];
+		for (int i = 0; i < n; ++i) {
+			float cpu = lgamma(src[i]);
+			float gpu = dst[i];
+			if (fabsf(cpu - gpu) >= 1e-3) {
+				printf("%f %f %f\n", src[i], cpu, gpu);
+				OCL_ASSERT(0);
+			}
+		}
+		OCL_UNMAP_BUFFER(1);
+	}
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma);
diff --git a/utests/builtin_lgamma_r.cpp b/utests/builtin_lgamma_r.cpp
new file mode 100644
index 0000000..b6e5d0e
--- /dev/null
+++ b/utests/builtin_lgamma_r.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma_r(void) {
+	const int n = 1024;
+	float src[n];
+
+	// Setup kernel and buffers
+	OCL_CREATE_KERNEL("builtin_lgamma_r");
+	OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+	globals[0] = n;
+	locals[0] = 16;
+
+	for (int j = 0; j < 1024; j++) {
+		OCL_MAP_BUFFER(0);
+		for (int i = 0; i < n; ++i) {
+			src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+		}
+		OCL_UNMAP_BUFFER(0);
+
+		OCL_NDRANGE(1);
+
+		OCL_MAP_BUFFER(1);
+		OCL_MAP_BUFFER(2);
+		float *dst = (float*) buf_data[1];
+		for (int i = 0; i < n; ++i) {
+			int cpu_signp;
+			float cpu = lgamma_r(src[i], &cpu_signp);
+			int gpu_signp = ((int*)buf_data[2])[i];
+			float gpu = dst[i];
+			if (cpu_signp != gpu_signp || fabsf(cpu - gpu) >= 1e-3) {
+				printf("%f %f %f\n", src[i], cpu, gpu);
+				OCL_ASSERT(0);
+			}
+		}
+		OCL_UNMAP_BUFFER(1);
+		OCL_UNMAP_BUFFER(2);
+	}
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma_r);
diff --git a/utests/builtin_sinpi.cpp b/utests/builtin_sinpi.cpp
new file mode 100644
index 0000000..0e11a0d
--- /dev/null
+++ b/utests/builtin_sinpi.cpp
@@ -0,0 +1,104 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static int as_int(float x) {
+  union {float f; int i;} u;
+  u.f = x;
+  return u.i;
+}
+
+static float sinpi(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float y, z;
+  int n = 0, ix;
+  const float pi = 3.1415927410e+00f;
+
+  ix = as_int(x) & 0x7fffffff;
+
+  if (ix < 0x3e800000)
+    return sinf(pi * x);
+  y = -x;
+  z = floorf(y);
+  if (z != y) {
+    y *= 0.5f;
+    y = 2.f * (y - floorf(y));
+    n = y * 4.f;
+  } else {
+    if (ix >= 0x4b800000) {
+      y = 0;
+      n = 0;
+    } else {
+      if (ix < 0x4b000000)
+        z = y + 8.3886080000e+06f;
+      int n = as_int(z);
+      n &= 1;
+      y = n;
+      n <<= 2;
+    }
+  }
+  switch (n) {
+  case 0:
+    y = sinf(pi * y);
+    break;
+  case 1:
+  case 2:
+    y = cosf(pi * ((float) 0.5 - y));
+    break;
+  case 3:
+  case 4:
+    y = sinf(pi * (1.f - y));
+    break;
+  case 5:
+  case 6:
+    y = -cosf(pi * (y - (float) 1.5));
+    break;
+  default:
+    y = sinf(pi * (y - (float) 2.0));
+    break;
+  }
+  return -y;
+}
+
+void builtin_sinpi(void)
+{
+  const int n = 1024;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_sinpi");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < n; ++i) {
+      src[i] = ((float*)buf_data[0])[i] = (j*n + i) * 0.01f;
+    }
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(1);
+    float *dst = (float*)buf_data[1];
+    for (int i = 0; i < n; ++i) {
+      float cpu = sinpi(src[i]);
+      OCL_ASSERT (fabsf(cpu - dst[i]) < 1e-4);
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sinpi);
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
new file mode 100644
index 0000000..4c824d0
--- /dev/null
+++ b/utests/builtin_tgamma.cpp
@@ -0,0 +1,42 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_tgamma(void)
+{
+  const int n = 1024;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_tgamma");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1024; j ++) {
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < n; ++i) {
+      src[i] = ((float*)buf_data[0])[i] = (j*n+i+1) * 0.001f;
+    }
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(1);
+    float *dst = (float*)buf_data[1];
+    for (int i = 0; i < n; ++i) {
+      float cpu = gammaf(src[i]);
+      if (isinf(cpu)) {
+        OCL_ASSERT(isinf(dst[i]));
+      } else if (fabsf(cpu - dst[i]) >= 1e-3) {
+        printf("%f %f %f\n", src[i], cpu, dst[i]);
+        OCL_ASSERT(0);
+      }
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_tgamma);
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
index 384a654..71881b1 100644
--- a/utests/compiler_abs_diff.cpp
+++ b/utests/compiler_abs_diff.cpp
@@ -182,25 +182,30 @@ template <typename T, typename U> static void compiler_abs_diff_with_type(void)
     }
 }
 
-#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) \
-	static void compiler_abs_diff_##TYPE (void) \
+
+#define ABS_TEST_DIFF_TYPE_2(TYPE, CLTYPE, UTYPE) \
+	static void compiler_abs_diff_##CLTYPE (void) \
         { \
-           OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#TYPE, SOURCE, NULL);  \
+           OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#CLTYPE, SOURCE, NULL);  \
            compiler_abs_diff_with_type<TYPE, UTYPE>(); \
         } \
-	MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##TYPE);
+	MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##CLTYPE);
+
+#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE)
 
 typedef unsigned char uchar;
 typedef unsigned short ushort;
 typedef unsigned int uint;
+typedef uint64_t ulong64;
 ABS_TEST_DIFF_TYPE(int, uint)
+ABS_TEST_DIFF_TYPE_2(int64_t, long, ulong64)
 ABS_TEST_DIFF_TYPE(short, ushort)
 ABS_TEST_DIFF_TYPE(char, uchar)
 ABS_TEST_DIFF_TYPE(uint, uint)
+ABS_TEST_DIFF_TYPE_2(ulong64, ulong, ulong64)
 ABS_TEST_DIFF_TYPE(ushort, ushort)
 ABS_TEST_DIFF_TYPE(uchar, uchar)
 
-
 typedef cl_vec<int, 2> int2;
 typedef cl_vec<int, 3> int3;
 typedef cl_vec<int, 4> int4;
@@ -222,6 +227,26 @@ ABS_TEST_DIFF_TYPE(uint4, uint4)
 ABS_TEST_DIFF_TYPE(uint8, uint8)
 ABS_TEST_DIFF_TYPE(uint16, uint16)
 
+typedef cl_vec<int64_t, 2> long2;
+typedef cl_vec<int64_t, 3> long3;
+typedef cl_vec<int64_t, 4> long4;
+typedef cl_vec<int64_t, 8> long8;
+typedef cl_vec<int64_t, 16> long16;
+typedef cl_vec<uint64_t, 2> ulong2;
+typedef cl_vec<uint64_t, 3> ulong3;
+typedef cl_vec<uint64_t, 4> ulong4;
+typedef cl_vec<uint64_t, 8> ulong8;
+typedef cl_vec<uint64_t, 16> ulong16;
+ABS_TEST_DIFF_TYPE(long2, ulong2)
+ABS_TEST_DIFF_TYPE(long3, ulong3)
+ABS_TEST_DIFF_TYPE(long4, ulong4)
+ABS_TEST_DIFF_TYPE(long8, ulong8)
+ABS_TEST_DIFF_TYPE(long16, ulong16)
+ABS_TEST_DIFF_TYPE(ulong2, ulong2)
+ABS_TEST_DIFF_TYPE(ulong3, ulong3)
+ABS_TEST_DIFF_TYPE(ulong4, ulong4)
+ABS_TEST_DIFF_TYPE(ulong8, ulong8)
+ABS_TEST_DIFF_TYPE(ulong16, ulong16)
 
 typedef cl_vec<char, 2> char2;
 typedef cl_vec<char, 3> char3;
diff --git a/utests/compiler_bool_cross_basic_block.cpp b/utests/compiler_bool_cross_basic_block.cpp
new file mode 100644
index 0000000..4dd5bc7
--- /dev/null
+++ b/utests/compiler_bool_cross_basic_block.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int scale) {
+  bool isRedRow = false;
+  bool isRed;
+  int val = src[global_id];
+  for (int i=0; i<scale; i++, isRedRow = !isRedRow) {
+    if (isRedRow) {
+      isRed= false;
+      for (int j=0; j < scale; j++, isRed=!isRed) {
+        if (isRed) {
+	  val++;
+        }
+      }
+    }
+  }
+  dst[global_id] = val;
+}
+
+void compiler_bool_cross_basic_block(void){
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+  int scale = 4;
+	
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_bool_cross_basic_block");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(int), &scale);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu_src[i] = ((int*)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu(i, cpu_src, cpu_dst, scale);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+  OCL_UNMAP_BUFFER(1);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_bool_cross_basic_block)
diff --git a/utests/compiler_copy_image_3d.cpp b/utests/compiler_copy_image_3d.cpp
index 5290090..ff493e7 100644
--- a/utests/compiler_copy_image_3d.cpp
+++ b/utests/compiler_copy_image_3d.cpp
@@ -1,10 +1,11 @@
 #include "utest_helper.hpp"
+#include "string.h"
 
 static void compiler_copy_image_3d(void)
 {
   const size_t w = 512;
   const size_t h = 512;
-  const size_t depth = 1;
+  const size_t depth = 4;
   cl_image_format format;
   cl_sampler sampler;
 
@@ -14,12 +15,14 @@ static void compiler_copy_image_3d(void)
   for (uint32_t k = 0; k < depth; k++)
     for (uint32_t j = 0; j < h; j++)
       for (uint32_t i = 0; i < w; i++)
-        ((uint32_t*)buf_data[0])[k*w*h + j*w + i] = k*w*h + j*w + i;
+        ((float*)buf_data[0])[k*w*h + j*w + i] = (k << 10) + (j << 10) + i;
 
   format.image_channel_order = CL_RGBA;
-  format.image_channel_data_type = CL_UNSIGNED_INT8;
-  OCL_CREATE_IMAGE3D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, depth, 0, 0, buf_data[0]);
+  format.image_channel_data_type = CL_UNORM_INT8;
+  OCL_CREATE_IMAGE3D(buf[0], CL_MEM_COPY_HOST_PTR, &format, w, h, depth, w*4, w*h*4, buf_data[0]);
   OCL_CREATE_IMAGE3D(buf[1], 0, &format, w, h, depth, 0, 0, NULL);
+  for(uint32_t i = 0; i < depth; i++)
+   OCL_CREATE_IMAGE2D(buf[2 + i], 0, &format, w, h, 0, NULL);
   OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
   free(buf_data[0]);
   buf_data[0] = NULL;
@@ -28,21 +31,28 @@ static void compiler_copy_image_3d(void)
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
   OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  for(uint32_t i = 0; i < depth; i++)
+    OCL_SET_ARG(3 + i, sizeof(cl_mem), &buf[2 + i]);
   globals[0] = w;
   globals[1] = h;
-  locals[0] = 16;
-  locals[1] = 16;
-  OCL_NDRANGE(2);
+  globals[2] = depth;
+  locals[0] = 64;
+  locals[1] = 1;
+  locals[2] = 1;
+  OCL_NDRANGE(3);
 
   // Check result
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
+  for(uint32_t i = 0; i < depth + 2; i++)
+    OCL_MAP_BUFFER_GTT(i);
   for (uint32_t k = 0; k < depth; k++)
     for (uint32_t j = 0; j < h; ++j)
-      for (uint32_t i = 0; i < w; i++)
-        OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == ((uint32_t*)buf_data[1])[k*w*h + j*w + i]);
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
+      for (uint32_t i = 0; i < w; i++) {
+        OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[1])[k*w*((h+1)&-2LL) + j*w + i]);
+        OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[k + 2])[j * w + i]);
+      }
+
+  for(uint32_t i = 0; i < depth + 2; i++)
+    OCL_UNMAP_BUFFER_GTT(i);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_3d);
diff --git a/utests/compiler_fill_image_3d.cpp b/utests/compiler_fill_image_3d.cpp
index 4b3d4e3..6a679fb 100644
--- a/utests/compiler_fill_image_3d.cpp
+++ b/utests/compiler_fill_image_3d.cpp
@@ -4,7 +4,7 @@ static void compiler_fill_image_3d(void)
 {
   const size_t w = 512;
   const size_t h = 512;
-  const size_t depth = 1;
+  const size_t depth = 5;
   uint32_t color = 0x12345678;
   cl_image_format format;
 
@@ -21,9 +21,11 @@ static void compiler_fill_image_3d(void)
   OCL_SET_ARG(1, sizeof(color), &color);
   globals[0] = w;
   globals[1] = h;
+  globals[2] = depth;
   locals[0] = 16;
   locals[1] = 16;
-  OCL_NDRANGE(2);
+  locals[2] = 1;
+  OCL_NDRANGE(3);
 
   // Check result
   OCL_MAP_BUFFER(0);
diff --git a/utests/compiler_fill_image_3d_2.cpp b/utests/compiler_fill_image_3d_2.cpp
index 8ecc3e3..f5ff792 100644
--- a/utests/compiler_fill_image_3d_2.cpp
+++ b/utests/compiler_fill_image_3d_2.cpp
@@ -4,7 +4,7 @@ static void compiler_fill_image_3d_2(void)
 {
   const size_t w = 512;
   const size_t h = 512;
-  const size_t depth = 1;
+  const size_t depth = 5;
   cl_image_format format;
 
   format.image_channel_order = CL_RGBA;
@@ -19,17 +19,19 @@ static void compiler_fill_image_3d_2(void)
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   globals[0] = w;
   globals[1] = h;
+  globals[2] = depth;
   locals[0] = 16;
   locals[1] = 16;
-  OCL_NDRANGE(2);
+  locals[2] = 1;
+  OCL_NDRANGE(3);
 
   // Check result
-  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER_GTT(0);
   for (uint32_t k = 0; k < depth; k++)
     for (uint32_t j = 0; j < h; ++j)
       for (uint32_t i = 0; i < w; i++)
         OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == 0x78563412);
-  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER_GTT(0);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_3d_2);
diff --git a/utests/compiler_function_constant0.cpp b/utests/compiler_function_constant0.cpp
index c0a8a9d..6fbbd30 100644
--- a/utests/compiler_function_constant0.cpp
+++ b/utests/compiler_function_constant0.cpp
@@ -7,7 +7,7 @@ void compiler_function_constant0(void)
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_function_constant0");
-  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(int32_t), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, 1 * sizeof(char), NULL);
   OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -17,7 +17,7 @@ void compiler_function_constant0(void)
 
   OCL_MAP_BUFFER(0);
   for(uint32_t i = 0; i < 69; ++i)
-    ((short *)buf_data[0])[i] = i;
+    ((int32_t *)buf_data[0])[i] = i;
   OCL_UNMAP_BUFFER(0);
 
   OCL_MAP_BUFFER(1);
diff --git a/utests/compiler_global_constant.cpp b/utests/compiler_global_constant.cpp
index 1547020..a2d0172 100644
--- a/utests/compiler_global_constant.cpp
+++ b/utests/compiler_global_constant.cpp
@@ -22,8 +22,83 @@ void compiler_global_constant(void)
   // Check results
   OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], m[i%3] + e + r);
     OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + e + r);
   OCL_UNMAP_BUFFER(0);
 }
 
+void compiler_global_constant1(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  uint32_t data1[] = {1, 4, 7};
+  uint32_t data2[]= {3, 7, 11};
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + data2[i%3]);
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + data2[i%3]);
+  OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant2(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], 6);
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == 6);
+  OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant3(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant3");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  uint32_t data1[] = {3, 6, 9};
+  char data2[]= {'c', 'f', 'j'};
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + (int)data2[i%3]);
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + (uint32_t)data2[i%3]);
+  OCL_UNMAP_BUFFER(0);
+}
+
 MAKE_UTEST_FROM_FUNCTION(compiler_global_constant);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant1);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant2);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant3);
diff --git a/utests/compiler_global_constant_2.cpp b/utests/compiler_global_constant_2.cpp
index 56fccb5..cbe63ae 100644
--- a/utests/compiler_global_constant_2.cpp
+++ b/utests/compiler_global_constant_2.cpp
@@ -23,8 +23,37 @@ void compiler_global_constant_2(void)
   // Check results
   OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < n; ++i)
+//    std::cout << ((uint32_t *)buf_data[0])[i] << std::endl;
     OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + t[i%5] + e + r);
   OCL_UNMAP_BUFFER(0);
 }
 
+void compiler_global_constant_2_long(void)
+{
+  const size_t n = 2048;
+  const uint32_t e = 34, r = 77;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant_2", "compiler_global_constant_2_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(uint32_t), &e);
+  OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  uint64_t m[3] = {0x15b,0x25b,0xFFFFFFFFF};
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    std::cout << ((uint64_t *)buf_data[0])[i] << std::endl;
+    OCL_ASSERT(((uint64_t *)buf_data[0])[i] == m[i%3] + e + r);
+  OCL_UNMAP_BUFFER(0);
+}
+
 MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2_long);
diff --git a/utests/compiler_group_size.cpp b/utests/compiler_group_size.cpp
index 6d59aed..0c8881c 100644
--- a/utests/compiler_group_size.cpp
+++ b/utests/compiler_group_size.cpp
@@ -1,4 +1,11 @@
 #include "utest_helper.hpp"
+#include <string.h>
+
+struct xyz{
+  unsigned short b;
+  unsigned short e;
+  unsigned int o;
+};
 
 void compiler_group_size1(void)
 {
@@ -80,7 +87,55 @@ void compiler_group_size3(void)
     OCL_UNMAP_BUFFER(0);
   }
 }
+
+void compiler_group_size4(void)
+{
+  const size_t n = 16;
+  uint32_t color = 2;
+  uint32_t num = 1;
+  int group_size[] = {1};
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_group_size", "compiler_group_size4");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(struct xyz), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+
+  for(uint32_t i = 0; i < num; i++) {
+    // Run the kernel
+    OCL_MAP_BUFFER(0);
+    ((struct xyz*)buf_data[0])[0].b = 0;
+    ((struct xyz*)buf_data[0])[0].e = 2;
+    ((struct xyz*)buf_data[0])[0].o = 0;
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_MAP_BUFFER(1);
+    memset(((uint32_t*)buf_data[1]), 0x0, sizeof(uint32_t)*n);
+    OCL_UNMAP_BUFFER(1);
+
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    OCL_SET_ARG(2, sizeof(cl_int), &group_size[i]);
+    OCL_SET_ARG(3, sizeof(cl_int), &color);
+
+    globals[0] = group_size[i];
+    locals[0] = group_size[i];
+    OCL_NDRANGE(1);
+    OCL_MAP_BUFFER(1);
+
+    // Check results
+    for (uint32_t j = 0; j < n; ++j) {
+//      std::cout <<((uint32_t*)buf_data[1])[j] << "  ";
+      if(j >= i && j <= i+2) {
+       OCL_ASSERT(((uint32_t*)buf_data[1])[j] == color);
+      } else {
+       OCL_ASSERT(((uint32_t*)buf_data[1])[j] == 0);
+      }
+
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
 MAKE_UTEST_FROM_FUNCTION(compiler_group_size1);
 MAKE_UTEST_FROM_FUNCTION(compiler_group_size2);
 MAKE_UTEST_FROM_FUNCTION(compiler_group_size3);
+MAKE_UTEST_FROM_FUNCTION(compiler_group_size4);
 
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
index fad2744..d7e1517 100644
--- a/utests/compiler_long.cpp
+++ b/utests/compiler_long.cpp
@@ -30,8 +30,8 @@ void compiler_long(void)
   src1[7] = -2L,                  src2[7] = -1L;
   src1[8] = 0,                    src2[8] = 0x8000000000000000UL;
   for (int32_t i = 9; i < (int32_t) n; ++i) {
-    src1[i] = ((long)rand() << 32) + rand();
-    src2[i] = ((long)rand() << 32) + rand();
+    src1[i] = ((int64_t)rand() << 32) + rand();
+    src2[i] = ((int64_t)rand() << 32) + rand();
   }
   OCL_MAP_BUFFER(0);
   OCL_MAP_BUFFER(1);
diff --git a/utests/compiler_long_2.cpp b/utests/compiler_long_2.cpp
index e3c6640..6c5da4b 100644
--- a/utests/compiler_long_2.cpp
+++ b/utests/compiler_long_2.cpp
@@ -21,8 +21,8 @@ void compiler_long_2(void)
 
   // Run random tests
   for (int32_t i = 0; i < (int32_t) n; ++i) {
-    src1[i] = ((long)rand() << 32) + rand();
-    src2[i] = ((long)rand() << 32) + rand();
+    src1[i] = ((int64_t)rand() << 32) + rand();
+    src2[i] = ((int64_t)rand() << 32) + rand();
   }
   src1[4] = 1;
   OCL_MAP_BUFFER(0);
diff --git a/utests/compiler_long_convert.cpp b/utests/compiler_long_convert.cpp
index 18e13ee..827a45b 100644
--- a/utests/compiler_long_convert.cpp
+++ b/utests/compiler_long_convert.cpp
@@ -3,6 +3,7 @@
 #include <iostream>
 #include "utest_helper.hpp"
 
+// convert shorter integer to 64-bit integer
 void compiler_long_convert(void)
 {
   const size_t n = 16;
@@ -65,3 +66,93 @@ void compiler_long_convert(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_long_convert);
+
+// convert 64-bit integer to shorter integer
+void compiler_long_convert_2(void)
+{
+  const size_t n = 16;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = -i;
+  }
+  OCL_MAP_BUFFER(3);
+  memcpy(buf_data[3], src, sizeof(src));
+  OCL_UNMAP_BUFFER(3);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  char *dst1 = ((char *)buf_data[0]);
+  short *dst2 = ((short *)buf_data[1]);
+  int *dst3 = ((int *)buf_data[2]);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%x %x %x\n", dst1[i], dst2[i], dst3[i]);
+    OCL_ASSERT(dst1[i] == -i);
+    OCL_ASSERT(dst2[i] == -i);
+    OCL_ASSERT(dst3[i] == -i);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_2);
+
+// convert 64-bit integer to 32-bit float
+void compiler_long_convert_to_float(void)
+{
+  const size_t n = 16;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_to_float");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = -(int64_t)i;
+  }
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[1], src, sizeof(src));
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  float *dst = ((float *)buf_data[0]);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%f\n", dst[i]);
+    OCL_ASSERT(dst[i] == src[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_to_float);
diff --git a/utests/compiler_vector_inc.cpp b/utests/compiler_vector_inc.cpp
new file mode 100644
index 0000000..abc5408
--- /dev/null
+++ b/utests/compiler_vector_inc.cpp
@@ -0,0 +1,46 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_vector_inc(void)
+{
+  const int n = 64;
+  char dst[n];
+  char src[n];
+
+  OCL_CREATE_KERNEL("compiler_vector_inc");
+  OCL_CREATE_BUFFER(buf[0], 0, n, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n / 2;
+  locals[0] = 16;
+
+  for (int i = 0; i < n; ++i) {
+    dst[i] = i;
+    src[i] = (i / 2) % 4;
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], dst, n);
+  memcpy(buf_data[1], src, n);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  char *dest = ((char *)buf_data[0]);
+  for (int i=0; i<n; ++i) {
+    char wish;
+    if (src[i/2] < 2)
+      wish = dst[i] + 1;
+    else
+      wish = dst[i] - 1;
+    OCL_ASSERT(dest[i] == wish);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_vector_inc);
diff --git a/utests/load_program_from_bin.cpp b/utests/load_program_from_bin.cpp
new file mode 100644
index 0000000..d45c2bd
--- /dev/null
+++ b/utests/load_program_from_bin.cpp
@@ -0,0 +1,77 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+    dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_bin(void)
+{
+    const size_t n = 16;
+    float cpu_dst[16], cpu_src[16];
+    cl_int status;
+    cl_int binary_status;
+    char *ker_path = NULL;
+
+    cl_file_map_t *fm = cl_file_map_new();
+    ker_path = cl_do_kiss_path("compiler_ceil.bin", device);
+    OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+    const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
+    const size_t sz = cl_file_map_size(fm);
+
+    program = clCreateProgramWithBinary(ctx, 1,
+              &device, &sz, &src, &binary_status, &status);
+
+    OCL_ASSERT(program && status == CL_SUCCESS);
+
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+    kernel = clCreateKernel(program, "compiler_ceil", &status);
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+        OCL_UNMAP_BUFFER(0);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+#if 0
+        printf("#### GPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", ((float *)buf_data[1])[i]);
+        printf("\n#### CPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", cpu_dst[i]);
+        printf("\n");
+#endif
+
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+        OCL_UNMAP_BUFFER(1);
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
index 1ec8692..b974f6a 100644
--- a/utests/runtime_event.cpp
+++ b/utests/runtime_event.cpp
@@ -33,6 +33,8 @@ void runtime_event(void)
     OCL_ASSERT(status >= CL_SUBMITTED);
   }
 
+  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
   OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
 
   clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
@@ -45,13 +47,10 @@ void runtime_event(void)
     OCL_ASSERT(status <= CL_COMPLETE);
   }
 
-  // Check results
-  OCL_MAP_BUFFER(0);
-
   for (uint32_t i = 0; i < n; ++i) {
     OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
   }
-  OCL_UNMAP_BUFFER(0);
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
 
   for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
     clReleaseEvent(ev[i]);
diff --git a/utests/utest.cpp b/utests/utest.cpp
index fc3467e..18d10e8 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -32,7 +32,7 @@ using namespace std;
 vector<UTest> *UTest::utestList = NULL;
 void releaseUTestList(void) { delete UTest::utestList; }
 
-UTest::UTest(Function fn, const char *name) : fn(fn), name(name) {
+UTest::UTest(Function fn, const char *name, bool haveIssue) : fn(fn), name(name), haveIssue(haveIssue) {
   if (utestList == NULL) {
     utestList = new vector<UTest>;
     atexit(releaseUTestList);
@@ -40,7 +40,7 @@ UTest::UTest(Function fn, const char *name) : fn(fn), name(name) {
   utestList->push_back(*this);
 }
 
-UTest::UTest(void) : fn(NULL), name(NULL) {}
+UTest::UTest(void) : fn(NULL), name(NULL), haveIssue(false) {}
 
 static bool strequal(const char *s1, const char *s2) {
   if (strcmp(s1, s2) == 0) return true;
@@ -52,7 +52,7 @@ void UTest::run(const char *name) {
   if (utestList == NULL) return;
   for (size_t i = 0; i < utestList->size(); ++i) {
     const UTest &utest = (*utestList)[i];
-    if (utest.name == NULL || utest.fn == NULL) continue;
+    if (utest.name == NULL || utest.fn == NULL ) continue;
     if (strequal(utest.name, name)) {
       std::cout << utest.name << ":" << std::endl;
       (utest.fn)();
@@ -76,3 +76,25 @@ void UTest::runAll(void) {
   }
 }
 
+void UTest::runAllNoIssue(void) {
+  if (utestList == NULL) return;
+  for (size_t i = 0; i < utestList->size(); ++i) {
+    const UTest &utest = (*utestList)[i];
+    if (utest.fn == NULL || utest.haveIssue) continue;
+    std::cout << utest.name << ":" << std::endl;
+    (utest.fn)();
+    std::cout << std::endl;
+    cl_kernel_destroy();
+    cl_buffer_destroy();
+  }
+}
+
+void UTest::listAllCases()
+{
+  if (utestList == NULL) return;
+    for (size_t i = 0; i < utestList->size(); ++i) {
+      const UTest &utest = (*utestList)[i];
+      if (utest.fn == NULL) continue;
+    std::cout << utest.name << std::endl;
+ }
+}
diff --git a/utests/utest.hpp b/utests/utest.hpp
index 338a4dc..d3a6a6f 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -39,17 +39,23 @@ struct UTest
   /*! Empty test */
   UTest(void);
   /*! Build a new unit test and append it to the unit test list */
-  UTest(Function fn, const char *name);
+  UTest(Function fn, const char *name, bool haveIssue = false);
   /*! Function to execute */
   Function fn;
   /*! Name of the test */
   const char *name;
+  /*! Indicate whether current test cases has issue to be fixes */
+  bool haveIssue;
   /*! The tests that are registered */
   static std::vector<UTest> *utestList;
   /*! Run the test with the given name */
   static void run(const char *name);
+  /*! Run all the tests without known issue*/
+  static void runAllNoIssue(void);
   /*! Run all the tests */
   static void runAll(void);
+  /*! List all test cases */
+  static void listAllCases(void);
 };
 
 /*! Register a new unit test */
@@ -60,6 +66,12 @@ struct UTest
   static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
   static const UTest __##FN##__(__ANON__##FN##__, #FN);
 
+/*! Register a test case which has issue to be fixed */
+#define MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(FN) \
+  static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
+
+
 /*! No assert is expected */
 #define UTEST_EXPECT_SUCCESS(EXPR) \
  do { \
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 9069db2..8089799 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -205,8 +205,8 @@ clpanic(const char *msg, int rval)
   exit(-1);
 }
 
-static char*
-do_kiss_path(const char *file, cl_device_id device)
+char*
+cl_do_kiss_path(const char *file, cl_device_id device)
 {
   cl_int ver;
   const char *sub_path = NULL;
@@ -239,7 +239,7 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format, const
   cl_int status = CL_SUCCESS;
 
   /* Load the program and build it */
-  ker_path = do_kiss_path(file_name, device);
+  ker_path = cl_do_kiss_path(file_name, device);
   if (format == LLVM)
     program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
   else if (format == SOURCE) {
@@ -294,10 +294,10 @@ error:
 #include <cstring>
 #define GET_DEVICE_STR_INFO(LOWER_NAME, NAME) \
     std::string LOWER_NAME ##Str; \
-    OCL_CALL (clGetDeviceInfo, device, NAME, 0, 0, &param_value_size); \
+    OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, 0, 0, &param_value_size); \
     { \
       std::vector<char> param_value(param_value_size); \
-      OCL_CALL (clGetDeviceInfo, device, NAME, \
+      OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, \
                 param_value_size, param_value.empty() ? NULL : &param_value.front(), \
                 &param_value_size); \
       if (!param_value.empty()) \
@@ -311,7 +311,9 @@ cl_ocl_init(void)
   cl_int status = CL_SUCCESS;
   cl_uint platform_n;
   size_t i;
+#ifdef HAS_EGL
   bool hasGLExt = false;
+#endif
   cl_context_properties *props = NULL;
 
   /* Get the platform number */
@@ -331,16 +333,17 @@ cl_ocl_init(void)
   OCL_CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   {
     size_t param_value_size;
-    GET_DEVICE_STR_INFO(profile, CL_DEVICE_PROFILE);
-    GET_DEVICE_STR_INFO(name, CL_DEVICE_NAME);
-    GET_DEVICE_STR_INFO(vendor, CL_DEVICE_VENDOR);
-    GET_DEVICE_STR_INFO(version, CL_DEVICE_VERSION);
-    GET_DEVICE_STR_INFO(opencl_c_version, CL_DEVICE_OPENCL_C_VERSION);
-    GET_DEVICE_STR_INFO(driver_version, CL_DRIVER_VERSION);
-    GET_DEVICE_STR_INFO(extensions, CL_DEVICE_EXTENSIONS);
+    GET_DEVICE_STR_INFO(profile, PROFILE);
+    GET_DEVICE_STR_INFO(name, NAME);
+    GET_DEVICE_STR_INFO(vendor, VENDOR);
+    GET_DEVICE_STR_INFO(version, VERSION);
+    GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
+    GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
+#ifdef HAS_EGL
     if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
       hasGLExt = true;
     }
+#endif
   }
 
 #ifdef HAS_EGL
@@ -510,7 +513,7 @@ struct bmphdr {
 int *cl_read_bmp(const char *filename, int *width, int *height)
 {
   struct bmphdr hdr;
-  char *bmppath = do_kiss_path(filename, device);
+  char *bmppath = cl_do_kiss_path(filename, device);
   FILE *fp = fopen(bmppath, "rb");
   assert(fp);
 
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index e7f43fc..29a21d5 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -50,7 +50,7 @@ extern EGLSurface  eglSurface;
 #define OCL_THROW_ERROR(FN, STATUS) \
   do { \
     char msg[2048]; \
-    sprintf(msg, "error calling %s with error%s \n", #FN, err_msg[-STATUS]); \
+    sprintf(msg, "error calling %s with error %s \n", #FN, err_msg[-STATUS]); \
     OCL_ASSERTM(false, msg); \
   } while (0)
 
@@ -186,6 +186,9 @@ extern int cl_ocl_init(void);
 extern int cl_kernel_init(const char *file_name,
                 const char *kernel_name, int format, const char * build_opt);
 
+/* Get the file path */
+extern char* cl_do_kiss_path(const char *file, cl_device_id device);
+
 /* init the bunch of global varaibles here */
 extern int cl_test_init(const char *file_name, const char *kernel_name, int format);
 
diff --git a/utests/utest_run.cpp b/utests/utest_run.cpp
index e577b7b..94fbbee 100644
--- a/utests/utest_run.cpp
+++ b/utests/utest_run.cpp
@@ -26,19 +26,93 @@
 #include "utest_helper.hpp"
 #include "utest_exception.hpp"
 #include <iostream>
+#include <getopt.h>
+
+static const char *shortopts = "c:lanh";
+struct option longopts[] = {
+{"casename", required_argument, NULL, 'c'},
+{"list", no_argument, NULL, 'l'},
+{"all", no_argument, NULL, 'a'},
+{"allnoissue", no_argument, NULL, 'n'},
+{"help", no_argument, NULL, 'h'},
+{0, 0, 0, 0},
+};
+
+void usage()
+{
+    std::cout << "\
+Usage:\n\
+  ./utest_run <option>\n\
+\n\
+  option:\n\
+    -c <casename>: run sub-case named 'casename'\n\
+    -l           : list all the available case name\n\
+    -a           : run all test cases\n\
+    -n           : run all test cases without known issue (default option)\n\
+    -h           : display this usage\n\
+\
+    "<< std::endl;
+}
 
 int main(int argc, char *argv[])
 {
-  try {
-    cl_ocl_init();
-    if (argc >= 2)
-      for (int i = 1; i < argc; ++i)
-        UTest::run(argv[i]);
-    else
-      UTest::runAll();
-    cl_ocl_destroy();
-  } catch (Exception e) {
-      std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+
+  int c = 0;
+  cl_ocl_init();
+
+  c = getopt_long (argc, argv, shortopts, longopts, NULL);
+
+  if (argc == 1)
+    c = 'n';
+  if (argc == 2 && c < 1 ){
+    c = 'c';
+    optarg = argv[1];
   }
+
+  {
+    switch (c)
+    {
+      case 'c':
+        try {
+          UTest::run(optarg);
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'l':
+        UTest::listAllCases();
+        break;
+
+      case 'a':
+        try {
+          UTest::runAll();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'n':
+        try {
+          UTest::runAllNoIssue();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'h':
+      default:
+        usage();
+        exit(1);
+    }
+  } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
+
+  cl_ocl_destroy();
 }
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list