[Pkg-opencl-devel] [beignet] 55/66: Imported Upstream version 0.3

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 07:27:08 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 59b9b34c29b6350c4080535c15c2c443a4b17a47
Author: Simon Richter <sjr at debian.org>
Date:   Wed Oct 23 21:33:44 2013 +0200

    Imported Upstream version 0.3
---
 CMake/FindLLVM.cmake                        |    4 +-
 CMakeLists.txt                              |    2 +-
 backend/src/backend/context.cpp             |   29 +
 backend/src/backend/context.hpp             |    2 +
 backend/src/backend/gen_context.cpp         |   14 +
 backend/src/backend/gen_insn_scheduling.cpp |    4 +-
 backend/src/backend/gen_insn_selection.cpp  |   20 +-
 backend/src/backend/gen_insn_selection.hxx  |    1 +
 backend/src/backend/gen_program.cpp         |    6 +
 backend/src/backend/gen_program.h           |    3 +
 backend/src/backend/gen_reg_allocation.cpp  |   56 +-
 backend/src/backend/program.cpp             |   51 +-
 backend/src/backend/program.h               |    3 +
 backend/src/backend/program.hpp             |    5 +
 backend/src/gbe_bin_generater.cpp           |   47 +-
 backend/src/gen_convert.sh                  |  154 +++
 backend/src/ir/constant.cpp                 |    9 +-
 backend/src/ir/constant.hpp                 |    8 +-
 backend/src/ir/function.cpp                 |    2 +-
 backend/src/ir/function.hpp                 |    5 +
 backend/src/ir/instruction.cpp              |   10 +-
 backend/src/ir/instruction.hpp              |    2 +
 backend/src/ir/instruction.hxx              |    1 +
 backend/src/ir/unit.hpp                     |   20 -
 backend/src/llvm/llvm_gen_backend.cpp       |  343 ++++--
 backend/src/llvm/llvm_gen_backend.hpp       |    5 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx  |   31 +
 backend/src/llvm/llvm_passes.cpp            |   17 +
 backend/src/llvm/llvm_scalarize.cpp         |   22 +-
 backend/src/llvm/llvm_to_gen.cpp            |    2 +-
 backend/src/ocl_convert.h                   | 1542 +++++++++++++++++++++++++++
 backend/src/ocl_stdlib.tmpl.h               |   16 +-
 docs/Beignet.mdwn                           |    7 +-
 docs/Beignet/Backend/TODO.mdwn              |    9 +-
 include/CL/cl_intel.h                       |   32 +
 kernels/builtin_convert_sat.cl              |   48 +
 kernels/compiler_async_copy.cl              |   38 +-
 kernels/compiler_clod_function_call.cl      |   91 ++
 kernels/compiler_function_argument2.cl      |   12 +
 kernels/compiler_global_constant.cl         |   15 +-
 kernels/compiler_insert_vector.cl           |   11 +
 kernels/compiler_julia_function_call.cl     |  142 +++
 kernels/compiler_local_slm.cl               |   28 +-
 src/CMakeLists.txt                          |   18 +
 src/cl_api.c                                |  180 +++-
 src/cl_command_queue.c                      |    2 +-
 src/cl_command_queue_gen7.c                 |    8 +-
 src/cl_context.c                            |   26 +
 src/cl_context.h                            |   28 +-
 src/cl_driver.h                             |   12 +-
 src/cl_driver_defs.c                        |    3 +
 src/cl_event.c                              |   22 +
 src/cl_event.h                              |    2 +
 src/cl_gt_device.h                          |    2 +-
 src/cl_mem.c                                |  141 +++
 src/cl_mem.h                                |   19 +-
 src/cl_program.c                            |   22 +
 src/cl_program.h                            |    3 +
 src/intel/intel_defines.h                   |    4 +
 src/intel/intel_driver.c                    |   42 +-
 src/intel/intel_gpgpu.c                     |   71 +-
 src/kernels/cl_internal_copy_buf_align1.cl  |    8 +
 src/kernels/cl_internal_copy_buf_align16.cl |   12 +
 src/kernels/cl_internal_copy_buf_align4.cl  |    8 +
 utests/CMakeLists.txt                       |    5 +
 utests/builtin_convert_sat.cpp              |   80 ++
 utests/compiler_async_copy.cpp              |   86 +-
 utests/compiler_function_argument2.cpp      |   57 +
 utests/compiler_insert_vector.cpp           |   18 +
 utests/compiler_local_slm.cpp               |   30 +-
 utests/compiler_shader_toy.cpp              |   33 +-
 utests/compiler_vector_inc.cpp              |    2 +-
 utests/enqueue_copy_buf.cpp                 |   66 ++
 73 files changed, 3524 insertions(+), 355 deletions(-)

diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index e76ab42..3fa9ad9 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -7,9 +7,9 @@
 # LLVM_MODULE_LIBS - list of llvm libs for working with modules.
 # LLVM_FOUND       - True if llvm found.
 if (LLVM_INSTALL_DIR)
-  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config DOC "llvm-config executable" PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config-3.4 llvm-config DOC "llvm-config executable" PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
 else (LLVM_INSTALL_DIR)
-  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config DOC "llvm-config executable")
+  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config-3.4 llvm-config DOC "llvm-config executable")
 endif (LLVM_INSTALL_DIR)
 
 if (LLVM_CONFIG_EXECUTABLE)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d18f50..2ec6c08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 0)
-set (LIBCL_DRIVER_VERSION_MINOR 2)
+set (LIBCL_DRIVER_VERSION_MINOR 3)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 1)
 
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index cbd38f1..25d4f9c 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -60,6 +60,9 @@ namespace gbe
     /*! Free the given register file piece */
     void deallocate(int16_t offset);
 
+    /*! Spilt a block into 2 blocks */
+    void splitBlock(int16_t offset, int16_t subOffset);
+
   private:
     /*! May need to make that run-time in the future */
     static const int16_t RegisterFileSize = 4*KB;
@@ -268,6 +271,27 @@ namespace gbe
     }
   }
 
+  void RegisterFilePartitioner::splitBlock(int16_t offset, int16_t subOffset) {
+    // Retrieve the size in the allocation map
+    auto it = allocatedBlocks.find(offset);
+    GBE_ASSERT(it != allocatedBlocks.end());
+
+    while(subOffset > it->second) {
+      subOffset -= it->second;
+      offset += it->second;
+      it = allocatedBlocks.find(offset);
+      GBE_ASSERT(it != allocatedBlocks.end());
+    }
+
+    if(subOffset == 0)
+      return;
+    int16_t size = it->second;
+    allocatedBlocks.erase(it);
+    // Track the allocation to retrieve the size later
+    allocatedBlocks.insert(std::make_pair(offset, subOffset));
+    allocatedBlocks.insert(std::make_pair(offset + subOffset, size - subOffset));
+  }
+
   static int
   alignScratchSize(int size){
     int i = 0;
@@ -328,6 +352,10 @@ namespace gbe
 
   void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); }
 
+  void Context::splitBlock(int16_t offset, int16_t subOffset) {
+    partitioner->splitBlock(offset, subOffset);
+  }
+
   int32_t Context::allocConstBuf(uint32_t argID) {
      GBE_ASSERT(kernel->args[argID].type == GBE_ARG_CONSTANT_PTR);
 
@@ -632,6 +660,7 @@ namespace gbe
   void Context::handleSLM(void) {
     const bool useSLM = fn.getUseSLM();
     kernel->useSLM = useSLM;
+    kernel->slmSize = fn.getSLMSize();
   }
 
   bool Context::isScalarReg(const ir::Register &reg) const {
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index ca2c88d..000612e 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -86,6 +86,8 @@ namespace gbe
     int16_t allocate(int16_t size, int16_t alignment);
     /*! Deallocate previously allocated memory */
     void deallocate(int16_t offset);
+    /*! Spilt a block into 2 blocks, for some registers allocate together but  deallocate seperate */
+    void splitBlock(int16_t offset, int16_t subOffset);
     /* allocate curbe for constant ptr argument */
     int32_t allocConstBuf(uint32_t argID);
     /* allocate a new entry for a specific image's information */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 858105a..43b3bc7 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -189,6 +189,20 @@ namespace gbe
       case SEL_OP_MOV_DF:
         p->MOV_DF(dst, src, tmp);
         break;
+      case SEL_OP_CONVF_TO_I64:
+       {
+        tmp.type = GEN_TYPE_F;
+        GenRegister d = GenRegister::retype(tmp, GEN_TYPE_D);
+        float c = (1.f / 65536.f) * (1.f / 65536.f);
+        p->MUL(tmp, src, GenRegister::immf(c));
+        p->RNDZ(tmp, tmp);
+        p->MOV(d, tmp);
+        storeTopHalf(dst, d);
+        d.type = GEN_TYPE_UD;
+        p->MOV(d, GenRegister::abs(src));
+        storeBottomHalf(dst, d);
+        break;
+       }
       case SEL_OP_CONVI_TO_I64: {
         GenRegister middle;
         if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index f1f5775..a711f45 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -602,8 +602,8 @@ namespace gbe
     }
   }
 
-  BVAR(OCL_POST_ALLOC_INSN_SCHEDULE, true);
-  BVAR(OCL_PRE_ALLOC_INSN_SCHEDULE, true);
+  BVAR(OCL_POST_ALLOC_INSN_SCHEDULE, false);
+  BVAR(OCL_PRE_ALLOC_INSN_SCHEDULE, false);
 
   void schedulePostRegAllocation(GenContext &ctx, Selection &selection) {
     if (OCL_POST_ALLOC_INSN_SCHEDULE) {
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index bd52885..7eae7ca 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -459,6 +459,7 @@ namespace gbe
     ALU2(UPSAMPLE_INT)
     ALU2(UPSAMPLE_LONG)
     ALU1WithTemp(CONVI_TO_I64)
+    ALU1WithTemp(CONVF_TO_I64)
     ALU1(CONVI64_TO_I)
     I64Shift(I64SHL)
     I64Shift(I64SHR)
@@ -2296,7 +2297,7 @@ namespace gbe
       for (dstID = 0; dstID < tmpRegNum ; ++dstID)
         dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
       for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
-        dst[dstID] = sel.selReg(insn.getValue(valueID));
+        dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
       sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
     }
 
@@ -2416,7 +2417,7 @@ namespace gbe
         dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getValue(valueID));
+        src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
       sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
     }
 
@@ -2538,15 +2539,20 @@ namespace gbe
       const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
 
+      if(insn.getOpcode() == ir::OP_SAT_CVT) {
+        sel.push();
+        sel.curr.saturate = 1;
+      }
+
       // We need two instructions to make the conversion
       if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
-          const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+          const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
           unpacked = GenRegister::unpacked_uw(sel.reg(FAMILY_DWORD));
           unpacked = GenRegister::retype(unpacked, type);
         } else {
-          const uint32_t type = TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+          const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
           unpacked = GenRegister::unpacked_ub(sel.reg(FAMILY_DWORD));
           unpacked = GenRegister::retype(unpacked, type);
         }
@@ -2574,6 +2580,8 @@ namespace gbe
       } else if (dst.isint64()) {
         switch(src.type) {
           case GEN_TYPE_F:
+            sel.CONVF_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+            break;
           case GEN_TYPE_DF:
             NOT_IMPLEMENTED;
           default:
@@ -2581,6 +2589,10 @@ namespace gbe
         }
       } else
         sel.MOV(dst, src);
+
+      if(insn.getOpcode() == ir::OP_SAT_CVT)
+        sel.pop();
+
       return true;
     }
     DECL_CTOR(ConvertInstruction, 1, 1);
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 21b0a43..4499006 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -73,6 +73,7 @@ DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
 DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
+DECL_SELECTION_IR(CONVF_TO_I64, UnaryWithTempInstruction)
 DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
 DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
 DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 33f07b2..781152d 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -160,3 +160,9 @@ void genSetupCallBacks(void)
   gbe_program_new_from_llvm = gbe::genProgramNewFromLLVM;
 }
 
+sem_t llvm_semaphore;
+
+void genSetupLLVMSemaphore(void)
+{
+  sem_init(&llvm_semaphore, 0, 1);
+}
diff --git a/backend/src/backend/gen_program.h b/backend/src/backend/gen_program.h
index 9fae2e7..a498a5d 100644
--- a/backend/src/backend/gen_program.h
+++ b/backend/src/backend/gen_program.h
@@ -29,9 +29,12 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+#include <semaphore.h>
 
 /*! This will make the compiler output Gen ISA code */
 extern void genSetupCallBacks(void);
+extern sem_t llvm_semaphore;
+extern void genSetupLLVMSemaphore(void);
 
 #endif /* __GBE_GEN_PROGRAM_H__ */
 
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index ab8b7ee..30f9e38 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -88,8 +88,6 @@ namespace gbe
     map<ir::Register, VectorLocation> vectorMap;
     /*! All vectors used in the selection */
     vector<SelectionVector*> vectors;
-    /*! All vectors that are already expired */
-    set<SelectionVector*> expired;
     /*! The set of booleans that will go to GRF (cannot be kept into flags) */
     set<ir::Register> grfBooleans;
     /*! All the register intervals */
@@ -149,15 +147,16 @@ namespace gbe
     const Function &fn = ctx.getFunction();
     GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
     const Function::PushMap &pushMap = fn.getPushMap();
-    for (const auto &pushed : pushMap) {
-      const uint32_t argID = pushed.second.argID;
+    for (auto rit = pushMap.rbegin(); rit != pushMap.rend(); ++rit) {
+      const uint32_t argID = rit->second.argID;
       const FunctionArgument arg = fn.getArg(argID);
 
-      const uint32_t subOffset = pushed.second.offset;
-      const Register reg = pushed.second.getRegister();
+      const uint32_t subOffset = rit->second.offset;
+      const Register reg = rit->second.getRegister();
       auto it = this->ctx.curbeRegs.find(arg.reg);
       assert(it != ctx.curbeRegs.end());
       allocatePayloadReg(reg, it->second, subOffset);
+      ctx.splitBlock(it->second, subOffset);
     }
   }
 
@@ -309,28 +308,9 @@ namespace gbe
         continue;
       }
       // Case 1 - it does not belong to a vector. Just remove it
-      if (vectorMap.contains(reg) == false) {
         ctx.deallocate(it->second);
         this->expiringID++;
         return true;
-      // Case 2 - check that the vector has not been already removed. If not,
-      // since we equaled the intervals of all registers in the vector, we just
-      // remove the complete vector
-      } else {
-        SelectionVector *vector = vectorMap.find(reg)->second.first;
-        if (expired.contains(vector)) {
-          this->expiringID++;
-          continue;
-        } else {
-          const ir::Register first = vector->reg[0].reg();
-          auto it = RA.find(first);
-          GBE_ASSERT(it != RA.end());
-          ctx.deallocate(it->second);
-          expired.insert(vector);
-          this->expiringID++;
-          return true;
-        }
-      }
     }
 
     // We were not able to expire anything
@@ -540,11 +520,12 @@ namespace gbe
           }
           continue;
         }
-        for (uint32_t regID = 0; regID < vector->regNum; ++regID, grfOffset += alignment) {
+        for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
           const ir::Register reg = vector->reg[regID].reg();
           GBE_ASSERT(RA.contains(reg) == false
                      && ctx.sel->getRegisterData(reg).family == family);
-          RA.insert(std::make_pair(reg, grfOffset));
+          RA.insert(std::make_pair(reg, grfOffset + alignment * regID));
+          ctx.splitBlock(grfOffset, alignment * regID);  //splitBlock will not split if regID == 0
         }
       }
       // Case 2: This is a regular scalar register, allocate it alone
@@ -648,27 +629,6 @@ namespace gbe
       }
     }
 
-    // Extend the liveness of the registers that belong to vectors. Actually,
-    // this is way too brutal, we should instead maintain a list of allocated
-    // intervals to handle vector registers independently while doing the linear
-    // scan (or anything else)
-    for (auto vector : this->vectors) {
-      const uint32_t regNum = vector->regNum;
-      const ir::Register first = vector->reg[0].reg();
-      int32_t minID = this->intervals[first].minID;
-      int32_t maxID = this->intervals[first].maxID;
-      for (uint32_t regID = 1; regID < regNum; ++regID) {
-        const ir::Register reg = vector->reg[regID].reg();
-        minID = std::min(minID, this->intervals[reg].minID);
-        maxID = std::max(maxID, this->intervals[reg].maxID);
-      }
-      for (uint32_t regID = 0; regID < regNum; ++regID) {
-        const ir::Register reg = vector->reg[regID].reg();
-        this->intervals[reg].minID = minID;
-        this->intervals[reg].maxID = maxID;
-      }
-    }
-
     // Sort both intervals in starting point and ending point increasing orders
     const uint32_t regNum = ctx.sel->getRegNum();
     this->starting.resize(regNum);
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index ffd31d9..937f95b 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -75,7 +75,7 @@
 namespace gbe {
 
   Kernel::Kernel(const std::string &name) :
-    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), ctx(NULL), samplerSet(NULL), imageSet(NULL)
+    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL)
   {}
   Kernel::~Kernel(void) {
     if(ctx) GBE_DELETE(ctx);
@@ -244,7 +244,9 @@ namespace gbe {
     OUT_UPDATE_SZ(curbeSize);
     OUT_UPDATE_SZ(simdWidth);
     OUT_UPDATE_SZ(stackSize);
+    OUT_UPDATE_SZ(scratchSize);
     OUT_UPDATE_SZ(useSLM);
+    OUT_UPDATE_SZ(slmSize);
 
     /* samplers. */
     if (samplerSet) {
@@ -331,7 +333,9 @@ namespace gbe {
     IN_UPDATE_SZ(curbeSize);
     IN_UPDATE_SZ(simdWidth);
     IN_UPDATE_SZ(stackSize);
+    IN_UPDATE_SZ(scratchSize);
     IN_UPDATE_SZ(useSLM);
+    IN_UPDATE_SZ(slmSize);
 
     IN_UPDATE_SZ(has_samplerset);
     if (has_samplerset) {
@@ -406,7 +410,9 @@ namespace gbe {
     outs << spaces_nl << "  curbeSize: " << curbeSize << "\n";
     outs << spaces_nl << "  simdWidth: " << simdWidth << "\n";
     outs << spaces_nl << "  stackSize: " << stackSize << "\n";
+    outs << spaces_nl << "  scratchSize: " << scratchSize << "\n";
     outs << spaces_nl << "  useSLM: " << useSLM << "\n";
+    outs << spaces_nl << "  slmSize: " << slmSize << "\n";
 
     outs << spaces_nl << "  Argument Number is " << argNum << "\n";
     for (uint32_t i = 0; i < argNum; i++) {
@@ -449,6 +455,7 @@ namespace gbe {
     // Arguments to pass to the clang frontend
     vector<const char *> args;
     bool bOpt = true;
+    bool bFastMath = false;
 
     vector<std::string> useless; //hold substrings to avoid c_str free
     size_t start = 0, end = 0;
@@ -465,9 +472,12 @@ namespace gbe {
       if(str.size() == 0)
         continue;
       if(str == "-cl-opt-disable") bOpt = false;
+      if(str == "-cl-fast-relaxed-math") bFastMath = true;
       useless.push_back(str);
       args.push_back(str.c_str());
     }
+    args.push_back("-mllvm");
+    args.push_back("-inline-threshold=200000");
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
     args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
 #endif
@@ -475,7 +485,10 @@ namespace gbe {
     // XXX we haven't implement those builtin functions,
     // so disable it currently.
     args.push_back("-fno-builtin");
-    if(bOpt)  args.push_back("-O3");
+    if(bOpt)
+      args.push_back("-O2");
+    if(bFastMath)
+      args.push_back("-D __FAST_RELAXED_MATH__=1");
 #if LLVM_VERSION_MINOR <= 2
     args.push_back("-triple");
     args.push_back("nvptx");
@@ -529,15 +542,36 @@ namespace gbe {
     clang::LangOptions & lang_opts = Clang.getLangOpts();
     lang_opts.OpenCL = 1;
 
+    //llvm flags need command line parsing to take effect
+    if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
+      unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
+      const char **Args = new const char*[NumArgs + 2];
+      Args[0] = "clang (LLVM option parsing)";
+      for (unsigned i = 0; i != NumArgs; ++i){
+        Args[i + 1] = Clang.getFrontendOpts().LLVMArgs[i].c_str();
+      }
+      Args[NumArgs + 1] = 0;
+      llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
+      delete [] Args;
+    }
+
     // Create an action and make the compiler instance carry it out
     llvm::OwningPtr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction());
-    if (!Clang.ExecuteAction(*Act))
+    sem_wait(&llvm_semaphore);
+    auto retVal = Clang.ExecuteAction(*Act);
+    sem_post(&llvm_semaphore);
+    if (!retVal)
       return;
 
     llvm::Module *module = Act->takeModule();
 
     std::string ErrorInfo;
-    llvm::raw_fd_ostream OS(output, ErrorInfo,llvm::raw_fd_ostream::F_Binary);
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
+    auto mode = llvm::sys::fs::F_Binary;
+#else
+    auto mode = llvm::raw_fd_ostream::F_Binary;
+#endif
+    llvm::raw_fd_ostream OS(output, ErrorInfo, mode);
     //still write to temp file for code simply, otherwise need add another function.
     //because gbe_program_new_from_llvm also be used by cl_program_create_from_llvm, can't be removed
     //TODO: Pass module to llvmToGen, if use module, should return Act and use OwningPtr out of this funciton
@@ -704,6 +738,12 @@ namespace gbe {
     return kernel->getUseSLM() ? 1 : 0;
   }
 
+  static int32_t kernelGetSLMSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getSLMSize();
+  }
+
   static int32_t kernelSetConstBufSize(gbe_kernel genKernel, uint32_t argID, size_t sz) {
     if (genKernel == NULL) return -1;
     gbe::Kernel *kernel = (gbe::Kernel*) genKernel;
@@ -771,6 +811,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size =
 GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
@@ -805,6 +846,7 @@ namespace gbe
       gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
       gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
       gbe_kernel_use_slm = gbe::kernelUseSLM;
+      gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
       gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
       gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
       gbe_kernel_get_image_size = gbe::kernelGetImageSize;
@@ -812,6 +854,7 @@ namespace gbe
       gbe_get_image_base_index = gbe::getImageBaseIndex;
       gbe_set_image_base_index = gbe::setImageBaseIndex;
       genSetupCallBacks();
+      genSetupLLVMSemaphore();
     }
   };
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 8774344..10fcc49 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -218,6 +218,9 @@ extern gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_
 /*! Says if SLM is used. Required to reconfigure the L3 complex */
 typedef int32_t (gbe_kernel_use_slm_cb)(gbe_kernel);
 extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
+/*! Get slm size needed for kernel local variables */
+typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
+extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
 
 #ifdef __cplusplus
 }
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 28a792d..9b33b7c 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -104,6 +104,8 @@ namespace gbe {
     INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
     /*! Says if SLM is needed for it */
     INLINE bool getUseSLM(void) const { return this->useSLM; }
+    /*! get slm size for kernel local variable */
+    INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
     /*! set constant buffer size and return the cb curbe offset */
     int32_t setConstBufSize(uint32_t argID, size_t sz) {
       if(argID >= argNum) return -1;
@@ -145,7 +147,9 @@ namespace gbe {
        curbeSize         |
        simdWidth         |
        stackSize         |
+       scratchSize       |
        useSLM            |
+       slmSize           |
        samplers          |
        images            |
        code_size         |
@@ -169,6 +173,7 @@ namespace gbe {
     uint32_t stackSize;        //!< Stack size (may be 0 if unused)
     uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
+    uint32_t slmSize;          //!< slm size for kernel variable
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
     ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
     ir::ImageSet *imageSet;    //!< Copy from the corresponding function.
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index afe86f2..37e61e2 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -33,6 +33,9 @@
 #include <deque>
 #include <vector>
 #include <algorithm>
+#include <stdlib.h>
+#include <stdio.h>
+
 #include "backend/program.h"
 #include "backend/program.hpp"
 
@@ -49,6 +52,7 @@ protected:
     string prog_path;
     string build_opt;
     static string bin_path;
+    static bool str_fmt_out;
     int fd;
     int file_len;
     const char* code;
@@ -100,7 +104,6 @@ public:
         return *this;
     }
 
-
     const char* file_map_open (void) throw (int);
 
     const char* get_code (void) {
@@ -125,6 +128,10 @@ public:
         print_file();
     }
 
+    static void set_str_fmt_out (bool flag) {
+        str_fmt_out = flag;
+    }
+
     static int set_bin_path (const char* path) {
         if (bin_path.size())
             return 0;
@@ -138,14 +145,43 @@ public:
 };
 
 string program_build_instance::bin_path;
+bool program_build_instance::str_fmt_out = false;
 
 void program_build_instance::serialize_program(void) throw(int)
 {
     ofstream ofs;
     ostringstream oss;
+    size_t sz;
     ofs.open(bin_path, ofstream::out | ofstream::app | ofstream::binary);
 
-    size_t sz = gbe_prog->serializeToBin(ofs);
+    if (str_fmt_out) {
+        string array_name = "Unkown_name_array";
+        unsigned long last_slash = bin_path.rfind("/");
+        unsigned long last_dot = bin_path.rfind(".");
+
+        if (last_slash != string::npos &&  last_dot != string::npos)
+            array_name = bin_path.substr(last_slash + 1, last_dot - 1 - last_slash);
+
+        ofs << "char " << array_name << "[] = {" << "\n";
+
+        sz = gbe_prog->serializeToBin(oss);
+
+        for (size_t i = 0; i < sz; i++) {
+            unsigned char c = oss.str().c_str()[i];
+            char asic_str[9];
+            sprintf(asic_str, "%2.2x", c);
+            ofs << "0x";
+            ofs << asic_str << ((i == sz - 1) ? "" : ", ");
+        }
+
+        ofs << "};\n";
+
+	string array_size = array_name + "_size";
+	ofs << "int " << array_size << " = " << sz << ";" << "\n";
+    } else {
+        sz = gbe_prog->serializeToBin(ofs);
+    }
+
     ofs.close();
 
     if (!sz) {
@@ -211,7 +247,7 @@ int main (int argc, const char **argv)
         argv_saved.push_back(string(argv[i]));
     }
 
-    while ( (oc = getopt(argc, (char * const *)argv, "o:p:")) != -1 ) {
+    while ( (oc = getopt(argc, (char * const *)argv, "o:p:s")) != -1 ) {
         switch (oc) {
         case 'p':
         {
@@ -245,6 +281,11 @@ int main (int argc, const char **argv)
             used_index[optind-1] = 1;
             break;
 
+        case 's':
+            program_build_instance::set_str_fmt_out(true);
+            used_index[optind-1] = 1;
+            break;
+
         case ':':
             cout << "Miss the file option argument" << endl;
             return 1;
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
index 6cc81f1..047cc19 100755
--- a/backend/src/gen_convert.sh
+++ b/backend/src/gen_convert.sh
@@ -70,3 +70,157 @@ for vector_length in $VECTOR_LENGTHS; do
           done
         fi
 done
+
+echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x > MAX ? (DSTTYPE)MAX : x < MIN ? (DSTTYPE)MIN : x; \
+  }
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x > MAX ? (DSTTYPE)MAX : x; \
+  }
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+
+INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
+  ulong MAX = 0x7ffffffffffffffful;
+  return x > MAX ? MAX : x;
+}
+
+INLINE_OVERLOADABLE ulong convert_ulong_sat(long x) {
+  return x < 0 ? 0 : x;
+}
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x; \
+  }
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, char);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, char);
+DEF(uint, uchar);
+DEF(uint, short);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, char);
+DEF(ulong, uchar);
+DEF(ulong, short);
+DEF(ulong, ushort);
+DEF(ulong, int);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+'
+
+# vector convert_DSTTYPE_sat function
+for vector_length in $VECTOR_LENGTHS; do
+  if test $vector_length -eq 1; then continue; fi
+
+  for ftype in $TYPES; do
+    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+    if test $fbasetype = "double"; then continue; fi
+
+    for ttype in $TYPES; do
+      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+      if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+      fvectortype=$fbasetype$vector_length
+      tvectortype=$tbasetype$vector_length
+      conv="convert_${tbasetype}_sat"
+
+      construct="$conv(v.s0)"
+      if test $vector_length -gt 1; then
+        construct="$construct, $conv(v.s1)"
+      fi
+      if test $vector_length -gt 2; then
+        construct="$construct, $conv(v.s2)"
+      fi
+      if test $vector_length -gt 3; then
+        construct="$construct, $conv(v.s3)"
+      fi
+      if test $vector_length -gt 4; then
+        construct="$construct, $conv(v.s4)"
+        construct="$construct, $conv(v.s5)"
+        construct="$construct, $conv(v.s6)"
+        construct="$construct, $conv(v.s7)"
+      fi
+      if test $vector_length -gt 8; then
+        construct="$construct, $conv(v.s8)"
+        construct="$construct, $conv(v.s9)"
+        construct="$construct, $conv(v.sA)"
+        construct="$construct, $conv(v.sB)"
+        construct="$construct, $conv(v.sC)"
+        construct="$construct, $conv(v.sD)"
+        construct="$construct, $conv(v.sE)"
+        construct="$construct, $conv(v.sF)"
+      fi
+
+      echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v) {"
+      echo "  return ($tvectortype)($construct);"
+      echo "}"
+      echo
+    done
+  done
+done
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index 7a8f80f..a38d392 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -61,8 +61,7 @@ namespace ir {
                      + cnst.getName().size()*sizeof(char) //name
                      + sizeof(cnst.getSize())             //size
                      + sizeof(cnst.getAlignment())        //alignment
-                     + sizeof(cnst.getOffset())	          //offset
-                     + sizeof(cnst.getReg());             //reg
+                     + sizeof(cnst.getOffset());	        //offset
       OUT_UPDATE_SZ(bytes);
 
       OUT_UPDATE_SZ(cnst.getName().size());
@@ -71,7 +70,6 @@ namespace ir {
       OUT_UPDATE_SZ(cnst.getSize());
       OUT_UPDATE_SZ(cnst.getAlignment());
       OUT_UPDATE_SZ(cnst.getOffset());
-      OUT_UPDATE_SZ(cnst.getReg());
     }
 
     OUT_UPDATE_SZ(magic_end);
@@ -111,21 +109,18 @@ namespace ir {
       c_name[name_len] = 0;
 
       uint32_t size, align, offset;
-      uint16_t reg;
       IN_UPDATE_SZ(size);
       IN_UPDATE_SZ(align);
       IN_UPDATE_SZ(offset);
-      IN_UPDATE_SZ(reg);
 
       ir::Constant constant(c_name, size, align, offset);
-      constant.setReg(reg);
       constants.push_back(constant);
 
       delete[] c_name;
 
       /* Saint check */
       if (bytes != sizeof(name_len) + sizeof(char)*name_len + sizeof(size)
-              + sizeof(align) + sizeof(offset) + sizeof(reg))
+              + sizeof(align) + sizeof(offset))
         return 0;
     }
 
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index 4bb549e..70d09aa 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -36,17 +36,16 @@ namespace ir {
   public:
     /*! Build a constant description */
     INLINE Constant(const std::string &name, uint32_t size, uint32_t alignment, uint32_t offset) :
-      name(name), size(size), alignment(alignment), offset(offset), reg(0) {}
+      name(name), size(size), alignment(alignment), offset(offset) {}
     /*! Copy constructor */
     INLINE Constant(const Constant &other) :
-      name(other.name), size(other.size), alignment(other.alignment), offset(other.offset), reg(other.reg) {}
+      name(other.name), size(other.size), alignment(other.alignment), offset(other.offset) {}
     /*! Copy operator */
     INLINE Constant& operator= (const Constant &other) {
       this->name = other.name;
       this->size = other.size;
       this->alignment = other.alignment;
       this->offset = other.offset;
-      this->reg = other.reg;
       return *this;
     }
     /*! Nothing happens here */
@@ -55,14 +54,11 @@ namespace ir {
     uint32_t getSize (void) const { return size; }
     uint32_t getAlignment (void) const { return alignment; }
     uint32_t getOffset(void) const { return offset; }
-    uint16_t getReg(void) const { return reg; }
-    void setReg(uint16_t reg) { this->reg = reg; }
   private:
     std::string name; //!< Optional name of the constant
     uint32_t size;      //!< Size of the constant
     uint32_t alignment; //!< Alignment required for each constant
     uint32_t offset;    //!< Offset of the constant in the data segment
-    uint16_t reg; //!< Virtual register number
     GBE_CLASS(Constant);
   };
 
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 88aae08..c15c292 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -43,7 +43,7 @@ namespace ir {
   ///////////////////////////////////////////////////////////////////////////
 
   Function::Function(const std::string &name, const Unit &unit, Profile profile) :
-    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false)
+    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0)
   {
     initProfile(*this);
     samplerSet = GBE_NEW(SamplerSet);
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 6e712cd..3d4733d 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -301,6 +301,10 @@ namespace ir {
     INLINE bool getUseSLM(void) const { return this->useSLM; }
     /*! Change the SLM config for the function */
     INLINE bool setUseSLM(bool useSLM) { return this->useSLM = useSLM; }
+    /*! get SLM size needed for local variable inside kernel function */
+    INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
+    /*! set slm size needed for local variable inside kernel function */
+    INLINE void setSLMSize(uint32_t size) { this->slmSize = size; }
     /*! Get sampler set in this function */
     SamplerSet* getSamplerSet(void) const {return samplerSet; }
     /*! Get image set in this function */
@@ -320,6 +324,7 @@ namespace ir {
     LocationMap locationMap;        //!< Pushed function arguments (loc->reg)
     uint32_t simdWidth;             //!< 8 or 16 if forced, 0 otherwise
     bool useSLM;                    //!< Is SLM required?
+    uint32_t slmSize;               //!< local variable size inside kernel function
     SamplerSet *samplerSet;          //!< samplers used in this function.
     ImageSet* imageSet;              //!< Image set in this function's arguments..
     GBE_CLASS(Function);            //!< Use custom allocator
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 7c6c6c6..9b3e699 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -252,9 +252,10 @@ namespace ir {
       ConvertInstruction(Type dstType,
                          Type srcType,
                          Register dst,
-                         Register src)
+                         Register src,
+                         bool saturated=false)
       {
-        this->opcode = OP_CVT;
+        this->opcode = saturated ? OP_SAT_CVT : OP_CVT;
         this->dst[0] = dst;
         this->src[0] = src;
         this->dstType = dstType;
@@ -1469,6 +1470,11 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
     return internal::ConvertInstruction(dstType, srcType, dst, src).convert();
   }
 
+  // saturated convert
+  Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src) {
+    return internal::ConvertInstruction(dstType, srcType, dst, src, true).convert();
+  }
+
   // For all unary functions with given opcode
   Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Tuple src) {
     return internal::AtomicInstruction(atomicOp, dst, space, src).convert();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 27a34d1..90c819b 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -623,6 +623,8 @@ namespace ir {
   Instruction GT(Type type, Register dst, Register src0, Register src1);
   /*! cvt.{dstType <- srcType} dst src */
   Instruction CVT(Type dstType, Type srcType, Register dst, Register src);
+  /*! sat_cvt.{dstType <- srcType} dst src */
+  Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
   Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Tuple src);
   /*! bra labelIndex */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 1a9f867..cd60349 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -61,6 +61,7 @@ DECL_INSN(LT, CompareInstruction)
 DECL_INSN(GE, CompareInstruction)
 DECL_INSN(GT, CompareInstruction)
 DECL_INSN(CVT, ConvertInstruction)
+DECL_INSN(SAT_CVT, ConvertInstruction)
 DECL_INSN(ATOMIC, AtomicInstruction)
 DECL_INSN(BRA, BranchInstruction)
 DECL_INSN(RET, BranchInstruction)
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 9e3d66a..d8eab79 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -42,7 +42,6 @@ namespace ir {
   {
   public:
     typedef hash_map<std::string, Function*> FunctionSet;
-    typedef std::pair<void*, uint32_t> ValueIndex;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
     /*! Release everything (*including* the function pointers) */
@@ -73,30 +72,11 @@ namespace ir {
     ConstantSet& getConstantSet(void) { return constantSet; }
     /*! Return the constant set */
     const ConstantSet& getConstantSet(void) const { return constantSet; }
-
-    /*! Some values will not be allocated. For example a vector extract and
-     * a vector insertion when scalarize the vector load/store
-     */
-    void newValueProxy(void *real,
-                       void *fake,
-                       uint32_t realIndex = 0u,
-                       uint32_t fakeIndex = 0u) {
-      const ValueIndex key(fake, fakeIndex);
-      const ValueIndex value(real, realIndex);
-      GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
-      valueMap[key] = value;
-    }
-
-    void clearValueMap() { valueMap.clear(); }
-
-    /*! Return the value map */
-    const map<ValueIndex, ValueIndex> &getValueMap(void) const { return valueMap; }
   private:
     friend class ContextInterface; //!< Can free modify the unit
     hash_map<std::string, Function*> functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
     PointerSize pointerSize; //!< Size shared by all pointers
-    map<ValueIndex, ValueIndex> valueMap; //!< fake to real value map for vector load/store
     GBE_CLASS(Unit);
   };
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 8b73ac9..62d6eab 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -154,9 +154,9 @@
 #define LLVM_VERSION_MINOR 0
 #endif /* !defined(LLVM_VERSION_MINOR) */
 
-#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 3)
-#error "Only LLVM 3.0 - 3.3 is supported"
-#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 3) */
+#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4)
+#error "Only LLVM 3.0 - 3.4 is supported"
+#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4) */
 
 using namespace llvm;
 
@@ -275,6 +275,17 @@ namespace gbe
     return ir::MEM_GLOBAL;
   }
 
+  static Constant *extractConstantElem(Constant *CPV, uint32_t index) {
+    ConstantVector *CV = dyn_cast<ConstantVector>(CPV);
+    GBE_ASSERT(CV != NULL);
+#if GBE_DEBUG
+    const uint32_t elemNum = CV->getNumOperands();
+    GBE_ASSERTM(index < elemNum, "Out-of-bound constant vector access");
+#endif /* GBE_DEBUG */
+    CPV = cast<Constant>(CV->getOperand(index));
+    return CPV;
+  }
+
   /*! Handle the LLVM IR Value to Gen IR register translation. This has 2 roles:
    *  - Split the LLVM vector into several scalar values
    *  - Handle the transparent copies (bitcast or use of intrincics functions
@@ -305,13 +316,6 @@ namespace gbe
       GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
       valueMap[key] = value;
     }
-    /*! After scalarize pass, there are some valueMap in unit,
-     *  use this function to copy from unit valueMap */
-    void initValueMap(const map<ir::Unit::ValueIndex, ir::Unit::ValueIndex> &vMap) {
-      for(auto &it : vMap)
-        newValueProxy((Value*)it.second.first, (Value*)it.first.first,
-                      it.second.second, it.first.second);
-    }
     /*! Mostly used for the preallocated registers (lids, gids) */
     void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
       const ValueIndex key(value, index);
@@ -321,7 +325,9 @@ namespace gbe
     /*! Allocate a new scalar register */
     ir::Register newScalar(Value *value, Value *key = NULL, uint32_t index = 0u)
     {
-      GBE_ASSERT(dyn_cast<Constant>(value) == NULL);
+      // we don't allow normal constant, but GlobalValue is a special case,
+      // it needs a register to store its address
+      GBE_ASSERT(! (isa<Constant>(value) && !isa<GlobalValue>(value)));
       Type *type = value->getType();
       auto typeID = type->getTypeID();
       switch (typeID) {
@@ -391,6 +397,8 @@ namespace gbe
       getRealValue(value, index);
 
       Constant *CPV = dyn_cast<Constant>(value);
+      if(CPV && dyn_cast<ConstantVector>(CPV))
+        CPV = extractConstantElem(CPV, index);
       return (CPV && (isa<UndefValue>(CPV)));
     }
   private:
@@ -470,6 +478,10 @@ namespace gbe
      if (F.hasAvailableExternallyLinkage())
        return false;
 
+      // As we inline all function calls, so skip non-kernel functions
+      bool bKernel = isKernelFunction(F);
+      if(!bKernel) return false;
+
       LI = &getAnalysis<LoopInfo>();
 
       emitFunction(F);
@@ -477,7 +489,8 @@ namespace gbe
     }
 
     virtual bool doFinalization(Module &M) { return false; }
-
+    /*! handle global variable register allocation (local, constant space) */
+    void allocateGlobalVariableRegister(Function &F);
     /*! Emit the complete function code and declaration */
     void emitFunction(Function &F);
     /*! Handle input and output function parameters */
@@ -488,6 +501,8 @@ namespace gbe
     void emitMovForPHI(BasicBlock *curr, BasicBlock *succ);
     /*! Alocate one or several registers (if vector) for the value */
     INLINE void newRegister(Value *value, Value *key = NULL);
+    /*! get the register for a llvm::Constant */
+    ir::Register getConstantRegister(Constant *c, uint32_t index = 0);
     /*! Return a valid register from an operand (can use LOADI to make one) */
     INLINE ir::Register getRegister(Value *value, uint32_t index = 0);
     /*! Create a new immediate from a constant */
@@ -575,12 +590,16 @@ namespace gbe
 
     GBE_ASSERT(c);
     if(isa<UndefValue>(c)) {
-      uint32_t n = c->getNumOperands();
-      Type * opTy = type->getArrayElementType();
-      uint32_t size = opTy->getIntegerBitWidth()/ 8;
-      offset += size*n;
+      uint32_t size = getTypeByteSize(unit, type);
+      offset += size;
+      return;
+    } else if(isa<ConstantAggregateZero>(c)) {
+      uint32_t size = getTypeByteSize(unit, type);
+      memset((char*)mem+offset, 0, size);
+      offset += size;
       return;
     }
+
     switch(id) {
       case Type::TypeID::StructTyID:
         {
@@ -688,17 +707,6 @@ namespace gbe
     return false;
   }
 
-  static Constant *extractConstantElem(Constant *CPV, uint32_t index) {
-    ConstantVector *CV = dyn_cast<ConstantVector>(CPV);
-    GBE_ASSERT(CV != NULL);
-#if GBE_DEBUG
-    const uint32_t elemNum = CV->getNumOperands();
-    GBE_ASSERTM(index < elemNum, "Out-of-bound constant vector access");
-#endif /* GBE_DEBUG */
-    CPV = cast<Constant>(CV->getOperand(index));
-    return CPV;
-  }
-
   template <typename U, typename T>
   static U processConstant(Constant *CPV, T doIt, uint32_t index = 0u)
   {
@@ -739,7 +747,34 @@ namespace gbe
 #endif /* LLVM_VERSION_MINOR > 0 */
 
     if (dyn_cast<ConstantAggregateZero>(CPV)) {
-      return doIt(uint32_t(0)); // XXX Handle type
+      Type* Ty = CPV->getType();
+      if(Ty->isVectorTy())
+        Ty = (cast<VectorType>(Ty))->getElementType();
+      if (Ty == Type::getInt1Ty(CPV->getContext())) {
+        const bool b = 0;
+        return doIt(b);
+      } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
+        const uint8_t u8 = 0;
+        return doIt(u8);
+      } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
+        const uint16_t u16 = 0;
+        return doIt(u16);
+      } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
+        const uint32_t u32 = 0;
+        return doIt(u32);
+      } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
+        const uint64_t u64 = 0;
+        return doIt(u64);
+      } else if (Ty == Type::getFloatTy(CPV->getContext())) {
+        const float f32 = 0;
+        return doIt(f32);
+      } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+        const float f64 = 0;
+        return doIt(f64);
+      } else {
+        GBE_ASSERTM(false, "Unsupporte aggregate zero type.");
+        return doIt(uint32_t(0));
+      }
     } else {
       if (dyn_cast<ConstantVector>(CPV))
         CPV = extractConstantElem(CPV, index);
@@ -838,40 +873,46 @@ namespace gbe
     };
   }
 
-  ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
-    //the real value may be constant, so get real value before constant check
-    regTranslator.getRealValue(value, elemID);
+  ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
+    GBE_ASSERT(c != NULL);
 
-    if (dyn_cast<ConstantExpr>(value)) {
-      ConstantExpr *ce = dyn_cast<ConstantExpr>(value);
-      if(ce->isCast()) {
-        GBE_ASSERT(ce->getOpcode() == Instruction::PtrToInt);
-        const Value *pointer = ce->getOperand(0);
-        GBE_ASSERT(pointer->hasName());
-        auto name = pointer->getName().str();
-        uint16_t reg = unit.getConstantSet().getConstant(name).getReg();
-        return ir::Register(reg);
-      }
+    if(isa<GlobalValue>(c)) {
+      return regTranslator.getScalar(c, elemID);
     }
-    Constant *CPV = dyn_cast<Constant>(value);
-    if (CPV) {
-      if (isa<GlobalValue>(CPV)) {
-        auto name = CPV->getName().str();
-        uint16_t reg = unit.getConstantSet().getConstant(name).getReg();
-        return ir::Register(reg);
-      }
-      if (isa<ConstantExpr>(CPV)) {
+
+    if(isa<ConstantExpr>(c)) {
+      ConstantExpr * ce = dyn_cast<ConstantExpr>(c);
+
+      if(ce->isCast()) {
+        Value* op = ce->getOperand(0);
+        ir::Register pointer_reg;
+        if(isa<ConstantExpr>(op)) {
+          // try to get the real pointer register, for case like:
+          // store i64 ptrtoint (i8 addrspace(3)* getelementptr inbounds ...
+          // in which ptrtoint and getelementptr are ConstantExpr.
+          pointer_reg = getConstantRegister(dyn_cast<Constant>(op), elemID);
+        } else {
+          pointer_reg = regTranslator.getScalar(op, elemID);
+        }
+        // if ptrToInt request another type other than 32bit, convert as requested
+        ir::Type dstType = getType(ctx, ce->getType());
+        if(ce->getOpcode() == Instruction::PtrToInt && ir::TYPE_S32 != dstType) {
+          ir::Register tmp = ctx.reg(getFamily(dstType));
+          ctx.CVT(dstType, ir::TYPE_S32, tmp, pointer_reg);
+          return tmp;
+        }
+        return pointer_reg;
+      } else {
         uint32_t TypeIndex;
         uint32_t constantOffset = 0;
         uint32_t offset = 0;
-        ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV);
 
         // currently only GetElementPtr is handled
-        GBE_ASSERT(CE->getOpcode() == Instruction::GetElementPtr);
-        Value *pointer = CE->getOperand(0);
+        GBE_ASSERT(ce->getOpcode() == Instruction::GetElementPtr);
+        Value *pointer = ce->getOperand(0);
         CompositeType* CompTy = cast<CompositeType>(pointer->getType());
-        for(uint32_t op=1; op<CE->getNumOperands(); ++op) {
-          ConstantInt* ConstOP = dyn_cast<ConstantInt>(CE->getOperand(op));
+        for(uint32_t op=1; op<ce->getNumOperands(); ++op) {
+          ConstantInt* ConstOP = dyn_cast<ConstantInt>(ce->getOperand(op));
           GBE_ASSERT(ConstOP);
           TypeIndex = ConstOP->getZExtValue();
           for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
@@ -889,21 +930,30 @@ namespace gbe
           CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
         }
 
-        const std::string &pointer_name = pointer->getName().str();
-        ir::Register pointer_reg = ir::Register(unit.getConstantSet().getConstant(pointer_name).getReg());
+        ir::Register pointer_reg;
+        pointer_reg = regTranslator.getScalar(pointer, elemID);
         ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
         ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32));
         ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
         ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
         return reg;
       }
-      const ir::ImmediateIndex immIndex = this->newImmediate(CPV, elemID);
-      const ir::Immediate imm = ctx.getImmediate(immIndex);
-      const ir::Register reg = ctx.reg(getFamily(imm.type));
-      ctx.LOADI(imm.type, reg, immIndex);
-      return reg;
     }
-    else
+
+    const ir::ImmediateIndex immIndex = this->newImmediate(c, elemID);
+    const ir::Immediate imm = ctx.getImmediate(immIndex);
+    const ir::Register reg = ctx.reg(getFamily(imm.type));
+    ctx.LOADI(imm.type, reg, immIndex);
+    return reg;
+  }
+
+  ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
+    //the real value may be constant, so get real value before constant check
+    regTranslator.getRealValue(value, elemID);
+    if(isa<Constant>(value)) {
+      Constant *c = dyn_cast<Constant>(value);
+      return getConstantRegister(c, elemID);
+    } else
       return regTranslator.getScalar(value, elemID);
   }
 
@@ -1273,6 +1323,55 @@ namespace gbe
   BVAR(OCL_OPTIMIZE_PHI_MOVES, true);
   BVAR(OCL_OPTIMIZE_LOADI, true);
 
+  void GenWriter::allocateGlobalVariableRegister(Function &F)
+  {
+    // Allocate a address register for each global variable
+    const Module::GlobalListType &globalList = TheModule->getGlobalList();
+    size_t j = 0;
+    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+      const GlobalVariable &v = *i;
+      if(!v.isConstantUsed()) continue;
+
+      ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
+      if(addrSpace == ir::MEM_LOCAL) {
+        ir::Function &f = ctx.getFunction();
+        f.setUseSLM(true);
+        const Constant *c = v.getInitializer();
+        Type *ty = c->getType();
+        uint32_t oldSlm = f.getSLMSize();
+        uint32_t align = 8 * getAlignmentByte(unit, ty);
+        uint32_t padding = getPadding(oldSlm*8, align);
+
+        f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty));
+        const Value * parent = cast<Value>(&v);
+        // local variable can only be used in one kernel function. so, don't need to check its all uses.
+        // loop through the Constant to find the instruction that use the global variable
+        do {
+          Value::const_use_iterator it = parent->use_begin();
+          parent = cast<Value>(*it);
+        } while(isa<Constant>(parent));
+
+        const Instruction * insn = cast<Instruction>(parent);
+        const BasicBlock * bb = insn->getParent();
+        const Function * func = bb->getParent();
+        if(func != &F) continue;
+
+        this->newRegister(const_cast<GlobalVariable*>(&v));
+        ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
+      } else if(addrSpace == ir::MEM_CONSTANT) {
+        GBE_ASSERT(v.hasInitializer());
+        this->newRegister(const_cast<GlobalVariable*>(&v));
+        ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+        ir::Constant &con = unit.getConstantSet().getConstant(j ++);
+        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
+      } else {
+        GBE_ASSERT(0);
+      }
+    }
+
+  }
+
   void GenWriter::emitFunction(Function &F)
   {
     switch (F.getCallingConv()) {
@@ -1289,25 +1388,10 @@ namespace gbe
 
     ctx.startFunction(F.getName());
     this->regTranslator.clear();
-    this->regTranslator.initValueMap(unit.getValueMap());
     this->labelMap.clear();
     this->emitFunctionPrototype(F);
 
-    // Allocate a virtual register for each global constant array
-    const Module::GlobalListType &globalList = TheModule->getGlobalList();
-    size_t j = 0;
-    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
-      const GlobalVariable &v = *i;
-      unsigned addrSpace = v.getType()->getAddressSpace();
-      if(addrSpace != ir::AddressSpace::MEM_CONSTANT)
-        continue;
-      GBE_ASSERT(v.hasInitializer());
-      ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
-      ir::Constant &con = unit.getConstantSet().getConstant(j ++);
-      con.setReg(reg.value());
-      ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
-    }
-
+    this->allocateGlobalVariableRegister(F);
     // Visit all the instructions and emit the IR registers or the value to
     // value mapping when a new register is not needed
     pass = PASS_EMIT_REGISTERS;
@@ -1626,10 +1710,34 @@ namespace gbe
   /*! Because there are still fake insert/extract instruction for
    *  load/store, so keep empty function here */
   void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
-  void GenWriter::emitInsertElement(InsertElementInst &I) {}
+  void GenWriter::emitInsertElement(InsertElementInst &I) {
+    const VectorType *type = dyn_cast<VectorType>(I.getType());
+    GBE_ASSERT(type);
+    const int elemNum = type->getNumElements();
+
+    Value *vec = I.getOperand(0);
+    Value *value = I.getOperand(1);
+    const Value *index = I.getOperand(2);
+    const ConstantInt *c = dyn_cast<ConstantInt>(index);
+    int i = c->getValue().getSExtValue();
+
+    for(int j=0; j<elemNum; j++) {
+      if(i == j)
+        regTranslator.newValueProxy(value, &I, 0, i);
+      else
+        regTranslator.newValueProxy(vec, &I, j, j);
+    }
+  }
 
   void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {}
-  void GenWriter::emitExtractElement(ExtractElementInst &I) {}
+  void GenWriter::emitExtractElement(ExtractElementInst &I) {
+    Value *vec = I.getVectorOperand();
+    const Value *index = I.getIndexOperand();
+    const ConstantInt *c = dyn_cast<ConstantInt>(index);
+    GBE_ASSERT(c);
+    int i = c->getValue().getSExtValue();
+    regTranslator.newValueProxy(vec, &I, i, 0);
+  }
 
   void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
   void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
@@ -1910,6 +2018,30 @@ namespace gbe
       case GEN_OCL_I64RHADD:
       case GEN_OCL_I64_MAD_SAT:
       case GEN_OCL_I64_MAD_SATU:
+      case GEN_OCL_SAT_CONV_U8_TO_I8:
+      case GEN_OCL_SAT_CONV_I16_TO_I8:
+      case GEN_OCL_SAT_CONV_U16_TO_I8:
+      case GEN_OCL_SAT_CONV_I32_TO_I8:
+      case GEN_OCL_SAT_CONV_U32_TO_I8:
+      case GEN_OCL_SAT_CONV_F32_TO_I8:
+      case GEN_OCL_SAT_CONV_I8_TO_U8:
+      case GEN_OCL_SAT_CONV_I16_TO_U8:
+      case GEN_OCL_SAT_CONV_U16_TO_U8:
+      case GEN_OCL_SAT_CONV_I32_TO_U8:
+      case GEN_OCL_SAT_CONV_U32_TO_U8:
+      case GEN_OCL_SAT_CONV_F32_TO_U8:
+      case GEN_OCL_SAT_CONV_U16_TO_I16:
+      case GEN_OCL_SAT_CONV_I32_TO_I16:
+      case GEN_OCL_SAT_CONV_U32_TO_I16:
+      case GEN_OCL_SAT_CONV_F32_TO_I16:
+      case GEN_OCL_SAT_CONV_I16_TO_U16:
+      case GEN_OCL_SAT_CONV_I32_TO_U16:
+      case GEN_OCL_SAT_CONV_U32_TO_U16:
+      case GEN_OCL_SAT_CONV_F32_TO_U16:
+      case GEN_OCL_SAT_CONV_U32_TO_I32:
+      case GEN_OCL_SAT_CONV_F32_TO_I32:
+      case GEN_OCL_SAT_CONV_I32_TO_U32:
+      case GEN_OCL_SAT_CONV_F32_TO_U32:
         this->newRegister(&I);
         break;
       default:
@@ -2415,6 +2547,57 @@ namespace gbe
             ctx.I64RHADD(ir::TYPE_U64, dst, src0, src1);
             break;
            }
+#define DEF(DST_TYPE, SRC_TYPE) \
+  { ctx.SAT_CVT(DST_TYPE, SRC_TYPE, getRegister(&I), getRegister(I.getOperand(0))); break; }
+          case GEN_OCL_SAT_CONV_U8_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_U8);
+          case GEN_OCL_SAT_CONV_I16_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_S16);
+          case GEN_OCL_SAT_CONV_U16_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_U16);
+          case GEN_OCL_SAT_CONV_I32_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_I8_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_S8);
+          case GEN_OCL_SAT_CONV_I16_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_S16);
+          case GEN_OCL_SAT_CONV_U16_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_U16);
+          case GEN_OCL_SAT_CONV_I32_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_U16_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_U16);
+          case GEN_OCL_SAT_CONV_I32_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_I16_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_S16);
+          case GEN_OCL_SAT_CONV_I32_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_U32_TO_I32:
+            DEF(ir::TYPE_S32, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_I32:
+            DEF(ir::TYPE_S32, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_I32_TO_U32:
+            DEF(ir::TYPE_U32, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_F32_TO_U32:
+            DEF(ir::TYPE_U32, ir::TYPE_FLOAT);
+#undef DEF
           default: break;
         }
       }
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 2ad879e..55079f5 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -75,13 +75,16 @@ namespace gbe
   /*! Get the type size in bytes */
   uint32_t getTypeByteSize(const ir::Unit &unit, llvm::Type* Ty);
 
+  /*! whether this is a kernel function */
+  bool isKernelFunction(const llvm::Function &f);
+
   /*! Create a Gen-IR unit */
   llvm::FunctionPass *createGenPass(ir::Unit &unit);
 
   /*! Remove the GEP instructions */
   llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
 
-  llvm::FunctionPass* createScalarizePass(ir::Unit &unit);
+  llvm::FunctionPass* createScalarizePass();
 
 } /* namespace gbe */
 
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 321fc4e..3f44be8 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -146,3 +146,34 @@ DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
 
 // get sampler info
 DECL_LLVM_GEN_FUNCTION(GET_SAMPLER_INFO, __gen_ocl_get_sampler_info)
+
+// saturate convert
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U8_TO_I8,  _Z16convert_char_sath)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_I8, _Z16convert_char_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_I8, _Z16convert_char_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_I8, _Z16convert_char_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I8, _Z16convert_char_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I8, _Z16convert_char_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I8_TO_U8,  _Z17convert_uchar_satc)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_U8, _Z17convert_uchar_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_U8, _Z17convert_uchar_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U8, _Z17convert_uchar_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_U8, _Z17convert_uchar_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U8, _Z17convert_uchar_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_I16, _Z17convert_short_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_I16, _Z17convert_short_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I16, _Z17convert_short_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I16, _Z17convert_short_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_U16, _Z18convert_ushort_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U16, _Z18convert_ushort_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_U16, _Z18convert_ushort_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U16, _Z18convert_ushort_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I32, _Z15convert_int_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I32, _Z15convert_int_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U32, _Z16convert_uint_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U32, _Z16convert_uint_satf)
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 4bafc0d..60c9df1 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -105,6 +105,23 @@ using namespace llvm;
 
 namespace gbe
 {
+  bool isKernelFunction(const llvm::Function &F) {
+    const Module *module = F.getParent();
+    const Module::NamedMDListType& globalMD = module->getNamedMDList();
+    bool bKernel = false;
+    for(auto i = globalMD.begin(); i != globalMD.end(); i++) {
+      const NamedMDNode &md = *i;
+      if(strcmp(md.getName().data(), "opencl.kernels") != 0) continue;
+      uint32_t ops = md.getNumOperands();
+      for(uint32_t x = 0; x < ops; x++) {
+        MDNode* node = md.getOperand(x);
+        Value * op = node->getOperand(0);
+        if(op == &F) bKernel = true;
+      }
+    }
+    return bKernel;
+  }
+
   uint32_t getPadding(uint32_t offset, uint32_t align) {
     return (align - (offset % align)) % align; 
   }
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 7a40616..a29bc59 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -92,7 +92,6 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include "llvm/llvm_gen_backend.hpp"
-#include "ir/unit.hpp"
 #include "sys/map.hpp"
 
 
@@ -126,7 +125,7 @@ namespace gbe {
     // Standard pass stuff
     static char ID;
 
-    Scalarize(ir::Unit& unit) : FunctionPass(ID), unit(unit)
+    Scalarize() : FunctionPass(ID)
     {
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
       initializeDominatorTreePass(*PassRegistry::getPassRegistry());
@@ -228,7 +227,6 @@ namespace gbe {
 
     Type* intTy;
     Type* floatTy;
-    ir::Unit &unit;
 
     std::vector<Instruction*> deadList;
 
@@ -598,14 +596,11 @@ namespace gbe {
       Value *cv = ConstantInt::get(intTy, i);
       Value *EI = builder->CreateExtractElement(insn, cv);
       vVals.setComponent(i, EI);
-      //unit.fakeInsnMap[EI] = insn;
-      unit.newValueProxy(insn, EI, i, 0);
     }
   }
 
   Value* Scalarize::InsertToVector(Value * insn, Value* vecValue) {
     //VectorValues& vVals = vectorVals[writeValue];
-    //unit.vecValuesMap[call] = vectorVals[writeValue];
 
     //add fake insert instructions to avoid removed
     Value *II = NULL;
@@ -613,14 +608,8 @@ namespace gbe {
       Value *vec = II ? II : UndefValue::get(vecValue->getType());
       Value *cv = ConstantInt::get(intTy, i);
       II = builder->CreateInsertElement(vec, getComponent(i, vecValue), cv);
-      //unit.vecValuesMap[insn].setComponent(i, getComponent(i, writeValue));
-      //unit.newValueProxy(getComponent(i, vecValue), vecValue, 0, i);
-      //unit.fakeInsnMap[II] = insn;
     }
 
-    for (int i = 0; i < GetComponentCount(vecValue); ++i) {
-      unit.newValueProxy(getComponent(i, vecValue), II, 0, i);
-    }
     return II;
   }
 
@@ -767,12 +756,15 @@ namespace gbe {
     default: GBE_ASSERTM(false, "Unsupported calling convention");
     }
 
+    // As we inline all function calls, so skip non-kernel functions
+    bool bKernel = isKernelFunction(F);
+    if(!bKernel) return false;
+
     bool changed = false;
     module = F.getParent();
     intTy = IntegerType::get(module->getContext(), 32);
     floatTy = Type::getFloatTy(module->getContext());
     builder = new IRBuilder<>(module->getContext());
-    unit.clearValueMap();
 
     scalarizeArgs(F);
     typedef ReversePostOrderTraversal<Function*> RPOTType;
@@ -844,9 +836,9 @@ namespace gbe {
   {
       return;
   }
-  FunctionPass* createScalarizePass(ir::Unit &unit)
+  FunctionPass* createScalarizePass()
   {
-      return new Scalarize(unit);
+    return new Scalarize();
   }
   char Scalarize::ID = 0;
 
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 788a3dd..111514f 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -80,7 +80,7 @@ namespace gbe
     // Print the code before further optimizations
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
       passes.add(createPrintModulePass(&*o));
-    passes.add(createScalarizePass(unit));        // Expand all vector ops
+    passes.add(createScalarizePass());        // Expand all vector ops
     passes.add(createScalarReplAggregatesPass()); // Break up allocas
     passes.add(createRemoveGEPPass(unit));
     passes.add(createConstantPropagationPass());
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
index 13ae5ba..a667bee 100644
--- a/backend/src/ocl_convert.h
+++ b/backend/src/ocl_convert.h
@@ -2210,3 +2210,1545 @@ INLINE OVERLOADABLE double16 convert_double16(float16 v) {
 }
 
 INLINE OVERLOADABLE float16 convert_float16(float16 v) { return v; }
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x > MAX ? (DSTTYPE)MAX : x < MIN ? (DSTTYPE)MIN : x; \
+  }
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x > MAX ? (DSTTYPE)MAX : x; \
+  }
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+
+INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
+  ulong MAX = 0x7ffffffffffffffful;
+  return x > MAX ? MAX : x;
+}
+
+INLINE_OVERLOADABLE ulong convert_ulong_sat(long x) {
+  return x < 0 ? 0 : x;
+}
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x; \
+  }
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, char);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, char);
+DEF(uint, uchar);
+DEF(uint, short);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, char);
+DEF(ulong, uchar);
+DEF(ulong, short);
+DEF(ulong, ushort);
+DEF(ulong, int);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+
+INLINE OVERLOADABLE long2 convert_long2_sat(long2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(long2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(long2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(long2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(long2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(long2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(long2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(long2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(ulong2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ulong2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(ulong2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(ulong2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(ulong2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ulong2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(ulong2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ulong2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(int2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(int2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(int2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(int2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(int2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(int2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(int2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(int2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(uint2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uint2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(uint2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(uint2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(uint2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uint2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(uint2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uint2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(short2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(short2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(short2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(short2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(short2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(short2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(short2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(short2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(ushort2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ushort2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(ushort2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(ushort2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(ushort2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ushort2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(ushort2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ushort2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(char2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(char2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(char2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(char2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(char2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(char2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(char2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(char2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(uchar2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uchar2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(uchar2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(uchar2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(uchar2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uchar2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(uchar2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uchar2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(float2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(float2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(float2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(float2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(float2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(float2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(float2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(float2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(long3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(long3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(long3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(long3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(long3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(long3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(long3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(long3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(ulong3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ulong3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(ulong3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(ulong3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(ulong3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ulong3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(ulong3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ulong3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(int3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(int3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(int3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(int3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(int3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(int3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(int3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(int3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(uint3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uint3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(uint3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(uint3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(uint3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uint3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(uint3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uint3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(short3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(short3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(short3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(short3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(short3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(short3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(short3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(short3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(ushort3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ushort3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(ushort3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(ushort3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(ushort3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ushort3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(ushort3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ushort3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(char3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(char3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(char3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(char3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(char3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(char3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(char3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(char3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(uchar3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uchar3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(uchar3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(uchar3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(uchar3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uchar3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(uchar3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uchar3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(float3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(float3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(float3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(float3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(float3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(float3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(float3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(float3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(long4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(long4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(long4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(long4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(long4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(long4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(long4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(long4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(ulong4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ulong4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(ulong4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(ulong4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(ulong4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ulong4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(ulong4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ulong4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(int4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(int4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(int4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(int4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(int4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(int4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(int4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(int4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(uint4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uint4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(uint4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(uint4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(uint4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uint4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(uint4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uint4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(short4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(short4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(short4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(short4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(short4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(short4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(short4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(short4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(ushort4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ushort4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(ushort4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(ushort4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(ushort4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ushort4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(ushort4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ushort4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(char4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(char4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(char4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(char4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(char4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(char4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(char4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(char4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(uchar4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uchar4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(uchar4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(uchar4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(uchar4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uchar4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(uchar4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uchar4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(float4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(float4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(float4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(float4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(float4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(float4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(float4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(float4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(long8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(long8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(long8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(long8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(long8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(long8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(long8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(long8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(ulong8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ulong8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(ulong8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(ulong8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(ulong8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ulong8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(ulong8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ulong8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(int8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(int8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(int8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(int8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(int8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(int8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(int8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(int8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(uint8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uint8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(uint8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(uint8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(uint8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uint8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(uint8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uint8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(short8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(short8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(short8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(short8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(short8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(short8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(short8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(short8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(ushort8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ushort8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(ushort8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(ushort8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(ushort8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ushort8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(ushort8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ushort8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(char8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(char8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(char8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(char8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(char8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(char8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(char8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(char8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(uchar8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uchar8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(uchar8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(uchar8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(uchar8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uchar8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(uchar8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uchar8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(float8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(float8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(float8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(float8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(float8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(float8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(float8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(float8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(long16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(long16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(long16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(long16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(long16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(long16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(long16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(long16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(ulong16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ulong16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(ulong16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(ulong16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(ulong16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ulong16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(ulong16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ulong16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(int16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(int16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(int16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(int16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(int16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(int16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(int16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(int16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(uint16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uint16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(uint16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(uint16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(uint16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uint16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(uint16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uint16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(short16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(short16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(short16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(short16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(short16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(short16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(short16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(short16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(ushort16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ushort16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(ushort16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(ushort16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(ushort16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ushort16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(ushort16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ushort16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(char16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(char16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(char16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(char16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(char16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(char16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(char16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(char16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(uchar16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uchar16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(uchar16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(uchar16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(uchar16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uchar16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(uchar16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uchar16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(float16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(float16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(float16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(float16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(float16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(float16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(float16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(float16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 170ec70..7948b7c 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -85,7 +85,7 @@ struct _image2d_t;
 typedef __texture struct _image2d_t* __image2d_t;
 struct _image3d_t;
 typedef __texture struct _image3d_t* __image3d_t;
-typedef uint __sampler_t;
+typedef const uint __sampler_t;
 typedef size_t __event_t;
 #define image2d_t __image2d_t
 #define image3d_t __image3d_t
@@ -110,6 +110,7 @@ typedef size_t __event_t;
 #define __CL_VERSION_1_0__ 100
 #define __CL_VERSION_1_1__ 110
 #define __ENDIAN_LITTLE__ 1
+#define __IMAGE_SUPPORT__ 1
 #define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
                                         __attribute__((vec_type_hint(TYPE)))
 #define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
@@ -218,13 +219,13 @@ UDEF(uint);
 UDEF(ulong);
 #undef UDEF
 
-uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
-    return add_sat((uchar)x, (uchar)0);
-}
-
 INLINE_OVERLOADABLE int isfinite(float x) { return __builtin_isfinite(x); }
 INLINE_OVERLOADABLE int isinf(float x) { return __builtin_isinf(x); }
-INLINE_OVERLOADABLE int isnan(float x) { return __builtin_isnan(x); }
+INLINE_OVERLOADABLE int isnan(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return (u.u & 0x7FFFFFFF) > 0x7F800000;
+}
 INLINE_OVERLOADABLE int isnormal(float x) { return __builtin_isnormal(x); }
 INLINE_OVERLOADABLE int isordered(float x, float y) { return isequal(x, x) && isequal(y, y); }
 INLINE_OVERLOADABLE int isunordered(float x, float y) { return isnan(x) || isnan(y); }
@@ -2093,7 +2094,10 @@ DEF(short)
 DEF(ushort)
 DEF(int)
 DEF(uint)
+DEF(long)
+DEF(ulong)
 DEF(float)
+DEF(double)
 #undef BODY
 #undef DEFN
 #undef DEF
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index c0f88de..97b568b 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -55,7 +55,7 @@ with any thing older.
 
 [http://llvm.org/releases/](http://llvm.org/releases/)
 
-LLVM 3.1,3.2 and 3.3 are supported.
+LLVM 3.1,3.2,3.3 and 3.4 are supported.
 
 Also note that the code was compiled on GCC 4.6 and GCC 4.7. Since the code uses
 really recent C++11 features, you may expect problems with older compilers. Last
@@ -101,14 +101,9 @@ do:
   as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
   the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D.
 
-- Support for events.
-
 - Check that NDRangeKernels can be pushed into _different_ queues from several
   threads.
 
-- Support for nonblocking mode Enqueue\*Buffer. Now we only use the map extension to
-  implement those Enqueue\*Buffer functions.
-
 - No state tracking at all. One batch buffer is created at each "draw call"
   (i.e. for each NDRangeKernels). This is really inefficient since some
   expensive pipe controls are issued for each batch buffer
diff --git a/docs/Beignet/Backend/TODO.mdwn b/docs/Beignet/Backend/TODO.mdwn
index f14433d..adc7fd2 100644
--- a/docs/Beignet/Backend/TODO.mdwn
+++ b/docs/Beignet/Backend/TODO.mdwn
@@ -31,8 +31,6 @@ many things must be implemented:
 - From LLVM 3.3, we use SPIR IR. We need to use the compiler defined type to
   represent sampler_t/image2d_t/image1d_t/....
 
-- Adding support for long (int64).
-
 Gen IR
 ------
 
@@ -56,17 +54,12 @@ The code is defined in `src/ir`. Main things to do are:
   This will obviously impact both instruction selection and the register
   allocation.
 
-- Adding support for long (int64).
-
 Backend
 -------
 
 The code is defined in `src/backend`. Main things to do are:
 
-- Int64 support?
-
-- Implementing register spilling (see the [[compiler backend
-  description|compiler_backend]] for more details)
+- Optimize register spilling (see the [[compiler backend description|compiler_backend]] for more details)
 
 - Implementing proper instruction selection. A "simple" tree matching algorithm
   should provide good results for Gen
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 135e340..3fd73da 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -90,6 +90,38 @@ typedef CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithLLVMIntel_fn)(
                                  const char *            /* file */,
                                  cl_int *                /* errcode_ret */);
 
+/* Create buffer from libva's buffer object */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferFromLibvaIntel(cl_context      /* context */,
+                             unsigned int    /* bo_name */,
+                             cl_int *        /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateBufferFromLibvaIntel_fn)(
+                             cl_context     /* context */,
+                             unsigned int   /* bo_name */,
+                             cl_int *       /* errcode_ret */);
+
+/* Create image from libva's buffer object */
+typedef struct _cl_libva_image {
+    unsigned int            bo_name;
+    uint32_t                offset;
+    uint32_t                width;
+    uint32_t                height;
+    cl_image_format         fmt;
+    uint32_t                row_pitch;
+    uint32_t                reserved[8];
+} cl_libva_image;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImageFromLibvaIntel(cl_context               /* context */,
+                            const cl_libva_image *   /* info */,
+                            cl_int *                 /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImageFromLibvaIntel_fn)(
+                             cl_context             /* context */,
+                             const cl_libva_image * /* info */,
+                             cl_int *               /* errcode_ret */);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/kernels/builtin_convert_sat.cl b/kernels/builtin_convert_sat.cl
new file mode 100644
index 0000000..1485f1d
--- /dev/null
+++ b/kernels/builtin_convert_sat.cl
@@ -0,0 +1,48 @@
+#define DEF(DSTTYPE, SRCTYPE) \
+  kernel void builtin_convert_ ## SRCTYPE ## _to_ ## DSTTYPE ## _sat(global SRCTYPE *src, global DSTTYPE *dst) { \
+  int i = get_global_id(0); \
+  dst[i] = convert_ ## DSTTYPE ## _sat(src[i]); \
+}
+
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, long);
+DEF(char, ulong);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, long);
+DEF(uchar, ulong);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, long);
+DEF(short, ulong);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, long);
+DEF(ushort, ulong);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, long);
+DEF(int, ulong);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, long);
+DEF(uint, ulong);
+DEF(uint, float);
+DEF(long, ulong);
+DEF(long, float);
+DEF(ulong, long);
+DEF(ulong, float);
+#undef DEF
+
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
index a2432a4..06ec8e7 100644
--- a/kernels/compiler_async_copy.cl
+++ b/kernels/compiler_async_copy.cl
@@ -1,16 +1,24 @@
-__kernel void
-compiler_async_copy(__global int2 *dst, __global int2 *src, __local int2 *localBuffer, int copiesPerWorkItem)
-{
-  event_t event;
-  int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
-  int i;
-  event = async_work_group_copy((__local int2*)localBuffer, (__global const int2*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 );
-  wait_group_events( 1, &event );
-
-  for(i=0; i<copiesPerWorkItem; i++)
-    localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (int2)(3, 3);
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  event = async_work_group_copy((__global int2*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const int2*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 );
-  wait_group_events( 1, &event );
+#define DEF(TYPE) \
+kernel void \
+compiler_async_copy_##TYPE(__global TYPE *dst, __global TYPE *src, __local TYPE *localBuffer, int copiesPerWorkItem) \
+{ \
+  event_t event; \
+  int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); \
+  int i; \
+  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  wait_group_events( 1, &event ); \
+\
+  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  wait_group_events( 1, &event ); \
 }
+
+DEF(char2);
+DEF(uchar2);
+DEF(short2);
+DEF(ushort2);
+DEF(int2);
+DEF(uint2);
+DEF(long2);
+DEF(ulong2);
+DEF(float2);
+DEF(double2);
diff --git a/kernels/compiler_clod_function_call.cl b/kernels/compiler_clod_function_call.cl
new file mode 100644
index 0000000..ecfac46
--- /dev/null
+++ b/kernels/compiler_clod_function_call.cl
@@ -0,0 +1,91 @@
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define normalize fast_normalize
+#define length fast_length
+#define mod fmod
+
+vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+#define OUTPUT do {\
+  const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
+} while (0)
+
+#define time 1.f
+
+float f(vec3 o)
+{
+    float a=(sin(o.x)+o.y*.25f)*.35f;
+    o=(vec3)(cos(a)*o.x-sin(a)*o.y,sin(a)*o.x+cos(a)*o.y,o.z);
+    return dot(cos(o)*cos(o),(vec3)(1.f))-1.2f;
+}
+
+// XXX front end does not inline this function
+vec3 s(vec3 o,vec3 d)
+{
+    float t=0.0f;
+    float dt = 0.2f;
+    float nh = 0.0f;
+    float lh = 0.0f;
+    for(int i=0;i<50;i++)
+    {
+        nh = f(o+d*t);
+        if(nh>0.0f) { lh=nh; t+=dt; }
+    }
+
+    if( nh>0.0f ) return (vec3)(.93f,.94f,.85f);
+
+    t = t - dt*nh/(nh-lh);
+
+    vec3 exyy=(vec3)(0.1f,0.0f,0.0f);
+    vec3 eyxy=(vec3)(0.0f,0.1f,0.0f);
+    vec3 eyyx=(vec3)(0.0f,0.0f,0.1f);
+    vec3 p=o+d*t;
+    vec3 n=-normalize((vec3)(f(p+exyy),f(p+eyxy),f(p+eyyx))+(vec3)((sin(p*75.f)))*.01f);
+
+    return (vec3)(mix( ((max(-dot(n,(vec3)(.577f)),0.f) + 0.125f*max(-dot(n,(vec3)(-.707f,-.707f,0.f)),0.f)))*(mod
+    (length(p.xy)*20.f,2.f)<1.0f?(vec3)(.71f,.85f,.25f):(vec3)(.79f,.93f,.4f))
+                           ,(vec3)(.93f,.94f,.85f), (vec3)(pow(t/9.f,5.f)) ) );
+}
+
+#if 0
+// XXX vector type in the function arguments not supported yet
+__kernel void compiler_clod(__global uint *dst, vec2 resolution, int w)
+{
+    vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+    //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy;
+    vec2 p;
+    p.x = -1.0f + 2.0f * gl_FragCoord.x / resolution.x;
+    p.y = -1.0f + 2.0f * gl_FragCoord.y / resolution.y;
+    vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f);
+    OUTPUT;
+}
+#else
+__kernel void compiler_clod(__global uint *dst, float resx, float resy, int w)
+{
+    vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+    //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy;
+    vec2 p;
+    p.x = -1.0f + 2.0f * gl_FragCoord.x / resx;
+    p.y = -1.0f + 2.0f * gl_FragCoord.y / resy;
+    vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f);
+    OUTPUT;
+}
+
+#endif
+
diff --git a/kernels/compiler_function_argument2.cl b/kernels/compiler_function_argument2.cl
new file mode 100644
index 0000000..24e5795
--- /dev/null
+++ b/kernels/compiler_function_argument2.cl
@@ -0,0 +1,12 @@
+__kernel void compiler_function_argument2(
+char8 c, uchar8 uc, short8 s, ushort8 us, int8 i, uint8 ui, float8 f,
+__global float8 *result)
+{
+  result[0] = convert_float8(c);
+  result[1] = convert_float8(uc);
+  result[2] = convert_float8(s);
+  result[3] = convert_float8(us);
+  result[4] = convert_float8(i);
+  result[5] = convert_float8(ui);
+  result[6] = f;
+}
diff --git a/kernels/compiler_global_constant.cl b/kernels/compiler_global_constant.cl
index 71fe86c..53e24b3 100644
--- a/kernels/compiler_global_constant.cl
+++ b/kernels/compiler_global_constant.cl
@@ -19,9 +19,16 @@ struct Test2 {
   char a0;
   int a1;
 };
+struct Test3 {
+  int a0;
+  int a1;
+};
+struct Test4 {
+  float a0;
+  float a1;
+};
 
 constant struct Person james= {{"james"}, (int3)(1, 2, 3)};
-
 constant struct Test1 t0 = {1, 2};
 constant struct Test2 t1 = {1, 2};
 
@@ -29,6 +36,10 @@ constant int3 c[3] = {(int3)(0, 1, 2), (int3)(3, 4, 5), (int3)(6,7,8) };
 constant char4 d[3] = {(char4)(0, 1, 2, 3), (char4)(4, 5, 6, 7), (char4)(8, 9, 10, 11)};
 
 constant struct Person members[3] = {{{"abc"}, (int3)(1, 2, 3)}, { {"defg"}, (int3)(4,5,6)}, { {"hijk"}, (int3)(7,8,9)} };
+constant struct Test3 zero_struct = {0, 0};
+constant int3 zero_vec = {0,0,0};
+constant int zero_arr[3] = {0,0,0};
+constant float zero_flt[3] = {0.0f, 0.0f, 0.0f};
 
 __kernel void
 compiler_global_constant(__global int *dst, int e, int r)
@@ -36,7 +47,7 @@ compiler_global_constant(__global int *dst, int e, int r)
   int id = (int)get_global_id(0);
 
   int4 x = a + b;
-  dst[id] = m[id%3] * n * o[2] + e + r *x.y * a.x;
+  dst[id] = m[id%3] * n * o[2] + e + r *x.y * a.x + zero_struct.a0 + zero_vec.x + zero_arr[1] + (int)zero_flt[2];
 }
 // array of vectors
 __kernel void
diff --git a/kernels/compiler_insert_vector.cl b/kernels/compiler_insert_vector.cl
new file mode 100644
index 0000000..0f0e20f
--- /dev/null
+++ b/kernels/compiler_insert_vector.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_insert_vector(__global int4 *out )
+{
+    int tid = get_global_id(0);
+    int4 output = (int4)(0, 0, 0, 1); //black
+    if (tid > 16)
+    {
+        output = (int4)(tid, tid, 1, 1);
+    }
+    out[tid] = output;
+}
diff --git a/kernels/compiler_julia_function_call.cl b/kernels/compiler_julia_function_call.cl
new file mode 100644
index 0000000..7b3aa46
--- /dev/null
+++ b/kernels/compiler_julia_function_call.cl
@@ -0,0 +1,142 @@
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define normalize fast_normalize
+#define length fast_length
+#define mod fmod
+#define time 1.f
+
+vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+#define OUTPUT do {\
+  const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
+} while (0)
+
+float jinteresct(vec3 rO, vec3 rD, vec4 c, float *ao)
+{
+    float mz2,md2,dist,t;
+    float res=1000.0f;
+    vec4 z,nz;
+    int update_ao = 1;
+    *ao = 0.0f;
+    for(t=0.0f;t<6.0f;t+=dist)
+    {
+        if (update_ao) *ao += 1.0f;
+        vec3 p=rO+t*rD;
+
+        // calc distance
+        z=(vec4)(p,(c.y+c.x)*.3f);
+        md2=1.0f;
+        mz2=dot(z,z);
+
+        for(int i=0;i<9;i++)
+        {
+             // |dz|^2 -> 4*|dz|^2
+             //if (mz2 <= 4.0f)
+             {
+             md2*=4.0f*mz2;
+             // z -> z2 + c
+             nz.x=z.x*z.x-dot(z.yzw,z.yzw);
+             nz.yzw=2.0f*z.x*z.yzw;
+             z=nz+c;
+             mz2=dot(z,z);
+            }
+             if(mz2>4.0f)
+                 break;
+         }
+
+         dist=0.25f*sqrt(mz2/md2)*log(mz2);
+         if(dist<0.0005f)
+         {
+             res=t;
+             break;
+         }
+         t+= dist;
+    }
+
+    return res;
+}
+
+#if 1
+vec3 calcNormal(vec3 p, vec4 c)
+{
+    vec4 nz,ndz,dz[4];
+
+    vec4 z=(vec4)(p,(c.y+c.x)*.3f);
+
+    dz[0]=(vec4)(1.0f,0.0f,0.0f,0.0f);
+    dz[1]=(vec4)(0.0f,1.0f,0.0f,0.0f);
+    dz[2]=(vec4)(0.0f,0.0f,1.0f,0.0f);
+  //dz[3]=(vec4)(0.0f,0.0f,0.0f,1.0f);
+
+    for(int i=0;i<9;i++)
+    {
+        vec4 mz = (vec4)(z.x,-z.y,-z.z,-z.w);
+        // derivative
+        dz[0]=(vec4)(dot(mz,dz[0]),z.x*dz[0].yzw+dz[0].x*z.yzw);
+        dz[1]=(vec4)(dot(mz,dz[1]),z.x*dz[1].yzw+dz[1].x*z.yzw);
+        dz[2]=(vec4)(dot(mz,dz[2]),z.x*dz[2].yzw+dz[2].x*z.yzw);
+        //dz[3]=(vec4)(dot(mz,dz[3]),z.x*dz[3].yzw+dz[3].x*z.yzw);
+
+        // z = z2 + c
+        nz.x=dot(z, mz);
+        nz.yzw=2.0f*z.x*z.yzw;
+        z=nz+c;
+
+        if(dot(z,z)>4.0f)
+            break;
+    }
+
+    return normalize((vec3)(dot(z,dz[0]),dot(z,dz[1]),dot(z,dz[2])));
+}
+#endif
+
+__kernel void compiler_julia(__global uint *dst, float resx, float resy, int w)
+{
+    vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+    vec2 p=-1.0f+2.0f*gl_FragCoord.xy/(vec2)(resx,resy);
+    vec3 color = (vec3)(0.0f);
+    vec4 cccc = (vec4)( .7f*cos(.5f*time), .7f*sin(.3f*time), .7f*cos(1.0f*time), 0.0f );
+    vec3 edir = normalize((vec3)(p,1.0f));
+    vec3 wori = (vec3)(0.0f,0.0f,-2.0f);
+
+    float ao;
+    float t = jinteresct(wori,edir,cccc,&ao);
+    if(t<100.0f)
+    {
+#if 1
+        vec3 inter = wori + t*edir;
+        vec3 nor = calcNormal(inter,cccc);
+
+        float dif = .5f + .5f*dot( nor, (vec3)(0.57703f) );
+        ao = max( 1.0f-ao*0.005f, 0.0f);
+
+        color = (vec3)(1.0f,.9f,.5f)*dif*ao +  .5f*(vec3)(.6f,.7f,.8f)*ao;
+#else
+        color = (vec3)(0.5f,0.0f,0.0f);
+#endif
+    }
+    else
+    {
+        color = (vec3)(0.5f,0.51f,0.52f)+(vec3)(0.5f,0.47f,0.45f)*p.y;
+    }
+
+    vec4 gl_FragColor = (vec4)(color,1.0f);
+    OUTPUT;
+}
+
diff --git a/kernels/compiler_local_slm.cl b/kernels/compiler_local_slm.cl
index 1a4b175..52c078c 100644
--- a/kernels/compiler_local_slm.cl
+++ b/kernels/compiler_local_slm.cl
@@ -1,10 +1,24 @@
-#if 0
-__kernel void compiler_local_slm(__global int *dst, __local int *hop) {
-#else
+struct Test{
+  char t0;
+  int t1;
+};
+
+constant int two= 2;
+
 __kernel void compiler_local_slm(__global int *dst) {
-  __local int hop[10];
-#endif
-  hop[get_global_id(0)] = get_local_id(1);
-  dst[get_global_id(0)] = hop[get_local_id(0)];
+  __local int hop[16];
+  __local char a;
+  __local struct Test c;
+
+  c.t1 = get_group_id(0);
+  a = two;// seems clang currently has a bug if I write 'a=2;' so currently workaroud it.
+  hop[get_local_id(0)] = get_local_id(0);
+  barrier(CLK_LOCAL_MEM_FENCE);
+  dst[get_global_id(0)] = hop[get_local_id(0)] + (int)a + hop[1] + c.t1;
 }
 
+__kernel void compiler_local_slm1(__global ulong *dst) {
+  __local int hop[16];
+  dst[1] = (ulong)&hop[1];
+  dst[0] = (ulong)&hop[0];
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3fc8689..1e28c6c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,7 +4,25 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include
                     ${MESA_SOURCE_INCLUDES})
 
+macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
+foreach (KF ${KERNEL_FILES})
+  set (input_file ${KERNEL_PATH}/${KF}.cl)
+  set (output_file ${KERNEL_PATH}/${KF}_str.c)
+  list (APPEND KERNEL_STR_FILES ${output_file})
+  add_custom_command(
+    OUTPUT ${output_file}
+    COMMAND rm -rf ${output_file}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater -s ${input_file} -o${output_file}
+    DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+endforeach (KF)
+endmacro (MakeKernelBinStr)
+
+set (KERNEL_STR_FILES)
+set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 cl_internal_copy_buf_align16)
+MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
+
 set(OPENCL_SRC
+    ${KERNEL_STR_FILES}
     cl_api.c
     cl_alloc.c
     cl_kernel.c
diff --git a/src/cl_api.c b/src/cl_api.c
index ded0e0c..0e562ed 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -398,7 +398,7 @@ clCreateCommandQueue(cl_context                   context,
   INVALID_DEVICE_IF (device != context->device);
   INVALID_VALUE_IF (properties & ~(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE));
 
-  if(properties) {
+  if(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {/*not supported now.*/
     err = CL_INVALID_QUEUE_PROPERTIES;
     goto error;
   }
@@ -854,8 +854,7 @@ error:
 cl_int
 clUnloadCompiler(void)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  return CL_SUCCESS;
 }
 
 cl_int
@@ -981,8 +980,26 @@ clCreateKernelsInProgram(cl_program      program,
                          cl_kernel *     kernels,
                          cl_uint *       num_kernels_ret)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_PROGRAM (program);
+  if (program->is_built == CL_FALSE) {
+    err = CL_INVALID_PROGRAM_EXECUTABLE;
+    goto error;
+  }
+  if (kernels && num_kernels < program->ker_n) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(num_kernels_ret)
+    *num_kernels_ret = program->ker_n;
+
+  if(kernels)
+    err = cl_program_create_kernels_in_program(program, kernels);
+
+error:
+  return err;
 }
 
 cl_int
@@ -1206,8 +1223,37 @@ clGetEventProfilingInfo(cl_event             event,
                         void *               param_value,
                         size_t *             param_value_size_ret)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  cl_ulong ret_val;
+
+  CHECK_EVENT(event);
+
+  if (!(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
+          event->type == CL_COMMAND_USER ||
+          event->status != CL_COMPLETE) {
+    err = CL_PROFILING_INFO_NOT_AVAILABLE;
+    goto error;
+  }
+
+  if ((param_name != CL_PROFILING_COMMAND_QUEUED &&
+          param_name != CL_PROFILING_COMMAND_SUBMIT &&
+          param_name != CL_PROFILING_COMMAND_START &&
+          param_name != CL_PROFILING_COMMAND_END) ||
+          (param_value && param_value_size < sizeof(cl_ulong))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  err = cl_event_profiling(event, param_name, &ret_val);
+
+  if (err == CL_SUCCESS) {
+    if (param_value)
+      *(cl_ulong*)param_value = ret_val;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_ulong);
+  }
+error:
+  return err;
 }
 
 cl_int
@@ -1252,9 +1298,6 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
      goto error;
   }
 
-  if (blocking_read != CL_TRUE)
-     NOT_IMPLEMENTED;
-
   if (!ptr || !size || offset + size > buffer->size) {
      err = CL_INVALID_VALUE;
      goto error;
@@ -1311,9 +1354,6 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
     goto error;
   }
 
-  if (blocking_read != CL_TRUE)
-    NOT_IMPLEMENTED;
-
   if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -1391,9 +1431,6 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
     goto error;
   }
 
-  if (blocking_write != CL_TRUE)
-    NOT_IMPLEMENTED;
-
   if (!ptr || !size || offset + size > buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -1450,10 +1487,6 @@ clEnqueueWriteBufferRect(cl_command_queue     command_queue,
     goto error;
   }
 
-  if (blocking_write != CL_TRUE)
-    NOT_IMPLEMENTED;
-
-
   if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -1521,8 +1554,57 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
                     const cl_event *     event_wait_list,
                     cl_event *           event)
 {
-  NOT_IMPLEMENTED;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_MEM(dst_buffer);
+
+  if (command_queue->ctx != src_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (command_queue->ctx != dst_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (src_offset < 0 || src_offset + cb > src_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if (dst_offset < 0 || dst_offset + cb > src_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  /* Check overlap */
+  if (src_buffer == dst_buffer
+         && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
+         && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
+    err = CL_MEM_COPY_OVERLAP;
+    goto error;
+  }
+
+  // TODO: Need to check the sub buffer cases.
+  err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
   return 0;
+
+error:
+  return err;
 }
 
 cl_int
@@ -1638,9 +1720,6 @@ clEnqueueReadImage(cl_command_queue      command_queue,
      goto error;
   }
 
-  if (blocking_read != CL_TRUE)
-     NOT_IMPLEMENTED;
-
   if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
      err = CL_INVALID_VALUE;
      goto error;
@@ -1720,9 +1799,6 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
     goto error;
   }
 
-  if (blocking_write != CL_TRUE)
-    NOT_IMPLEMENTED;
-
   if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -2055,9 +2131,6 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     goto error;
   }
 
-  if (blocking_map != CL_TRUE)
-    NOT_IMPLEMENTED;
-
   if (!size || offset + size > buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -2123,9 +2196,6 @@ clEnqueueMapImage(cl_command_queue   command_queue,
     goto error;
   }
 
-  if (blocking_map != CL_TRUE)
-    NOT_IMPLEMENTED;
-
   if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
     err = CL_INVALID_VALUE;
     goto error;
@@ -2467,6 +2537,8 @@ clGetExtensionFunctionAddress(const char *func_name)
   EXTFUNC(clPinBufferIntel)
   EXTFUNC(clUnpinBufferIntel)
   EXTFUNC(clReportUnfreedIntel)
+  EXTFUNC(clCreateBufferFromLibvaIntel)
+  EXTFUNC(clCreateImageFromLibvaIntel)
   return NULL;
 }
 
@@ -2564,3 +2636,45 @@ clCreateProgramWithLLVMIntel(cl_context              context,
                                      errcode_ret);
 }
 
+cl_mem
+clCreateBufferFromLibvaIntel(cl_context  context,
+                             unsigned int bo_name,
+                             cl_int *errorcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  mem = cl_mem_new_libva_buffer(context, bo_name, &err);
+
+error:
+  if (errorcode_ret)
+    *errorcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateImageFromLibvaIntel(cl_context context,
+                            const cl_libva_image *info,
+                            cl_int *errorcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  if (!info) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  mem = cl_mem_new_libva_image(context,
+                               info->bo_name, info->offset, info->width, info->height,
+                               info->fmt, info->row_pitch,
+                               &err);
+
+error:
+  if (errorcode_ret)
+    *errorcode_ret = err;
+  return mem;
+}
+
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index ff78770..13789f6 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -125,7 +125,7 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
     assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
-    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo,
+    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo, image->offset,
                         image->intel_fmt, image->image_type,
                         image->w, image->h, image->depth,
                         image->row_pitch, image->tiling);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index b85c0cd..65f8e17 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -200,7 +200,8 @@ cl_curbe_fill(cl_kernel ker,
   }
   /* Handle the various offsets to SLM */
   const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
-  int32_t arg, slm_offset = 0;
+  /* align so that we kernel argument get good alignment */
+  int32_t arg, slm_offset = ALIGN(gbe_kernel_get_slm_size(ker->opaque), 32);
   for (arg = 0; arg < arg_n; ++arg) {
     const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
     if (type != GBE_ARG_LOCAL_PTR)
@@ -286,7 +287,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   }
 
   /* Setup the kernel */
-  cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
+  if (queue->props & CL_QUEUE_PROFILING_ENABLE)
+    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1);
+  else
+    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0);
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
diff --git a/src/cl_context.c b/src/cl_context.c
index 4f1c611..b62e946 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -268,3 +268,29 @@ cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kern
 
   return ctx->internel_kernels[index];
 }
+
+cl_kernel
+cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+                  const char * str_kernel, size_t size, const char * str_option)
+{
+  cl_int ret;
+  cl_int binary_status = CL_SUCCESS;
+  if (!ctx->internal_prgs[index])
+  {
+    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+      &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
+
+    if (!ctx->internal_prgs[index])
+      return NULL;
+
+    ret = cl_program_build(ctx->internal_prgs[index], str_option);
+    if (ret != CL_SUCCESS)
+      return NULL;
+
+    ctx->internal_prgs[index]->is_built = 1;
+
+    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+  }
+
+  return ctx->internel_kernels[index];
+}
diff --git a/src/cl_context.h b/src/cl_context.h
index 7016733..29bcb9f 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -40,17 +40,19 @@ enum _cl_gl_context_type {
 };
 
 enum _cl_internal_ker_type {
-  CL_ENQUEUE_COPY_BUFFER = 0,
-  CL_ENQUEUE_COPY_BUFFER_RECT = 1,
-  CL_ENQUEUE_COPY_IMAGE_0 = 2,             //copy image 2d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_1 = 3,             //copy image 3d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_2 = 4,             //copy image 2d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_3 = 5,             //copy image 3d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0 = 6,   //copy image 2d to buffer
-  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1 = 7,   //copy image 3d tobuffer
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0 = 8,   //copy buffer to image 2d
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1 = 9,   //copy buffer to image 3d
-  CL_INTERNAL_KERNEL_MAX = 10
+  CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_RECT,
+  CL_ENQUEUE_COPY_IMAGE_0,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_1,             //copy image 3d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2,             //copy image 2d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_3,             //copy image 3d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0,   //copy image 2d to buffer
+  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
+  CL_INTERNAL_KERNEL_MAX
 };
 
 struct _cl_context_prop {
@@ -137,5 +139,9 @@ extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
 /* Get the internal used kernel */
 extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
 
+/* Get the internal used kernel from binary*/
+extern cl_kernel cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+                  const char * str_kernel, size_t size, const char * str_option);
+
 #endif /* __CL_CONTEXT_H__ */
 
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 100b38d..0e9b487 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -110,6 +110,7 @@ extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
                                       uint32_t id,
                                       cl_buffer obj_bo,
+                                      uint32_t obj_bo_offset,
                                       uint32_t format,
                                       uint32_t type,
                                       int32_t w,
@@ -129,7 +130,7 @@ typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
 extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
 
 /* Configure internal state */
-typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
+typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
 extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
 
 /* Set the buffer object where to report performance counters */
@@ -191,6 +192,9 @@ extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
 typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
 extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
 
+/* Get a event time stamp */
+typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
+extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp;
 
 /* Will spawn all threads */
 typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
@@ -223,6 +227,12 @@ extern cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture;
 typedef void (cl_buffer_release_from_texture_cb)(cl_context, unsigned int, int, unsigned int);
 extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
 
+typedef cl_buffer (cl_buffer_get_buffer_from_libva_cb)(cl_context ctx, unsigned int bo_name, size_t *sz);
+extern cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva;
+
+typedef cl_buffer (cl_buffer_get_image_from_libva_cb)(cl_context ctx, unsigned int bo_name, struct _cl_mem_image *image);
+extern cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva;
+
 /* Unref a buffer and destroy it if no more ref */
 typedef int (cl_buffer_unreference_cb)(cl_buffer);
 extern cl_buffer_unreference_cb *cl_buffer_unreference;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index ac4ff7a..54fa62e 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -45,6 +45,8 @@ LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
 LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
 LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
 LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
+LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
+LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
 
 /* cl_khr_gl_sharing */
 LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
@@ -78,4 +80,5 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
 LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
 LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
 LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
+LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL;
 
diff --git a/src/cl_event.c b/src/cl_event.c
index 918e245..212f1ee 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -490,3 +490,25 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
   cl_event_set_status(*event, CL_COMPLETE);
   return CL_SUCCESS;
 }
+
+cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val)
+{
+  if (!event->gpgpu_event) {
+    /* Some event like read buffer do not need GPU involved, so
+       we just return all the profiling to 0 now. */
+    *ret_val = 0;
+    return CL_SUCCESS;
+  }
+
+  if(param_name == CL_PROFILING_COMMAND_START ||
+     param_name == CL_PROFILING_COMMAND_QUEUED ||
+     param_name == CL_PROFILING_COMMAND_SUBMIT) {
+    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val);
+    return CL_SUCCESS;
+  } else if (param_name == CL_PROFILING_COMMAND_END) {
+    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val);
+    return CL_SUCCESS;
+  } else {
+    return CL_INVALID_VALUE;
+  }
+}
diff --git a/src/cl_event.h b/src/cl_event.h
index 7dde24b..722486a 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -90,5 +90,7 @@ void cl_event_set_status(cl_event, cl_int);
 void cl_event_update_status(cl_event);
 /* Create the marker event */
 cl_int cl_event_marker(cl_command_queue, cl_event*);
+/* Do the event profiling */
+cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 6bfc453..87c4a24 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -44,7 +44,7 @@
 .image2d_max_height = 8192,
 .image3d_max_width = 8192,
 .image3d_max_height = 8192,
-.image3d_max_depth = 8192,
+.image3d_max_depth = 2048,
 .max_samplers = 8,
 .mem_base_addr_align = sizeof(cl_uint) * 8,
 .min_data_type_align_size = sizeof(cl_uint),
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 68753f1..5e70ef1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -601,6 +601,66 @@ cl_mem_add_ref(cl_mem mem)
 #define LOCAL_SZ_2   4
 
 LOCAL cl_int
+cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+            size_t src_offset, size_t dst_offset, size_t cb)
+{
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {1,1,1};
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(src_buf->ctx == dst_buf->ctx);
+
+  if ((cb % 4) || (src_offset % 4) || (dst_offset % 4)) {
+    extern char cl_internal_copy_buf_align1_str[];
+    extern int cl_internal_copy_buf_align1_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN1,
+             cl_internal_copy_buf_align1_str, (size_t)cl_internal_copy_buf_align1_str_size, NULL);
+  } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
+    extern char cl_internal_copy_buf_align4_str[];
+    extern int cl_internal_copy_buf_align4_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+             cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL);
+    cb = cb/4;
+    src_offset = src_offset/4;
+    dst_offset = dst_offset/4;
+  } else {
+    extern char cl_internal_copy_buf_align16_str[];
+    extern int cl_internal_copy_buf_align16_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+             cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
+    cb = cb/16;
+    src_offset = src_offset/4;
+    dst_offset = dst_offset/4;
+  }
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  if (cb < LOCAL_SZ_0) {
+    local_sz[0] = 1;
+  } else {
+    local_sz[0] = LOCAL_SZ_0;
+  }
+  global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+  cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+  cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+  cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+  return ret;
+}
+
+LOCAL cl_int
 cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
                        const size_t *src_origin, const size_t *dst_origin, const size_t *region,
                        size_t src_row_pitch, size_t src_slice_pitch,
@@ -1056,3 +1116,84 @@ cl_mem_unpin(cl_mem mem)
   cl_buffer_unpin(mem->bo);
   return CL_SUCCESS;
 }
+
+LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx,
+                                     unsigned int bo_name,
+                                     cl_int* errcode)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+
+  size_t sz = 0;
+  mem->bo = cl_buffer_get_buffer_from_libva(ctx, bo_name, &sz);
+  mem->size = sz;
+
+exit:
+  if (errcode)
+    *errcode = err;
+  return mem;
+
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
+LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
+                                    unsigned int bo_name, size_t offset,
+                                    size_t width, size_t height,
+                                    cl_image_format fmt,
+                                    size_t row_pitch,
+                                    cl_int *errcode)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  struct _cl_mem_image *image = NULL;
+  uint32_t intel_fmt, bpp;
+
+  intel_fmt = cl_image_get_intel_format(&fmt);
+  if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) {
+    err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
+    goto error;
+  }
+
+  cl_image_byte_per_pixel(&fmt, &bpp);
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, &err);
+  if (mem == NULL || err != CL_SUCCESS) {
+    err = CL_OUT_OF_HOST_MEMORY;
+    goto error;
+  }
+
+  image = cl_mem_image(mem);
+
+  mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image);
+
+  image->w = width;
+  image->h = height;
+  image->image_type = CL_MEM_OBJECT_IMAGE2D;
+  image->depth = 2;
+  image->fmt = fmt;
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->row_pitch = row_pitch;
+  image->slice_pitch = 0;
+  // NOTE: tiling of image is set in cl_buffer_get_image_from_libva().
+  image->tile_x = 0;
+  image->tile_y = 0;
+  image->offset = offset;
+
+exit:
+  if (errcode)
+    *errcode = err;
+  return mem;
+
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
diff --git a/src/cl_mem.h b/src/cl_mem.h
index ca601f9..75d5cf4 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -73,8 +73,8 @@ enum cl_mem_type {
 #define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
 
 typedef  struct _cl_mem {
-  uint64_t magic;           /* To identify it as a memory object */
   DEFINE_ICD(dispatch)
+  uint64_t magic;           /* To identify it as a memory object */
   cl_mem prev, next;        /* We chain the memory buffers together */
   enum cl_mem_type type;
   volatile int ref_n;       /* This object is reference counted */
@@ -100,7 +100,7 @@ struct _cl_mem_image {
   size_t host_row_pitch, host_slice_pitch;
   cl_image_tiling_t tiling;       /* only IVB+ supports TILE_[X,Y] (image only) */
   size_t tile_x, tile_y;          /* tile offset, used for mipmap images.  */
-  size_t offset;
+  size_t offset;                  /* offset for dri_bo, used when it's reloc. */
 };
 
 struct _cl_mem_gl_image {
@@ -188,6 +188,10 @@ extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
 /* Add one more reference to this object */
 extern void cl_mem_add_ref(cl_mem);
 
+/* api clEnqueueCopyBuffer help function */
+extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+              size_t src_offset, size_t dst_offset, size_t cb);
+
 /* api clEnqueueCopyBufferRect help function */
 extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
                                      const size_t *, const size_t *, const size_t *,
@@ -244,5 +248,16 @@ cl_mem_copy_image_region(const size_t *origin, const size_t *region,
                          const void *src, size_t src_row_pitch, size_t src_slice_pitch,
                          const struct _cl_mem_image *image);
 
+extern cl_mem cl_mem_new_libva_buffer(cl_context ctx,
+                                      unsigned int bo_name,
+                                      cl_int *errcode);
+
+extern cl_mem cl_mem_new_libva_image(cl_context ctx,
+                                     unsigned int bo_name, size_t offset,
+                                     size_t width, size_t height,
+                                     cl_image_format fmt,
+                                     size_t row_pitch,
+                                     cl_int *errcode);
+
 #endif /* __CL_MEM_H__ */
 
diff --git a/src/cl_program.c b/src/cl_program.c
index a0e0104..7ae8e8a 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -386,3 +386,25 @@ error:
   goto exit;
 }
 
+LOCAL cl_int
+cl_program_create_kernels_in_program(cl_program p, cl_kernel* ker)
+{
+  int i = 0;
+
+  if(ker == NULL)
+    return CL_SUCCESS;
+
+  for (i = 0; i < p->ker_n; ++i) {
+    TRY_ALLOC_NO_ERR(ker[i], cl_kernel_dup(p->ker[i]));
+  }
+  
+  return CL_SUCCESS;
+
+error:
+  do {
+    cl_kernel_delete(ker[i]);
+    ker[i--] = NULL;
+  } while(i > 0);
+
+  return CL_OUT_OF_HOST_MEMORY;
+}
diff --git a/src/cl_program.h b/src/cl_program.h
index de82fd5..2cb547a 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -68,6 +68,9 @@ extern void cl_program_add_ref(cl_program);
 /* Create a kernel for the OCL user */
 extern cl_kernel cl_program_create_kernel(cl_program, const char*, cl_int*);
 
+/* creates kernel objects for all kernel functions in program. */
+extern cl_int cl_program_create_kernels_in_program(cl_program, cl_kernel*);
+
 /* Create a program from OCL source */
 extern cl_program
 cl_program_create_from_source(cl_context ctx,
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index 19bdbed..e5015ec 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -62,6 +62,7 @@
 #define CMD_MEDIA_GATEWAY_STATE                 CMD(2, 0, 3)
 #define CMD_MEDIA_STATE_FLUSH                   CMD(2, 0, 4)
 #define CMD_GPGPU_WALKER                        CMD(2, 1, 5)
+#define CMD_PIPE_CONTROL                        CMD(3, 2, 0)
 
 #define CMD_LOAD_REGISTER_IMM                   (0x22 << 23)
 
@@ -300,6 +301,9 @@
 #define GEN7_PIPE_CONTROL_INSTRUCTION_GFX 0x3
 #define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2
 #define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0
+#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP        (3 << 14)
+#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE       (1 << 2)
+
 
 #define GEN_MAPFILTER_NEAREST        0x0
 #define GEN_MAPFILTER_LINEAR         0x1
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index cc33914..cfbb302 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -56,6 +56,7 @@
 #include "intel_gpgpu.h"
 #include "intel_batchbuffer.h"
 #include "intel_bufmgr.h"
+#include <X11/Xlibint.h>
 #include "x11/dricommon.h"
 #include "cl_mem.h"
 
@@ -192,8 +193,10 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
   if(intel->x11_display) {
     if((intel->dri_ctx = getDRI2State(intel->x11_display,
                                      DefaultScreen(intel->x11_display),
-                                     &driver_name)))
+                                     &driver_name))) {
       intel_driver_init_shared(intel, intel->dri_ctx);
+      Xfree(driver_name);
+    }
     else
       printf("X server found. dri2 connection failed! \n");
   } else {
@@ -405,9 +408,6 @@ intel_driver_get_ver(struct intel_driver *drv)
 static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
 static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
 
-#if defined(HAS_EGL)
-#include "intel_dri_resource_sharing.h"
-#include "cl_image.h"
 static int get_cl_tiling(uint32_t drm_tiling)
 {
   switch(drm_tiling) {
@@ -420,6 +420,9 @@ static int get_cl_tiling(uint32_t drm_tiling)
   return CL_NO_TILE;
 }
 
+#if defined(HAS_EGL)
+#include "intel_dri_resource_sharing.h"
+#include "cl_image.h"
 static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
 {
   cl_int ret = CL_SUCCESS;
@@ -583,6 +586,35 @@ intel_release_buffer_from_texture(cl_context ctx, unsigned int target,
 }
 #endif
 
+cl_buffer intel_share_buffer_from_libva(cl_context ctx,
+                                        unsigned int bo_name,
+                                        size_t *sz)
+{
+  drm_intel_bo *intel_bo;
+
+  intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+
+  if (sz)
+    *sz = intel_bo->size;
+
+  return (cl_buffer)intel_bo;
+}
+
+cl_buffer intel_share_image_from_libva(cl_context ctx,
+                                       unsigned int bo_name,
+                                       struct _cl_mem_image *image)
+{
+  drm_intel_bo *intel_bo;
+  uint32_t intel_tiling, intel_swizzle_mode;
+
+  intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+
+  drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+  image->tiling = get_cl_tiling(intel_tiling);
+
+  return (cl_buffer)intel_bo;
+}
+
 static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
 {
   switch (tiling) {
@@ -630,6 +662,8 @@ intel_setup_callbacks(void)
   cl_buffer_release_from_texture = (cl_buffer_release_from_texture_cb *) intel_release_buffer_from_texture;
   intel_set_cl_gl_callbacks();
 #endif
+  cl_buffer_get_buffer_from_libva = (cl_buffer_get_buffer_from_libva_cb *) intel_share_buffer_from_libva;
+  cl_buffer_get_image_from_libva = (cl_buffer_get_image_from_libva_cb *) intel_share_image_from_libva;
   cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference;
   cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference;
   cl_buffer_map = (cl_buffer_map_cb *) drm_intel_bo_map;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 5d93a67..b9bf2f9 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -60,6 +60,7 @@ typedef struct surface_heap {
 typedef struct intel_event {
   intel_batchbuffer_t *batch;
   drm_intel_bo* buffer;
+  drm_intel_bo* ts_buf;
   int status;
 } intel_event_t;
 
@@ -98,6 +99,7 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } perf_b;
   struct { drm_intel_bo *bo; } scratch_b;
   struct { drm_intel_bo *bo; } constant_b;
+  struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
 
   uint32_t per_thread_scratch;
   struct {
@@ -123,6 +125,8 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
 {
   if (gpgpu == NULL)
     return;
+  if(gpgpu->time_stamp_b.bo)
+    drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
   if (gpgpu->surface_heap_b.bo)
     drm_intel_bo_unreference(gpgpu->surface_heap_b.bo);
   if (gpgpu->idrt_b.bo)
@@ -158,7 +162,6 @@ intel_gpgpu_new(intel_driver_t *drv)
   state->drv = drv;
   state->batch = intel_batchbuffer_new(state->drv);
   assert(state->batch);
-  intel_batchbuffer_init(state->batch, state->drv);
 
 exit:
   return state;
@@ -281,6 +284,21 @@ static const uint32_t gpgpu_l3_config_reg2[] = {
   0x00204080, 0x00244890, 0x00284490, 0x002444A0
 };
 
+/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
+static void
+intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
+{
+  BEGIN_BATCH(gpgpu->batch, 5);
+  OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2));
+  OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
+  OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo,
+          I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+          GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  ADVANCE_BATCH();
+}
+
 static void
 intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
 {
@@ -346,11 +364,19 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
     OUT_BATCH(gpgpu->batch, 0);
     ADVANCE_BATCH(gpgpu->batch);
   }
+
+  /* Insert PIPE_CONTROL for time stamp of start*/
+  if (gpgpu->time_stamp_b.bo)
+    intel_gpgpu_write_timestamp(gpgpu, 0);
 }
 
 static void
 intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
 {
+  /* Insert PIPE_CONTROL for time stamp of end*/
+  if (gpgpu->time_stamp_b.bo)
+    intel_gpgpu_write_timestamp(gpgpu, 1);
+
   /* Insert the performance counter command */
   if (gpgpu->perf_b.bo) {
     BEGIN_BATCH(gpgpu->batch, 3);
@@ -395,7 +421,8 @@ intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
 static void
 intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
                        uint32_t max_threads,
-                       uint32_t size_cs_entry)
+                       uint32_t size_cs_entry,
+                       int profiling)
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   drm_intel_bo *bo;
@@ -411,6 +438,16 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->urb.size_cs_entry = size_cs_entry;
   gpgpu->max_threads = max_threads;
 
+  /* Set the profile buffer*/
+  if(gpgpu->time_stamp_b.bo)
+    dri_bo_unreference(gpgpu->time_stamp_b.bo);
+  gpgpu->time_stamp_b.bo = NULL;
+  if (profiling) {
+    bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
+    assert(bo);
+    gpgpu->time_stamp_b.bo = bo;
+  }
+
   /* Constant URB  buffer */
   if(gpgpu->curbe_b.bo)
     dri_bo_unreference(gpgpu->curbe_b.bo);
@@ -478,7 +515,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
 }
 
 static void
-intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo)
+intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
 {
   surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
   heap->binding_table[index] = offsetof(surface_heap_t, surface) +
@@ -486,7 +523,7 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
   dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
                     I915_GEM_DOMAIN_RENDER,
                     I915_GEM_DOMAIN_RENDER,
-                    0,
+                    obj_bo_offset,
                     heap->binding_table[index] +
                     offsetof(gen7_surface_state_t, ss1),
                     obj_bo);
@@ -570,6 +607,7 @@ static void
 intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
                               uint32_t index,
                               dri_bo* obj_bo,
+                              uint32_t obj_bo_offset,
                               uint32_t format,
                               cl_mem_object_type type,
                               int32_t w,
@@ -601,7 +639,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
     ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
   }
   ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
-  intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo);
+  intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
   gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
 }
 
@@ -643,6 +681,7 @@ static void
 intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
                        uint32_t index,
                        cl_buffer *obj_bo,
+                       uint32_t obj_bo_offset,
                        uint32_t format,
                        cl_mem_object_type type,
                        int32_t w,
@@ -651,7 +690,7 @@ intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
                        int32_t pitch,
                        cl_gpgpu_tiling tiling)
 {
-  intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, type, w, h, depth, pitch, tiling);
+  intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, obj_bo_offset, format, type, w, h, depth, pitch, tiling);
   assert(index < GEN_MAX_SURFACES);
 }
 
@@ -927,6 +966,11 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
   if(event->buffer != NULL)
     drm_intel_bo_reference(event->buffer);
 
+  if(gpgpu->time_stamp_b.bo) {
+    event->ts_buf = gpgpu->time_stamp_b.bo;
+    drm_intel_bo_reference(event->ts_buf);
+  }
+
 exit:
   return event;
 error:
@@ -989,9 +1033,23 @@ intel_gpgpu_event_delete(intel_event_t *event)
   assert(event->batch == NULL);   //This command must have been flushed.
   if(event->buffer)
     drm_intel_bo_unreference(event->buffer);
+  if(event->ts_buf)
+    drm_intel_bo_unreference(event->ts_buf);
   cl_free(event);
 }
 
+static void
+intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts)
+{
+  assert(event->ts_buf != NULL);
+  assert(index == 0 || index == 1);
+  drm_intel_gem_bo_map_gtt(event->ts_buf);
+  uint64_t* ptr = event->ts_buf->virtual;
+
+  *ret_ts = ptr[index] * 80; //convert to nanoseconds
+  drm_intel_gem_bo_unmap_gtt(event->ts_buf);
+}
+
 LOCAL void
 intel_set_gpgpu_callbacks(void)
 {
@@ -1019,5 +1077,6 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
   cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
   cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
+  cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp;
 }
 
diff --git a/src/kernels/cl_internal_copy_buf_align1.cl b/src/kernels/cl_internal_copy_buf_align1.cl
new file mode 100644
index 0000000..cd3ec7b
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_align1.cl
@@ -0,0 +1,8 @@
+kernel void __cl_cpy_region_align1 ( global char* src, unsigned int src_offset,
+                                     global char* dst, unsigned int dst_offset,
+				     unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size)
+        dst[i+dst_offset] = src[i+src_offset];
+}
diff --git a/src/kernels/cl_internal_copy_buf_align16.cl b/src/kernels/cl_internal_copy_buf_align16.cl
new file mode 100644
index 0000000..75b1a4a
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_align16.cl
@@ -0,0 +1,12 @@
+kernel void __cl_cpy_region_align16 ( global float* src, unsigned int src_offset,
+                                      global float* dst, unsigned int dst_offset,
+				      unsigned int size)
+{
+    int i = get_global_id(0) * 4;
+    if (i < size*4) {
+        dst[i+dst_offset] = src[i+src_offset];
+        dst[i+dst_offset + 1] = src[i+src_offset + 1];
+        dst[i+dst_offset + 2] = src[i+src_offset + 2];
+        dst[i+dst_offset + 3] = src[i+src_offset + 3];
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_align4.cl b/src/kernels/cl_internal_copy_buf_align4.cl
new file mode 100644
index 0000000..44a0f81
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_cpy_region_align4 ( global float* src, unsigned int src_offset,
+                                     global float* dst, unsigned int dst_offset,
+				     unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size)
+        dst[i+dst_offset] = src[i+src_offset];
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index f18bd46..37240fe 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -41,6 +41,7 @@ set (utests_sources
   compiler_fill_image_3d_2.cpp
   compiler_function_argument0.cpp
   compiler_function_argument1.cpp
+  compiler_function_argument2.cpp
   compiler_function_argument.cpp
   compiler_function_constant0.cpp
   compiler_function_constant1.cpp
@@ -52,6 +53,7 @@ set (utests_sources
   compiler_if_else.cpp
   compiler_integer_division.cpp
   compiler_integer_remainder.cpp
+	compiler_insert_vector.cpp
   compiler_lower_return0.cpp
   compiler_lower_return1.cpp
   compiler_lower_return2.cpp
@@ -96,6 +98,7 @@ set (utests_sources
   compiler_local_memory_barrier.cpp
   compiler_local_memory_barrier_wg64.cpp
   compiler_local_memory_barrier_2.cpp
+  compiler_local_slm.cpp
   compiler_movforphi_undef.cpp
   compiler_volatile.cpp
   compiler_copy_image1.cpp
@@ -126,6 +129,7 @@ set (utests_sources
   builtin_num_groups.cpp
   builtin_local_id.cpp
   builtin_acos_asin.cpp
+  builtin_convert_sat.cpp
   runtime_createcontext.cpp
   runtime_null_kernel_arg.cpp
   runtime_event.cpp
@@ -143,6 +147,7 @@ set (utests_sources
   compiler_long_cmp.cpp
   compiler_bool_cross_basic_block.cpp
   load_program_from_bin.cpp
+  enqueue_copy_buf.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/builtin_convert_sat.cpp b/utests/builtin_convert_sat.cpp
new file mode 100644
index 0000000..7272057
--- /dev/null
+++ b/utests/builtin_convert_sat.cpp
@@ -0,0 +1,80 @@
+#include <cstdint>
+#include "utest_helper.hpp"
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+int64_t my_rand(void) {
+  int64_t x = rand() - RAND_MAX/2;
+  int64_t y = rand() - RAND_MAX/2;
+  return x * y;
+}
+
+#define DEF2(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX, REAL_SRC_TYPE) \
+void builtin_convert_ ## SRC_TYPE ## _to_ ## DST_TYPE ## _sat(void) \
+{ \
+  const int n = 128; \
+  OCL_CREATE_KERNEL_FROM_FILE("builtin_convert_sat", "builtin_convert_" # SRC_TYPE "_to_" # DST_TYPE "_sat"); \
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(REAL_SRC_TYPE), NULL); \
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(DST_TYPE), NULL); \
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+  globals[0] = n; \
+  locals[0] = 16; \
+  OCL_MAP_BUFFER(0); \
+  for (int i = 0; i < n; i++) \
+    ((REAL_SRC_TYPE *)buf_data[0])[i] = my_rand(); \
+  OCL_UNMAP_BUFFER(0); \
+  OCL_NDRANGE(1); \
+  OCL_MAP_BUFFER(0); \
+  OCL_MAP_BUFFER(1); \
+  for (int i = 0; i < n; i++) { \
+    REAL_SRC_TYPE src = ((REAL_SRC_TYPE *)buf_data[0])[i]; \
+    DST_TYPE dst; \
+    if ((double)src > (double)DST_MAX) \
+      dst = DST_MAX; \
+    else if ((double)src < (double)DST_MIN) \
+      dst = DST_MIN; \
+    else \
+      dst = src; \
+    OCL_ASSERT(((DST_TYPE *)buf_data[1])[i] == dst); \
+  } \
+  OCL_UNMAP_BUFFER(0); \
+  OCL_UNMAP_BUFFER(1); \
+} \
+MAKE_UTEST_FROM_FUNCTION(builtin_convert_ ## SRC_TYPE ## _to_ ## DST_TYPE ## _sat);
+
+#define DEF(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX) \
+  DEF2(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX, SRC_TYPE)
+
+DEF(char, uchar, -128, 127);
+DEF(char, short, -128, 127);
+DEF(char, ushort, -128, 127);
+DEF(char, int, -128, 127);
+DEF(char, uint, -128, 127);
+DEF2(char, long, -128, 127, int64_t);
+DEF(char, float, -128, 127);
+DEF(uchar, char, 0, 255);
+DEF(uchar, short, 0, 255);
+DEF(uchar, ushort, 0, 255);
+DEF(uchar, int, 0, 255);
+DEF(uchar, uint, 0, 255);
+DEF2(uchar, long, 0, 255, int64_t);
+DEF(uchar, float, 0, 255);
+DEF(short, ushort, -32768, 32767);
+DEF(short, int, -32768, 32767);
+DEF(short, uint, -32768, 32767);
+DEF2(short, long, -32768, 32767, int64_t);
+DEF(short, float, -32768, 32767);
+DEF(ushort, short, 0, 65535);
+DEF(ushort, int, 0, 65535);
+DEF(ushort, uint, 0, 65535);
+DEF2(ushort, long, 0, 65535, int64_t);
+DEF(ushort, float, 0, 65535);
+DEF(int, uint, -0x7FFFFFFF-1, 0x7FFFFFFF);
+DEF2(int, long, -0x7FFFFFFF-1, 0x7FFFFFFF, int64_t);
+DEF(int, float, -0x7FFFFFFF-1, 0x7FFFFFFF);
+DEF(uint, int, 0, 0xffffffffu);
+DEF2(uint, long, 0, 0xffffffffu, int64_t);
+DEF(uint, float, 0, 0xffffffffu);
+#undef DEF
diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp
index 9384f85..7951ff7 100644
--- a/utests/compiler_async_copy.cpp
+++ b/utests/compiler_async_copy.cpp
@@ -1,39 +1,55 @@
 #include "utest_helper.hpp"
+#include <stdint.h>
 
-static void compiler_async_copy(void)
-{
-  const size_t n = 1024;
-  const size_t local_size = 32;
-  const int copiesPerWorkItem = 5;
+typedef unsigned char uchar;
+typedef unsigned short ushort;
 
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_async_copy");
-  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(int)*2, NULL);
-  OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+#define DEF(TYPE, KER_TYPE, VEC_SIZE) \
+static void compiler_async_copy_##KER_TYPE##VEC_SIZE(void) \
+{ \
+  const size_t n = 1024; \
+  const size_t local_size = 32; \
+  const int copiesPerWorkItem = 5; \
+\
+  /* Setup kernel and buffers */\
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_async_copy", "compiler_async_copy_" # KER_TYPE # VEC_SIZE); \
+  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(TYPE) * VEC_SIZE, NULL); \
+  OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(TYPE) * VEC_SIZE, NULL); \
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+  OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(TYPE)*VEC_SIZE, NULL); \
+  OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem); \
+\
+  OCL_MAP_BUFFER(1); \
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * VEC_SIZE; ++i) \
+      ((TYPE*)buf_data[1])[i] = rand(); \
+  OCL_UNMAP_BUFFER(1); \
+\
+  /* Run the kernel */\
+  globals[0] = n; \
+  locals[0] = local_size; \
+  OCL_NDRANGE(1); \
+  OCL_MAP_BUFFER(0); \
+  OCL_MAP_BUFFER(1); \
+\
+  /* Check results */\
+  TYPE *dst = (TYPE*)buf_data[0]; \
+  TYPE *src = (TYPE*)buf_data[1]; \
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * VEC_SIZE; i++) \
+    OCL_ASSERT(dst[i] == src[i]); \
+  OCL_UNMAP_BUFFER(0); \
+  OCL_UNMAP_BUFFER(1); \
+} \
+\
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy_##KER_TYPE##VEC_SIZE);
 
-  OCL_MAP_BUFFER(1);
-  for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; ++i)
-      ((int*)buf_data[1])[i] = rand();
-  OCL_UNMAP_BUFFER(1);
-
-  // Run the kernel
-  globals[0] = n;
-  locals[0] = local_size;
-  OCL_NDRANGE(1);
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
-
-  // Check results
-  int *dst = (int*)buf_data[0];
-  int *src = (int*)buf_data[1];
-  for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; i++)
-    OCL_ASSERT(dst[i] == src[i] + 3);
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_async_copy);
+DEF(char, char, 2);
+DEF(uchar, uchar, 2);
+DEF(short, short, 2);
+DEF(ushort, ushort, 2);
+DEF(int, int, 2);
+DEF(uint, uint, 2);
+DEF(int64_t, long, 2);
+DEF(uint64_t, ulong, 2);
+DEF(float, float, 2);
+DEF(double, double, 2);
diff --git a/utests/compiler_function_argument2.cpp b/utests/compiler_function_argument2.cpp
new file mode 100644
index 0000000..c352a9e
--- /dev/null
+++ b/utests/compiler_function_argument2.cpp
@@ -0,0 +1,57 @@
+#include "utest_helper.hpp"
+
+#define VECSIZE 8
+void compiler_function_argument2(void)
+{
+  char arg0[8] = { 0 };
+  unsigned char arg1[8] = { 0 };
+  short arg2[8] = { 0 };
+  unsigned short arg3[8] = { 0 };
+  int arg4[8] = { 0 };
+  unsigned int arg5[8] = { 0 };
+  float arg6[8] = { 0 };
+
+  for (uint32_t i = 0; i < 8; ++i) {
+      arg0[i] = rand();
+      arg1[i] = rand();
+      arg2[i] = rand();
+      arg3[i] = rand();
+      arg4[i] = rand();
+      arg5[i] = rand();
+      arg6[i] = rand();
+  }
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_argument2");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(float) * 8 * 8, NULL);
+  OCL_SET_ARG(0, sizeof(arg0), arg0);
+  OCL_SET_ARG(1, sizeof(arg1), arg1);
+  OCL_SET_ARG(2, sizeof(arg2), arg2);
+  OCL_SET_ARG(3, sizeof(arg3), arg3);
+  OCL_SET_ARG(4, sizeof(arg4), arg4);
+  OCL_SET_ARG(5, sizeof(arg5), arg5);
+  OCL_SET_ARG(6, sizeof(arg6), arg6);
+  OCL_SET_ARG(7, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = 1;
+  locals[0] = 1;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  /* Check results */
+  float *dst = (float*)buf_data[0];
+
+  for (uint32_t i = 0; i < 8; ++i) {
+      OCL_ASSERT((float)arg0[i] == dst[0*8 + i]);
+      OCL_ASSERT((float)arg1[i] == dst[1*8 + i]);
+      OCL_ASSERT((float)arg2[i] == dst[2*8 + i]);
+      OCL_ASSERT((float)arg3[i] == dst[3*8 + i]);
+      OCL_ASSERT((float)arg4[i] == dst[4*8 + i]);
+      OCL_ASSERT((float)arg5[i] == dst[5*8 + i]);
+      OCL_ASSERT((float)arg6[i] == dst[6*8 + i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument2);
diff --git a/utests/compiler_insert_vector.cpp b/utests/compiler_insert_vector.cpp
new file mode 100644
index 0000000..c7c239f
--- /dev/null
+++ b/utests/compiler_insert_vector.cpp
@@ -0,0 +1,18 @@
+#include "utest_helper.hpp"
+
+void compiler_insert_vector(void)
+{
+  const size_t n = 2048;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_insert_vector");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int) * 4, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insert_vector);
diff --git a/utests/compiler_local_slm.cpp b/utests/compiler_local_slm.cpp
index aa9a2fe..48a072f 100644
--- a/utests/compiler_local_slm.cpp
+++ b/utests/compiler_local_slm.cpp
@@ -2,9 +2,33 @@
 
 void compiler_local_slm(void)
 {
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_local_slm");
+  const size_t n = 32;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    std::cout << ((int32_t*)buf_data[0])[i] << std::endl;
+    OCL_ASSERT(((int32_t*)buf_data[0])[i] == (i%16 + 2 + 1+ i/16));
+  OCL_UNMAP_BUFFER(0);
 }
 
+void compiler_local_slm1(void)
+{
+  const size_t n = 2;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = 1;
+  locals[0] = 1;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  uint64_t * ptr = (uint64_t*)buf_data[0];
+  OCL_ASSERT((ptr[1] -ptr[0])  == 4);
+  OCL_UNMAP_BUFFER(0);
+}
 MAKE_UTEST_FROM_FUNCTION(compiler_local_slm);
-
+MAKE_UTEST_FROM_FUNCTION(compiler_local_slm1);
diff --git a/utests/compiler_shader_toy.cpp b/utests/compiler_shader_toy.cpp
index ead9120..58bcc6f 100644
--- a/utests/compiler_shader_toy.cpp
+++ b/utests/compiler_shader_toy.cpp
@@ -31,7 +31,9 @@
 
 static const int dim = 256;
 
-static void run_kernel(int w, int h, const char *name)
+// tricky here 'name' stands for Kernel and Reference
+// 'file' stands for .cl file name and dst image name
+static void run_kernel(int w, int h, const char *file, const char *name)
 {
   const size_t global[2] = {size_t(w), size_t(h)};
   const size_t local[2] = {16, 1};
@@ -42,8 +44,8 @@ static void run_kernel(int w, int h, const char *name)
   char dst_img[256];
   char ref_img[256];
 
-  snprintf(kernel_file, sizeof(kernel_file), "%s.cl", name);
-  snprintf(dst_img, sizeof(dst_img), "%s.bmp", name);
+  snprintf(kernel_file, sizeof(kernel_file), "%s.cl", file);
+  snprintf(dst_img, sizeof(dst_img), "%s.bmp", file);
   snprintf(ref_img, sizeof(ref_img), "%s_ref.bmp", name);
   OCL_CALL (cl_kernel_init, kernel_file, name, SOURCE, NULL);
 
@@ -63,20 +65,23 @@ static void run_kernel(int w, int h, const char *name)
   OCL_CHECK_IMAGE(dst, w, h, ref_img);
 }
 
-#define DECL_SHADER_TOY_TEST(W,H,NAME) \
-  static void NAME(void) { run_kernel(W,H,#NAME); } \
-  MAKE_UTEST_FROM_FUNCTION(NAME);
+#define DECL_SHADER_TOY_TEST(W,H,FILE_NAME, KERNEL_NAME) \
+  static void FILE_NAME(void) { run_kernel(W,H,#FILE_NAME, #KERNEL_NAME); } \
+  MAKE_UTEST_FROM_FUNCTION(FILE_NAME);
 
-DECL_SHADER_TOY_TEST(dim,dim,compiler_clod);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_ribbon);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_nautilus);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge_no_shadow);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_julia);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_no_break);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_clod,compiler_clod);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_ribbon,compiler_ribbon);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_nautilus,compiler_nautilus);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge_no_shadow,compiler_menger_sponge_no_shadow);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia,compiler_julia);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_no_break,compiler_julia_no_break);
+// test for function calls
+DECL_SHADER_TOY_TEST(dim,dim,compiler_clod_function_call,compiler_clod);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_function_call,compiler_julia);
 
 // Still issues here for LLVM 3.2
-// DECL_SHADER_TOY_TEST(dim,dim,compiler_chocolux);
-// DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge);
+// DECL_SHADER_TOY_TEST(dim,dim,compiler_chocolux,compiler_chocolux);
+// DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge,compiler_menger_sponge);
 
 #undef DECL_SHADER_TOY_TEST
 
diff --git a/utests/compiler_vector_inc.cpp b/utests/compiler_vector_inc.cpp
index abc5408..c44424b 100644
--- a/utests/compiler_vector_inc.cpp
+++ b/utests/compiler_vector_inc.cpp
@@ -43,4 +43,4 @@ void compiler_vector_inc(void)
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_vector_inc);
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_vector_inc);
diff --git a/utests/enqueue_copy_buf.cpp b/utests/enqueue_copy_buf.cpp
new file mode 100644
index 0000000..969eaa8
--- /dev/null
+++ b/utests/enqueue_copy_buf.cpp
@@ -0,0 +1,66 @@
+#include "utest_helper.hpp"
+
+void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+    unsigned int i;
+    OCL_MAP_BUFFER(0);
+
+    for (i=0; i < sz; i++) {
+        ((char*)buf_data[0])[i] = (rand() & 63);
+    }
+
+    OCL_UNMAP_BUFFER(0);
+
+    if (src_off + cb > sz || dst_off + cb > sz) {
+        /* Expect Error. */
+        OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                       src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+        return;
+    }
+
+    OCL_ASSERT(!clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                    src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+
+#if 0
+    printf("\n########### Src buffer: \n");
+    for (i = 0; i < cb; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[0])[i + src_off]);
+
+    printf("\n########### dst buffer: \n");
+    for (i = 0; i < cb; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[1])[i + dst_off]);
+#endif
+
+    // Check results
+    for (i = 0; i < cb; ++i) {
+        if (((char*)buf_data[0])[i + src_off] != ((char*)buf_data[1])[i + dst_off]) {
+            printf ("different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+}
+
+void enqueue_copy_buf(void)
+{
+    size_t i;
+    size_t j;
+    const size_t sz = 1024;
+
+    OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+    for (i=0; i<sz; i+=8) {
+        for (j=0; j<sz; j+=10) {
+            test_copy_buf(sz, i, j, sz/2);
+        }
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_copy_buf);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list