[Pkg-opencl-devel] [beignet] 42/66: Imported Upstream version 0.2

Fri Oct 31 07:27:06 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 7799aef10c3a06f2340faaa55603535129181480
Author: Simon Richter <sjr at debian.org>
Date:   Fri Jul 5 15:01:51 2013 +0200

    Imported Upstream version 0.2
---
 CMakeLists.txt                                    |   2 +-
 backend/src/backend/gen_context.cpp               |   2 +
 backend/src/backend/gen_encoder.cpp               |  24 ++
 backend/src/backend/gen_encoder.hpp               |   2 +
 backend/src/backend/gen_insn_selection.cpp        |  22 +-
 backend/src/backend/gen_insn_selection.hxx        |   2 +
 backend/src/ir/instruction.cpp                    |   2 +
 backend/src/ir/instruction.hpp                    |   4 +
 backend/src/ir/instruction.hxx                    |   2 +
 backend/src/ir/unit.cpp                           |  14 -
 backend/src/ir/unit.hpp                           |  20 +-
 backend/src/llvm/llvm_gen_backend.cpp             |  68 ++++-
 backend/src/llvm/llvm_gen_ocl_function.hxx        |   2 +
 backend/src/llvm/llvm_scalarize.cpp               |   2 +-
 backend/src/ocl_stdlib.h                          | 245 +++++++++++++---
 docs/Beignet.mdwn                                 |   5 +-
 docs/Beignet/Backend/TODO.mdwn                    |  34 +--
 kernels/compiler_abs.cl                           |   1 +
 kernels/{compiler_abs.cl => compiler_abs_diff.cl} |  13 +-
 kernels/compiler_atomic_functions.cl              |  19 +-
 kernels/compiler_global_constant.cl               |   3 +-
 kernels/compiler_step.cl                          |  38 +++
 kernels/compiler_upsample_int.cl                  |   4 +
 src/cl_mem.c                                      |   5 +
 src/cl_mem.h                                      |   4 +
 src/cl_mem_gl.c                                   |   9 +
 src/cl_platform_id.h                              |   2 +-
 src/cl_utils.h                                    |  24 +-
 src/intel/intel_driver.c                          |   1 +
 src/x11/gbm_dri2_x11_platform.c                   | 103 ++++++-
 utests/CMakeLists.txt                             |   3 +
 utests/compiler_abs.cpp                           |  52 +++-
 utests/compiler_abs_diff.cpp                      | 267 +++++++++++++++++
 utests/compiler_atomic_functions.cpp              |  23 +-
 utests/compiler_fill_gl_image.cpp                 |   4 +
 utests/compiler_global_memory_barrier.cpp         |   2 +-
 utests/compiler_local_memory_barrier_2.cpp        |   2 +-
 utests/compiler_step.cpp                          | 338 ++++++++++++++++++++++
 utests/compiler_upsample_int.cpp                  |  37 +++
 utests/utest_helper.cpp                           |  19 +-
 40 files changed, 1260 insertions(+), 165 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6a5d49..41ac43b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_VERSION_MAJOR 0)
-set (LIBCL_VERSION_MINOR 1)
+set (LIBCL_VERSION_MINOR 2)
 
 configure_file (
   "src/OCLConfig.h.in"
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index acd9c1d..e33d8da 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -169,6 +169,8 @@ namespace gbe
       case SEL_OP_ADD:  p->ADD(dst, src0, src1); break;
       case SEL_OP_MUL:  p->MUL(dst, src0, src1); break;
       case SEL_OP_MACH: p->MACH(dst, src0, src1); break;
+      case SEL_OP_UPSAMPLE_SHORT: p->UPSAMPLE_SHORT(dst, src0, src1); break;
+      case SEL_OP_UPSAMPLE_INT: p->UPSAMPLE_INT(dst, src0, src1); break;
       default: NOT_IMPLEMENTED;
     }
   }
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 25303b4..f84c6dd 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -812,6 +812,30 @@ namespace gbe
     pop();
   }
 
+  void GenEncoder::UPSAMPLE_SHORT(GenRegister dest, GenRegister src0, GenRegister src1) {
+    dest.type = GEN_TYPE_B;
+    dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src0.type = GEN_TYPE_B;
+    src0.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src1.type = GEN_TYPE_B;
+    src1.hstride = GEN_HORIZONTAL_STRIDE_2;
+    MOV(dest, src1);
+    dest.subnr ++;
+    MOV(dest, src0);
+  }
+
+  void GenEncoder::UPSAMPLE_INT(GenRegister dest, GenRegister src0, GenRegister src1) {
+    dest.type = GEN_TYPE_W;
+    dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src0.type = GEN_TYPE_W;
+    src0.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src1.type = GEN_TYPE_W;
+    src1.hstride = GEN_HORIZONTAL_STRIDE_2;
+    MOV(dest, src1);
+    dest.subnr += 2;
+    MOV(dest, src0);
+  }
+
   void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
     int w = curr.execWidth;
     if (src0.isdf()) {
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index a7cbc89..d3a7165 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -92,6 +92,8 @@ namespace gbe
     ALU1(MOV)
     ALU1(FBH)
     ALU1(FBL)
+    ALU2(UPSAMPLE_SHORT)
+    ALU2(UPSAMPLE_INT)
     ALU1(RNDZ)
     ALU1(RNDE)
     ALU1(RNDD)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index bbe392d..d4be8bf 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -438,6 +438,8 @@ namespace gbe
     ALU1(FBL)
     ALU3(HADD)
     ALU3(RHADD)
+    ALU2(UPSAMPLE_SHORT)
+    ALU2(UPSAMPLE_INT)
 #undef ALU1
 #undef ALU2
 #undef ALU3
@@ -1451,6 +1453,12 @@ namespace gbe
             sel.RHADD(dst, src0, src1, temp);
             break;
           }
+        case OP_UPSAMPLE_SHORT:
+          sel.UPSAMPLE_SHORT(dst, src0, src1);
+          break;
+        case OP_UPSAMPLE_INT:
+          sel.UPSAMPLE_INT(dst, src0, src1);
+          break;
         default: NOT_IMPLEMENTED;
       }
       sel.pop();
@@ -1792,24 +1800,22 @@ namespace gbe
       const ir::Register reg = sel.reg(FAMILY_DWORD);
 
       const uint32_t params = insn.getParameters();
-      //XXX TODO need to double check local barrier whether need fence or not
-      if(params == syncGlobalBarrier || params == syncLocalBarrier) {
+      if(params == syncGlobalBarrier) {
         const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
         sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
       }
 
       sel.push();
         sel.curr.predicate = GEN_PREDICATE_NONE;
+
+        // As only the payload.2 is used and all the other regions are ignored
+        // SIMD8 mode here is safe.
         sel.curr.execWidth = 8;
         sel.curr.physicalFlag = 0;
         sel.curr.noMask = 1;
+        // Copy barrier id from r0.
+        sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
 
-        sel.SHL(GenRegister::ud8grf(reg),
-                GenRegister::ud1grf(ocl::threadn),
-                GenRegister::immud(0x9));
-        sel.OR(GenRegister::ud8grf(reg),
-               GenRegister::ud8grf(reg),
-               GenRegister::immud(0x00088000));
         // A barrier is OK to start the thread synchronization *and* SLM fence
         sel.BARRIER(GenRegister::f8grf(reg));
         // Now we wait for the other threads
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index c85d328..33c3937 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -47,3 +47,5 @@ DECL_SELECTION_IR(FBH, UnaryInstruction)
 DECL_SELECTION_IR(FBL, UnaryInstruction)
 DECL_SELECTION_IR(HADD, TernaryInstruction)
 DECL_SELECTION_IR(RHADD, TernaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 21b82ce..2a77454 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1330,6 +1330,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   DECL_EMIT_FUNCTION(SUB)
   DECL_EMIT_FUNCTION(SUBSAT)
   DECL_EMIT_FUNCTION(MUL_HI)
+  DECL_EMIT_FUNCTION(UPSAMPLE_SHORT)
+  DECL_EMIT_FUNCTION(UPSAMPLE_INT)
   DECL_EMIT_FUNCTION(DIV)
   DECL_EMIT_FUNCTION(REM)
   DECL_EMIT_FUNCTION(SHL)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index fc1c984..48e6963 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -519,6 +519,10 @@ namespace ir {
   Instruction SIN(Type type, Register dst, Register src);
   /*! mul_hi.type dst src */
   Instruction MUL_HI(Type type, Register dst, Register src0, Register src1);
+  /*! upsample_short.type dst src */
+  Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
+  /*! upsample_int.type dst src */
+  Instruction UPSAMPLE_INT(Type type, Register dst, Register src0, Register src1);
   /*! fbh.type dst src */
   Instruction FBH(Type type, Register dst, Register src);
   /*! fbl.type dst src */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 0e1c575..b9f0e73 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -77,3 +77,5 @@ DECL_INSN(FBH, UnaryInstruction)
 DECL_INSN(FBL, UnaryInstruction)
 DECL_INSN(HADD, BinaryInstruction)
 DECL_INSN(RHADD, BinaryInstruction)
+DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
+DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index 01e1eb1..4aeffe9 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -21,12 +21,6 @@
  * \file unit.cpp
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
-#include "llvm/Config/config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Instructions.h"
-#else
-#include "llvm/IR/Instructions.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "ir/unit.hpp"
 #include "ir/function.hpp"
 
@@ -59,14 +53,6 @@ namespace ir {
     constantSet.append(data, name, size, alignment);
   }
 
-  void Unit::removeDeadValues()
-  {
-    for(auto &it : valueMap) {
-      llvm::Instruction* I = llvm::dyn_cast<llvm::Instruction>(it.first.first);  //fake value
-      if((I == NULL) || (I->getParent() == NULL))
-        valueMap.erase(it.first);
-    }
-  }
   std::ostream &operator<< (std::ostream &out, const Unit &unit) {
     unit.apply([&out] (const Function &fn) { out << fn << std::endl; });
     return out;
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 1017f5f..9e3d66a 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -24,13 +24,6 @@
 #ifndef __GBE_IR_UNIT_HPP__
 #define __GBE_IR_UNIT_HPP__
 
-#include "llvm/Config/config.h"
-#if LLVM_VERSION_MINOR <= 2
-#include "llvm/Value.h"
-#else
-#include "llvm/IR/Value.h"
-#endif  /* LLVM_VERSION_MINOR <= 2 */
-
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
 #include "sys/hash_map.hpp"
@@ -49,7 +42,7 @@ namespace ir {
   {
   public:
     typedef hash_map<std::string, Function*> FunctionSet;
-    typedef std::pair<llvm::Value*, uint32_t> ValueIndex;
+    typedef std::pair<void*, uint32_t> ValueIndex;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
     /*! Release everything (*including* the function pointers) */
@@ -84,8 +77,8 @@ namespace ir {
     /*! Some values will not be allocated. For example a vector extract and
      * a vector insertion when scalarize the vector load/store
      */
-    void newValueProxy(llvm::Value *real,
-                       llvm::Value *fake,
+    void newValueProxy(void *real,
+                       void *fake,
                        uint32_t realIndex = 0u,
                        uint32_t fakeIndex = 0u) {
       const ValueIndex key(fake, fakeIndex);
@@ -93,10 +86,11 @@ namespace ir {
       GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
       valueMap[key] = value;
     }
-    /* remove fake values that removed by other pass */
-    void removeDeadValues(void);
+
+    void clearValueMap() { valueMap.clear(); }
+
     /*! Return the value map */
-    const map<ValueIndex, ValueIndex>& getValueMap(void) const { return valueMap; }
+    const map<ValueIndex, ValueIndex> &getValueMap(void) const { return valueMap; }
   private:
     friend class ContextInterface; //!< Can free modify the unit
     hash_map<std::string, Function*> functions; //!< All the defined functions
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 8385e21..cebe0f4 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -307,8 +307,10 @@ namespace gbe
     }
     /*! After scalarize pass, there are some valueMap in unit,
      *  use this function to copy from unit valueMap */
-    void initValueMap(const map<ValueIndex, ValueIndex>& vMap) {
-      valueMap.insert(vMap.begin(), vMap.end());
+    void initValueMap(const map<ir::Unit::ValueIndex, ir::Unit::ValueIndex> &vMap) {
+      for(auto &it : vMap)
+        newValueProxy((Value*)it.second.first, (Value*)it.first.first,
+                      it.second.second, it.first.second);
     }
     /*! Mostly used for the preallocated registers (lids, gids) */
     void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
@@ -567,6 +569,35 @@ namespace gbe
       if(addrSpace == ir::AddressSpace::MEM_CONSTANT) {
         GBE_ASSERT(v.hasInitializer());
         const Constant *c = v.getInitializer();
+        if (c->getType()->getTypeID() != Type::ArrayTyID) {
+          void *mem = malloc(sizeof(double));
+          int size = 0;
+          switch(c->getType()->getTypeID()) {
+            case Type::TypeID::IntegerTyID: {
+              const ConstantInt *ci = dyn_cast<ConstantInt>(c);
+              *(int *)mem = ci->isNegative() ? ci->getSExtValue() : ci->getZExtValue();
+              size = sizeof(int);
+              break;
+            }
+            case Type::TypeID::FloatTyID: {
+              const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+              *(float *)mem = cf->getValueAPF().convertToFloat();
+              size = sizeof(float);
+              break;
+            }
+            case Type::TypeID::DoubleTyID: {
+              const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+              *(double *)mem = cf->getValueAPF().convertToDouble();
+              size = sizeof(double);
+              break;
+            }
+            default:
+              NOT_IMPLEMENTED;
+          }
+          unit.newConstant((char *)mem, name, size, size);
+          free(mem);
+          continue;
+        }
         GBE_ASSERT(c->getType()->getTypeID() == Type::ArrayTyID);
         const ConstantDataArray *cda = dyn_cast<ConstantDataArray>(c);
         GBE_ASSERT(cda);
@@ -782,7 +813,11 @@ namespace gbe
     }
     Constant *CPV = dyn_cast<Constant>(value);
     if (CPV) {
-      GBE_ASSERT(isa<GlobalValue>(CPV) == false);
+      if (isa<GlobalValue>(CPV)) {
+        auto name = CPV->getName().str();
+        uint16_t reg = unit.getConstantSet().getConstant(name).getReg();
+        return ir::Register(reg);
+      }
       const ir::ImmediateIndex immIndex = this->newImmediate(CPV, elemID);
       const ir::Immediate imm = ctx.getImmediate(immIndex);
       const ir::Register reg = ctx.reg(getFamily(imm.type));
@@ -1174,7 +1209,6 @@ namespace gbe
     }
 
     ctx.startFunction(F.getName());
-    unit.removeDeadValues();
     this->regTranslator.clear();
     this->regTranslator.initValueMap(unit.getValueMap());
     this->labelMap.clear();
@@ -1189,10 +1223,6 @@ namespace gbe
       if(addrSpace != ir::AddressSpace::MEM_CONSTANT)
         continue;
       GBE_ASSERT(v.hasInitializer());
-      const Constant *c = v.getInitializer();
-      GBE_ASSERT(c->getType()->getTypeID() == Type::ArrayTyID);
-      const ConstantDataArray *cda = dyn_cast<ConstantDataArray>(c);
-      GBE_ASSERT(cda);
       ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
       ir::Constant &con = unit.getConstantSet().getConstant(j ++);
       con.setReg(reg.value());
@@ -1741,6 +1771,7 @@ namespace gbe
       case GEN_OCL_LBARRIER:
       case GEN_OCL_GBARRIER:
       case GEN_OCL_LGBARRIER:
+        ctx.getFunction().setUseSLM(true);
         break;
       case GEN_OCL_WRITE_IMAGE0:
       case GEN_OCL_WRITE_IMAGE1:
@@ -1777,6 +1808,8 @@ namespace gbe
       }
       case GEN_OCL_MUL_HI_INT:
       case GEN_OCL_MUL_HI_UINT:
+      case GEN_OCL_UPSAMPLE_SHORT:
+      case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_SADD_SAT_CHAR:
       case GEN_OCL_SADD_SAT_SHORT:
       case GEN_OCL_SADD_SAT_INT:
@@ -1823,10 +1856,9 @@ namespace gbe
 
   void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
     CallSite::arg_iterator AI = CS.arg_begin();
-#if GBE_DEBUG
     CallSite::arg_iterator AE = CS.arg_end();
-#endif /* GBE_DEBUG */
     GBE_ASSERT(AI != AE);
+
     unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register dst = this->getRegister(&I);
@@ -2158,6 +2190,22 @@ namespace gbe
             ctx.MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_UPSAMPLE_SHORT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.UPSAMPLE_SHORT(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_UPSAMPLE_INT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.UPSAMPLE_INT(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           case GEN_OCL_SADD_SAT_CHAR:
           case GEN_OCL_SADD_SAT_SHORT:
           case GEN_OCL_SADD_SAT_INT:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index f448a50..8e940bc 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -133,3 +133,5 @@ DECL_LLVM_GEN_FUNCTION(FBL, __gen_ocl_fbl)
 DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_abs)
 DECL_LLVM_GEN_FUNCTION(HADD, __gen_ocl_hadd)
 DECL_LLVM_GEN_FUNCTION(RHADD, __gen_ocl_rhadd)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index bab2236..41674b6 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -773,7 +773,7 @@ namespace gbe {
     intTy = IntegerType::get(module->getContext(), 32);
     floatTy = Type::getFloatTy(module->getContext());
     builder = new IRBuilder<>(module->getContext());
-    unit.removeDeadValues();
+    unit.clearValueMap();
 
     scalarizeArgs(F);
     typedef ReversePostOrderTraversal<Function*> RPOTType;
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 04984d8..09f92d0 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -4175,11 +4175,11 @@ uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
     return add_sat((uchar)x, (uchar)0);
 }
 
-#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x) { return (name(x.s0), name(x.s1)); }
-#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x) { return (name(x.s0), name(x.s1), name(x.s2)); }
-#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3)); }
-#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7)); }
-#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7), name(x.s8), name(x.s9), name(x.sA), name(x.sB), name(x.sC), name(x.sD), name(x.sE), name(x.sF)); }
+#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x) { return (int2)(name(x.s0), name(x.s1)); }
+#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x) { return (int3)(name(x.s0), name(x.s1), name(x.s2)); }
+#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x) { return (int4)(name(x.s0), name(x.s1), name(x.s2), name(x.s3)); }
+#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x) { return (int8)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7)); }
+#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x) { return (int16)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7), name(x.s8), name(x.s9), name(x.sA), name(x.sB), name(x.sC), name(x.sD), name(x.sE), name(x.sF)); }
 INLINE_OVERLOADABLE int isfinite(float x) { return __builtin_isfinite(x); }
 DEC2(isfinite);
 DEC3(isfinite);
@@ -4216,11 +4216,11 @@ DEC16(signbit);
 #undef DEC8
 #undef DEC16
 
-#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x, float2 y) { return (name(x.s0, y.s0), name(x.s1, y.s1)); }
-#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x, float3 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2)); }
-#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x, float4 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3)); }
-#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x, float8 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7)); }
-#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x, float16 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7), name(x.s8, y.s8), name(x.s9, y.s9), name(x.sA, y.sA), name(x.sB, y.sB), name(x.sC, y.sC), name(x.sD, y.sD), name(x.sE, y.sE), name(x.sF, y.sF)); }
+#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x, float2 y) { return (int2)(name(x.s0, y.s0), name(x.s1, y.s1)); }
+#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x, float3 y) { return (int3)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2)); }
+#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x, float4 y) { return (int4)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3)); }
+#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x, float8 y) { return (int8)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7)); }
+#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x, float16 y) { return (int16)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7), name(x.s8, y.s8), name(x.s9, y.s9), name(x.sA, y.sA), name(x.sB, y.sB), name(x.sC, y.sC), name(x.sD, y.sD), name(x.sE, y.sE), name(x.sF, y.sF)); }
 INLINE_OVERLOADABLE int islessgreater(float x, float y) { return (x<y)||(x>y); }
 DEC2(islessgreater);
 DEC3(islessgreater);
@@ -4338,11 +4338,11 @@ INLINE_OVERLOADABLE uint clz(uint x) {
   return __gen_ocl_fbh(x);
 }
 
-#define DEC2(type) INLINE_OVERLOADABLE type##2 clz(type##2 a) { return (clz(a.s0), clz(a.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 clz(type##3 a) { return (clz(a.s0), clz(a.s1), clz(a.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 clz(type##4 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 clz(type##8 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 clz(type##16 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7), clz(a.s8), clz(a.s9), clz(a.sa), clz(a.sb), clz(a.sc), clz(a.sd), clz(a.se), clz(a.sf)); }
+#define DEC2(type) INLINE_OVERLOADABLE type##2 clz(type##2 a) { return (type##2)(clz(a.s0), clz(a.s1)); }
+#define DEC3(type) INLINE_OVERLOADABLE type##3 clz(type##3 a) { return (type##3)(clz(a.s0), clz(a.s1), clz(a.s2)); }
+#define DEC4(type) INLINE_OVERLOADABLE type##4 clz(type##4 a) { return (type##4)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3)); }
+#define DEC8(type) INLINE_OVERLOADABLE type##8 clz(type##8 a) { return (type##8)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7)); }
+#define DEC16(type) INLINE_OVERLOADABLE type##16 clz(type##16 a) { return (type##16)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7), clz(a.s8), clz(a.s9), clz(a.sa), clz(a.sb), clz(a.sc), clz(a.sd), clz(a.se), clz(a.sf)); }
 #define DEC(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint) 
 DEC(2)
 DEC(3)
@@ -4364,11 +4364,11 @@ INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
 INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
 INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
 INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
-#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
+#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (type##2)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
+#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (type##3)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
+#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (type##4)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
+#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (type##8)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
+#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (type##16)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
 #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
 DEF(2)
 DEF(3)
@@ -4390,11 +4390,11 @@ DEF(ushort)
 DEF(int)
 DEF(uint)
 #undef DEF
-#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b.se, c.se),  [...]
+#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (type##2)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
+#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (type##3)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
+#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (type##4)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
+#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (type##8)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
+#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (type##16)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b.s [...]
 #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
 DEF(2)
 DEF(3)
@@ -4422,11 +4422,11 @@ DEF(ushort, 15)
 DEF(int, 31)
 DEF(uint, 31)
 #undef DEF
-#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
+#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (type##2)(rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
+#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (type##3)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
+#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (type##4)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
+#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (type##8)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
+#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (type##16)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
 #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
 DEF(2)
 DEF(3)
@@ -4440,6 +4440,47 @@ DEF(16)
 #undef DEC8
 #undef DEC16
 
+OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
+OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
+INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+#define DEC2(type, type2) INLINE_OVERLOADABLE type2##2 upsample(type##2 a, type##2 b) { return (type2##2)(upsample(a.s0, b.s0), upsample(a.s1, b.s1)); }
+#define DEC3(type, type2) INLINE_OVERLOADABLE type2##3 upsample(type##3 a, type##3 b) { return (type2##3)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2)); }
+#define DEC4(type, type2) INLINE_OVERLOADABLE type2##4 upsample(type##4 a, type##4 b) { return (type2##4)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3)); }
+#define DEC8(type, type2) INLINE_OVERLOADABLE type2##8 upsample(type##8 a, type##8 b) { return (type2##8)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7)); }
+#define DEC16(type, type2) INLINE_OVERLOADABLE type2##16 upsample(type##16 a, type##16 b) { return (type2##16)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7), upsample(a.s8, b.s8), upsample(a.s9, b.s9), upsample(a.sa, b.sa), upsample(a.sb, b.sb), upsample(a.sc, b.sc), upsample(a.sd, b.sd), upsample(a.se, b.se), upsample(a.sf, b.sf)); }
+#define DEF(n) DEC##n(uchar, ushort); DEC##n(ushort, uint)
+DEF(2)
+DEF(3)
+DEF(4)
+DEF(8)
+DEF(16)
+#undef DEF
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+#define DEC2(type, type2) INLINE_OVERLOADABLE type2##2 upsample(type##2 a, u##type##2 b) { return (type2##2)(upsample(a.s0, b.s0), upsample(a.s1, b.s1)); }
+#define DEC3(type, type2) INLINE_OVERLOADABLE type2##3 upsample(type##3 a, u##type##3 b) { return (type2##3)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2)); }
+#define DEC4(type, type2) INLINE_OVERLOADABLE type2##4 upsample(type##4 a, u##type##4 b) { return (type2##4)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3)); }
+#define DEC8(type, type2) INLINE_OVERLOADABLE type2##8 upsample(type##8 a, u##type##8 b) { return (type2##8)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7)); }
+#define DEC16(type, type2) INLINE_OVERLOADABLE type2##16 upsample(type##16 a, u##type##16 b) { return (type2##16)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7), upsample(a.s8, b.s8), upsample(a.s9, b.s9), upsample(a.sa, b.sa), upsample(a.sb, b.sb), upsample(a.sc, b.sc), upsample(a.sd, b.sd), upsample(a.se, b.se), upsample(a.sf, b.sf)); }
+#define DEF(n) DEC##n(char, short); DEC##n(short, int)
+DEF(2)
+DEF(3)
+DEF(4)
+DEF(8)
+DEF(16)
+#undef DEF
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
 PURE CONST uint __gen_ocl_hadd(uint x, uint y);
 PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
 #define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
@@ -4454,11 +4495,11 @@ INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 &
 INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
 INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
 INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
-#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (func(a.s0, b.s0), func(a.s1, b.s1)); }
-#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
-#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
-#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
-#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
+#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (type##2)(func(a.s0, b.s0), func(a.s1, b.s1)); }
+#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (type##3)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
+#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (type##4)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
+#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (type##8)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
+#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (type##16)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
 #define DEF(func, n) DEC##n(func, char); DEC##n(func, uchar); DEC##n(func, short); DEC##n(func, ushort); DEC##n(func, int); DEC##n(func, uint)
 DEF(hadd, 2)
 DEF(hadd, 3)
@@ -4481,6 +4522,7 @@ int __gen_ocl_abs(int x);
 #define ABS_I(I, CVT)  (CVT)__gen_ocl_abs(x.s##I)
 #define ABS_VEC1(CVT)  (CVT)__gen_ocl_abs(x)
 #define ABS_VEC2(CVT)  ABS_I(0, CVT), ABS_I(1, CVT)
+#define ABS_VEC3(CVT)  ABS_I(0, CVT), ABS_I(1, CVT), ABS_I(2, CVT)
 #define ABS_VEC4(CVT)  ABS_VEC2(CVT), ABS_I(2, CVT), ABS_I(3, CVT)
 #define ABS_VEC8(CVT)  ABS_VEC4(CVT), ABS_I(4, CVT), ABS_I(5, CVT),\
 	               ABS_I(6, CVT), ABS_I(7, CVT)
@@ -4490,7 +4532,7 @@ int __gen_ocl_abs(int x);
 
 #define DEC_1(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return ABS_VEC1(u##TYPE); }
 #define DEC_N(TYPE, N) INLINE_OVERLOADABLE u##TYPE##N abs(TYPE##N x) { return (u##TYPE##N)(ABS_VEC##N(u##TYPE)); };
-#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
+#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 3) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
 
 DEC(int)
 DEC(short)
@@ -4509,10 +4551,95 @@ DEC(uchar)
 #undef ABS_I
 #undef ABS_VEC1
 #undef ABS_VEC2
+#undef ABS_VEC3
 #undef ABS_VEC4
 #undef ABS_VEC8
 #undef ABS_VEC16
 
+
+/* Char and short type abs diff */
+/* promote char and short to int and will be no module overflow */
+#define ABS_DIFF(CVT) (CVT)(abs((int)x - (int)y))
+#define ABS_DIFF_I(CVT, I)  (CVT)(abs((int)x.s##I - (int)y.s##I))
+
+#define ABS_DIFF_VEC1(CVT)  ABS_DIFF(CVT)
+#define ABS_DIFF_VEC2(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1)
+#define ABS_DIFF_VEC3(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1), ABS_DIFF_I(CVT, 2)
+#define ABS_DIFF_VEC4(CVT)  ABS_DIFF_VEC2(CVT), ABS_DIFF_I(CVT, 2), ABS_DIFF_I(CVT, 3)
+#define ABS_DIFF_VEC8(CVT)  ABS_DIFF_VEC4(CVT), ABS_DIFF_I(CVT, 4), ABS_DIFF_I(CVT, 5), \
+                            ABS_DIFF_I(CVT, 6), ABS_DIFF_I(CVT, 7)
+#define ABS_DIFF_VEC16(CVT)  ABS_DIFF_VEC8(CVT), ABS_DIFF_I(CVT, 8), ABS_DIFF_I(CVT, 9), \
+                            ABS_DIFF_I(CVT, A), ABS_DIFF_I(CVT, B), \
+                            ABS_DIFF_I(CVT, C), ABS_DIFF_I(CVT, D), \
+                            ABS_DIFF_I(CVT, E), ABS_DIFF_I(CVT, F)
+
+#define DEC_1(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
+                           { return ABS_DIFF_VEC1(UTYPE); }
+#define DEC_N(TYPE, UTYPE, N) INLINE_OVERLOADABLE UTYPE##N abs_diff(TYPE##N x, TYPE##N y) \
+                              { return (UTYPE##N)(ABS_DIFF_VEC##N(UTYPE)); };
+#define DEC(TYPE, UTYPE)  DEC_1(TYPE, UTYPE) DEC_N(TYPE, UTYPE, 2)  DEC_N(TYPE, UTYPE, 3 ) \
+                          DEC_N(TYPE, UTYPE, 4) DEC_N(TYPE, UTYPE, 8) DEC_N(TYPE, UTYPE, 16)
+DEC(char, uchar)
+DEC(uchar, uchar)
+DEC(short, ushort)
+DEC(ushort, ushort)
+
+#undef DEC
+#undef DEC_1
+#undef DEC_N
+#undef ABS_DIFF
+#undef ABS_DIFF_I
+#undef ABS_DIFF_VEC1
+#undef ABS_DIFF_VEC2
+#undef ABS_DIFF_VEC3
+#undef ABS_DIFF_VEC4
+#undef ABS_DIFF_VEC8
+#undef ABS_DIFF_VEC16
+
+INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
+    /* same signed will never overflow. */
+    return y > x ? (y -x) : (x - y);
+}
+
+INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
+    /* same signed will never module overflow. */
+    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+        return abs(x - y);
+
+    return (abs(x) + abs(y));
+}
+
+#define ABS_DIFF_I(I)  abs_diff(x.s##I, y.s##I)
+
+#define ABS_DIFF_VEC2  ABS_DIFF_I(0), ABS_DIFF_I(1)
+#define ABS_DIFF_VEC3  ABS_DIFF_I(0), ABS_DIFF_I(1), ABS_DIFF_I(2)
+#define ABS_DIFF_VEC4  ABS_DIFF_VEC2, ABS_DIFF_I(2), ABS_DIFF_I(3)
+#define ABS_DIFF_VEC8  ABS_DIFF_VEC4, ABS_DIFF_I(4), ABS_DIFF_I(5), \
+                       ABS_DIFF_I(6), ABS_DIFF_I(7)
+#define ABS_DIFF_VEC16  ABS_DIFF_VEC8, ABS_DIFF_I(8), ABS_DIFF_I(9), \
+                            ABS_DIFF_I(A), ABS_DIFF_I(B), \
+                            ABS_DIFF_I(C), ABS_DIFF_I(D), \
+                            ABS_DIFF_I(E), ABS_DIFF_I(F)
+
+#define DEC_N(TYPE, N) INLINE_OVERLOADABLE uint##N abs_diff(TYPE##N x, TYPE##N y) \
+				      { return (uint##N)(ABS_DIFF_VEC##N); };
+#define DEC(TYPE)   DEC_N(TYPE, 2)  DEC_N(TYPE, 3 ) \
+                           DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
+DEC(int)
+DEC(uint)
+
+#undef DEC
+#undef DEC_1
+#undef DEC_N
+#undef ABS_DIFF
+#undef ABS_DIFF_I
+#undef ABS_DIFF_VEC1
+#undef ABS_DIFF_VEC2
+#undef ABS_DIFF_VEC3
+#undef ABS_DIFF_VEC4
+#undef ABS_DIFF_VEC8
+#undef ABS_DIFF_VEC16
+
 /////////////////////////////////////////////////////////////////////////////
 // Work Items functions (see 6.11.1 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
@@ -4827,6 +4954,52 @@ DECL_SELECT4(float4, float, uint4, 0x80000000)
 /////////////////////////////////////////////////////////////////////////////
 // Common Functions (see 6.11.4 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float step(float edge, float x) {
+  return x < edge ? 0.0 : 1.0;
+}
+#define STEP(I)  x.s##I < edge.s##I ? 0.0 : 1.0
+INLINE_OVERLOADABLE float2 step(float2 edge, float2 x) {
+  return (float2)(STEP(0), STEP(1));
+}
+INLINE_OVERLOADABLE float3 step(float3 edge, float3 x) {
+  return (float3)(STEP(0), STEP(1), STEP(2));
+}
+INLINE_OVERLOADABLE float4 step(float4 edge, float4 x) {
+  return (float4)(STEP(0), STEP(1), STEP(2), STEP(3));
+}
+INLINE_OVERLOADABLE float8 step(float8 edge, float8 x) {
+  return (float8)(STEP(0), STEP(1), STEP(2), STEP(3),
+                  STEP(4), STEP(5), STEP(6), STEP(7));
+}
+INLINE_OVERLOADABLE float16 step(float16 edge, float16 x) {
+  return (float16)(STEP(0), STEP(1), STEP(2), STEP(3),
+                   STEP(4), STEP(5), STEP(6), STEP(7),
+                   STEP(8), STEP(9), STEP(A), STEP(B),
+                   STEP(C), STEP(D), STEP(E), STEP(F));
+}
+#undef STEP
+#define STEP(I)  x.s##I < edge ? 0.0 : 1.0
+INLINE_OVERLOADABLE float2 step(float edge, float2 x) {
+  return (float2)(STEP(0), STEP(1));
+}
+INLINE_OVERLOADABLE float3 step(float edge, float3 x) {
+  return (float3)(STEP(0), STEP(1), STEP(2));
+}
+INLINE_OVERLOADABLE float4 step(float edge, float4 x) {
+  return (float4)(STEP(0), STEP(1), STEP(2), STEP(3));
+}
+INLINE_OVERLOADABLE float8 step(float edge, float8 x) {
+  return (float8)(STEP(0), STEP(1), STEP(2), STEP(3),
+                  STEP(4), STEP(5), STEP(6), STEP(7));
+}
+INLINE_OVERLOADABLE float16 step(float edge, float16 x) {
+  return (float16)(STEP(0), STEP(1), STEP(2), STEP(3),
+                   STEP(4), STEP(5), STEP(6), STEP(7),
+                   STEP(8), STEP(9), STEP(A), STEP(B),
+                   STEP(C), STEP(D), STEP(E), STEP(F));
+}
+#undef STEP
+
 #define DECL_MIN_MAX_CLAMP(TYPE) \
 INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
   return a > b ? a : b; \
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index e31cce1..c0f88de 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -135,4 +135,7 @@ How to contribute
 You are always welcome to contribute to this project, just need to subscribe
 to the beignet mail list and send patches to it for review.
 The official mail list is as below:
-http://lists.freedesktop.org/mailman/listinfo/beignet
+[http://lists.freedesktop.org/mailman/listinfo/beignet](http://lists.freedesktop.org/mailman/listinfo/beignet)
+
+The wiki url is as below:
+[http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/Beignet/Backend/TODO.mdwn b/docs/Beignet/Backend/TODO.mdwn
index 3f1ccb4..f14433d 100644
--- a/docs/Beignet/Backend/TODO.mdwn
+++ b/docs/Beignet/Backend/TODO.mdwn
@@ -25,31 +25,19 @@ many things must be implemented:
 
 - Lowering down of various intrinsics like `llvm.memcpy`
 
--  Conformance test for all OpenCL built-ins (`native_cos`, `native_sin`,
-  `mad`, atomic operations, barriers...).
-
-- Lowering down of int16 / int8 / float16 / char16 / char8 / char4 loads and
-  stores into the supported loads and stores
-
-- Support for local declaration of local array (the OpenCL profile will properly
-  declare them as global arrays)
-
-- Support for doubles
-
-- Support atomic extensions.
-
 - Better resolving of the PHI functions. Today, we always generate MOV
   instructions at the end of each basic block . They can be easily optimized.
 
+- From LLVM 3.3, we use SPIR IR. We need to use the compiler defined type to
+  represent sampler_t/image2d_t/image1d_t/....
+
+- Adding support for long (int64).
+
 Gen IR
 ------
 
 The code is defined in `src/ir`. Main things to do are:
 
-- Bringing support for doubles
-
-- Adding support for atomic extensions.
-
 - Finishing the handling of function arguments (see the [[IR
   description|gen_ir]] for more details)
 
@@ -68,14 +56,14 @@ The code is defined in `src/ir`. Main things to do are:
   This will obviously impact both instruction selection and the register
   allocation.
 
+- Adding support for long (int64).
+
 Backend
 -------
 
 The code is defined in `src/backend`. Main things to do are:
 
-- Implementing support for doubles
-
-- Implementing atomic extensions.
+- Int64 support?
 
 - Implementing register spilling (see the [[compiler backend
   description|compiler_backend]] for more details)
@@ -99,10 +87,8 @@ I also purely and simply copied and pasted the Gen ISA disassembler from Mesa.
 This leads to code duplication. Also some messages used by OpenCL (untyped reads
 and writes) are not properly decoded yet.
 
-There are some quick and dirty hacks also like the use of function call `system`
-(...). This should be cleanly replaced by popen and stuff. I also directly
-called the LLVM compiler executable instead of using Clang library. All of this
-should be improved and cleaned up. Track "XXX" comments in the code.
+All of those code should be improved and cleaned up are tracked with "XXX"
+comments in the code.
 
 Parts of the code leaks memory when exceptions are used. There are some pointers
 to track and replace with std::unique_ptr. Note that we also add a custom memory
diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs.cl
index 9e77c2b..549575c 100644
--- a/kernels/compiler_abs.cl
+++ b/kernels/compiler_abs.cl
@@ -15,6 +15,7 @@
 #define COMPILER_ABS(TYPE, UTYPE)  \
     COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs_diff.cl
similarity index 60%
copy from kernels/compiler_abs.cl
copy to kernels/compiler_abs_diff.cl
index 9e77c2b..583ba2b 100644
--- a/kernels/compiler_abs.cl
+++ b/kernels/compiler_abs_diff.cl
@@ -1,20 +1,21 @@
 #define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
-    kernel void compiler_abs_##TYPE ( \
-           global TYPE* src, global UTYPE* dst) { \
+    kernel void compiler_abs_diff_##TYPE ( \
+           global TYPE* x, global TYPE* y, global UTYPE* diff) { \
         int i = get_global_id(0); \
-        dst[i] = abs(src[i]);     \
+        diff[i] = abs_diff(x[i], y[i]);     \
     }
 
 #define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
-    kernel void compiler_abs_##TYPE##N ( \
-           global TYPE##N* src, global UTYPE##N* dst) { \
+    kernel void compiler_abs_diff_##TYPE##N ( \
+           global TYPE##N* x, global TYPE##N* y, global UTYPE##N* diff) { \
         int i = get_global_id(0); \
-        dst[i] = abs(src[i]);     \
+        diff[i] = abs_diff(x[i], y[i]);     \
     }
 
 #define COMPILER_ABS(TYPE, UTYPE)  \
     COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
index 61ce2f4..fbc16fb 100644
--- a/kernels/compiler_atomic_functions.cl
+++ b/kernels/compiler_atomic_functions.cl
@@ -1,14 +1,21 @@
 __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
   int lid = get_local_id(0);
   int i = lid % 12;
-  atomic_xchg(&tmp[4], -1);
+  if(lid == 0) {
+    for(int j=0; j<12; j=j+1) {
+      atomic_xchg(&tmp[j], 0);
+    }
+    atomic_xchg(&tmp[4], -1);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
   switch(i) {
     case 0: atomic_inc(&tmp[i]); break;
     case 1: atomic_dec(&tmp[i]); break;
     case 2: atomic_add(&tmp[i], src[lid]); break;
     case 3: atomic_sub(&tmp[i], src[lid]); break;
-    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 4))); break;
-    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 4)); break;
+    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 16)); break;
     case 6: atomic_xor(&tmp[i], src[lid]); break;
     case 7: atomic_min(&tmp[i], -src[lid]); break;
     case 8: atomic_max(&tmp[i], src[lid]); break;
@@ -23,8 +30,8 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
     case 1: atomic_dec(&dst[i]); break;
     case 2: atomic_add(&dst[i], src[lid]); break;
     case 3: atomic_sub(&dst[i], src[lid]); break;
-    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 4))); break;
-    case 5: atomic_or (&dst[i], src[lid]<<(lid / 4)); break;
+    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_or (&dst[i], src[lid]<<(lid / 16)); break;
     case 6: atomic_xor(&dst[i], src[lid]); break;
     case 7: atomic_min(&dst[i], -src[lid]); break;
     case 8: atomic_max(&dst[i], src[lid]); break;
@@ -38,6 +45,6 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
 
   if(get_global_id(0) == 0) {
     for(i=0; i<12; i=i+1)
-      atomic_add(&dst[i], tmp[i]);
+      atomic_xchg(&dst[i+12], tmp[i]);
   }
 }
diff --git a/kernels/compiler_global_constant.cl b/kernels/compiler_global_constant.cl
index af3c7b1..5e2e0b4 100644
--- a/kernels/compiler_global_constant.cl
+++ b/kernels/compiler_global_constant.cl
@@ -1,8 +1,9 @@
 constant int m[3] = {71,72,73};
+constant int n = 1;
 
 __kernel void
 compiler_global_constant(__global int *dst, int e, int r)
 {
   int id = (int)get_global_id(0);
-  dst[id] = m[id%3] + e + r;
+  dst[id] = m[id%3] * n + e + r;
 }
diff --git a/kernels/compiler_step.cl b/kernels/compiler_step.cl
new file mode 100644
index 0000000..ef77f05
--- /dev/null
+++ b/kernels/compiler_step.cl
@@ -0,0 +1,38 @@
+#define COMPILER_STEP_FUNC_N(TYPE, N) \
+    kernel void compiler_step_##TYPE##N ( \
+           global TYPE##N* edge, global TYPE##N* x, global TYPE##N* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = step(edge[i], x[i]);     \
+    }
+
+kernel void compiler_step_float (global float* edge,
+                                 global float* x, global float* dst)
+{
+    int i = get_global_id(0);
+    dst[i] = step(edge[i], x[i]);
+}
+
+COMPILER_STEP_FUNC_N(float, 2)
+COMPILER_STEP_FUNC_N(float, 3)
+COMPILER_STEP_FUNC_N(float, 4)
+COMPILER_STEP_FUNC_N(float, 8)
+COMPILER_STEP_FUNC_N(float, 16)
+
+#define COMPILER_STEPF_FUNC_N(TYPE, N) \
+    kernel void compiler_stepf_##TYPE##N ( \
+           float edge, global TYPE##N* x, global TYPE##N* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = step(edge, x[i]);     \
+    }
+
+kernel void compiler_stepf_float (float edge, global float* x, global float* dst)
+{
+    int i = get_global_id(0);
+    dst[i] = step(edge, x[i]);
+}
+
+COMPILER_STEPF_FUNC_N(float, 2)
+COMPILER_STEPF_FUNC_N(float, 3)
+COMPILER_STEPF_FUNC_N(float, 4)
+COMPILER_STEPF_FUNC_N(float, 8)
+COMPILER_STEPF_FUNC_N(float, 16)
diff --git a/kernels/compiler_upsample_int.cl b/kernels/compiler_upsample_int.cl
new file mode 100644
index 0000000..d7945b5
--- /dev/null
+++ b/kernels/compiler_upsample_int.cl
@@ -0,0 +1,4 @@
+kernel void compiler_upsample_int(global short *src1, global ushort *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = upsample(src1[i], src2[i]);
+}
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 4afc207..5465aa9 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -400,6 +400,11 @@ cl_mem_delete(cl_mem mem)
     return;
   if (LIKELY(mem->bo != NULL))
     cl_buffer_unreference(mem->bo);
+#ifdef HAS_EGL
+  if (UNLIKELY(mem->egl_image != NULL)) {
+     cl_mem_gl_delete(mem);
+  }
+#endif
 
   /* Remove it from the list */
   assert(mem->ctx);
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 33ad174..c204992 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -55,6 +55,7 @@ struct _cl_mem {
   uint64_t magic;           /* To identify it as a memory object */
   volatile int ref_n;       /* This object is reference counted */
   cl_buffer bo;             /* Data in GPU memory */
+  void *egl_image;          /* created from external egl image*/
   size_t size;              /* original request size, not alignment size, used in constant buffer */
   cl_mem prev, next;        /* We chain the memory buffers together */
   cl_context ctx;           /* Context it belongs to */
@@ -87,6 +88,9 @@ cl_mem_new_image(cl_context context,
 /* Unref the object and delete it if no more reference */
 extern void cl_mem_delete(cl_mem);
 
+/* Destroy egl image. */
+extern void cl_mem_gl_delete(cl_mem);
+
 /* Add one more reference to this object */
 extern void cl_mem_add_ref(cl_mem);
 
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index c2a5395..04641a5 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -183,6 +183,7 @@ LOCAL cl_mem cl_mem_new_gl_texture(cl_context ctx,
     err = CL_INVALID_GL_OBJECT;
     goto error;
   }
+  mem->egl_image = egl_image;
   mem->bo = cl_buffer_alloc_from_eglimage(ctx, (void*)egl_image, &gl_format, &w, &h, &pitch, &tiling);
   if (UNLIKELY(mem->bo == NULL)) {
     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
@@ -232,3 +233,11 @@ error:
   goto exit;
 
 }
+
+LOCAL void cl_mem_gl_delete(cl_mem mem)
+{
+  struct cl_gl_ext_deps *egl_funcs;
+  EGLDisplay egl_display = (EGLDisplay)mem->ctx->props.egl_display;
+  egl_funcs =  CL_EXTENSION_GET_FUNCS(mem->ctx, khr_gl_sharing, gl_ext_deps);
+  egl_funcs->eglDestroyImageKHR_func(egl_display, mem->egl_image);
+}
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index 3a74028..5701a50 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -60,7 +60,7 @@ extern cl_int cl_get_platform_info(cl_platform_id    platform,
                                    size_t *          param_value_size_ret);
 
 #define OCL_VERSION_MAJOR 1
-#define OCL_VERSION_MINOR 0
+#define OCL_VERSION_MINOR 1
 
 #define _STR(x) #x
 #define _JOINT(x, y) _STR(x) "." _STR(y)
diff --git a/src/cl_utils.h b/src/cl_utils.h
index dfb1369..59b7a2b 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -39,14 +39,22 @@ struct JOIN(__,JOIN(__,__LINE__)) {                                 \
 }
 
 /* Throw errors */
-#define ERR(ERROR, ...)                                             \
-do {                                                                \
-  fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__);     \
-  fprintf(stderr, __VA_ARGS__);                                     \
-  fprintf(stderr, "\n");                                            \
-  err = ERROR;                                                      \
-  goto error;                                                       \
-} while (0)
+#ifdef NDEBUG
+  #define ERR(ERROR, ...)                                             \
+  do {                                                                \
+    err = ERROR;                                                      \
+    goto error;                                                       \
+  } while (0)
+#else
+  #define ERR(ERROR, ...)                                             \
+  do {                                                                \
+    fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__);     \
+    fprintf(stderr, __VA_ARGS__);                                     \
+    fprintf(stderr, "\n");                                            \
+    err = ERROR;                                                      \
+    goto error;                                                       \
+  } while (0)
+#endif
 
 #define DO_ALLOC_ERR                                                \
 do {                                                                \
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index ebc4961..6c6b9fb 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -208,6 +208,7 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
 
 #if defined(HAS_GBM) && defined(HAS_EGL)
   if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
+    assert(props->egl_display);
     intel->gbm = gbm_create_device(intel->fd);
     if (intel->gbm == NULL) {
       printf("GBM device create failed.\n");
diff --git a/src/x11/gbm_dri2_x11_platform.c b/src/x11/gbm_dri2_x11_platform.c
index 56b2467..481f407 100644
--- a/src/x11/gbm_dri2_x11_platform.c
+++ b/src/x11/gbm_dri2_x11_platform.c
@@ -1,11 +1,93 @@
+#include <string.h>
 #include "GL/gl.h" /* dri_interface need gl types definitions. */
 #include "GL/internal/dri_interface.h"
 #include "gbm_deps/gbm_driint.h"
 #include "gbm_deps/gbmint.h"
 #include "dricommon.h"
 
-/* image_lookup_extension is from egl_dri2.c. */
-extern const __DRIimageLookupExtension image_lookup_extension;
+typedef struct EGLDisplay _EGLDisplay;
+typedef struct EGLDriver  _EGLDriver;
+/* XXX should check whether we support pthread.*/
+typedef pthread_mutex_t _EGLMutex;
+
+enum _egl_platform_type {
+   _EGL_PLATFORM_WINDOWS,
+   _EGL_PLATFORM_X11,
+   _EGL_PLATFORM_WAYLAND,
+   _EGL_PLATFORM_DRM,
+   _EGL_PLATFORM_FBDEV,
+   _EGL_PLATFORM_NULL,
+   _EGL_PLATFORM_ANDROID,
+
+   _EGL_NUM_PLATFORMS,
+   _EGL_INVALID_PLATFORM = -1
+};
+typedef enum _egl_platform_type _EGLPlatformType;
+typedef unsigned int EGLBoolean;
+typedef int32_t EGLint;
+
+struct _hack_egl_display
+{
+   /* used to link displays */
+   _EGLDisplay *Next;
+
+   _EGLMutex Mutex;
+
+   _EGLPlatformType Platform; /**< The type of the platform display */
+   void *PlatformDisplay;     /**< A pointer to the platform display */
+
+   _EGLDriver *Driver;        /**< Matched driver of the display */
+
+   EGLBoolean Initialized;    /**< True if the display is initialized */
+
+   /* options that affect how the driver initializes the display */
+   struct {
+      EGLBoolean TestOnly;    /**< Driver should not set fields when true */
+      EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
+   } Options;
+
+   /* these fields are set by the driver during init */
+   void *DriverData;          /**< Driver private data */
+   EGLint VersionMajor;       /**< EGL major version */
+   EGLint VersionMinor;       /**< EGL minor version */
+   EGLint ClientAPIs;         /**< Bitmask of APIs supported (EGL_xxx_BIT) */
+};
+
+struct _hack_dri2_egl_display
+{
+   int                       dri2_major;
+   int                       dri2_minor;
+   __DRIscreen              *dri_screen;
+   int                       own_dri_screen;
+   const __DRIconfig       **driver_configs;
+   void                     *driver;
+   __DRIcoreExtension       *core;
+   __DRIdri2Extension       *dri2;
+   __DRIswrastExtension     *swrast;
+   __DRI2flushExtension     *flush;
+   __DRItexBufferExtension  *tex_buffer;
+   __DRIimageExtension      *image;
+   __DRIrobustnessExtension *robustness;
+   __DRI2configQueryExtension *config;
+   int                       fd;
+
+   int                       own_device;
+   int                       swap_available;
+   int                       invalidate_available;
+   int                       min_swap_interval;
+   int                       max_swap_interval;
+   int                       default_swap_interval;
+   struct gbm_dri_device    *gbm_dri;
+
+   char                     *device_name;
+   char                     *driver_name;
+
+   __DRIdri2LoaderExtension    dri2_loader_extension;
+   __DRIswrastLoaderExtension  swrast_loader_extension;
+   const __DRIextension     *extensions[4];
+};
+
+static __DRIimageLookupExtension *image_lookup_extension;
 
 /* We are use DRI2 x11 platform, and by default, gbm doesn't register
  * a valid image extension, and actually, it doesn't know how to register
@@ -13,8 +95,21 @@ extern const __DRIimageLookupExtension image_lookup_extension;
 void cl_gbm_set_image_extension(struct gbm_device *gbm, void *display)
 {
   struct gbm_dri_device *gbm_dri = gbm_dri_device(gbm);
-  if (gbm_dri->lookup_image == NULL) {
-    gbm_dri->lookup_image = image_lookup_extension.lookupEGLImage;
+  struct _hack_egl_display *egl_dpy = (struct _hack_egl_display*)display;
+  struct _hack_dri2_egl_display *dri2_dpy = (struct _hack_dri2_egl_display*)egl_dpy->DriverData;
+  int i;
+
+  if (gbm_dri->lookup_image == NULL
+      && egl_dpy->Platform == _EGL_PLATFORM_X11) {
+    for(i = 0; i < 4; i++)
+     if (dri2_dpy->extensions[i]
+         && ((strncmp(dri2_dpy->extensions[i]->name,
+                      __DRI_IMAGE_LOOKUP,
+                      sizeof(__DRI_IMAGE_LOOKUP))) == 0))
+       break;
+    if (i >= 4) return;
+    image_lookup_extension = (__DRIimageLookupExtension*)dri2_dpy->extensions[i];
+    gbm_dri->lookup_image = image_lookup_extension->lookupEGLImage;
     gbm_dri->lookup_user_data = display;
   }
 }
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 3fe0065..fafacb5 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -30,8 +30,10 @@ set (utests_sources
   compiler_copy_image.cpp
   compiler_copy_image_3d.cpp
   compiler_copy_buffer_row.cpp
+  compiler_step.cpp
   compiler_fabs.cpp
   compiler_abs.cpp
+  compiler_abs_diff.cpp
   compiler_fill_image.cpp
   compiler_fill_image0.cpp
   compiler_fill_image_3d.cpp
@@ -66,6 +68,7 @@ set (utests_sources
   compiler_uint8_copy.cpp
   compiler_uint16_copy.cpp
   compiler_uint3_unaligned_copy.cpp
+  compiler_upsample_int.cpp
   compiler_unstructured_branch0.cpp
   compiler_unstructured_branch1.cpp
   compiler_unstructured_branch2.cpp
diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
index 59d8365..9457b9b 100644
--- a/utests/compiler_abs.cpp
+++ b/utests/compiler_abs.cpp
@@ -3,23 +3,26 @@
 
 template <typename T, int N>
 struct cl_vec {
-    T ptr[N];
+    T ptr[((N+1)/2)*2]; //align to 2 elements.
 
     typedef cl_vec<T, N> vec_type;
 
     cl_vec(void) {
-        memset(ptr, 0, sizeof(T) * N);
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
     }
     cl_vec(vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
         memcpy (this->ptr, other.ptr, sizeof(T) * N);
     }
 
     vec_type& operator= (vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
         memcpy (this->ptr, other.ptr, sizeof(T) * N);
         return *this;
     }
 
     template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
         memcpy (this->ptr, other.ptr, sizeof(T) * N);
         return *this;
     }
@@ -56,6 +59,8 @@ template <typename T, typename U> static void cpu(int global_id, T *src, U *dst)
 template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
 {
     int i = 0;
+
+    memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
     for (; i < N; i++) {
         vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
     }
@@ -66,25 +71,34 @@ template <typename T> static void gen_rand_val (T & val)
     val = static_cast<T>((rand() & 63) - 32);
 }
 
+template <typename T>
+inline static void print_data (T& val)
+{
+    if (std::is_unsigned<T>::value)
+        printf(" %u", val);
+    else
+        printf(" %d", val);
+}
+
 template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* src,
-	cl_vec<U, N>* dst, int n)
+        cl_vec<U, N>* dst, int n)
 {
     U* val = reinterpret_cast<U *>(dst);
 
-    n = n*N;
+    n = n*((N+1)/2)*2;
 
     printf("\nRaw: \n");
     for (int32_t i = 0; i < (int32_t) n; ++i) {
-        printf(" %d", ((T *)buf_data[0])[i]);
+        print_data(((T *)buf_data[0])[i]);
     }
 
     printf("\nCPU: \n");
     for (int32_t i = 0; i < (int32_t) n; ++i) {
-        printf(" %d", val[i]);
+        print_data(val[i]);
     }
     printf("\nGPU: \n");
     for (int32_t i = 0; i < (int32_t) n; ++i) {
-        printf(" %d", ((U *)buf_data[1])[i]);
+        print_data(((U *)buf_data[1])[i]);
     }
 }
 
@@ -92,16 +106,16 @@ template <typename T, typename U> static void dump_data (T* src, U* dst, int n)
 {
     printf("\nRaw: \n");
     for (int32_t i = 0; i < (int32_t) n; ++i) {
-        printf(" %d", ((T *)buf_data[0])[i]);
+        print_data(((T *)buf_data[0])[i]);
     }
 
     printf("\nCPU: \n");
     for (int32_t i = 0; i < (int32_t) n; ++i) {
-        printf(" %d", dst[i]);
+        print_data(dst[i]);
     }
     printf("\nGPU: \n");
     for (int32_t i = 0; i < (int32_t) n; ++i) {
-        printf(" %d", ((U *)buf_data[1])[i]);
+        print_data(((U *)buf_data[1])[i]);
     }
 }
 
@@ -122,6 +136,12 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
     // Run random tests
     for (uint32_t pass = 0; pass < 8; ++pass) {
         OCL_MAP_BUFFER(0);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(1);
+        memset(buf_data[1], 0, sizeof(U) * n);
+        OCL_UNMAP_BUFFER(1);
+
         for (int32_t i = 0; i < (int32_t) n; ++i) {
             gen_rand_val(cpu_src[i]);
         }
@@ -166,54 +186,66 @@ ABS_TEST_TYPE(uchar, uchar)
 
 
 typedef cl_vec<int, 2> int2;
+typedef cl_vec<int, 3> int3;
 typedef cl_vec<int, 4> int4;
 typedef cl_vec<int, 8> int8;
 typedef cl_vec<int, 16> int16;
 typedef cl_vec<unsigned int, 2> uint2;
+typedef cl_vec<unsigned int, 3> uint3;
 typedef cl_vec<unsigned int, 4> uint4;
 typedef cl_vec<unsigned int, 8> uint8;
 typedef cl_vec<unsigned int, 16> uint16;
 ABS_TEST_TYPE(int2, uint2)
+ABS_TEST_TYPE(int3, uint3)
 ABS_TEST_TYPE(int4, uint4)
 ABS_TEST_TYPE(int8, uint8)
 ABS_TEST_TYPE(int16, uint16)
 ABS_TEST_TYPE(uint2, uint2)
+ABS_TEST_TYPE(uint3, uint3)
 ABS_TEST_TYPE(uint4, uint4)
 ABS_TEST_TYPE(uint8, uint8)
 ABS_TEST_TYPE(uint16, uint16)
 
 
 typedef cl_vec<char, 2> char2;
+typedef cl_vec<char, 3> char3;
 typedef cl_vec<char, 4> char4;
 typedef cl_vec<char, 8> char8;
 typedef cl_vec<char, 16> char16;
 typedef cl_vec<unsigned char, 2> uchar2;
+typedef cl_vec<unsigned char, 3> uchar3;
 typedef cl_vec<unsigned char, 4> uchar4;
 typedef cl_vec<unsigned char, 8> uchar8;
 typedef cl_vec<unsigned char, 16> uchar16;
 ABS_TEST_TYPE(char2, uchar2)
+ABS_TEST_TYPE(char3, uchar3)
 ABS_TEST_TYPE(char4, uchar4)
 ABS_TEST_TYPE(char8, uchar8)
 ABS_TEST_TYPE(char16, uchar16)
 ABS_TEST_TYPE(uchar2, uchar2)
+ABS_TEST_TYPE(uchar3, uchar3)
 ABS_TEST_TYPE(uchar4, uchar4)
 ABS_TEST_TYPE(uchar8, uchar8)
 ABS_TEST_TYPE(uchar16, uchar16)
 
 
 typedef cl_vec<short, 2> short2;
+typedef cl_vec<short, 3> short3;
 typedef cl_vec<short, 4> short4;
 typedef cl_vec<short, 8> short8;
 typedef cl_vec<short, 16> short16;
 typedef cl_vec<unsigned short, 2> ushort2;
+typedef cl_vec<unsigned short, 3> ushort3;
 typedef cl_vec<unsigned short, 4> ushort4;
 typedef cl_vec<unsigned short, 8> ushort8;
 typedef cl_vec<unsigned short, 16> ushort16;
 ABS_TEST_TYPE(short2, ushort2)
+ABS_TEST_TYPE(short3, ushort3)
 ABS_TEST_TYPE(short4, ushort4)
 ABS_TEST_TYPE(short8, ushort8)
 ABS_TEST_TYPE(short16, ushort16)
 ABS_TEST_TYPE(ushort2, ushort2)
+ABS_TEST_TYPE(ushort3, ushort3)
 ABS_TEST_TYPE(ushort4, ushort4)
 ABS_TEST_TYPE(ushort8, ushort8)
 ABS_TEST_TYPE(ushort16, ushort16)
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
new file mode 100644
index 0000000..384a654
--- /dev/null
+++ b/utests/compiler_abs_diff.cpp
@@ -0,0 +1,267 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+    T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+    typedef cl_vec<T, N> vec_type;
+
+    cl_vec(void) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    }
+    cl_vec(vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    vec_type& operator= (vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    bool operator== (vec_type & other) {
+        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    void abs_diff(vec_type & other) {
+        int i = 0;
+        for (; i < N; i++) {
+            T a = ptr[i];
+            T b = other.ptr[i];
+            T f = a > b ? (a - b) : (b - a);
+            ptr[i] = f;
+        }
+    }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        cl_vec<T, N> *x, cl_vec<T, N> *y, cl_vec<U, N> *diff)
+{
+    cl_vec<T, N> v  = x[global_id];
+    v.abs_diff(y[global_id]);
+    diff[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *x, T *y, U *diff)
+{
+    T a = x[global_id];
+    T b = y[global_id];
+    U f = a > b ? (a - b) : (b - a);
+    diff[global_id] = f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+    int i = 0;
+    for (; i < N; i++) {
+        vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
+    }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+    val = static_cast<T>((rand() & 63) - 32);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+    if (std::is_unsigned<T>::value)
+        printf(" %u", val);
+    else
+        printf(" %d", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* x,
+        cl_vec<T, N>* y, cl_vec<U, N>* diff, int n)
+{
+    U* val = reinterpret_cast<U *>(diff);
+
+    n = n*((N+1)/2)*2;
+
+    printf("\nRaw x: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+    printf("\nRaw y: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(val[i]);
+    }
+    printf("\nGPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (T* x, T* y, U* diff, int n)
+{
+    printf("\nRaw x: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+    printf("\nRaw y: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(diff[i]);
+    }
+    printf("\nGPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U> static void compiler_abs_diff_with_type(void)
+{
+    const size_t n = 16;
+    U cpu_diff[16];
+    T cpu_x[16];
+    T cpu_y[16];
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(U), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        OCL_MAP_BUFFER(1);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(2);
+        memset(buf_data[2], 0, sizeof(U) * n);
+        OCL_UNMAP_BUFFER(2);
+
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_x[i]);
+            gen_rand_val(cpu_y[i]);
+        }
+
+        memcpy(buf_data[0], cpu_x, sizeof(T) * n);
+        memcpy(buf_data[1], cpu_y, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, cpu_x, cpu_y, cpu_diff);
+
+        // Compare
+        OCL_MAP_BUFFER(2);
+
+//      dump_data(cpu_x, cpu_y, cpu_diff, n);
+
+        OCL_ASSERT(!memcmp(buf_data[2], cpu_diff, sizeof(T) * n));
+
+        OCL_UNMAP_BUFFER(0);
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(2);
+    }
+}
+
+#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) \
+	static void compiler_abs_diff_##TYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#TYPE, SOURCE, NULL);  \
+           compiler_abs_diff_with_type<TYPE, UTYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##TYPE);
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+ABS_TEST_DIFF_TYPE(int, uint)
+ABS_TEST_DIFF_TYPE(short, ushort)
+ABS_TEST_DIFF_TYPE(char, uchar)
+ABS_TEST_DIFF_TYPE(uint, uint)
+ABS_TEST_DIFF_TYPE(ushort, ushort)
+ABS_TEST_DIFF_TYPE(uchar, uchar)
+
+
+typedef cl_vec<int, 2> int2;
+typedef cl_vec<int, 3> int3;
+typedef cl_vec<int, 4> int4;
+typedef cl_vec<int, 8> int8;
+typedef cl_vec<int, 16> int16;
+typedef cl_vec<unsigned int, 2> uint2;
+typedef cl_vec<unsigned int, 3> uint3;
+typedef cl_vec<unsigned int, 4> uint4;
+typedef cl_vec<unsigned int, 8> uint8;
+typedef cl_vec<unsigned int, 16> uint16;
+ABS_TEST_DIFF_TYPE(int2, uint2)
+ABS_TEST_DIFF_TYPE(int3, uint3)
+ABS_TEST_DIFF_TYPE(int4, uint4)
+ABS_TEST_DIFF_TYPE(int8, uint8)
+ABS_TEST_DIFF_TYPE(int16, uint16)
+ABS_TEST_DIFF_TYPE(uint2, uint2)
+ABS_TEST_DIFF_TYPE(uint3, uint3)
+ABS_TEST_DIFF_TYPE(uint4, uint4)
+ABS_TEST_DIFF_TYPE(uint8, uint8)
+ABS_TEST_DIFF_TYPE(uint16, uint16)
+
+
+typedef cl_vec<char, 2> char2;
+typedef cl_vec<char, 3> char3;
+typedef cl_vec<char, 4> char4;
+typedef cl_vec<char, 8> char8;
+typedef cl_vec<char, 16> char16;
+typedef cl_vec<unsigned char, 2> uchar2;
+typedef cl_vec<unsigned char, 3> uchar3;
+typedef cl_vec<unsigned char, 4> uchar4;
+typedef cl_vec<unsigned char, 8> uchar8;
+typedef cl_vec<unsigned char, 16> uchar16;
+ABS_TEST_DIFF_TYPE(char2, uchar2)
+ABS_TEST_DIFF_TYPE(char3, uchar3)
+ABS_TEST_DIFF_TYPE(char4, uchar4)
+ABS_TEST_DIFF_TYPE(char8, uchar8)
+ABS_TEST_DIFF_TYPE(char16, uchar16)
+ABS_TEST_DIFF_TYPE(uchar2, uchar2)
+ABS_TEST_DIFF_TYPE(uchar3, uchar3)
+ABS_TEST_DIFF_TYPE(uchar4, uchar4)
+ABS_TEST_DIFF_TYPE(uchar8, uchar8)
+ABS_TEST_DIFF_TYPE(uchar16, uchar16)
+
+
+typedef cl_vec<short, 2> short2;
+typedef cl_vec<short, 3> short3;
+typedef cl_vec<short, 4> short4;
+typedef cl_vec<short, 8> short8;
+typedef cl_vec<short, 16> short16;
+typedef cl_vec<unsigned short, 2> ushort2;
+typedef cl_vec<unsigned short, 3> ushort3;
+typedef cl_vec<unsigned short, 4> ushort4;
+typedef cl_vec<unsigned short, 8> ushort8;
+typedef cl_vec<unsigned short, 16> ushort16;
+ABS_TEST_DIFF_TYPE(short2, ushort2)
+ABS_TEST_DIFF_TYPE(short3, ushort3)
+ABS_TEST_DIFF_TYPE(short4, ushort4)
+ABS_TEST_DIFF_TYPE(short8, ushort8)
+ABS_TEST_DIFF_TYPE(short16, ushort16)
+ABS_TEST_DIFF_TYPE(ushort2, ushort2)
+ABS_TEST_DIFF_TYPE(ushort3, ushort3)
+ABS_TEST_DIFF_TYPE(ushort4, ushort4)
+ABS_TEST_DIFF_TYPE(ushort8, ushort8)
+ABS_TEST_DIFF_TYPE(ushort16, ushort16)
diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
index 571e0c6..65f1c5a 100644
--- a/utests/compiler_atomic_functions.cpp
+++ b/utests/compiler_atomic_functions.cpp
@@ -4,12 +4,12 @@
 #include <string.h>
 
 #define GROUP_NUM 16
-#define LOCAL_SIZE 64
+#define LOCAL_SIZE 256
 static void cpu_compiler_atomic(int *dst, int *src)
 {
   dst[4] = 0xffffffff;
   int tmp[16] = { 0 };
-
+  tmp[4] = -1;
   for(int j=0; j<LOCAL_SIZE; j++) {
     int i = j % 12;
 
@@ -18,8 +18,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
       case 1: tmp[i] -= 1; break;
       case 2: tmp[i] += src[j]; break;
       case 3: tmp[i] -= src[j]; break;
-      case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
-      case 5: tmp[i] |= src[j]<<(j>>2); break;
+      case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
+      case 5: tmp[i] |= src[j]<<(j>>4); break;
       case 6: tmp[i] ^= src[j]; break;
       case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
       case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
@@ -39,8 +39,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
         case 1: dst[i] -= 1; break;
         case 2: dst[i] += src[j]; break;
         case 3: dst[i] -= src[j]; break;
-        case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
-        case 5: dst[i] |= src[j]<<(j>>2); break;
+        case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
+        case 5: dst[i] |= src[j]<<(j>>4); break;
         case 6: dst[i] ^= src[j]; break;
         case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
         case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
@@ -53,27 +53,28 @@ static void cpu_compiler_atomic(int *dst, int *src)
   }
 
   for(int i=0; i<12; i++)
-    dst[i] += tmp[i];
+    dst[i+12] = tmp[i];
 }
 
 static void compiler_atomic_functions(void)
 {
   const size_t n = GROUP_NUM * LOCAL_SIZE;
-  int cpu_dst[16] = {0}, cpu_src[256];
+  int cpu_dst[24] = {0}, cpu_src[256];
 
   globals[0] = n;
   locals[0] = LOCAL_SIZE;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_atomic_functions");
-  OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, 16 * sizeof(int), NULL);
   OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
 
   OCL_MAP_BUFFER(0);
-  memset(buf_data[0], 0, 16 * sizeof(int));
+  memset(buf_data[0], 0, 24 * sizeof(int));
+  ((int *)buf_data[0])[4] = -1;
   OCL_UNMAP_BUFFER(0);
 
   OCL_MAP_BUFFER(1);
@@ -86,7 +87,7 @@ static void compiler_atomic_functions(void)
   OCL_MAP_BUFFER(0);
 
   // Check results
-  for(int i=0; i<12; i++) {
+  for(int i=0; i<24; i++) {
     //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
     OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
   }
diff --git a/utests/compiler_fill_gl_image.cpp b/utests/compiler_fill_gl_image.cpp
index b070b8f..437fcf4 100644
--- a/utests/compiler_fill_gl_image.cpp
+++ b/utests/compiler_fill_gl_image.cpp
@@ -33,6 +33,10 @@ static void compiler_fill_gl_image(void)
   uint32_t *resultColor;
   GLuint tex;
 
+  if (eglContext == EGL_NO_CONTEXT) {
+    fprintf(stderr, "There is no valid egl context. Ignore this case.\n");
+    return;
+  }
   // Setup kernel and images
   glGenTextures(1, &tex);
   glBindTexture(GL_TEXTURE_2D, tex);
diff --git a/utests/compiler_global_memory_barrier.cpp b/utests/compiler_global_memory_barrier.cpp
index a6496a7..ea84e72 100644
--- a/utests/compiler_global_memory_barrier.cpp
+++ b/utests/compiler_global_memory_barrier.cpp
@@ -13,7 +13,7 @@ static void compiler_global_memory_barrier(void)
 
   // Run the kernel
   globals[0] = n/2;
-  locals[0] = 32;
+  locals[0] = 256;
   OCL_NDRANGE(1);
   OCL_MAP_BUFFER(0);
 
diff --git a/utests/compiler_local_memory_barrier_2.cpp b/utests/compiler_local_memory_barrier_2.cpp
index b074123..4fa090b 100644
--- a/utests/compiler_local_memory_barrier_2.cpp
+++ b/utests/compiler_local_memory_barrier_2.cpp
@@ -5,7 +5,7 @@ static void compiler_local_memory_barrier_2(void)
   const size_t n = 16*1024;
 
   globals[0] = n/2;
-  locals[0] = 32;
+  locals[0] = 256;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_local_memory_barrier_2");
diff --git a/utests/compiler_step.cpp b/utests/compiler_step.cpp
new file mode 100644
index 0000000..3285dda
--- /dev/null
+++ b/utests/compiler_step.cpp
@@ -0,0 +1,338 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+    T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+    typedef cl_vec<T, N> vec_type;
+
+    cl_vec(void) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    }
+    cl_vec(vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    vec_type& operator= (vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    bool operator== (vec_type & other) {
+        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    void step (vec_type & other) {
+        int i = 0;
+        for (; i < N; i++) {
+            T a = ptr[i];
+            T edge = other.ptr[i];
+            T f = a < edge ? 0.0 : 1.0;
+            ptr[i] = f;
+        }
+    }
+
+    void step (float & edge) {
+        int i = 0;
+        for (; i < N; i++) {
+            T a = ptr[i];
+            T f = a < edge ? 0.0 : 1.0;
+            ptr[i] = f;
+        }
+    }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        cl_vec<T, N> *edge, cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+    cl_vec<T, N> v  = src[global_id];
+    v.step(edge[global_id]);
+    dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *edge, T *src, U *dst)
+{
+    T f = src[global_id];
+    T e = edge[global_id];
+    f = f < e ? 0.0 : 1.0;
+    dst[global_id] = (U)f;
+}
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        float edge, cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+    cl_vec<T, N> v  = src[global_id];
+    v.step(edge);
+    dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, float edge, T *src, U *dst)
+{
+    T f = src[global_id];
+    f = f < edge ? 0.0 : 1.0;
+    dst[global_id] = (U)f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+    int i = 0;
+
+    memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    for (; i < N; i++) {
+        vect.ptr[i] = static_cast<T>(.1f * (rand() & 15) - .75f);
+    }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+    val = static_cast<T>(.1f * (rand() & 15) - .75f);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+    if (std::is_unsigned<T>::value)
+        printf(" %u", val);
+    else
+        printf(" %d", val);
+}
+
+inline static void print_data (float& val)
+{
+    printf(" %f", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* edge,
+        cl_vec<T, N>* src, cl_vec<U, N>* dst, int n)
+{
+    U* val = reinterpret_cast<U *>(dst);
+
+    n = n*((N+1)/2)*2;
+
+    printf("\nEdge: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(val[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (T* edge, T* src, U* dst, int n)
+{
+    printf("\nedge: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(dst[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U, int N> static void dump_data (float edge,
+        cl_vec<T, N>* src, cl_vec<U, N>* dst, int n)
+{
+    U* val = reinterpret_cast<U *>(dst);
+
+    n = n*((N+1)/2)*2;
+
+    printf("\nEdge: %f\n", edge);
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(val[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (float edge, T* src, U* dst, int n)
+{
+    printf("\nedge: %f\n", edge);
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(dst[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T> static void compiler_step_with_type(void)
+{
+    const size_t n = 16;
+    T cpu_dst[n], cpu_src[n];
+    T edge[n];
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+    globals[0] = n;
+    locals[0] = n;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        OCL_MAP_BUFFER(1);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(2);
+        memset(buf_data[2], 0, sizeof(T) * n);
+        OCL_UNMAP_BUFFER(2);
+
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_src[i]);
+            gen_rand_val(edge[i]);
+        }
+
+        memcpy(buf_data[1], cpu_src, sizeof(T) * n);
+        memcpy(buf_data[0], edge, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, edge, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(2);
+
+        //dump_data(edge, cpu_src, cpu_dst, n);
+
+        OCL_ASSERT(!memcmp(buf_data[2], cpu_dst, sizeof(T) * n));
+        OCL_UNMAP_BUFFER(2);
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(0);
+    }
+}
+
+#define STEP_TEST_TYPE(TYPE) \
+	static void compiler_step_##TYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_step.cl", "compiler_step_"#TYPE, SOURCE, NULL);  \
+           compiler_step_with_type<TYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION(compiler_step_##TYPE);
+
+typedef cl_vec<float, 2> float2;
+typedef cl_vec<float, 3> float3;
+typedef cl_vec<float, 4> float4;
+typedef cl_vec<float, 8> float8;
+typedef cl_vec<float, 16> float16;
+STEP_TEST_TYPE(float)
+STEP_TEST_TYPE(float2)
+STEP_TEST_TYPE(float3)
+STEP_TEST_TYPE(float4)
+STEP_TEST_TYPE(float8)
+STEP_TEST_TYPE(float16)
+
+
+template <typename T> static void compiler_stepf_with_type(void)
+{
+    const size_t n = 16;
+    T cpu_dst[n], cpu_src[n];
+    float edge = (float)(.1f * (rand() & 15) - .75f);
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_SET_ARG(0, sizeof(float), &edge);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+    globals[0] = n;
+    locals[0] = n;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(1);
+        memset(buf_data[1], 0, sizeof(T) * n);
+        OCL_UNMAP_BUFFER(1);
+
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_src[i]);
+        }
+
+        memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, edge, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+        //dump_data(edge, cpu_src, cpu_dst, n);
+
+        OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(0);
+    }
+}
+
+#define STEPF_TEST_TYPE(TYPE) \
+	static void compiler_stepf_##TYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_step.cl", "compiler_stepf_"#TYPE, SOURCE, NULL);  \
+           compiler_stepf_with_type<TYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION(compiler_stepf_##TYPE);
+
+STEPF_TEST_TYPE(float)
+STEPF_TEST_TYPE(float2)
+STEPF_TEST_TYPE(float3)
+STEPF_TEST_TYPE(float4)
+STEPF_TEST_TYPE(float8)
+STEPF_TEST_TYPE(float16)
diff --git a/utests/compiler_upsample_int.cpp b/utests/compiler_upsample_int.cpp
new file mode 100644
index 0000000..ee912f9
--- /dev/null
+++ b/utests/compiler_upsample_int.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+
+void compiler_upsample_int(void)
+{
+  const int n = 32;
+  short src1[n];
+  unsigned short src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_upsample_int");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((short*)buf_data[0])[i] = rand();
+    src2[i] = ((short*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)((src1[i] << 16) | src2[i]));
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_upsample_int);
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 941b5f9..504f80f 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -89,6 +89,7 @@ bool init_egl_window(int width, int height) {
     EGLConfig  ecfg;
     EGLint     numConfig;
 
+    eglContext = EGL_NO_CONTEXT;
     xDisplay = XOpenDisplay(NULL);
     if (xDisplay == NULL) {
       fprintf(stderr, "Failed to open DISPLAY.\n");
@@ -343,15 +344,17 @@ cl_ocl_init(void)
 
 #ifdef HAS_EGL
   if (hasGLExt) {
-    init_egl_window(EGL_WINDOW_WIDTH, EGL_WINDOW_HEIGHT);
+    int i = 0;
     props = new cl_context_properties[7];
-    props[0] = CL_CONTEXT_PLATFORM;
-    props[1] = (cl_context_properties)platform;
-    props[2] = CL_EGL_DISPLAY_KHR;
-    props[3] = (cl_context_properties)eglGetCurrentDisplay();
-    props[4] = CL_GL_CONTEXT_KHR;
-    props[5] = (cl_context_properties)eglGetCurrentContext();
-    props[6] = 0;
+    props[i++] = CL_CONTEXT_PLATFORM;
+    props[i++] = (cl_context_properties)platform;
+    if (init_egl_window(EGL_WINDOW_WIDTH, EGL_WINDOW_HEIGHT)) {
+      props[i++] = CL_EGL_DISPLAY_KHR;
+      props[i++] = (cl_context_properties)eglGetCurrentDisplay();
+      props[i++] = CL_GL_CONTEXT_KHR;
+      props[i++] = (cl_context_properties)eglGetCurrentContext();
+    }
+    props[i++] = 0;
   }
 #endif
   /* Now create a context */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git