[Pkg-opencl-devel] [beignet] 24/47: Imported Debian patch 0.1+git20130703+84f63e8-1

Fri Oct 31 21:45:48 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit b19c9f03686a760678aa16cba8631a9e6a85a093
Author: Simon Richter <sjr at debian.org>
Date:   Wed Jul 3 09:52:32 2013 +0200

    Imported Debian patch 0.1+git20130703+84f63e8-1
---
 debian/changelog                                   |   6 +
 .../0001-Add-vector-argument-test-case.patch       |   8 +-
 .../0002-Fix-atomic-test-failed-in-GT1.patch       | 150 +++++++++
 .../0003-GBE-fixed-a-barrier-related-bug.patch     |  71 +++++
 ...ease-local-size-in-the-two-barrier-test-c.patch |  45 +++
 ...e-error-message-output-in-release-version.patch |  71 +++++
 ...the-builtin-function-vect-return-to-vect_.patch | 144 +++++++++
 ...-vector3-support-for-builtin-abs-function.patch | 252 +++++++++++++++
 ...Add-the-abs_diff-builtin-function-support.patch | 109 +++++++
 ...e-test-case-for-builtin-abs_diff-function.patch | 355 +++++++++++++++++++++
 ...d-OpenCL-1.2-definitions-required-for-ICD.patch |  95 ++++++
 debian/patches/series                              |  17 +-
 12 files changed, 1310 insertions(+), 13 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index d5fb18e..532b64e 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+beignet (0.1+git20130703+84f63e8-1) unstable; urgency=low
+
+  * New upstream release
+
+ -- Simon Richter <sjr at debian.org>  Wed, 03 Jul 2013 09:52:32 +0200
+
 beignet (0.1+git20130626+41005e0-1) unstable; urgency=low
 
   * New upstream release
diff --git a/debian/patches/0001-Add-vector-argument-test-case.patch b/debian/patches/0001-Add-vector-argument-test-case.patch
index 86a7636..4989208 100644
--- a/debian/patches/0001-Add-vector-argument-test-case.patch
+++ b/debian/patches/0001-Add-vector-argument-test-case.patch
@@ -1,7 +1,7 @@
-From 5e262194e009622863d19e17c03cb44d0bd066cc Mon Sep 17 00:00:00 2001
+From a5bb2bdda3a0cf4105623565e0f814dda580fc48 Mon Sep 17 00:00:00 2001
 From: Yang Rong <rong.r.yang at intel.com>
 Date: Thu, 16 May 2013 12:36:35 +0800
-Subject: [PATCH 01/11] Add vector argument test case.
+Subject: [PATCH 01/10] Add vector argument test case.
 To: beignet at lists.freedesktop.org
 
 Signed-off-by: Yang Rong <rong.r.yang at intel.com>
@@ -26,10 +26,10 @@ index 0000000..0985dbd
 +  dst[id] = value.w;
 +}
 diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index df59feb..8a58ff4 100644
+index 3fe0065..cc99370 100644
 --- a/utests/CMakeLists.txt
 +++ b/utests/CMakeLists.txt
-@@ -39,6 +39,7 @@ set (utests_sources
+@@ -38,6 +38,7 @@ set (utests_sources
    compiler_fill_image_3d_2.cpp
    compiler_function_argument0.cpp
    compiler_function_argument1.cpp
diff --git a/debian/patches/0002-Fix-atomic-test-failed-in-GT1.patch b/debian/patches/0002-Fix-atomic-test-failed-in-GT1.patch
new file mode 100644
index 0000000..ca71f65
--- /dev/null
+++ b/debian/patches/0002-Fix-atomic-test-failed-in-GT1.patch
@@ -0,0 +1,150 @@
+From 8470b1b15e78673f951ef8e58c5ff043909b5152 Mon Sep 17 00:00:00 2001
+From: Yang Rong <rong.r.yang at intel.com>
+Date: Tue, 2 Jul 2013 15:22:24 +0800
+Subject: [PATCH 02/10] Fix atomic test failed in GT1.
+To: beignet at lists.freedesktop.org
+
+Barrier only ensure one work group finish, can't guarantee all work item's atomic ops
+have finished before the last atomic_add.
+So use atomic_xchg to update first work group's local buffer to other global buffer position.
+
+Signed-off-by: Yang Rong <rong.r.yang at intel.com>
+---
+ kernels/compiler_atomic_functions.cl |   19 +++++++++++++------
+ utests/compiler_atomic_functions.cpp |   23 ++++++++++++-----------
+ 2 files changed, 25 insertions(+), 17 deletions(-)
+
+diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
+index 61ce2f4..fbc16fb 100644
+--- a/kernels/compiler_atomic_functions.cl
++++ b/kernels/compiler_atomic_functions.cl
+@@ -1,14 +1,21 @@
+ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
+   int lid = get_local_id(0);
+   int i = lid % 12;
+-  atomic_xchg(&tmp[4], -1);
++  if(lid == 0) {
++    for(int j=0; j<12; j=j+1) {
++      atomic_xchg(&tmp[j], 0);
++    }
++    atomic_xchg(&tmp[4], -1);
++  }
++  barrier(CLK_LOCAL_MEM_FENCE);
++
+   switch(i) {
+     case 0: atomic_inc(&tmp[i]); break;
+     case 1: atomic_dec(&tmp[i]); break;
+     case 2: atomic_add(&tmp[i], src[lid]); break;
+     case 3: atomic_sub(&tmp[i], src[lid]); break;
+-    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 4))); break;
+-    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 4)); break;
++    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 16))); break;
++    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 16)); break;
+     case 6: atomic_xor(&tmp[i], src[lid]); break;
+     case 7: atomic_min(&tmp[i], -src[lid]); break;
+     case 8: atomic_max(&tmp[i], src[lid]); break;
+@@ -23,8 +30,8 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
+     case 1: atomic_dec(&dst[i]); break;
+     case 2: atomic_add(&dst[i], src[lid]); break;
+     case 3: atomic_sub(&dst[i], src[lid]); break;
+-    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 4))); break;
+-    case 5: atomic_or (&dst[i], src[lid]<<(lid / 4)); break;
++    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 16))); break;
++    case 5: atomic_or (&dst[i], src[lid]<<(lid / 16)); break;
+     case 6: atomic_xor(&dst[i], src[lid]); break;
+     case 7: atomic_min(&dst[i], -src[lid]); break;
+     case 8: atomic_max(&dst[i], src[lid]); break;
+@@ -38,6 +45,6 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
+ 
+   if(get_global_id(0) == 0) {
+     for(i=0; i<12; i=i+1)
+-      atomic_add(&dst[i], tmp[i]);
++      atomic_xchg(&dst[i+12], tmp[i]);
+   }
+ }
+diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
+index 571e0c6..65f1c5a 100644
+--- a/utests/compiler_atomic_functions.cpp
++++ b/utests/compiler_atomic_functions.cpp
+@@ -4,12 +4,12 @@
+ #include <string.h>
+ 
+ #define GROUP_NUM 16
+-#define LOCAL_SIZE 64
++#define LOCAL_SIZE 256
+ static void cpu_compiler_atomic(int *dst, int *src)
+ {
+   dst[4] = 0xffffffff;
+   int tmp[16] = { 0 };
+-
++  tmp[4] = -1;
+   for(int j=0; j<LOCAL_SIZE; j++) {
+     int i = j % 12;
+ 
+@@ -18,8 +18,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
+       case 1: tmp[i] -= 1; break;
+       case 2: tmp[i] += src[j]; break;
+       case 3: tmp[i] -= src[j]; break;
+-      case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
+-      case 5: tmp[i] |= src[j]<<(j>>2); break;
++      case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
++      case 5: tmp[i] |= src[j]<<(j>>4); break;
+       case 6: tmp[i] ^= src[j]; break;
+       case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
+       case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
+@@ -39,8 +39,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
+         case 1: dst[i] -= 1; break;
+         case 2: dst[i] += src[j]; break;
+         case 3: dst[i] -= src[j]; break;
+-        case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
+-        case 5: dst[i] |= src[j]<<(j>>2); break;
++        case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
++        case 5: dst[i] |= src[j]<<(j>>4); break;
+         case 6: dst[i] ^= src[j]; break;
+         case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
+         case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
+@@ -53,27 +53,28 @@ static void cpu_compiler_atomic(int *dst, int *src)
+   }
+ 
+   for(int i=0; i<12; i++)
+-    dst[i] += tmp[i];
++    dst[i+12] = tmp[i];
+ }
+ 
+ static void compiler_atomic_functions(void)
+ {
+   const size_t n = GROUP_NUM * LOCAL_SIZE;
+-  int cpu_dst[16] = {0}, cpu_src[256];
++  int cpu_dst[24] = {0}, cpu_src[256];
+ 
+   globals[0] = n;
+   locals[0] = LOCAL_SIZE;
+ 
+   // Setup kernel and buffers
+   OCL_CREATE_KERNEL("compiler_atomic_functions");
+-  OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
++  OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
+   OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
+   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+   OCL_SET_ARG(1, 16 * sizeof(int), NULL);
+   OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+ 
+   OCL_MAP_BUFFER(0);
+-  memset(buf_data[0], 0, 16 * sizeof(int));
++  memset(buf_data[0], 0, 24 * sizeof(int));
++  ((int *)buf_data[0])[4] = -1;
+   OCL_UNMAP_BUFFER(0);
+ 
+   OCL_MAP_BUFFER(1);
+@@ -86,7 +87,7 @@ static void compiler_atomic_functions(void)
+   OCL_MAP_BUFFER(0);
+ 
+   // Check results
+-  for(int i=0; i<12; i++) {
++  for(int i=0; i<24; i++) {
+     //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
+     OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
+   }
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0003-GBE-fixed-a-barrier-related-bug.patch b/debian/patches/0003-GBE-fixed-a-barrier-related-bug.patch
new file mode 100644
index 0000000..4b6f8fc
--- /dev/null
+++ b/debian/patches/0003-GBE-fixed-a-barrier-related-bug.patch
@@ -0,0 +1,71 @@
+From 43493e052b553f94f8a05f21bdb5203bddc12870 Mon Sep 17 00:00:00 2001
+From: Zhigang Gong <zhigang.gong at linux.intel.com>
+Date: Tue, 2 Jul 2013 18:49:48 +0800
+Subject: [PATCH 03/10] GBE: fixed a barrier related bug.
+To: beignet at lists.freedesktop.org
+
+Actually, this commit fixed two bugs related to barrier.
+1. We should set useSLM to true if we use barrier.
+2. We need to set barrier id to the barrierMsg payload according to
+r0.2. And we don't need to reprogram the barrierCount.
+
+And after this fix, we don't need the work around for the local
+memory barrier, thus we don't need the memory fence for local memory
+barrier.
+
+Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
+---
+ backend/src/backend/gen_insn_selection.cpp |   14 ++++++--------
+ backend/src/llvm/llvm_gen_backend.cpp      |    1 +
+ 2 files changed, 7 insertions(+), 8 deletions(-)
+
+diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
+index bbe392d..bfe1e28 100644
+--- a/backend/src/backend/gen_insn_selection.cpp
++++ b/backend/src/backend/gen_insn_selection.cpp
+@@ -1792,24 +1792,22 @@ namespace gbe
+       const ir::Register reg = sel.reg(FAMILY_DWORD);
+ 
+       const uint32_t params = insn.getParameters();
+-      //XXX TODO need to double check local barrier whether need fence or not
+-      if(params == syncGlobalBarrier || params == syncLocalBarrier) {
++      if(params == syncGlobalBarrier) {
+         const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
+         sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
+       }
+ 
+       sel.push();
+         sel.curr.predicate = GEN_PREDICATE_NONE;
++
++        // As only the payload.2 is used and all the other regions are ignored
++        // SIMD8 mode here is safe.
+         sel.curr.execWidth = 8;
+         sel.curr.physicalFlag = 0;
+         sel.curr.noMask = 1;
++        // Copy barrier id from r0.
++        sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
+ 
+-        sel.SHL(GenRegister::ud8grf(reg),
+-                GenRegister::ud1grf(ocl::threadn),
+-                GenRegister::immud(0x9));
+-        sel.OR(GenRegister::ud8grf(reg),
+-               GenRegister::ud8grf(reg),
+-               GenRegister::immud(0x00088000));
+         // A barrier is OK to start the thread synchronization *and* SLM fence
+         sel.BARRIER(GenRegister::f8grf(reg));
+         // Now we wait for the other threads
+diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
+index 8385e21..db34296 100644
+--- a/backend/src/llvm/llvm_gen_backend.cpp
++++ b/backend/src/llvm/llvm_gen_backend.cpp
+@@ -1741,6 +1741,7 @@ namespace gbe
+       case GEN_OCL_LBARRIER:
+       case GEN_OCL_GBARRIER:
+       case GEN_OCL_LGBARRIER:
++        ctx.getFunction().setUseSLM(true);
+         break;
+       case GEN_OCL_WRITE_IMAGE0:
+       case GEN_OCL_WRITE_IMAGE1:
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0004-utests-increase-local-size-in-the-two-barrier-test-c.patch b/debian/patches/0004-utests-increase-local-size-in-the-two-barrier-test-c.patch
new file mode 100644
index 0000000..f0100c0
--- /dev/null
+++ b/debian/patches/0004-utests-increase-local-size-in-the-two-barrier-test-c.patch
@@ -0,0 +1,45 @@
+From 5c5e5e83918b23709d1135299596b968feff00bc Mon Sep 17 00:00:00 2001
+From: Zhigang Gong <zhigang.gong at linux.intel.com>
+Date: Tue, 2 Jul 2013 18:49:49 +0800
+Subject: [PATCH 04/10] utests: increase local size in the two barrier test
+ cases.
+To: beignet at lists.freedesktop.org
+
+Increasing the local size to 256 to bring more pressure
+to barrier testing.
+
+Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
+---
+ utests/compiler_global_memory_barrier.cpp  |    2 +-
+ utests/compiler_local_memory_barrier_2.cpp |    2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/utests/compiler_global_memory_barrier.cpp b/utests/compiler_global_memory_barrier.cpp
+index a6496a7..ea84e72 100644
+--- a/utests/compiler_global_memory_barrier.cpp
++++ b/utests/compiler_global_memory_barrier.cpp
+@@ -13,7 +13,7 @@ static void compiler_global_memory_barrier(void)
+ 
+   // Run the kernel
+   globals[0] = n/2;
+-  locals[0] = 32;
++  locals[0] = 256;
+   OCL_NDRANGE(1);
+   OCL_MAP_BUFFER(0);
+ 
+diff --git a/utests/compiler_local_memory_barrier_2.cpp b/utests/compiler_local_memory_barrier_2.cpp
+index b074123..4fa090b 100644
+--- a/utests/compiler_local_memory_barrier_2.cpp
++++ b/utests/compiler_local_memory_barrier_2.cpp
+@@ -5,7 +5,7 @@ static void compiler_local_memory_barrier_2(void)
+   const size_t n = 16*1024;
+ 
+   globals[0] = n/2;
+-  locals[0] = 32;
++  locals[0] = 256;
+ 
+   // Setup kernel and buffers
+   OCL_CREATE_KERNEL("compiler_local_memory_barrier_2");
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0005-Disable-error-message-output-in-release-version.patch b/debian/patches/0005-Disable-error-message-output-in-release-version.patch
new file mode 100644
index 0000000..8be79bc
--- /dev/null
+++ b/debian/patches/0005-Disable-error-message-output-in-release-version.patch
@@ -0,0 +1,71 @@
+From 31473819a135cb8c9048617e12b0b5453104aeec Mon Sep 17 00:00:00 2001
+From: Ruiling Song <ruiling.song at intel.com>
+Date: Tue, 2 Jul 2013 16:44:43 +0800
+Subject: [PATCH 05/10] Disable error message output in release version.
+To: beignet at lists.freedesktop.org
+
+As piglit will got the error message we output to stderr and mark the case 'WARN'.
+so, we disable the message to stderr, and use release version to run piglit.
+
+also fix a minor compile fail under release version.
+
+Signed-off-by: Ruiling Song <ruiling.song at intel.com>
+---
+ backend/src/llvm/llvm_gen_backend.cpp |    3 +--
+ src/cl_utils.h                        |   24 ++++++++++++++++--------
+ 2 files changed, 17 insertions(+), 10 deletions(-)
+
+diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
+index db34296..c2e37fa 100644
+--- a/backend/src/llvm/llvm_gen_backend.cpp
++++ b/backend/src/llvm/llvm_gen_backend.cpp
+@@ -1824,10 +1824,9 @@ namespace gbe
+ 
+   void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
+     CallSite::arg_iterator AI = CS.arg_begin();
+-#if GBE_DEBUG
+     CallSite::arg_iterator AE = CS.arg_end();
+-#endif /* GBE_DEBUG */
+     GBE_ASSERT(AI != AE);
++
+     unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
+     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
+     const ir::Register dst = this->getRegister(&I);
+diff --git a/src/cl_utils.h b/src/cl_utils.h
+index dfb1369..59b7a2b 100644
+--- a/src/cl_utils.h
++++ b/src/cl_utils.h
+@@ -39,14 +39,22 @@ struct JOIN(__,JOIN(__,__LINE__)) {                                 \
+ }
+ 
+ /* Throw errors */
+-#define ERR(ERROR, ...)                                             \
+-do {                                                                \
+-  fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__);     \
+-  fprintf(stderr, __VA_ARGS__);                                     \
+-  fprintf(stderr, "\n");                                            \
+-  err = ERROR;                                                      \
+-  goto error;                                                       \
+-} while (0)
++#ifdef NDEBUG
++  #define ERR(ERROR, ...)                                             \
++  do {                                                                \
++    err = ERROR;                                                      \
++    goto error;                                                       \
++  } while (0)
++#else
++  #define ERR(ERROR, ...)                                             \
++  do {                                                                \
++    fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__);     \
++    fprintf(stderr, __VA_ARGS__);                                     \
++    fprintf(stderr, "\n");                                            \
++    err = ERROR;                                                      \
++    goto error;                                                       \
++  } while (0)
++#endif
+ 
+ #define DO_ALLOC_ERR                                                \
+ do {                                                                \
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch b/debian/patches/0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch
new file mode 100644
index 0000000..96651e7
--- /dev/null
+++ b/debian/patches/0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch
@@ -0,0 +1,144 @@
+From 5dda5e3155c9188b60ef0787402614b744a686d9 Mon Sep 17 00:00:00 2001
+From: Junyan He <junyan.he at linux.intel.com>
+Date: Wed, 3 Jul 2013 12:41:17 +0800
+Subject: [PATCH 06/10] Modify all the builtin function vect return to
+ (vect_name)(e1, e2, e3)
+To: beignet at lists.freedesktop.org
+
+Some builtin functions has the prototype like:
+int3 function_name (int3 x) { return (x.s0, x.s1, x.s2);}
+which not comply with CL spec and will cause the clang IR
+be translated error.
+The vector declare should be (vect)(e1, e2, e3)
+
+Signed-off-by: Junyan He <junyan.he at linux.intel.com>
+---
+ backend/src/ocl_stdlib.h |   70 +++++++++++++++++++++++-----------------------
+ 1 file changed, 35 insertions(+), 35 deletions(-)
+
+diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
+index 04984d8..bc9a0d4 100644
+--- a/backend/src/ocl_stdlib.h
++++ b/backend/src/ocl_stdlib.h
+@@ -4175,11 +4175,11 @@ uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
+     return add_sat((uchar)x, (uchar)0);
+ }
+ 
+-#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x) { return (name(x.s0), name(x.s1)); }
+-#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x) { return (name(x.s0), name(x.s1), name(x.s2)); }
+-#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3)); }
+-#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7)); }
+-#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7), name(x.s8), name(x.s9), name(x.sA), name(x.sB), name(x.sC), name(x.sD), name(x.sE), name(x.sF)); }
++#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x) { return (int2)(name(x.s0), name(x.s1)); }
++#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x) { return (int3)(name(x.s0), name(x.s1), name(x.s2)); }
++#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x) { return (int4)(name(x.s0), name(x.s1), name(x.s2), name(x.s3)); }
++#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x) { return (int8)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7)); }
++#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x) { return (int16)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7), name(x.s8), name(x.s9), name(x.sA), name(x.sB), name(x.sC), name(x.sD), name(x.sE), name(x.sF)); }
+ INLINE_OVERLOADABLE int isfinite(float x) { return __builtin_isfinite(x); }
+ DEC2(isfinite);
+ DEC3(isfinite);
+@@ -4216,11 +4216,11 @@ DEC16(signbit);
+ #undef DEC8
+ #undef DEC16
+ 
+-#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x, float2 y) { return (name(x.s0, y.s0), name(x.s1, y.s1)); }
+-#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x, float3 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2)); }
+-#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x, float4 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3)); }
+-#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x, float8 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7)); }
+-#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x, float16 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7), name(x.s8, y.s8), name(x.s9, y.s9), name(x.sA, y.sA), name(x.sB, y.sB), name(x.sC, y.sC), name(x.sD, y.sD), name(x.sE, y.sE), name(x.sF, y.sF)); }
++#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x, float2 y) { return (int2)(name(x.s0, y.s0), name(x.s1, y.s1)); }
++#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x, float3 y) { return (int3)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2)); }
++#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x, float4 y) { return (int4)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3)); }
++#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x, float8 y) { return (int8)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7)); }
++#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x, float16 y) { return (int16)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7), name(x.s8, y.s8), name(x.s9, y.s9), name(x.sA, y.sA), name(x.sB, y.sB), name(x.sC, y.sC), name(x.sD, y.sD), name(x.sE, y.sE), name(x.sF, y.sF)); }
+ INLINE_OVERLOADABLE int islessgreater(float x, float y) { return (x<y)||(x>y); }
+ DEC2(islessgreater);
+ DEC3(islessgreater);
+@@ -4338,11 +4338,11 @@ INLINE_OVERLOADABLE uint clz(uint x) {
+   return __gen_ocl_fbh(x);
+ }
+ 
+-#define DEC2(type) INLINE_OVERLOADABLE type##2 clz(type##2 a) { return (clz(a.s0), clz(a.s1)); }
+-#define DEC3(type) INLINE_OVERLOADABLE type##3 clz(type##3 a) { return (clz(a.s0), clz(a.s1), clz(a.s2)); }
+-#define DEC4(type) INLINE_OVERLOADABLE type##4 clz(type##4 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3)); }
+-#define DEC8(type) INLINE_OVERLOADABLE type##8 clz(type##8 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7)); }
+-#define DEC16(type) INLINE_OVERLOADABLE type##16 clz(type##16 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7), clz(a.s8), clz(a.s9), clz(a.sa), clz(a.sb), clz(a.sc), clz(a.sd), clz(a.se), clz(a.sf)); }
++#define DEC2(type) INLINE_OVERLOADABLE type##2 clz(type##2 a) { return (type##2)(clz(a.s0), clz(a.s1)); }
++#define DEC3(type) INLINE_OVERLOADABLE type##3 clz(type##3 a) { return (type##3)(clz(a.s0), clz(a.s1), clz(a.s2)); }
++#define DEC4(type) INLINE_OVERLOADABLE type##4 clz(type##4 a) { return (type##4)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3)); }
++#define DEC8(type) INLINE_OVERLOADABLE type##8 clz(type##8 a) { return (type##8)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7)); }
++#define DEC16(type) INLINE_OVERLOADABLE type##16 clz(type##16 a) { return (type##16)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7), clz(a.s8), clz(a.s9), clz(a.sa), clz(a.sb), clz(a.sc), clz(a.sd), clz(a.se), clz(a.sf)); }
+ #define DEC(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint) 
+ DEC(2)
+ DEC(3)
+@@ -4364,11 +4364,11 @@ INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
+ INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
+ INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
+ INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+-#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
+-#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
+-#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
+-#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
+-#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
++#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (type##2)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
++#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (type##3)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
++#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (type##4)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
++#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (type##8)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
++#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (type##16)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
+ #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
+ DEF(2)
+ DEF(3)
+@@ -4390,11 +4390,11 @@ DEF(ushort)
+ DEF(int)
+ DEF(uint)
+ #undef DEF
+-#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
+-#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
+-#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
+-#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
+-#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b.se, c.se), [...]
++#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (type##2)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
++#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (type##3)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
++#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (type##4)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
++#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (type##8)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
++#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (type##16)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b. [...]
+ #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
+ DEF(2)
+ DEF(3)
+@@ -4422,11 +4422,11 @@ DEF(ushort, 15)
+ DEF(int, 31)
+ DEF(uint, 31)
+ #undef DEF
+-#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
+-#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
+-#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
+-#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
+-#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
++#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (type##2)(rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
++#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (type##3)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
++#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (type##4)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
++#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (type##8)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
++#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (type##16)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
+ #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
+ DEF(2)
+ DEF(3)
+@@ -4454,11 +4454,11 @@ INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 &
+ INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
+ INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
+ INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+-#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (func(a.s0, b.s0), func(a.s1, b.s1)); }
+-#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
+-#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
+-#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
+-#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
++#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (type##2)(func(a.s0, b.s0), func(a.s1, b.s1)); }
++#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (type##3)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
++#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (type##4)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
++#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (type##8)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
++#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (type##16)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
+ #define DEF(func, n) DEC##n(func, char); DEC##n(func, uchar); DEC##n(func, short); DEC##n(func, ushort); DEC##n(func, int); DEC##n(func, uint)
+ DEF(hadd, 2)
+ DEF(hadd, 3)
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0007-Add-the-vector3-support-for-builtin-abs-function.patch b/debian/patches/0007-Add-the-vector3-support-for-builtin-abs-function.patch
new file mode 100644
index 0000000..e116290
--- /dev/null
+++ b/debian/patches/0007-Add-the-vector3-support-for-builtin-abs-function.patch
@@ -0,0 +1,252 @@
+From bebce26d3bb8daa227f73ae3353c544b3d8f36ed Mon Sep 17 00:00:00 2001
+From: Junyan He <junyan.he at linux.intel.com>
+Date: Wed, 3 Jul 2013 15:16:59 +0800
+Subject: [PATCH 07/10] Add the vector3 support for builtin abs function
+To: beignet at lists.freedesktop.org
+
+Add the forgetten abs vector3 for all the types.
+Because the kernel input alignment, improve the test
+case to match the alignment request.
+
+Signed-off-by: Junyan He <junyan.he at linux.intel.com>
+---
+ backend/src/ocl_stdlib.h |    4 +++-
+ kernels/compiler_abs.cl  |    1 +
+ utests/compiler_abs.cpp  |   54 +++++++++++++++++++++++++++++++++++++---------
+ 3 files changed, 48 insertions(+), 11 deletions(-)
+
+diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
+index bc9a0d4..4bfdf9a 100644
+--- a/backend/src/ocl_stdlib.h
++++ b/backend/src/ocl_stdlib.h
+@@ -4481,6 +4481,7 @@ int __gen_ocl_abs(int x);
+ #define ABS_I(I, CVT)  (CVT)__gen_ocl_abs(x.s##I)
+ #define ABS_VEC1(CVT)  (CVT)__gen_ocl_abs(x)
+ #define ABS_VEC2(CVT)  ABS_I(0, CVT), ABS_I(1, CVT)
++#define ABS_VEC3(CVT)  ABS_I(0, CVT), ABS_I(1, CVT), ABS_I(2, CVT)
+ #define ABS_VEC4(CVT)  ABS_VEC2(CVT), ABS_I(2, CVT), ABS_I(3, CVT)
+ #define ABS_VEC8(CVT)  ABS_VEC4(CVT), ABS_I(4, CVT), ABS_I(5, CVT),\
+ 	               ABS_I(6, CVT), ABS_I(7, CVT)
+@@ -4490,7 +4491,7 @@ int __gen_ocl_abs(int x);
+ 
+ #define DEC_1(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return ABS_VEC1(u##TYPE); }
+ #define DEC_N(TYPE, N) INLINE_OVERLOADABLE u##TYPE##N abs(TYPE##N x) { return (u##TYPE##N)(ABS_VEC##N(u##TYPE)); };
+-#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
++#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 3) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
+ 
+ DEC(int)
+ DEC(short)
+@@ -4509,6 +4510,7 @@ DEC(uchar)
+ #undef ABS_I
+ #undef ABS_VEC1
+ #undef ABS_VEC2
++#undef ABS_VEC3
+ #undef ABS_VEC4
+ #undef ABS_VEC8
+ #undef ABS_VEC16
+diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs.cl
+index 9e77c2b..549575c 100644
+--- a/kernels/compiler_abs.cl
++++ b/kernels/compiler_abs.cl
+@@ -15,6 +15,7 @@
+ #define COMPILER_ABS(TYPE, UTYPE)  \
+     COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
++    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
+     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
+     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
+     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
+diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
+index 59d8365..a1b14b4 100644
+--- a/utests/compiler_abs.cpp
++++ b/utests/compiler_abs.cpp
+@@ -3,23 +3,26 @@
+ 
+ template <typename T, int N>
+ struct cl_vec {
+-    T ptr[N];
++    T ptr[((N+1)/2)*2]; //align to 2 elements.
+ 
+     typedef cl_vec<T, N> vec_type;
+ 
+     cl_vec(void) {
+-        memset(ptr, 0, sizeof(T) * N);
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+     }
+     cl_vec(vec_type & other) {
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+         memcpy (this->ptr, other.ptr, sizeof(T) * N);
+     }
+ 
+     vec_type& operator= (vec_type & other) {
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+         memcpy (this->ptr, other.ptr, sizeof(T) * N);
+         return *this;
+     }
+ 
+     template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+         memcpy (this->ptr, other.ptr, sizeof(T) * N);
+         return *this;
+     }
+@@ -56,6 +59,8 @@ template <typename T, typename U> static void cpu(int global_id, T *src, U *dst)
+ template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+ {
+     int i = 0;
++
++    memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
+     for (; i < N; i++) {
+         vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
+     }
+@@ -66,25 +71,34 @@ template <typename T> static void gen_rand_val (T & val)
+     val = static_cast<T>((rand() & 63) - 32);
+ }
+ 
++template <typename T>
++inline static void print_data (T& val)
++{
++    if (std::is_unsigned<T>::value)
++        printf(" %u", val);
++    else
++        printf(" %d", val);
++}
++
+ template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* src,
+-	cl_vec<U, N>* dst, int n)
++        cl_vec<U, N>* dst, int n)
+ {
+     U* val = reinterpret_cast<U *>(dst);
+ 
+-    n = n*N;
++    n = n*((N+1)/2)*2;
+ 
+     printf("\nRaw: \n");
+     for (int32_t i = 0; i < (int32_t) n; ++i) {
+-        printf(" %d", ((T *)buf_data[0])[i]);
++        print_data(((T *)buf_data[0])[i]);
+     }
+ 
+     printf("\nCPU: \n");
+     for (int32_t i = 0; i < (int32_t) n; ++i) {
+-        printf(" %d", val[i]);
++        print_data(val[i]);
+     }
+     printf("\nGPU: \n");
+     for (int32_t i = 0; i < (int32_t) n; ++i) {
+-        printf(" %d", ((U *)buf_data[1])[i]);
++        print_data(((U *)buf_data[1])[i]);
+     }
+ }
+ 
+@@ -92,16 +106,16 @@ template <typename T, typename U> static void dump_data (T* src, U* dst, int n)
+ {
+     printf("\nRaw: \n");
+     for (int32_t i = 0; i < (int32_t) n; ++i) {
+-        printf(" %d", ((T *)buf_data[0])[i]);
++        print_data(((T *)buf_data[0])[i]);
+     }
+ 
+     printf("\nCPU: \n");
+     for (int32_t i = 0; i < (int32_t) n; ++i) {
+-        printf(" %d", dst[i]);
++        print_data(dst[i]);
+     }
+     printf("\nGPU: \n");
+     for (int32_t i = 0; i < (int32_t) n; ++i) {
+-        printf(" %d", ((U *)buf_data[1])[i]);
++        print_data(((U *)buf_data[1])[i]);
+     }
+ }
+ 
+@@ -111,6 +125,8 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
+     U cpu_dst[16];
+     T cpu_src[16];
+ 
++    printf("sizeof T, is %u, sizeof U is %u\n", (int)sizeof(T), (int)sizeof(U));
++
+     // Setup buffers
+     OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+     OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+@@ -122,6 +138,12 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
+     // Run random tests
+     for (uint32_t pass = 0; pass < 8; ++pass) {
+         OCL_MAP_BUFFER(0);
++
++        /* Clear the dst buffer to avoid random data. */
++        OCL_MAP_BUFFER(1);
++        memset(buf_data[1], 0, sizeof(U) * n);
++        OCL_UNMAP_BUFFER(1);
++
+         for (int32_t i = 0; i < (int32_t) n; ++i) {
+             gen_rand_val(cpu_src[i]);
+         }
+@@ -166,54 +188,66 @@ ABS_TEST_TYPE(uchar, uchar)
+ 
+ 
+ typedef cl_vec<int, 2> int2;
++typedef cl_vec<int, 3> int3;
+ typedef cl_vec<int, 4> int4;
+ typedef cl_vec<int, 8> int8;
+ typedef cl_vec<int, 16> int16;
+ typedef cl_vec<unsigned int, 2> uint2;
++typedef cl_vec<unsigned int, 3> uint3;
+ typedef cl_vec<unsigned int, 4> uint4;
+ typedef cl_vec<unsigned int, 8> uint8;
+ typedef cl_vec<unsigned int, 16> uint16;
+ ABS_TEST_TYPE(int2, uint2)
++ABS_TEST_TYPE(int3, uint3)
+ ABS_TEST_TYPE(int4, uint4)
+ ABS_TEST_TYPE(int8, uint8)
+ ABS_TEST_TYPE(int16, uint16)
+ ABS_TEST_TYPE(uint2, uint2)
++ABS_TEST_TYPE(uint3, uint3)
+ ABS_TEST_TYPE(uint4, uint4)
+ ABS_TEST_TYPE(uint8, uint8)
+ ABS_TEST_TYPE(uint16, uint16)
+ 
+ 
+ typedef cl_vec<char, 2> char2;
++typedef cl_vec<char, 3> char3;
+ typedef cl_vec<char, 4> char4;
+ typedef cl_vec<char, 8> char8;
+ typedef cl_vec<char, 16> char16;
+ typedef cl_vec<unsigned char, 2> uchar2;
++typedef cl_vec<unsigned char, 3> uchar3;
+ typedef cl_vec<unsigned char, 4> uchar4;
+ typedef cl_vec<unsigned char, 8> uchar8;
+ typedef cl_vec<unsigned char, 16> uchar16;
+ ABS_TEST_TYPE(char2, uchar2)
++ABS_TEST_TYPE(char3, uchar3)
+ ABS_TEST_TYPE(char4, uchar4)
+ ABS_TEST_TYPE(char8, uchar8)
+ ABS_TEST_TYPE(char16, uchar16)
+ ABS_TEST_TYPE(uchar2, uchar2)
++ABS_TEST_TYPE(uchar3, uchar3)
+ ABS_TEST_TYPE(uchar4, uchar4)
+ ABS_TEST_TYPE(uchar8, uchar8)
+ ABS_TEST_TYPE(uchar16, uchar16)
+ 
+ 
+ typedef cl_vec<short, 2> short2;
++typedef cl_vec<short, 3> short3;
+ typedef cl_vec<short, 4> short4;
+ typedef cl_vec<short, 8> short8;
+ typedef cl_vec<short, 16> short16;
+ typedef cl_vec<unsigned short, 2> ushort2;
++typedef cl_vec<unsigned short, 3> ushort3;
+ typedef cl_vec<unsigned short, 4> ushort4;
+ typedef cl_vec<unsigned short, 8> ushort8;
+ typedef cl_vec<unsigned short, 16> ushort16;
+ ABS_TEST_TYPE(short2, ushort2)
++ABS_TEST_TYPE(short3, ushort3)
+ ABS_TEST_TYPE(short4, ushort4)
+ ABS_TEST_TYPE(short8, ushort8)
+ ABS_TEST_TYPE(short16, ushort16)
+ ABS_TEST_TYPE(ushort2, ushort2)
++ABS_TEST_TYPE(ushort3, ushort3)
+ ABS_TEST_TYPE(ushort4, ushort4)
+ ABS_TEST_TYPE(ushort8, ushort8)
+ ABS_TEST_TYPE(ushort16, ushort16)
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0008-Add-the-abs_diff-builtin-function-support.patch b/debian/patches/0008-Add-the-abs_diff-builtin-function-support.patch
new file mode 100644
index 0000000..35d5535
--- /dev/null
+++ b/debian/patches/0008-Add-the-abs_diff-builtin-function-support.patch
@@ -0,0 +1,109 @@
+From b1229e1adbea64a9d3d4bbb5f01740c3b3595485 Mon Sep 17 00:00:00 2001
+From: Junyan He <junyan.he at linux.intel.com>
+Date: Wed, 3 Jul 2013 15:17:05 +0800
+Subject: [PATCH 08/10] Add the abs_diff builtin function support
+To: beignet at lists.freedesktop.org
+
+Signed-off-by: Junyan He <junyan.he at linux.intel.com>
+---
+ backend/src/ocl_stdlib.h |   84 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 84 insertions(+)
+
+diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
+index 4bfdf9a..9b80445 100644
+--- a/backend/src/ocl_stdlib.h
++++ b/backend/src/ocl_stdlib.h
+@@ -4515,6 +4515,90 @@ DEC(uchar)
+ #undef ABS_VEC8
+ #undef ABS_VEC16
+ 
++
++/* Char and short type abs diff */
++/* promote char and short to int and will be no module overflow */
++#define ABS_DIFF(CVT) (CVT)(abs((int)x - (int)y))
++#define ABS_DIFF_I(CVT, I)  (CVT)(abs((int)x.s##I - (int)y.s##I))
++
++#define ABS_DIFF_VEC1(CVT)  ABS_DIFF(CVT)
++#define ABS_DIFF_VEC2(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1)
++#define ABS_DIFF_VEC3(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1), ABS_DIFF_I(CVT, 2)
++#define ABS_DIFF_VEC4(CVT)  ABS_DIFF_VEC2(CVT), ABS_DIFF_I(CVT, 2), ABS_DIFF_I(CVT, 3)
++#define ABS_DIFF_VEC8(CVT)  ABS_DIFF_VEC4(CVT), ABS_DIFF_I(CVT, 4), ABS_DIFF_I(CVT, 5), \
++                            ABS_DIFF_I(CVT, 6), ABS_DIFF_I(CVT, 7)
++#define ABS_DIFF_VEC16(CVT)  ABS_DIFF_VEC8(CVT), ABS_DIFF_I(CVT, 8), ABS_DIFF_I(CVT, 9), \
++                            ABS_DIFF_I(CVT, A), ABS_DIFF_I(CVT, B), \
++                            ABS_DIFF_I(CVT, C), ABS_DIFF_I(CVT, D), \
++                            ABS_DIFF_I(CVT, E), ABS_DIFF_I(CVT, F)
++
++#define DEC_1(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
++                           { return ABS_DIFF_VEC1(UTYPE); }
++#define DEC_N(TYPE, UTYPE, N) INLINE_OVERLOADABLE UTYPE##N abs_diff(TYPE##N x, TYPE##N y) \
++                              { return (UTYPE##N)(ABS_DIFF_VEC##N(UTYPE)); };
++#define DEC(TYPE, UTYPE)  DEC_1(TYPE, UTYPE) DEC_N(TYPE, UTYPE, 2)  DEC_N(TYPE, UTYPE, 3 ) \
++                          DEC_N(TYPE, UTYPE, 4) DEC_N(TYPE, UTYPE, 8) DEC_N(TYPE, UTYPE, 16)
++DEC(char, uchar)
++DEC(uchar, uchar)
++DEC(short, ushort)
++DEC(ushort, ushort)
++
++#undef DEC
++#undef DEC_1
++#undef DEC_N
++#undef ABS_DIFF
++#undef ABS_DIFF_I
++#undef ABS_DIFF_VEC1
++#undef ABS_DIFF_VEC2
++#undef ABS_DIFF_VEC3
++#undef ABS_DIFF_VEC4
++#undef ABS_DIFF_VEC8
++#undef ABS_DIFF_VEC16
++
++INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
++    /* same signed will never overflow. */
++    return y > x ? (y -x) : (x - y);
++}
++
++INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
++    /* same signed will never module overflow. */
++    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
++        return abs(x - y);
++
++    return (abs(x) + abs(y));
++}
++
++#define ABS_DIFF_I(I)  abs_diff(x.s##I, y.s##I)
++
++#define ABS_DIFF_VEC2  ABS_DIFF_I(0), ABS_DIFF_I(1)
++#define ABS_DIFF_VEC3  ABS_DIFF_I(0), ABS_DIFF_I(1), ABS_DIFF_I(2)
++#define ABS_DIFF_VEC4  ABS_DIFF_VEC2, ABS_DIFF_I(2), ABS_DIFF_I(3)
++#define ABS_DIFF_VEC8  ABS_DIFF_VEC4, ABS_DIFF_I(4), ABS_DIFF_I(5), \
++                       ABS_DIFF_I(6), ABS_DIFF_I(7)
++#define ABS_DIFF_VEC16  ABS_DIFF_VEC8, ABS_DIFF_I(8), ABS_DIFF_I(9), \
++                            ABS_DIFF_I(A), ABS_DIFF_I(B), \
++                            ABS_DIFF_I(C), ABS_DIFF_I(D), \
++                            ABS_DIFF_I(E), ABS_DIFF_I(F)
++
++#define DEC_N(TYPE, N) INLINE_OVERLOADABLE uint##N abs_diff(TYPE##N x, TYPE##N y) \
++				      { return (uint##N)(ABS_DIFF_VEC##N); };
++#define DEC(TYPE)   DEC_N(TYPE, 2)  DEC_N(TYPE, 3 ) \
++                           DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
++DEC(int)
++DEC(uint)
++
++#undef DEC
++#undef DEC_1
++#undef DEC_N
++#undef ABS_DIFF
++#undef ABS_DIFF_I
++#undef ABS_DIFF_VEC1
++#undef ABS_DIFF_VEC2
++#undef ABS_DIFF_VEC3
++#undef ABS_DIFF_VEC4
++#undef ABS_DIFF_VEC8
++#undef ABS_DIFF_VEC16
++
+ /////////////////////////////////////////////////////////////////////////////
+ // Work Items functions (see 6.11.1 of OCL 1.1 spec)
+ /////////////////////////////////////////////////////////////////////////////
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0009-Add-the-test-case-for-builtin-abs_diff-function.patch b/debian/patches/0009-Add-the-test-case-for-builtin-abs_diff-function.patch
new file mode 100644
index 0000000..49bc014
--- /dev/null
+++ b/debian/patches/0009-Add-the-test-case-for-builtin-abs_diff-function.patch
@@ -0,0 +1,355 @@
+From 7337895767f8dbd7a7637ffbd97efed0352e7e06 Mon Sep 17 00:00:00 2001
+From: Junyan He <junyan.he at linux.intel.com>
+Date: Wed, 3 Jul 2013 15:17:10 +0800
+Subject: [PATCH 09/10] Add the test case for builtin abs_diff() function
+To: beignet at lists.freedesktop.org
+
+All the integer value types check are supported.
+Please use the case named compiler_abs_diff_xxxx,
+where xxxx means the data type such as int2, char4
+
+Signed-off-by: Junyan He <junyan.he at linux.intel.com>
+---
+ kernels/compiler_abs_diff.cl |   28 +++++
+ utests/CMakeLists.txt        |    1 +
+ utests/compiler_abs.cpp      |    2 -
+ utests/compiler_abs_diff.cpp |  267 ++++++++++++++++++++++++++++++++++++++++++
+ 4 files changed, 296 insertions(+), 2 deletions(-)
+ create mode 100644 kernels/compiler_abs_diff.cl
+ create mode 100644 utests/compiler_abs_diff.cpp
+
+diff --git a/kernels/compiler_abs_diff.cl b/kernels/compiler_abs_diff.cl
+new file mode 100644
+index 0000000..583ba2b
+--- /dev/null
++++ b/kernels/compiler_abs_diff.cl
+@@ -0,0 +1,28 @@
++#define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
++    kernel void compiler_abs_diff_##TYPE ( \
++           global TYPE* x, global TYPE* y, global UTYPE* diff) { \
++        int i = get_global_id(0); \
++        diff[i] = abs_diff(x[i], y[i]);     \
++    }
++
++#define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
++    kernel void compiler_abs_diff_##TYPE##N ( \
++           global TYPE##N* x, global TYPE##N* y, global UTYPE##N* diff) { \
++        int i = get_global_id(0); \
++        diff[i] = abs_diff(x[i], y[i]);     \
++    }
++
++#define COMPILER_ABS(TYPE, UTYPE)  \
++    COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
++    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
++    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
++    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
++    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
++    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
++
++COMPILER_ABS(int, uint)
++COMPILER_ABS(uint, uint)
++COMPILER_ABS(char, uchar)
++COMPILER_ABS(uchar, uchar)
++COMPILER_ABS(short, ushort)
++COMPILER_ABS(ushort, ushort)
+diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
+index cc99370..8cdf10d 100644
+--- a/utests/CMakeLists.txt
++++ b/utests/CMakeLists.txt
+@@ -32,6 +32,7 @@ set (utests_sources
+   compiler_copy_buffer_row.cpp
+   compiler_fabs.cpp
+   compiler_abs.cpp
++  compiler_abs_diff.cpp
+   compiler_fill_image.cpp
+   compiler_fill_image0.cpp
+   compiler_fill_image_3d.cpp
+diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
+index a1b14b4..9457b9b 100644
+--- a/utests/compiler_abs.cpp
++++ b/utests/compiler_abs.cpp
+@@ -125,8 +125,6 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
+     U cpu_dst[16];
+     T cpu_src[16];
+ 
+-    printf("sizeof T, is %u, sizeof U is %u\n", (int)sizeof(T), (int)sizeof(U));
+-
+     // Setup buffers
+     OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+     OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
+new file mode 100644
+index 0000000..384a654
+--- /dev/null
++++ b/utests/compiler_abs_diff.cpp
+@@ -0,0 +1,267 @@
++#include "utest_helper.hpp"
++#include "string.h"
++
++template <typename T, int N>
++struct cl_vec {
++    T ptr[((N+1)/2)*2]; //align to 2 elements.
++
++    typedef cl_vec<T, N> vec_type;
++
++    cl_vec(void) {
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
++    }
++    cl_vec(vec_type & other) {
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
++        memcpy (this->ptr, other.ptr, sizeof(T) * N);
++    }
++
++    vec_type& operator= (vec_type & other) {
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
++        memcpy (this->ptr, other.ptr, sizeof(T) * N);
++        return *this;
++    }
++
++    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
++        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
++        memcpy (this->ptr, other.ptr, sizeof(T) * N);
++        return *this;
++    }
++
++    bool operator== (vec_type & other) {
++        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
++    }
++
++    void abs_diff(vec_type & other) {
++        int i = 0;
++        for (; i < N; i++) {
++            T a = ptr[i];
++            T b = other.ptr[i];
++            T f = a > b ? (a - b) : (b - a);
++            ptr[i] = f;
++        }
++    }
++};
++
++template <typename T, typename U, int N> static void cpu (int global_id,
++        cl_vec<T, N> *x, cl_vec<T, N> *y, cl_vec<U, N> *diff)
++{
++    cl_vec<T, N> v  = x[global_id];
++    v.abs_diff(y[global_id]);
++    diff[global_id] = v;
++}
++
++template <typename T, typename U> static void cpu(int global_id, T *x, T *y, U *diff)
++{
++    T a = x[global_id];
++    T b = y[global_id];
++    U f = a > b ? (a - b) : (b - a);
++    diff[global_id] = f;
++}
++
++template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
++{
++    int i = 0;
++    for (; i < N; i++) {
++        vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
++    }
++}
++
++template <typename T> static void gen_rand_val (T & val)
++{
++    val = static_cast<T>((rand() & 63) - 32);
++}
++
++template <typename T>
++inline static void print_data (T& val)
++{
++    if (std::is_unsigned<T>::value)
++        printf(" %u", val);
++    else
++        printf(" %d", val);
++}
++
++template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* x,
++        cl_vec<T, N>* y, cl_vec<U, N>* diff, int n)
++{
++    U* val = reinterpret_cast<U *>(diff);
++
++    n = n*((N+1)/2)*2;
++
++    printf("\nRaw x: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(((T *)buf_data[0])[i]);
++    }
++    printf("\nRaw y: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(((T *)buf_data[1])[i]);
++    }
++
++    printf("\nCPU diff: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(val[i]);
++    }
++    printf("\nGPU diff: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(((U *)buf_data[2])[i]);
++    }
++}
++
++template <typename T, typename U> static void dump_data (T* x, T* y, U* diff, int n)
++{
++    printf("\nRaw x: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(((T *)buf_data[0])[i]);
++    }
++    printf("\nRaw y: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(((T *)buf_data[1])[i]);
++    }
++
++    printf("\nCPU diff: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(diff[i]);
++    }
++    printf("\nGPU diff: \n");
++    for (int32_t i = 0; i < (int32_t) n; ++i) {
++        print_data(((U *)buf_data[2])[i]);
++    }
++}
++
++template <typename T, typename U> static void compiler_abs_diff_with_type(void)
++{
++    const size_t n = 16;
++    U cpu_diff[16];
++    T cpu_x[16];
++    T cpu_y[16];
++
++    // Setup buffers
++    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
++    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
++    OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(U), NULL);
++    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
++    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
++    OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
++    globals[0] = 16;
++    locals[0] = 16;
++
++    // Run random tests
++    for (uint32_t pass = 0; pass < 8; ++pass) {
++        OCL_MAP_BUFFER(0);
++        OCL_MAP_BUFFER(1);
++
++        /* Clear the dst buffer to avoid random data. */
++        OCL_MAP_BUFFER(2);
++        memset(buf_data[2], 0, sizeof(U) * n);
++        OCL_UNMAP_BUFFER(2);
++
++        for (int32_t i = 0; i < (int32_t) n; ++i) {
++            gen_rand_val(cpu_x[i]);
++            gen_rand_val(cpu_y[i]);
++        }
++
++        memcpy(buf_data[0], cpu_x, sizeof(T) * n);
++        memcpy(buf_data[1], cpu_y, sizeof(T) * n);
++
++        // Run the kernel on GPU
++        OCL_NDRANGE(1);
++
++        // Run on CPU
++        for (int32_t i = 0; i < (int32_t) n; ++i)
++            cpu(i, cpu_x, cpu_y, cpu_diff);
++
++        // Compare
++        OCL_MAP_BUFFER(2);
++
++//      dump_data(cpu_x, cpu_y, cpu_diff, n);
++
++        OCL_ASSERT(!memcmp(buf_data[2], cpu_diff, sizeof(T) * n));
++
++        OCL_UNMAP_BUFFER(0);
++        OCL_UNMAP_BUFFER(1);
++        OCL_UNMAP_BUFFER(2);
++    }
++}
++
++#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) \
++	static void compiler_abs_diff_##TYPE (void) \
++        { \
++           OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#TYPE, SOURCE, NULL);  \
++           compiler_abs_diff_with_type<TYPE, UTYPE>(); \
++        } \
++	MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##TYPE);
++
++typedef unsigned char uchar;
++typedef unsigned short ushort;
++typedef unsigned int uint;
++ABS_TEST_DIFF_TYPE(int, uint)
++ABS_TEST_DIFF_TYPE(short, ushort)
++ABS_TEST_DIFF_TYPE(char, uchar)
++ABS_TEST_DIFF_TYPE(uint, uint)
++ABS_TEST_DIFF_TYPE(ushort, ushort)
++ABS_TEST_DIFF_TYPE(uchar, uchar)
++
++
++typedef cl_vec<int, 2> int2;
++typedef cl_vec<int, 3> int3;
++typedef cl_vec<int, 4> int4;
++typedef cl_vec<int, 8> int8;
++typedef cl_vec<int, 16> int16;
++typedef cl_vec<unsigned int, 2> uint2;
++typedef cl_vec<unsigned int, 3> uint3;
++typedef cl_vec<unsigned int, 4> uint4;
++typedef cl_vec<unsigned int, 8> uint8;
++typedef cl_vec<unsigned int, 16> uint16;
++ABS_TEST_DIFF_TYPE(int2, uint2)
++ABS_TEST_DIFF_TYPE(int3, uint3)
++ABS_TEST_DIFF_TYPE(int4, uint4)
++ABS_TEST_DIFF_TYPE(int8, uint8)
++ABS_TEST_DIFF_TYPE(int16, uint16)
++ABS_TEST_DIFF_TYPE(uint2, uint2)
++ABS_TEST_DIFF_TYPE(uint3, uint3)
++ABS_TEST_DIFF_TYPE(uint4, uint4)
++ABS_TEST_DIFF_TYPE(uint8, uint8)
++ABS_TEST_DIFF_TYPE(uint16, uint16)
++
++
++typedef cl_vec<char, 2> char2;
++typedef cl_vec<char, 3> char3;
++typedef cl_vec<char, 4> char4;
++typedef cl_vec<char, 8> char8;
++typedef cl_vec<char, 16> char16;
++typedef cl_vec<unsigned char, 2> uchar2;
++typedef cl_vec<unsigned char, 3> uchar3;
++typedef cl_vec<unsigned char, 4> uchar4;
++typedef cl_vec<unsigned char, 8> uchar8;
++typedef cl_vec<unsigned char, 16> uchar16;
++ABS_TEST_DIFF_TYPE(char2, uchar2)
++ABS_TEST_DIFF_TYPE(char3, uchar3)
++ABS_TEST_DIFF_TYPE(char4, uchar4)
++ABS_TEST_DIFF_TYPE(char8, uchar8)
++ABS_TEST_DIFF_TYPE(char16, uchar16)
++ABS_TEST_DIFF_TYPE(uchar2, uchar2)
++ABS_TEST_DIFF_TYPE(uchar3, uchar3)
++ABS_TEST_DIFF_TYPE(uchar4, uchar4)
++ABS_TEST_DIFF_TYPE(uchar8, uchar8)
++ABS_TEST_DIFF_TYPE(uchar16, uchar16)
++
++
++typedef cl_vec<short, 2> short2;
++typedef cl_vec<short, 3> short3;
++typedef cl_vec<short, 4> short4;
++typedef cl_vec<short, 8> short8;
++typedef cl_vec<short, 16> short16;
++typedef cl_vec<unsigned short, 2> ushort2;
++typedef cl_vec<unsigned short, 3> ushort3;
++typedef cl_vec<unsigned short, 4> ushort4;
++typedef cl_vec<unsigned short, 8> ushort8;
++typedef cl_vec<unsigned short, 16> ushort16;
++ABS_TEST_DIFF_TYPE(short2, ushort2)
++ABS_TEST_DIFF_TYPE(short3, ushort3)
++ABS_TEST_DIFF_TYPE(short4, ushort4)
++ABS_TEST_DIFF_TYPE(short8, ushort8)
++ABS_TEST_DIFF_TYPE(short16, ushort16)
++ABS_TEST_DIFF_TYPE(ushort2, ushort2)
++ABS_TEST_DIFF_TYPE(ushort3, ushort3)
++ABS_TEST_DIFF_TYPE(ushort4, ushort4)
++ABS_TEST_DIFF_TYPE(ushort8, ushort8)
++ABS_TEST_DIFF_TYPE(ushort16, ushort16)
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0010-Readd-OpenCL-1.2-definitions-required-for-ICD.patch b/debian/patches/0010-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
new file mode 100644
index 0000000..0dd14f4
--- /dev/null
+++ b/debian/patches/0010-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
@@ -0,0 +1,95 @@
+From cddef6876d11c289a0bcfe75d5f11ffe982ba5e4 Mon Sep 17 00:00:00 2001
+From: Simon Richter <Simon.Richter at hogyros.de>
+Date: Wed, 19 Jun 2013 11:30:36 +0200
+Subject: [PATCH 10/10] Readd OpenCL 1.2 definitions required for ICD
+To: beignet at lists.freedesktop.org
+
+The definition for the ICD dispatch table requires a few additional
+definitions from OpenCL 1.2.
+---
+ include/CL/cl.h          |   15 +++++++++++++++
+ include/CL/cl_platform.h |    2 ++
+ src/cl_mem.h             |   12 ------------
+ 3 files changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/include/CL/cl.h b/include/CL/cl.h
+index 4355e74..a7f25d1 100644
+--- a/include/CL/cl.h
++++ b/include/CL/cl.h
+@@ -67,6 +67,7 @@ typedef cl_uint             cl_channel_type;
+ typedef cl_bitfield         cl_mem_flags;
+ typedef cl_uint             cl_mem_object_type;
+ typedef cl_uint             cl_mem_info;
++typedef cl_bitfield         cl_mem_migration_flags;
+ typedef cl_uint             cl_image_info;
+ typedef cl_uint             cl_buffer_create_type;
+ typedef cl_uint             cl_addressing_mode;
+@@ -75,8 +76,10 @@ typedef cl_uint             cl_sampler_info;
+ typedef cl_bitfield         cl_map_flags;
+ typedef cl_uint             cl_program_info;
+ typedef cl_uint             cl_program_build_info;
++typedef intptr_t            cl_device_partition_property;
+ typedef cl_int              cl_build_status;
+ typedef cl_uint             cl_kernel_info;
++typedef cl_uint             cl_kernel_arg_info;
+ typedef cl_uint             cl_kernel_work_group_info;
+ typedef cl_uint             cl_event_info;
+ typedef cl_uint             cl_command_type;
+@@ -87,6 +90,18 @@ typedef struct _cl_image_format {
+     cl_channel_type         image_channel_data_type;
+ } cl_image_format;
+ 
++typedef struct _cl_image_desc {
++    cl_mem_object_type      image_type;
++    size_t                  image_width;
++    size_t                  image_height;
++    size_t                  image_depth;
++    size_t                  image_array_size;
++    size_t                  image_row_pitch;
++    size_t                  image_slice_pitch;
++    cl_uint                 num_mip_levels;
++    cl_uint                 num_samples;
++    cl_mem                  buffer;
++} cl_image_desc;
+ 
+ typedef struct _cl_buffer_region {
+     size_t                  origin;
+diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
+index 043b048..9a2f17a 100644
+--- a/include/CL/cl_platform.h
++++ b/include/CL/cl_platform.h
+@@ -58,6 +58,8 @@ extern "C" {
+     #define CL_EXT_SUFFIX__VERSION_1_0
+     #define CL_API_SUFFIX__VERSION_1_1
+     #define CL_EXT_SUFFIX__VERSION_1_1
++    #define CL_API_SUFFIX__VERSION_1_2
++    #define CL_EXT_SUFFIX__VERSION_1_2
+     #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+ #endif
+ 
+diff --git a/src/cl_mem.h b/src/cl_mem.h
+index 33ad174..66815fe 100644
+--- a/src/cl_mem.h
++++ b/src/cl_mem.h
+@@ -29,18 +29,6 @@
+ #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+ #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+ #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+-typedef struct _cl_image_desc {
+-    cl_mem_object_type      image_type;
+-    size_t                  image_width;
+-    size_t                  image_height;
+-    size_t                  image_depth;
+-    size_t                  image_array_size;
+-    size_t                  image_row_pitch;
+-    size_t                  image_slice_pitch;
+-    cl_uint                 num_mip_levels;
+-    cl_uint                 num_samples;
+-    cl_mem                  buffer;
+-} cl_image_desc;
+ #endif
+ 
+ typedef enum cl_image_tiling {
+-- 
+1.7.10.4
+
diff --git a/debian/patches/series b/debian/patches/series
index 90fcaf8..9f3609d 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -4,12 +4,11 @@ khronos
 deprecated-in-utest
 private
 0001-Add-vector-argument-test-case.patch
-0003-Add-the-builtin-function-abs-and-the-according-test-.patch
-0004-PATCH-Refine-the-get_local_id-.-builtins.patch
-0005-Add-atomic-help-functions.-Support-global-and-local-.patch
-0006-Add-all-atomic-built-in-functions-support.patch
-0007-Add-atomic-test-case.patch
-0008-support-built-in-function-rotate.patch
-0009-test-case-for-function-rotate.patch
-0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch
-0011-utests-Add-basic-arithmetic-test-case.patch
+0002-Fix-atomic-test-failed-in-GT1.patch
+0003-GBE-fixed-a-barrier-related-bug.patch
+0004-utests-increase-local-size-in-the-two-barrier-test-c.patch
+0005-Disable-error-message-output-in-release-version.patch
+0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch
+0007-Add-the-vector3-support-for-builtin-abs-function.patch
+0008-Add-the-abs_diff-builtin-function-support.patch
+0009-Add-the-test-case-for-builtin-abs_diff-function.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git