[Pkg-opencl-devel] [beignet] 25/47: Imported Debian patch 0.2-1

Fri Oct 31 21:45:48 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 6dbd2d024c6c93269b900e05eeb1c59d526df928
Author: Simon Richter <sjr at debian.org>
Date:   Fri Jul 5 15:01:51 2013 +0200

    Imported Debian patch 0.2-1
---
 debian/changelog                                   |   6 +
 .../0001-Add-vector-argument-test-case.patch       |  74 ---
 .../0002-Fix-atomic-test-failed-in-GT1.patch       | 150 -----
 ...d-OpenCL-1.2-definitions-required-for-ICD.patch |  95 ----
 ...ltin-function-abs-and-the-according-test-.patch | 219 --------
 .../0003-GBE-fixed-a-barrier-related-bug.patch     |  71 ---
 ...-PATCH-Refine-the-get_local_id-.-builtins.patch |  55 --
 ...ease-local-size-in-the-two-barrier-test-c.patch |  45 --
 ...help-functions.-Support-global-and-local-.patch | 611 ---------------------
 ...e-error-message-output-in-release-version.patch |  71 ---
 ...Add-all-atomic-built-in-functions-support.patch | 289 ----------
 ...the-builtin-function-vect-return-to-vect_.patch | 144 -----
 debian/patches/0007-Add-atomic-test-case.patch     | 188 -------
 ...-vector3-support-for-builtin-abs-function.patch | 252 ---------
 ...Add-the-abs_diff-builtin-function-support.patch | 109 ----
 .../0008-support-built-in-function-rotate.patch    |  58 --
 ...e-test-case-for-builtin-abs_diff-function.patch | 355 ------------
 .../0009-test-case-for-function-rotate.patch       |  87 ---
 ...more-support-of-char-and-short-arithmetic.patch | 153 ------
 ...d-OpenCL-1.2-definitions-required-for-ICD.patch |  95 ----
 ...011-utests-Add-basic-arithmetic-test-case.patch | 336 -----------
 debian/patches/series                              |   9 -
 22 files changed, 6 insertions(+), 3466 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index 532b64e..922cd70 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+beignet (0.2-1) unstable; urgency=low
+
+  * New upstream release (Closes: #712903)
+
+ -- Simon Richter <sjr at debian.org>  Fri, 05 Jul 2013 15:01:51 +0200
+
 beignet (0.1+git20130703+84f63e8-1) unstable; urgency=low
 
   * New upstream release
diff --git a/debian/patches/0001-Add-vector-argument-test-case.patch b/debian/patches/0001-Add-vector-argument-test-case.patch
deleted file mode 100644
index 4989208..0000000
--- a/debian/patches/0001-Add-vector-argument-test-case.patch
+++ /dev/null
@@ -1,74 +0,0 @@
-From a5bb2bdda3a0cf4105623565e0f814dda580fc48 Mon Sep 17 00:00:00 2001
-From: Yang Rong <rong.r.yang at intel.com>
-Date: Thu, 16 May 2013 12:36:35 +0800
-Subject: [PATCH 01/10] Add vector argument test case.
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Yang Rong <rong.r.yang at intel.com>
----
- kernels/compiler_function_argument2.cl |    6 ++++++
- utests/CMakeLists.txt                  |    1 +
- utests/compiler_function_argument2.cpp |   26 ++++++++++++++++++++++++++
- 3 files changed, 33 insertions(+)
- create mode 100644 kernels/compiler_function_argument2.cl
- create mode 100644 utests/compiler_function_argument2.cpp
-
-diff --git a/kernels/compiler_function_argument2.cl b/kernels/compiler_function_argument2.cl
-new file mode 100644
-index 0000000..0985dbd
---- /dev/null
-+++ b/kernels/compiler_function_argument2.cl
-@@ -0,0 +1,6 @@
-+__kernel void
-+compiler_function_argument2(__global int *dst, int4 value)
-+{
-+  int id = (int)get_global_id(0);
-+  dst[id] = value.w;
-+}
-diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index 3fe0065..cc99370 100644
---- a/utests/CMakeLists.txt
-+++ b/utests/CMakeLists.txt
-@@ -38,6 +38,7 @@ set (utests_sources
-   compiler_fill_image_3d_2.cpp
-   compiler_function_argument0.cpp
-   compiler_function_argument1.cpp
-+  compiler_function_argument2.cpp
-   compiler_function_argument.cpp
-   compiler_function_constant0.cpp
-   compiler_function_constant1.cpp
-diff --git a/utests/compiler_function_argument2.cpp b/utests/compiler_function_argument2.cpp
-new file mode 100644
-index 0000000..1e398a9
---- /dev/null
-+++ b/utests/compiler_function_argument2.cpp
-@@ -0,0 +1,26 @@
-+#include "utest_helper.hpp"
-+
-+struct int4 {int x,y,z,w;};
-+void compiler_function_argument2(void)
-+{
-+  const size_t n = 2048;
-+  const int4 value = {31, 32, 33, 34};
-+
-+  // Setup kernel and buffers
-+  OCL_CREATE_KERNEL("compiler_function_argument2");
-+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
-+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-+  OCL_SET_ARG(1, sizeof(int4), &value);
-+
-+  // Run the kernel
-+  globals[0] = n;
-+  locals[0] = 16;
-+  OCL_NDRANGE(1);
-+  OCL_MAP_BUFFER(0);
-+
-+  // Check results
-+  for (uint32_t i = 0; i < n; ++i)
-+    OCL_ASSERT(((int*)buf_data[0])[i] == value.w);
-+}
-+
-+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument2);
--- 
-1.7.10.4
-
diff --git a/debian/patches/0002-Fix-atomic-test-failed-in-GT1.patch b/debian/patches/0002-Fix-atomic-test-failed-in-GT1.patch
deleted file mode 100644
index ca71f65..0000000
--- a/debian/patches/0002-Fix-atomic-test-failed-in-GT1.patch
+++ /dev/null
@@ -1,150 +0,0 @@
-From 8470b1b15e78673f951ef8e58c5ff043909b5152 Mon Sep 17 00:00:00 2001
-From: Yang Rong <rong.r.yang at intel.com>
-Date: Tue, 2 Jul 2013 15:22:24 +0800
-Subject: [PATCH 02/10] Fix atomic test failed in GT1.
-To: beignet at lists.freedesktop.org
-
-Barrier only ensure one work group finish, can't guarantee all work item's atomic ops
-have finished before the last atomic_add.
-So use atomic_xchg to update first work group's local buffer to other global buffer position.
-
-Signed-off-by: Yang Rong <rong.r.yang at intel.com>
----
- kernels/compiler_atomic_functions.cl |   19 +++++++++++++------
- utests/compiler_atomic_functions.cpp |   23 ++++++++++++-----------
- 2 files changed, 25 insertions(+), 17 deletions(-)
-
-diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
-index 61ce2f4..fbc16fb 100644
---- a/kernels/compiler_atomic_functions.cl
-+++ b/kernels/compiler_atomic_functions.cl
-@@ -1,14 +1,21 @@
- __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
-   int lid = get_local_id(0);
-   int i = lid % 12;
--  atomic_xchg(&tmp[4], -1);
-+  if(lid == 0) {
-+    for(int j=0; j<12; j=j+1) {
-+      atomic_xchg(&tmp[j], 0);
-+    }
-+    atomic_xchg(&tmp[4], -1);
-+  }
-+  barrier(CLK_LOCAL_MEM_FENCE);
-+
-   switch(i) {
-     case 0: atomic_inc(&tmp[i]); break;
-     case 1: atomic_dec(&tmp[i]); break;
-     case 2: atomic_add(&tmp[i], src[lid]); break;
-     case 3: atomic_sub(&tmp[i], src[lid]); break;
--    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 4))); break;
--    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 4)); break;
-+    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 16))); break;
-+    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 16)); break;
-     case 6: atomic_xor(&tmp[i], src[lid]); break;
-     case 7: atomic_min(&tmp[i], -src[lid]); break;
-     case 8: atomic_max(&tmp[i], src[lid]); break;
-@@ -23,8 +30,8 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
-     case 1: atomic_dec(&dst[i]); break;
-     case 2: atomic_add(&dst[i], src[lid]); break;
-     case 3: atomic_sub(&dst[i], src[lid]); break;
--    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 4))); break;
--    case 5: atomic_or (&dst[i], src[lid]<<(lid / 4)); break;
-+    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 16))); break;
-+    case 5: atomic_or (&dst[i], src[lid]<<(lid / 16)); break;
-     case 6: atomic_xor(&dst[i], src[lid]); break;
-     case 7: atomic_min(&dst[i], -src[lid]); break;
-     case 8: atomic_max(&dst[i], src[lid]); break;
-@@ -38,6 +45,6 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
- 
-   if(get_global_id(0) == 0) {
-     for(i=0; i<12; i=i+1)
--      atomic_add(&dst[i], tmp[i]);
-+      atomic_xchg(&dst[i+12], tmp[i]);
-   }
- }
-diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
-index 571e0c6..65f1c5a 100644
---- a/utests/compiler_atomic_functions.cpp
-+++ b/utests/compiler_atomic_functions.cpp
-@@ -4,12 +4,12 @@
- #include <string.h>
- 
- #define GROUP_NUM 16
--#define LOCAL_SIZE 64
-+#define LOCAL_SIZE 256
- static void cpu_compiler_atomic(int *dst, int *src)
- {
-   dst[4] = 0xffffffff;
-   int tmp[16] = { 0 };
--
-+  tmp[4] = -1;
-   for(int j=0; j<LOCAL_SIZE; j++) {
-     int i = j % 12;
- 
-@@ -18,8 +18,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
-       case 1: tmp[i] -= 1; break;
-       case 2: tmp[i] += src[j]; break;
-       case 3: tmp[i] -= src[j]; break;
--      case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
--      case 5: tmp[i] |= src[j]<<(j>>2); break;
-+      case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
-+      case 5: tmp[i] |= src[j]<<(j>>4); break;
-       case 6: tmp[i] ^= src[j]; break;
-       case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
-       case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
-@@ -39,8 +39,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
-         case 1: dst[i] -= 1; break;
-         case 2: dst[i] += src[j]; break;
-         case 3: dst[i] -= src[j]; break;
--        case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
--        case 5: dst[i] |= src[j]<<(j>>2); break;
-+        case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
-+        case 5: dst[i] |= src[j]<<(j>>4); break;
-         case 6: dst[i] ^= src[j]; break;
-         case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
-         case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
-@@ -53,27 +53,28 @@ static void cpu_compiler_atomic(int *dst, int *src)
-   }
- 
-   for(int i=0; i<12; i++)
--    dst[i] += tmp[i];
-+    dst[i+12] = tmp[i];
- }
- 
- static void compiler_atomic_functions(void)
- {
-   const size_t n = GROUP_NUM * LOCAL_SIZE;
--  int cpu_dst[16] = {0}, cpu_src[256];
-+  int cpu_dst[24] = {0}, cpu_src[256];
- 
-   globals[0] = n;
-   locals[0] = LOCAL_SIZE;
- 
-   // Setup kernel and buffers
-   OCL_CREATE_KERNEL("compiler_atomic_functions");
--  OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
-+  OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
-   OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
-   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-   OCL_SET_ARG(1, 16 * sizeof(int), NULL);
-   OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
- 
-   OCL_MAP_BUFFER(0);
--  memset(buf_data[0], 0, 16 * sizeof(int));
-+  memset(buf_data[0], 0, 24 * sizeof(int));
-+  ((int *)buf_data[0])[4] = -1;
-   OCL_UNMAP_BUFFER(0);
- 
-   OCL_MAP_BUFFER(1);
-@@ -86,7 +87,7 @@ static void compiler_atomic_functions(void)
-   OCL_MAP_BUFFER(0);
- 
-   // Check results
--  for(int i=0; i<12; i++) {
-+  for(int i=0; i<24; i++) {
-     //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
-     OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
-   }
--- 
-1.7.10.4
-
diff --git a/debian/patches/0002-Readd-OpenCL-1.2-definitions-required-for-ICD.patch b/debian/patches/0002-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
deleted file mode 100644
index bbda3ac..0000000
--- a/debian/patches/0002-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
+++ /dev/null
@@ -1,95 +0,0 @@
-From 116a8feff8b91cedcd91c087ba14214ee939fa72 Mon Sep 17 00:00:00 2001
-From: Simon Richter <Simon.Richter at hogyros.de>
-Date: Wed, 19 Jun 2013 11:30:36 +0200
-Subject: [PATCH 02/11] Readd OpenCL 1.2 definitions required for ICD
-To: beignet at lists.freedesktop.org
-
-The definition for the ICD dispatch table requires a few additional
-definitions from OpenCL 1.2.
----
- include/CL/cl.h          |   15 +++++++++++++++
- include/CL/cl_platform.h |    2 ++
- src/cl_mem.h             |   12 ------------
- 3 files changed, 17 insertions(+), 12 deletions(-)
-
-diff --git a/include/CL/cl.h b/include/CL/cl.h
-index 4355e74..a7f25d1 100644
---- a/include/CL/cl.h
-+++ b/include/CL/cl.h
-@@ -67,6 +67,7 @@ typedef cl_uint             cl_channel_type;
- typedef cl_bitfield         cl_mem_flags;
- typedef cl_uint             cl_mem_object_type;
- typedef cl_uint             cl_mem_info;
-+typedef cl_bitfield         cl_mem_migration_flags;
- typedef cl_uint             cl_image_info;
- typedef cl_uint             cl_buffer_create_type;
- typedef cl_uint             cl_addressing_mode;
-@@ -75,8 +76,10 @@ typedef cl_uint             cl_sampler_info;
- typedef cl_bitfield         cl_map_flags;
- typedef cl_uint             cl_program_info;
- typedef cl_uint             cl_program_build_info;
-+typedef intptr_t            cl_device_partition_property;
- typedef cl_int              cl_build_status;
- typedef cl_uint             cl_kernel_info;
-+typedef cl_uint             cl_kernel_arg_info;
- typedef cl_uint             cl_kernel_work_group_info;
- typedef cl_uint             cl_event_info;
- typedef cl_uint             cl_command_type;
-@@ -87,6 +90,18 @@ typedef struct _cl_image_format {
-     cl_channel_type         image_channel_data_type;
- } cl_image_format;
- 
-+typedef struct _cl_image_desc {
-+    cl_mem_object_type      image_type;
-+    size_t                  image_width;
-+    size_t                  image_height;
-+    size_t                  image_depth;
-+    size_t                  image_array_size;
-+    size_t                  image_row_pitch;
-+    size_t                  image_slice_pitch;
-+    cl_uint                 num_mip_levels;
-+    cl_uint                 num_samples;
-+    cl_mem                  buffer;
-+} cl_image_desc;
- 
- typedef struct _cl_buffer_region {
-     size_t                  origin;
-diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
-index 043b048..9a2f17a 100644
---- a/include/CL/cl_platform.h
-+++ b/include/CL/cl_platform.h
-@@ -58,6 +58,8 @@ extern "C" {
-     #define CL_EXT_SUFFIX__VERSION_1_0
-     #define CL_API_SUFFIX__VERSION_1_1
-     #define CL_EXT_SUFFIX__VERSION_1_1
-+    #define CL_API_SUFFIX__VERSION_1_2
-+    #define CL_EXT_SUFFIX__VERSION_1_2
-     #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
- #endif
- 
-diff --git a/src/cl_mem.h b/src/cl_mem.h
-index 33ad174..66815fe 100644
---- a/src/cl_mem.h
-+++ b/src/cl_mem.h
-@@ -29,18 +29,6 @@
- #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
- #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
- #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
--typedef struct _cl_image_desc {
--    cl_mem_object_type      image_type;
--    size_t                  image_width;
--    size_t                  image_height;
--    size_t                  image_depth;
--    size_t                  image_array_size;
--    size_t                  image_row_pitch;
--    size_t                  image_slice_pitch;
--    cl_uint                 num_mip_levels;
--    cl_uint                 num_samples;
--    cl_mem                  buffer;
--} cl_image_desc;
- #endif
- 
- typedef enum cl_image_tiling {
--- 
-1.7.10.4
-
diff --git a/debian/patches/0003-Add-the-builtin-function-abs-and-the-according-test-.patch b/debian/patches/0003-Add-the-builtin-function-abs-and-the-according-test-.patch
deleted file mode 100644
index cf69bf3..0000000
--- a/debian/patches/0003-Add-the-builtin-function-abs-and-the-according-test-.patch
+++ /dev/null
@@ -1,219 +0,0 @@
-From 8b61fc4c2644e6748577d30567e6e4bf70aef436 Mon Sep 17 00:00:00 2001
-From: Junyan He <junyan.he at linux.intel.com>
-Date: Tue, 25 Jun 2013 15:50:54 +0800
-Subject: [PATCH 03/11] Add the builtin function abs() and the according test
- case
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Junyan He <junyan.he at linux.intel.com>
----
- backend/src/backend/gen_insn_selection.cpp |   10 ++++-
- backend/src/llvm/llvm_gen_backend.cpp      |   10 ++++-
- backend/src/llvm/llvm_gen_ocl_function.hxx |    3 +-
- backend/src/ocl_stdlib.h                   |    4 ++
- kernels/compiler_abs.cl                    |    5 +++
- utests/CMakeLists.txt                      |    1 +
- utests/compiler_abs.cpp                    |   61 ++++++++++++++++++++++++++++
- 7 files changed, 91 insertions(+), 3 deletions(-)
- create mode 100644 kernels/compiler_abs.cl
- create mode 100644 utests/compiler_abs.cpp
-
-diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
-index 8fb2a80..11efcb9 100644
---- a/backend/src/backend/gen_insn_selection.cpp
-+++ b/backend/src/backend/gen_insn_selection.cpp
-@@ -1224,7 +1224,15 @@ namespace gbe
-       const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode));
-       const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode));
-       switch (opcode) {
--        case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
-+        case ir::OP_ABS:
-+          if (insn.getType() == ir::TYPE_S32 || insn.getType() == ir::TYPE_U32) {
-+            const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
-+            const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
-+            sel.MOV(dst_, GenRegister::abs(src_));
-+          } else {
-+            sel.MOV(dst, GenRegister::abs(src));
-+          }
-+	  break;
-         case ir::OP_MOV:
-           if (dst.isdf()) {
-             ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
-diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
-index 08500ba..ba2192a 100644
---- a/backend/src/llvm/llvm_gen_backend.cpp
-+++ b/backend/src/llvm/llvm_gen_backend.cpp
-@@ -1688,6 +1688,7 @@ namespace gbe
-       case GEN_OCL_POW:
-       case GEN_OCL_RCP:
-       case GEN_OCL_ABS:
-+      case GEN_OCL_FABS:
-       case GEN_OCL_RNDZ:
-       case GEN_OCL_RNDE:
-       case GEN_OCL_RNDU:
-@@ -1844,6 +1845,13 @@ namespace gbe
-             ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
-             break;
-           }
-+          case GEN_OCL_ABS:
-+          {
-+            const ir::Register src = this->getRegister(*AI);
-+            const ir::Register dst = this->getRegister(&I);
-+            ctx.ALU1(ir::OP_ABS, ir::TYPE_S32, dst, src);
-+            break;
-+          }
-           case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
-           case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
-           case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
-@@ -1852,7 +1860,7 @@ namespace gbe
-           case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
-           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
-           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
--          case GEN_OCL_ABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
-+          case GEN_OCL_FABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
-           case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
-           case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
-           case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
-diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
-index fe19844..89a04ea 100644
---- a/backend/src/llvm/llvm_gen_ocl_function.hxx
-+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
-@@ -19,7 +19,8 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
- DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
- 
- // Math function
--DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
-+DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_abs)
-+DECL_LLVM_GEN_FUNCTION(FABS, __gen_ocl_fabs)
- DECL_LLVM_GEN_FUNCTION(COS, __gen_ocl_cos)
- DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
- DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
-diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index 3b191ab..5ad829e 100644
---- a/backend/src/ocl_stdlib.h
-+++ b/backend/src/ocl_stdlib.h
-@@ -4401,6 +4401,7 @@ INLINE uint get_global_id(uint dim) {
- /////////////////////////////////////////////////////////////////////////////
- // Math Functions (see 6.11.2 of OCL 1.1 spec)
- /////////////////////////////////////////////////////////////////////////////
-+PURE CONST int __gen_ocl_abs(int x);
- PURE CONST float __gen_ocl_fabs(float x);
- PURE CONST float __gen_ocl_sin(float x);
- PURE CONST float __gen_ocl_cos(float x);
-@@ -4579,6 +4580,7 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
- // XXX work-around PTX profile
- #define sqrt native_sqrt
- INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
-+INLINE_OVERLOADABLE int __gen_ocl_internal_abs(int x)  { return __gen_ocl_abs(x); }
- INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
- INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
- INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
-@@ -4924,6 +4926,7 @@ DECL_VECTOR_1OP(native_exp10, float);
- DECL_VECTOR_1OP(__gen_ocl_internal_expm1, float);
- DECL_VECTOR_1OP(__gen_ocl_internal_cbrt, float);
- DECL_VECTOR_1OP(__gen_ocl_internal_fabs, float);
-+DECL_VECTOR_1OP(__gen_ocl_internal_abs, int);
- DECL_VECTOR_1OP(__gen_ocl_internal_trunc, float);
- DECL_VECTOR_1OP(__gen_ocl_internal_round, float);
- DECL_VECTOR_1OP(__gen_ocl_internal_floor, float);
-@@ -5051,6 +5054,7 @@ INLINE_OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(fl
- INLINE_OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
- 
- // XXX workaround ptx profile
-+#define abs __gen_ocl_internal_abs
- #define fabs __gen_ocl_internal_fabs
- #define trunc __gen_ocl_internal_trunc
- #define round __gen_ocl_internal_round
-diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs.cl
-new file mode 100644
-index 0000000..7030a26
---- /dev/null
-+++ b/kernels/compiler_abs.cl
-@@ -0,0 +1,5 @@
-+kernel void compiler_abs(global int *src, global int *dst) {
-+  int i = get_global_id(0);
-+  dst[i] = abs(src[i]);
-+}
-+
-diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index 8a58ff4..b75f3b4 100644
---- a/utests/CMakeLists.txt
-+++ b/utests/CMakeLists.txt
-@@ -33,6 +33,7 @@ set (utests_sources
-   compiler_double_2.cpp
-   compiler_double_3.cpp
-   compiler_fabs.cpp
-+  compiler_abs.cpp
-   compiler_fill_image.cpp
-   compiler_fill_image0.cpp
-   compiler_fill_image_3d.cpp
-diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
-new file mode 100644
-index 0000000..908a32a
---- /dev/null
-+++ b/utests/compiler_abs.cpp
-@@ -0,0 +1,61 @@
-+#include "utest_helper.hpp"
-+
-+static void cpu(int global_id, int *src, int *dst) {
-+    int f = src[global_id];
-+    f = f < 0 ? -f : f;
-+    dst[global_id] = f;
-+}
-+
-+void compiler_abs(void)
-+{
-+    const size_t n = 16;
-+    int cpu_dst[16], cpu_src[16];
-+
-+    // Setup kernel and buffers
-+    OCL_CREATE_KERNEL("compiler_abs");
-+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
-+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
-+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-+    globals[0] = 16;
-+    locals[0] = 16;
-+
-+    // Run random tests
-+    for (uint32_t pass = 0; pass < 8; ++pass) {
-+        OCL_MAP_BUFFER(0);
-+        for (int32_t i = 0; i < (int32_t) n; ++i)
-+            cpu_src[i] = ((int*)buf_data[0])[i] = (rand() & 15) - 7;
-+
-+        // Run the kernel on GPU
-+        OCL_NDRANGE(1);
-+
-+        // Run on CPU
-+        for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
-+
-+        // Compare
-+        OCL_MAP_BUFFER(1);
-+
-+#if 0
-+        printf("Raw DATA: \n");
-+        for (int32_t i = 0; i < (int32_t) n; ++i) {
-+            printf(" %d", ((int *)buf_data[0])[i]);
-+        }
-+
-+        printf("\nCPU: \n");
-+        for (int32_t i = 0; i < (int32_t) n; ++i) {
-+            printf(" %d", cpu_dst[i]);
-+        }
-+        printf("\nGPU: \n");
-+        for (int32_t i = 0; i < (int32_t) n; ++i) {
-+            printf(" %d", ((int *)buf_data[1])[i]);
-+        }
-+#endif
-+
-+        for (int32_t i = 0; i < (int32_t) n; ++i)
-+            OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
-+        OCL_UNMAP_BUFFER(1);
-+        OCL_UNMAP_BUFFER(0);
-+    }
-+}
-+
-+MAKE_UTEST_FROM_FUNCTION(compiler_abs);
--- 
-1.7.10.4
-
diff --git a/debian/patches/0003-GBE-fixed-a-barrier-related-bug.patch b/debian/patches/0003-GBE-fixed-a-barrier-related-bug.patch
deleted file mode 100644
index 4b6f8fc..0000000
--- a/debian/patches/0003-GBE-fixed-a-barrier-related-bug.patch
+++ /dev/null
@@ -1,71 +0,0 @@
-From 43493e052b553f94f8a05f21bdb5203bddc12870 Mon Sep 17 00:00:00 2001
-From: Zhigang Gong <zhigang.gong at linux.intel.com>
-Date: Tue, 2 Jul 2013 18:49:48 +0800
-Subject: [PATCH 03/10] GBE: fixed a barrier related bug.
-To: beignet at lists.freedesktop.org
-
-Actually, this commit fixed two bugs related to barrier.
-1. We should set useSLM to true if we use barrier.
-2. We need to set barrier id to the barrierMsg payload according to
-r0.2. And we don't need to reprogram the barrierCount.
-
-And after this fix, we don't need the work around for the local
-memory barrier, thus we don't need the memory fence for local memory
-barrier.
-
-Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
----
- backend/src/backend/gen_insn_selection.cpp |   14 ++++++--------
- backend/src/llvm/llvm_gen_backend.cpp      |    1 +
- 2 files changed, 7 insertions(+), 8 deletions(-)
-
-diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
-index bbe392d..bfe1e28 100644
---- a/backend/src/backend/gen_insn_selection.cpp
-+++ b/backend/src/backend/gen_insn_selection.cpp
-@@ -1792,24 +1792,22 @@ namespace gbe
-       const ir::Register reg = sel.reg(FAMILY_DWORD);
- 
-       const uint32_t params = insn.getParameters();
--      //XXX TODO need to double check local barrier whether need fence or not
--      if(params == syncGlobalBarrier || params == syncLocalBarrier) {
-+      if(params == syncGlobalBarrier) {
-         const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
-         sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
-       }
- 
-       sel.push();
-         sel.curr.predicate = GEN_PREDICATE_NONE;
-+
-+        // As only the payload.2 is used and all the other regions are ignored
-+        // SIMD8 mode here is safe.
-         sel.curr.execWidth = 8;
-         sel.curr.physicalFlag = 0;
-         sel.curr.noMask = 1;
-+        // Copy barrier id from r0.
-+        sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
- 
--        sel.SHL(GenRegister::ud8grf(reg),
--                GenRegister::ud1grf(ocl::threadn),
--                GenRegister::immud(0x9));
--        sel.OR(GenRegister::ud8grf(reg),
--               GenRegister::ud8grf(reg),
--               GenRegister::immud(0x00088000));
-         // A barrier is OK to start the thread synchronization *and* SLM fence
-         sel.BARRIER(GenRegister::f8grf(reg));
-         // Now we wait for the other threads
-diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
-index 8385e21..db34296 100644
---- a/backend/src/llvm/llvm_gen_backend.cpp
-+++ b/backend/src/llvm/llvm_gen_backend.cpp
-@@ -1741,6 +1741,7 @@ namespace gbe
-       case GEN_OCL_LBARRIER:
-       case GEN_OCL_GBARRIER:
-       case GEN_OCL_LGBARRIER:
-+        ctx.getFunction().setUseSLM(true);
-         break;
-       case GEN_OCL_WRITE_IMAGE0:
-       case GEN_OCL_WRITE_IMAGE1:
--- 
-1.7.10.4
-
diff --git a/debian/patches/0004-PATCH-Refine-the-get_local_id-.-builtins.patch b/debian/patches/0004-PATCH-Refine-the-get_local_id-.-builtins.patch
deleted file mode 100644
index 888ffc4..0000000
--- a/debian/patches/0004-PATCH-Refine-the-get_local_id-.-builtins.patch
+++ /dev/null
@@ -1,55 +0,0 @@
-From 3c0ee8cab035426e7e28425d2142e15051d83c90 Mon Sep 17 00:00:00 2001
-From: Zhigang Gong <zhigang.gong at linux.intel.com>
-Date: Tue, 25 Jun 2013 14:15:09 +0800
-Subject: [PATCH 04/11] [PATCH] Refine the get_local_id/... builtins.
-To: beignet at lists.freedesktop.org
-
-As we could prepare correct value on runtime library side and give
-a correct value in the payload for dim 0, 1 and 2. So for these 3
-dim argument, we don't need to check it whether in the valid range,
-we just read the payload's value.
-
-This way, we can avoid any unecessary branching for normal usage of
-these builtin functions. And could avoid a known bool related bug.
-
-Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
----
- backend/src/ocl_stdlib.h |    8 +++-----
- src/cl_api.c             |    2 +-
- 2 files changed, 4 insertions(+), 6 deletions(-)
-
-diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index 5ad829e..227454d 100644
---- a/backend/src/ocl_stdlib.h
-+++ b/backend/src/ocl_stdlib.h
-@@ -4379,11 +4379,9 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
- #define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
- INLINE unsigned NAME(unsigned int dim) {             \
-   if (dim == 0) return __gen_ocl_##NAME##0();        \
--  else if (dim > 0 && dim < get_work_dim()) {        \
--    if (dim == 1) return __gen_ocl_##NAME##1();      \
--    else if (dim == 2) return __gen_ocl_##NAME##2(); \
--  }                                                  \
--  return OTHER_RET;                                  \
-+  else if (dim == 1) return __gen_ocl_##NAME##1();   \
-+  else if (dim == 2) return __gen_ocl_##NAME##2();   \
-+  else return OTHER_RET;                             \
- }
- 
- DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
-diff --git a/src/cl_api.c b/src/cl_api.c
-index f7db4bc..bb09c07 100644
---- a/src/cl_api.c
-+++ b/src/cl_api.c
-@@ -1581,7 +1581,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
- {
-   size_t fixed_global_off[] = {0,0,0};
-   size_t fixed_global_sz[] = {1,1,1};
--  size_t fixed_local_sz[] = {16,1,1};
-+  size_t fixed_local_sz[] = {1,1,1};
-   cl_int err = CL_SUCCESS;
-   cl_uint i;
- 
--- 
-1.7.10.4
-
diff --git a/debian/patches/0004-utests-increase-local-size-in-the-two-barrier-test-c.patch b/debian/patches/0004-utests-increase-local-size-in-the-two-barrier-test-c.patch
deleted file mode 100644
index f0100c0..0000000
--- a/debian/patches/0004-utests-increase-local-size-in-the-two-barrier-test-c.patch
+++ /dev/null
@@ -1,45 +0,0 @@
-From 5c5e5e83918b23709d1135299596b968feff00bc Mon Sep 17 00:00:00 2001
-From: Zhigang Gong <zhigang.gong at linux.intel.com>
-Date: Tue, 2 Jul 2013 18:49:49 +0800
-Subject: [PATCH 04/10] utests: increase local size in the two barrier test
- cases.
-To: beignet at lists.freedesktop.org
-
-Increasing the local size to 256 to bring more pressure
-to barrier testing.
-
-Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
----
- utests/compiler_global_memory_barrier.cpp  |    2 +-
- utests/compiler_local_memory_barrier_2.cpp |    2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/utests/compiler_global_memory_barrier.cpp b/utests/compiler_global_memory_barrier.cpp
-index a6496a7..ea84e72 100644
---- a/utests/compiler_global_memory_barrier.cpp
-+++ b/utests/compiler_global_memory_barrier.cpp
-@@ -13,7 +13,7 @@ static void compiler_global_memory_barrier(void)
- 
-   // Run the kernel
-   globals[0] = n/2;
--  locals[0] = 32;
-+  locals[0] = 256;
-   OCL_NDRANGE(1);
-   OCL_MAP_BUFFER(0);
- 
-diff --git a/utests/compiler_local_memory_barrier_2.cpp b/utests/compiler_local_memory_barrier_2.cpp
-index b074123..4fa090b 100644
---- a/utests/compiler_local_memory_barrier_2.cpp
-+++ b/utests/compiler_local_memory_barrier_2.cpp
-@@ -5,7 +5,7 @@ static void compiler_local_memory_barrier_2(void)
-   const size_t n = 16*1024;
- 
-   globals[0] = n/2;
--  locals[0] = 32;
-+  locals[0] = 256;
- 
-   // Setup kernel and buffers
-   OCL_CREATE_KERNEL("compiler_local_memory_barrier_2");
--- 
-1.7.10.4
-
diff --git a/debian/patches/0005-Add-atomic-help-functions.-Support-global-and-local-.patch b/debian/patches/0005-Add-atomic-help-functions.-Support-global-and-local-.patch
deleted file mode 100644
index f011afd..0000000
--- a/debian/patches/0005-Add-atomic-help-functions.-Support-global-and-local-.patch
+++ /dev/null
@@ -1,611 +0,0 @@
-From 6e70caed68eb0ace38a8b42b105c0fd4c38e8f9e Mon Sep 17 00:00:00 2001
-From: Yang Rong <rong.r.yang at intel.com>
-Date: Wed, 26 Jun 2013 15:29:21 +0800
-Subject: [PATCH 05/11] Add atomic help functions. Support global and local
- buffer.
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Yang Rong <rong.r.yang at intel.com>
----
- backend/src/backend/gen_context.cpp                |    9 +++
- backend/src/backend/gen_context.hpp                |    1 +
- backend/src/backend/gen_defs.hpp                   |   49 ++++++++++++--
- backend/src/backend/gen_encoder.cpp                |   35 ++++++++++
- backend/src/backend/gen_encoder.hpp                |    2 +
- .../src/backend/gen_insn_gen7_schedule_info.hxx    |    2 +-
- backend/src/backend/gen_insn_selection.cpp         |   61 +++++++++++++++++
- backend/src/backend/gen_insn_selection.hpp         |    4 +-
- backend/src/backend/gen_insn_selection.hxx         |    1 +
- backend/src/ir/context.hpp                         |    6 ++
- backend/src/ir/instruction.cpp                     |   70 +++++++++++++++++++-
- backend/src/ir/instruction.hpp                     |   43 +++++++++++-
- backend/src/ir/instruction.hxx                     |   13 ++++
- 13 files changed, 283 insertions(+), 13 deletions(-)
-
-diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
-index 93d3932..41cab90 100644
---- a/backend/src/backend/gen_context.cpp
-+++ b/backend/src/backend/gen_context.cpp
-@@ -226,6 +226,15 @@ namespace gbe
-     }
-   }
- 
-+  void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
-+    const GenRegister src = ra->genReg(insn.src(0));
-+    const GenRegister dst = ra->genReg(insn.dst(0));
-+    const uint32_t function = insn.extra.function;
-+    const uint32_t bti = insn.extra.elem;
-+
-+    p->ATOMIC(dst, function, src, bti, insn.srcNum);
-+  }
-+
-   void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
-     GenRegister src = ra->genReg(insn.src(0));
-     if(isScalarReg(src.reg()))
-diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
-index 804384d..5dfaef9 100644
---- a/backend/src/backend/gen_context.hpp
-+++ b/backend/src/backend/gen_context.hpp
-@@ -91,6 +91,7 @@ namespace gbe
-     void emitWriteFloat64Instruction(const SelectionInstruction &insn);
-     void emitUntypedReadInstruction(const SelectionInstruction &insn);
-     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
-+    void emitAtomicInstruction(const SelectionInstruction &insn);
-     void emitByteGatherInstruction(const SelectionInstruction &insn);
-     void emitByteScatterInstruction(const SelectionInstruction &insn);
-     void emitSampleInstruction(const SelectionInstruction &insn);
-diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
-index d1ce6b2..61412c4 100644
---- a/backend/src/backend/gen_defs.hpp
-+++ b/backend/src/backend/gen_defs.hpp
-@@ -1,4 +1,4 @@
--/* 
-+/*
-  * Copyright © 2012 Intel Corporation
-  *
-  * This library is free software; you can redistribute it and/or
-@@ -21,7 +21,7 @@
-  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
-  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
-  develop this 3D driver.
-- 
-+
-  Permission is hereby granted, free of charge, to any person obtaining
-  a copy of this software and associated documentation files (the
-  "Software"), to deal in the Software without restriction, including
-@@ -29,11 +29,11 @@
-  distribute, sublicense, and/or sell copies of the Software, and to
-  permit persons to whom the Software is furnished to do so, subject to
-  the following conditions:
-- 
-+
-  The above copyright notice and this permission notice (including the
-  next paragraph) shall be included in all copies or substantial
-  portions of the Software.
-- 
-+
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-@@ -41,7 +41,7 @@
-  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-- 
-+
-  **********************************************************************/
-  /*
-   * Authors:
-@@ -169,6 +169,28 @@ enum opcode {
-   GEN_OPCODE_NOP = 126,
- };
- 
-+#define GEN_ATOMIC_SIMD16   0
-+#define GEN_ATOMIC_SIMD8    1
-+
-+enum GenAtomicOpCode {
-+  GEN_ATOMIC_OP_CMPWR8B   = 0,
-+  GEN_ATOMIC_OP_AND       = 1,
-+  GEN_ATOMIC_OP_OR        = 2,
-+  GEN_ATOMIC_OP_XOR       = 3,
-+  GEN_ATOMIC_OP_MOV       = 4,
-+  GEN_ATOMIC_OP_INC       = 5,
-+  GEN_ATOMIC_OP_DEC       = 6,
-+  GEN_ATOMIC_OP_ADD       = 7,
-+  GEN_ATOMIC_OP_SUB       = 8,
-+  GEN_ATOMIC_OP_REVSUB    = 9,
-+  GEN_ATOMIC_OP_IMAX      = 10,
-+  GEN_ATOMIC_OP_IMIN      = 11,
-+  GEN_ATOMIC_OP_UMAX      = 12,
-+  GEN_ATOMIC_OP_UMIN      = 13,
-+  GEN_ATOMIC_OP_CMPWR     = 14,
-+  GEN_ATOMIC_OP_PREDEC    = 15
-+};
-+
- /*! Gen SFID */
- enum GenMessageTarget {
-   GEN_SFID_NULL                     = 0,
-@@ -772,7 +794,7 @@ struct GenInstruction
-     /*! Memory fence */
-     struct {
-       uint32_t bti:8;
--      uint32_t ingored:5;
-+      uint32_t pad:5;
-       uint32_t commit_enable:1;
-       uint32_t msg_type:4;
-       uint32_t pad2:1;
-@@ -783,6 +805,21 @@ struct GenInstruction
-       uint32_t end_of_thread:1;
-     } gen7_memory_fence;
- 
-+    /*! atomic messages */
-+    struct {
-+      uint32_t bti:8;
-+      uint32_t aop_type:4;
-+      uint32_t simd_mode:1;
-+      uint32_t return_data:1;
-+      uint32_t msg_type:4;
-+      uint32_t category:1;
-+      uint32_t header_present:1;
-+      uint32_t response_length:5;
-+      uint32_t msg_length:4;
-+      uint32_t pad3:2;
-+      uint32_t end_of_thread:1;
-+    } gen7_atomic_op;
-+
-     struct {
-       uint32_t src1_subreg_nr_high:1;
-       uint32_t src1_reg_nr:8;
-diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
-index e96678b..43658e8 100644
---- a/backend/src/backend/gen_encoder.cpp
-+++ b/backend/src/backend/gen_encoder.cpp
-@@ -558,6 +558,41 @@ namespace gbe
-                            response_length);
-   }
- 
-+  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
-+    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
-+    uint32_t msg_length = 0;
-+    uint32_t response_length = 0;
-+
-+    if (this->curr.execWidth == 8) {
-+      msg_length = srcNum;
-+      response_length = 1;
-+    } else if (this->curr.execWidth == 16) {
-+      msg_length = 2*srcNum;
-+      response_length = 2;
-+    } else
-+      NOT_IMPLEMENTED;
-+
-+    this->setHeader(insn);
-+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
-+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-+    this->setSrc1(insn, GenRegister::immud(0));
-+
-+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
-+    setMessageDescriptor(this, insn, sfid, msg_length, response_length);
-+    insn->bits3.gen7_atomic_op.msg_type = GEN_UNTYPED_ATOMIC_READ;
-+    insn->bits3.gen7_atomic_op.bti = bti;
-+    insn->bits3.gen7_atomic_op.return_data = 1;
-+    insn->bits3.gen7_atomic_op.aop_type = function;
-+
-+    if (this->curr.execWidth == 8)
-+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
-+    else if (this->curr.execWidth == 16)
-+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
-+    else
-+      NOT_SUPPORTED;
-+
-+  }
-+
-   GenInstruction *GenEncoder::next(uint32_t opcode) {
-      GenInstruction insn;
-      std::memset(&insn, 0, sizeof(GenInstruction));
-diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
-index 88a3e77..3ff8c97 100644
---- a/backend/src/backend/gen_encoder.hpp
-+++ b/backend/src/backend/gen_encoder.hpp
-@@ -136,6 +136,8 @@ namespace gbe
-     void NOP(void);
-     /*! Wait instruction (used for the barrier) */
-     void WAIT(void);
-+    /*! Atomic instructions */
-+    void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-     /*! Read 64-bits float arrays */
-     void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
-     /*! Write 64-bits float arrays */
-diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
-index a3b4621..f3f4a25 100644
---- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
-+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
-@@ -21,4 +21,4 @@ DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
- DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
- DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
- DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
--
-+DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
-diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
-index 11efcb9..c64afd9 100644
---- a/backend/src/backend/gen_insn_selection.cpp
-+++ b/backend/src/backend/gen_insn_selection.cpp
-@@ -168,12 +168,14 @@ namespace gbe
-   bool SelectionInstruction::isRead(void) const {
-     return this->opcode == SEL_OP_UNTYPED_READ ||
-            this->opcode == SEL_OP_READ_FLOAT64 ||
-+           this->opcode == SEL_OP_ATOMIC       ||
-            this->opcode == SEL_OP_BYTE_GATHER;
-   }
- 
-   bool SelectionInstruction::isWrite(void) const {
-     return this->opcode == SEL_OP_UNTYPED_WRITE ||
-            this->opcode == SEL_OP_WRITE_FLOAT64 ||
-+           this->opcode == SEL_OP_ATOMIC        ||
-            this->opcode == SEL_OP_BYTE_SCATTER;
-   }
- 
-@@ -456,6 +458,8 @@ namespace gbe
-     void NOP(void);
-     /*! Wait instruction (used for the barrier) */
-     void WAIT(void);
-+    /*! Atomic instruction */
-+    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
-     /*! Read 64 bits float array */
-     void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
-     /*! Write 64 bits float array */
-@@ -730,6 +734,23 @@ namespace gbe
-     insn->src(0) = src;
-   }
- 
-+  void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
-+                                     uint32_t srcNum, Reg src0,
-+                                     Reg src1, Reg src2, uint32_t bti) {
-+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
-+    insn->dst(0) = dst;
-+    insn->src(0) = src0;
-+    if(srcNum > 1) insn->src(1) = src1;
-+    if(srcNum > 2) insn->src(2) = src2;
-+    insn->extra.function = function;
-+    insn->extra.elem     = bti;
-+    SelectionVector *vector = this->appendVector();
-+
-+    vector->regNum = srcNum;
-+    vector->reg = &insn->src(0);
-+    vector->isSrc = 1;
-+  }
-+
-   void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
-   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
-   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
-@@ -916,6 +937,7 @@ namespace gbe
-   bool Selection::Opaque::isRoot(const ir::Instruction &insn) const {
-     if (insn.getDstNum() > 1 ||
-         insn.hasSideEffect() ||
-+        insn.isMemberOf<ir::AtomicInstruction>() ||
-         insn.isMemberOf<ir::BranchInstruction>() ||
-         insn.isMemberOf<ir::LabelInstruction>())
-     return true;
-@@ -2033,6 +2055,44 @@ namespace gbe
-     DECL_CTOR(ConvertInstruction, 1, 1);
-   };
- 
-+  /*! Convert instruction pattern */
-+  DECL_PATTERN(AtomicInstruction)
-+  {
-+    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn) const
-+    {
-+      using namespace ir;
-+      const Opcode opcode = insn.getOpcode();
-+      const AddressSpace space = insn.getAddressSpace();
-+      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
-+      const uint32_t srcNum = insn.getSrcNum();
-+      const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
-+      GenRegister src1 = src0, src2 = src0;
-+      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
-+      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
-+      GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
-+      GenAtomicOpCode aop = GEN_ATOMIC_OP_CMPWR8B;
-+      switch (opcode) {
-+        case OP_ATOMIC_ADD:     aop = GEN_ATOMIC_OP_ADD; break;
-+        case OP_ATOMIC_SUB:     aop = GEN_ATOMIC_OP_SUB; break;
-+        case OP_ATOMIC_AND:     aop = GEN_ATOMIC_OP_AND; break;
-+        case OP_ATOMIC_OR :     aop = GEN_ATOMIC_OP_OR;  break;
-+        case OP_ATOMIC_XOR:     aop = GEN_ATOMIC_OP_XOR; break;
-+        case OP_ATOMIC_XCHG:    aop = GEN_ATOMIC_OP_MOV; break;
-+        case OP_ATOMIC_UMIN:    aop = GEN_ATOMIC_OP_UMIN; break;
-+        case OP_ATOMIC_UMAX:    aop = GEN_ATOMIC_OP_UMAX; break;
-+        case OP_ATOMIC_IMIN:    aop = GEN_ATOMIC_OP_IMIN; break;
-+        case OP_ATOMIC_IMAX:    aop = GEN_ATOMIC_OP_IMAX; break;
-+        case OP_ATOMIC_INC:     aop = GEN_ATOMIC_OP_INC; break;
-+        case OP_ATOMIC_DEC:     aop = GEN_ATOMIC_OP_DEC; break;
-+        case OP_ATOMIC_CMPXCHG: aop = GEN_ATOMIC_OP_CMPWR; break;
-+        default: NOT_SUPPORTED;
-+      }
-+      sel.ATOMIC(dst, aop, srcNum, src0, src1, src2, bti);
-+      return true;
-+    }
-+    DECL_CTOR(AtomicInstruction, 1, 1);
-+  };
-+
-   /*! Select instruction pattern */
-   class SelectInstructionPattern : public SelectionPattern
-   {
-@@ -2379,6 +2439,7 @@ namespace gbe
-     this->insert<SelectInstructionPattern>();
-     this->insert<CompareInstructionPattern>();
-     this->insert<ConvertInstructionPattern>();
-+    this->insert<AtomicInstructionPattern>();
-     this->insert<LabelInstructionPattern>();
-     this->insert<BranchInstructionPattern>();
-     this->insert<Int32x32MulInstructionPattern>();
-diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
-index 778eb1f..5ae6e42 100644
---- a/backend/src/backend/gen_insn_selection.hpp
-+++ b/backend/src/backend/gen_insn_selection.hpp
-@@ -92,9 +92,9 @@ namespace gbe
-     GenInstructionState state;
-     union {
-       struct {
--        /*! Store bti for loads/stores and function for math and compares */
-+        /*! Store bti for loads/stores and function for math, atomic and compares */
-         uint16_t function:8;
--        /*! elemSize for byte scatters / gathers, elemNum for untyped msg */
-+        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
-         uint16_t elem:8;
-       };
-       struct {
-diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
-index cc2be08..f1a4701 100644
---- a/backend/src/backend/gen_insn_selection.hxx
-+++ b/backend/src/backend/gen_insn_selection.hxx
-@@ -20,6 +20,7 @@ DECL_SELECTION_IR(RSL, BinaryInstruction)
- DECL_SELECTION_IR(ASR, BinaryInstruction)
- DECL_SELECTION_IR(ADD, BinaryInstruction)
- DECL_SELECTION_IR(MUL, BinaryInstruction)
-+DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
- DECL_SELECTION_IR(MACH, BinaryInstruction)
- DECL_SELECTION_IR(CMP, CompareInstruction)
- DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
-diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
-index c286f1d..55e76f2 100644
---- a/backend/src/ir/context.hpp
-+++ b/backend/src/ir/context.hpp
-@@ -150,6 +150,12 @@ namespace ir {
-       this->append(insn);
-     }
- 
-+    /*! For all atomic functions */
-+    void ATOMIC(Opcode opcode, Register dst, AddressSpace space, Tuple src) {
-+      const Instruction insn = gbe::ir::ATOMIC(opcode, dst, space, src);
-+      this->append(insn);
-+    }
-+
-     /*! LOAD with the destinations directly specified */
-     template <typename... Args>
-     void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
-diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
-index 67a4c12..578f5d2 100644
---- a/backend/src/ir/instruction.cpp
-+++ b/backend/src/ir/instruction.cpp
-@@ -1,4 +1,4 @@
--/* 
-+/*
-  * Copyright © 2012 Intel Corporation
-  *
-  * This library is free software; you can redistribute it and/or
-@@ -56,7 +56,7 @@ namespace ir {
-     };
- 
-     /*! For regular n source instructions */
--    template <typename T, uint32_t srcNum> 
-+    template <typename T, uint32_t srcNum>
-     struct NSrcPolicy {
-       INLINE uint32_t getSrcNum(void) const { return srcNum; }
-       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
-@@ -246,6 +246,37 @@ namespace ir {
-       Type srcType; //!< Type to convert from
-     };
- 
-+    class ALIGNED_INSTRUCTION AtomicInstruction :
-+      public BasePolicy,
-+      public TupleSrcPolicy<AtomicInstruction>,
-+      public NDstPolicy<AtomicInstruction, 1>
-+    {
-+    public:
-+      AtomicInstruction(Opcode opcode,
-+                         Register dst,
-+                         AddressSpace addrSpace,
-+                         Tuple src)
-+      {
-+        this->opcode = opcode;
-+        this->dst[0] = dst;
-+        this->src = src;
-+        this->addrSpace = addrSpace;
-+        srcNum = 2;
-+        if((opcode == OP_ATOMIC_INC) ||
-+          (opcode == OP_ATOMIC_DEC))
-+          srcNum = 1;
-+        if(opcode == OP_ATOMIC_CMPXCHG)
-+          srcNum = 3;
-+      }
-+      INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
-+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
-+      INLINE void out(std::ostream &out, const Function &fn) const;
-+      Register dst[1];
-+      Tuple src;
-+      AddressSpace addrSpace; //!< Address space
-+      uint8_t srcNum;         //!<Source Number
-+    };
-+
-     class ALIGNED_INSTRUCTION BranchInstruction :
-       public BasePolicy,
-       public NDstPolicy<BranchInstruction, 0>
-@@ -738,6 +769,20 @@ namespace ir {
-       return true;
-     }
- 
-+    // We can convert anything to anything, but types and families must match
-+    INLINE bool AtomicInstruction::wellFormed(const Function &fn, std::string &whyNot) const
-+    {
-+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
-+        return false;
-+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
-+        return false;
-+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
-+        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
-+          return false;
-+
-+      return true;
-+    }
-+
-     /*! Loads and stores follow the same restrictions */
-     template <typename T>
-     INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
-@@ -883,6 +928,15 @@ namespace ir {
-       ternaryOrSelectOut(*this, out, fn);
-     }
- 
-+    INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
-+      this->outOpcode(out);
-+      out << "." << addrSpace;
-+      out << " %" << this->getDst(fn, 0);
-+      out << " {" << "%" << this->getSrc(fn, 0) << "}";
-+      for (uint32_t i = 1; i < srcNum; ++i)
-+        out << " %" << this->getSrc(fn, i);
-+    }
-+
-     INLINE void ConvertInstruction::out(std::ostream &out, const Function &fn) const {
-       this->outOpcode(out);
-       out << "." << this->getDstType()
-@@ -1009,6 +1063,10 @@ START_INTROSPECTION(ConvertInstruction)
- #include "ir/instruction.hxx"
- END_INTROSPECTION(ConvertInstruction)
- 
-+START_INTROSPECTION(AtomicInstruction)
-+#include "ir/instruction.hxx"
-+END_INTROSPECTION(AtomicInstruction)
-+
- START_INTROSPECTION(SelectInstruction)
- #include "ir/instruction.hxx"
- END_INTROSPECTION(SelectInstruction)
-@@ -1180,7 +1238,7 @@ END_FUNCTION(Instruction, Register)
-   }
- 
-   bool Instruction::hasSideEffect(void) const {
--    return opcode == OP_STORE || 
-+    return opcode == OP_STORE ||
-            opcode == OP_TYPED_WRITE ||
-            opcode == OP_SYNC;
-   }
-@@ -1197,6 +1255,7 @@ DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
- DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
- DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
- DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
-+DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
- DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
- DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
- DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-@@ -1304,6 +1363,11 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
-     return internal::ConvertInstruction(dstType, srcType, dst, src).convert();
-   }
- 
-+  // For all unary functions with given opcode
-+  Instruction ATOMIC(Opcode opcode, Register dst, AddressSpace space, Tuple src) {
-+    return internal::AtomicInstruction(opcode, dst, space, src).convert();
-+  }
-+
-   // BRA
-   Instruction BRA(LabelIndex labelIndex) {
-     return internal::BranchInstruction(OP_BRA, labelIndex).convert();
-diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
-index 0f3bd34..91d280d 100644
---- a/backend/src/ir/instruction.hpp
-+++ b/backend/src/ir/instruction.hpp
-@@ -1,4 +1,4 @@
--/* 
-+/*
-  * Copyright © 2012 Intel Corporation
-  *
-  * This library is free software; you can redistribute it and/or
-@@ -228,6 +228,19 @@ namespace ir {
-     static bool isClassOf(const Instruction &insn);
-   };
- 
-+  /*! Atomic instruction */
-+  class AtomicInstruction : public Instruction {
-+  public:
-+    /*! Where the address register goes */
-+    static const uint32_t addressIndex = 0;
-+    /*! Address space that is manipulated here */
-+    AddressSpace getAddressSpace(void) const;
-+    /*! Return the register that contains the addresses */
-+    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
-+    /*! Return true if the given instruction is an instance of this class */
-+    static bool isClassOf(const Instruction &insn);
-+  };
-+
-   /*! Store instruction. First source is the address. Next sources are the
-    *  values to store contiguously at the given address
-    */
-@@ -555,6 +568,34 @@ namespace ir {
-   Instruction GT(Type type, Register dst, Register src0, Register src1);
-   /*! cvt.{dstType <- srcType} dst src */
-   Instruction CVT(Type dstType, Type srcType, Register dst, Register src);
-+  /*! atomic dst addr.space {src1 {src2}} */
-+  Instruction ATOMIC(Opcode opcode, Register dst, AddressSpace space, Tuple src);
-+  /*! atomic_add dst addr.space src1 */
-+  Instruction ATOMIC_ADD(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_sub dst addr.space src1 */
-+  Instruction ATOMIC_SUB(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_and dst addr.space src1 */
-+  Instruction ATOMIC_AND(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_or dst addr.space src1 */
-+  Instruction ATOMIC_OR(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_xor dst addr.space src1 */
-+  Instruction ATOMIC_XOR(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_xchg dst addr.space src1 */
-+  Instruction ATOMIC_XCHG(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_imin dst addr.space src1 */
-+  Instruction ATOMIC_IMIN(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_imax dst addr.space src1 */
-+  Instruction ATOMIC_IMAX(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_umin dst addr.space src1 */
-+  Instruction ATOMIC_UMIN(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_umax dst addr.space src1 */
-+  Instruction ATOMIC_UMAX(Register dst, Register addr, AddressSpace space, Register src1);
-+  /*! atomic_inc dst addr.space */
-+  Instruction ATOMIC_INC(Register dst, Register addr, AddressSpace space);
-+  /*! atomic_dec dst addr.space */
-+  Instruction ATOMIC_DEC(Register dst, Register addr, AddressSpace space);
-+  /*! atomic_cmpxchg dst addr.space src1 src2 */
-+  Instruction ATOMIC_CMPXCHG(Register dst, Register addr, AddressSpace space, Register src1, Register src2);
-   /*! bra labelIndex */
-   Instruction BRA(LabelIndex labelIndex);
-   /*! (pred) bra labelIndex */
-diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
-index acfb45a..42c1e89 100644
---- a/backend/src/ir/instruction.hxx
-+++ b/backend/src/ir/instruction.hxx
-@@ -61,6 +61,19 @@ DECL_INSN(LT, CompareInstruction)
- DECL_INSN(GE, CompareInstruction)
- DECL_INSN(GT, CompareInstruction)
- DECL_INSN(CVT, ConvertInstruction)
-+DECL_INSN(ATOMIC_ADD, AtomicInstruction)
-+DECL_INSN(ATOMIC_SUB, AtomicInstruction)
-+DECL_INSN(ATOMIC_AND, AtomicInstruction)
-+DECL_INSN(ATOMIC_OR, AtomicInstruction)
-+DECL_INSN(ATOMIC_XOR, AtomicInstruction)
-+DECL_INSN(ATOMIC_XCHG, AtomicInstruction)
-+DECL_INSN(ATOMIC_UMIN, AtomicInstruction)
-+DECL_INSN(ATOMIC_UMAX, AtomicInstruction)
-+DECL_INSN(ATOMIC_IMIN, AtomicInstruction)
-+DECL_INSN(ATOMIC_IMAX, AtomicInstruction)
-+DECL_INSN(ATOMIC_INC, AtomicInstruction)
-+DECL_INSN(ATOMIC_DEC, AtomicInstruction)
-+DECL_INSN(ATOMIC_CMPXCHG, AtomicInstruction)
- DECL_INSN(BRA, BranchInstruction)
- DECL_INSN(RET, BranchInstruction)
- DECL_INSN(LOADI, LoadImmInstruction)
--- 
-1.7.10.4
-
diff --git a/debian/patches/0005-Disable-error-message-output-in-release-version.patch b/debian/patches/0005-Disable-error-message-output-in-release-version.patch
deleted file mode 100644
index 8be79bc..0000000
--- a/debian/patches/0005-Disable-error-message-output-in-release-version.patch
+++ /dev/null
@@ -1,71 +0,0 @@
-From 31473819a135cb8c9048617e12b0b5453104aeec Mon Sep 17 00:00:00 2001
-From: Ruiling Song <ruiling.song at intel.com>
-Date: Tue, 2 Jul 2013 16:44:43 +0800
-Subject: [PATCH 05/10] Disable error message output in release version.
-To: beignet at lists.freedesktop.org
-
-As piglit will got the error message we output to stderr and mark the case 'WARN'.
-so, we disable the message to stderr, and use release version to run piglit.
-
-also fix a minor compile fail under release version.
-
-Signed-off-by: Ruiling Song <ruiling.song at intel.com>
----
- backend/src/llvm/llvm_gen_backend.cpp |    3 +--
- src/cl_utils.h                        |   24 ++++++++++++++++--------
- 2 files changed, 17 insertions(+), 10 deletions(-)
-
-diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
-index db34296..c2e37fa 100644
---- a/backend/src/llvm/llvm_gen_backend.cpp
-+++ b/backend/src/llvm/llvm_gen_backend.cpp
-@@ -1824,10 +1824,9 @@ namespace gbe
- 
-   void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
-     CallSite::arg_iterator AI = CS.arg_begin();
--#if GBE_DEBUG
-     CallSite::arg_iterator AE = CS.arg_end();
--#endif /* GBE_DEBUG */
-     GBE_ASSERT(AI != AE);
-+
-     unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
-     const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
-     const ir::Register dst = this->getRegister(&I);
-diff --git a/src/cl_utils.h b/src/cl_utils.h
-index dfb1369..59b7a2b 100644
---- a/src/cl_utils.h
-+++ b/src/cl_utils.h
-@@ -39,14 +39,22 @@ struct JOIN(__,JOIN(__,__LINE__)) {                                 \
- }
- 
- /* Throw errors */
--#define ERR(ERROR, ...)                                             \
--do {                                                                \
--  fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__);     \
--  fprintf(stderr, __VA_ARGS__);                                     \
--  fprintf(stderr, "\n");                                            \
--  err = ERROR;                                                      \
--  goto error;                                                       \
--} while (0)
-+#ifdef NDEBUG
-+  #define ERR(ERROR, ...)                                             \
-+  do {                                                                \
-+    err = ERROR;                                                      \
-+    goto error;                                                       \
-+  } while (0)
-+#else
-+  #define ERR(ERROR, ...)                                             \
-+  do {                                                                \
-+    fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__);     \
-+    fprintf(stderr, __VA_ARGS__);                                     \
-+    fprintf(stderr, "\n");                                            \
-+    err = ERROR;                                                      \
-+    goto error;                                                       \
-+  } while (0)
-+#endif
- 
- #define DO_ALLOC_ERR                                                \
- do {                                                                \
--- 
-1.7.10.4
-
diff --git a/debian/patches/0006-Add-all-atomic-built-in-functions-support.patch b/debian/patches/0006-Add-all-atomic-built-in-functions-support.patch
deleted file mode 100644
index fac6999..0000000
--- a/debian/patches/0006-Add-all-atomic-built-in-functions-support.patch
+++ /dev/null
@@ -1,289 +0,0 @@
-From 6d471350375328273070c9b73b22e9ab042c3313 Mon Sep 17 00:00:00 2001
-From: Yang Rong <rong.r.yang at intel.com>
-Date: Wed, 26 Jun 2013 15:29:22 +0800
-Subject: [PATCH 06/11] Add all atomic built-in functions support.
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Yang Rong <rong.r.yang at intel.com>
----
- backend/src/llvm/llvm_gen_backend.cpp      |   76 +++++++++++++++++++++
- backend/src/llvm/llvm_gen_ocl_function.hxx |   28 ++++++++
- backend/src/ocl_stdlib.h                   |  100 +++++++++++++++++++++++++++-
- 3 files changed, 203 insertions(+), 1 deletion(-)
-
-diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
-index ba2192a..5db3fbe 100644
---- a/backend/src/llvm/llvm_gen_backend.cpp
-+++ b/backend/src/llvm/llvm_gen_backend.cpp
-@@ -534,6 +534,8 @@ namespace gbe
- 
-     // Emit unary instructions from gen native function
-     void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
-+    // Emit unary instructions from gen native function
-+    void emitAtomicInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
- 
-     // These instructions are not supported at all
-     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
-@@ -693,10 +695,12 @@ namespace gbe
-           return doIt(uint64_t(0));
-         }
-       }
-+
-       // NULL pointers
-       if(isa<ConstantPointerNull>(CPV)) {
-         return doIt(uint32_t(0));
-       }
-+
-       // Floats and doubles
-       const Type::TypeID typeID = CPV->getType()->getTypeID();
-       switch (typeID) {
-@@ -1698,6 +1702,32 @@ namespace gbe
-       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
-       case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
-       case GEN_OCL_GET_IMAGE_DEPTH:
-+      case GEN_OCL_ATOMIC_ADD0:
-+      case GEN_OCL_ATOMIC_ADD1:
-+      case GEN_OCL_ATOMIC_SUB0:
-+      case GEN_OCL_ATOMIC_SUB1:
-+      case GEN_OCL_ATOMIC_AND0:
-+      case GEN_OCL_ATOMIC_AND1:
-+      case GEN_OCL_ATOMIC_OR0:
-+      case GEN_OCL_ATOMIC_OR1:
-+      case GEN_OCL_ATOMIC_XOR0:
-+      case GEN_OCL_ATOMIC_XOR1:
-+      case GEN_OCL_ATOMIC_XCHG0:
-+      case GEN_OCL_ATOMIC_XCHG1:
-+      case GEN_OCL_ATOMIC_UMAX0:
-+      case GEN_OCL_ATOMIC_UMAX1:
-+      case GEN_OCL_ATOMIC_UMIN0:
-+      case GEN_OCL_ATOMIC_UMIN1:
-+      case GEN_OCL_ATOMIC_IMAX0:
-+      case GEN_OCL_ATOMIC_IMAX1:
-+      case GEN_OCL_ATOMIC_IMIN0:
-+      case GEN_OCL_ATOMIC_IMIN1:
-+      case GEN_OCL_ATOMIC_INC0:
-+      case GEN_OCL_ATOMIC_INC1:
-+      case GEN_OCL_ATOMIC_DEC0:
-+      case GEN_OCL_ATOMIC_DEC1:
-+      case GEN_OCL_ATOMIC_CMPXCHG0:
-+      case GEN_OCL_ATOMIC_CMPXCHG1:
-         // No structure can be returned
-         this->newRegister(&I);
-         break;
-@@ -1782,6 +1812,26 @@ namespace gbe
-     ctx.ALU1(opcode, ir::TYPE_FLOAT, dst, src);
-   }
- 
-+  void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::Opcode opcode) {
-+    CallSite::arg_iterator AI = CS.arg_begin();
-+#if GBE_DEBUG
-+    CallSite::arg_iterator AE = CS.arg_end();
-+#endif /* GBE_DEBUG */
-+    GBE_ASSERT(AI != AE);
-+    unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
-+    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
-+    const ir::Register dst = this->getRegister(&I);
-+
-+    vector<ir::Register> src;
-+    uint32_t srcNum = 0;
-+    while(AI != AE) {
-+      src.push_back(this->getRegister(*(AI++)));
-+      srcNum++;
-+    }
-+    const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
-+    ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
-+  }
-+
-   void GenWriter::emitCallInst(CallInst &I) {
-     if (Function *F = I.getCalledFunction()) {
-       if (F->getIntrinsicID() != 0) {
-@@ -1870,6 +1920,32 @@ namespace gbe
-           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
-           case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
-           case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
-+          case GEN_OCL_ATOMIC_ADD0:
-+          case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_ADD); break;
-+          case GEN_OCL_ATOMIC_SUB0:
-+          case GEN_OCL_ATOMIC_SUB1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_SUB); break;
-+          case GEN_OCL_ATOMIC_AND0:
-+          case GEN_OCL_ATOMIC_AND1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_AND); break;
-+          case GEN_OCL_ATOMIC_OR0:
-+          case GEN_OCL_ATOMIC_OR1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_OR); break;
-+          case GEN_OCL_ATOMIC_XOR0:
-+          case GEN_OCL_ATOMIC_XOR1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_XOR); break;
-+          case GEN_OCL_ATOMIC_XCHG0:
-+          case GEN_OCL_ATOMIC_XCHG1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_XCHG); break;
-+          case GEN_OCL_ATOMIC_INC0:
-+          case GEN_OCL_ATOMIC_INC1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_INC); break;
-+          case GEN_OCL_ATOMIC_DEC0:
-+          case GEN_OCL_ATOMIC_DEC1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_DEC); break;
-+          case GEN_OCL_ATOMIC_UMIN0:
-+          case GEN_OCL_ATOMIC_UMIN1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_UMIN); break;
-+          case GEN_OCL_ATOMIC_UMAX0:
-+          case GEN_OCL_ATOMIC_UMAX1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_UMAX); break;
-+          case GEN_OCL_ATOMIC_IMIN0:
-+          case GEN_OCL_ATOMIC_IMIN1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_IMIN); break;
-+          case GEN_OCL_ATOMIC_IMAX0:
-+          case GEN_OCL_ATOMIC_IMAX1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_IMAX); break;
-+          case GEN_OCL_ATOMIC_CMPXCHG0:
-+          case GEN_OCL_ATOMIC_CMPXCHG1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_CMPXCHG); break;
-           case GEN_OCL_GET_IMAGE_WIDTH:
-           case GEN_OCL_GET_IMAGE_HEIGHT:
-           case GEN_OCL_GET_IMAGE_DEPTH:
-diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
-index 89a04ea..2f79690 100644
---- a/backend/src/llvm/llvm_gen_ocl_function.hxx
-+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
-@@ -79,6 +79,34 @@ DECL_LLVM_GEN_FUNCTION(GET_IMAGE_DEPTH,  __gen_ocl_get_image_depth)
- DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_DATA_TYPE,  __gen_ocl_get_image_channel_data_type)
- DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_ORDER,  __gen_ocl_get_image_channel_order)
- 
-+// atomic related functions.
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD0, _Z20__gen_ocl_atomic_addPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD1, _Z20__gen_ocl_atomic_addPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB0, _Z20__gen_ocl_atomic_subPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB1, _Z20__gen_ocl_atomic_subPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND0, _Z20__gen_ocl_atomic_andPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND1, _Z20__gen_ocl_atomic_andPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR0,  _Z19__gen_ocl_atomic_orPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR1,  _Z19__gen_ocl_atomic_orPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR0, _Z20__gen_ocl_atomic_xorPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR1, _Z20__gen_ocl_atomic_xorPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN0, _Z21__gen_ocl_atomic_uminPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN1, _Z21__gen_ocl_atomic_uminPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX0, _Z21__gen_ocl_atomic_umaxPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX1, _Z21__gen_ocl_atomic_umaxPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN0, _Z21__gen_ocl_atomic_iminPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN1, _Z21__gen_ocl_atomic_iminPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX0, _Z21__gen_ocl_atomic_imaxPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX1, _Z21__gen_ocl_atomic_imaxPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG0, _Z21__gen_ocl_atomic_xchgPU3AS1jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG1, _Z21__gen_ocl_atomic_xchgPU3AS3jj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC0, _Z20__gen_ocl_atomic_incPU3AS1j)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC1, _Z20__gen_ocl_atomic_incPU3AS3j)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC0, _Z20__gen_ocl_atomic_decPU3AS1j)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC1, _Z20__gen_ocl_atomic_decPU3AS3j)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG0, _Z24__gen_ocl_atomic_cmpxchgPU3AS1jjj)
-+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG1, _Z24__gen_ocl_atomic_cmpxchgPU3AS3jjj)
-+
- // saturation related functions.
- DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
- DECL_LLVM_GEN_FUNCTION(SADD_SAT_SHORT, _Z12ocl_sadd_satss)
-diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index 227454d..7a98e04 100644
---- a/backend/src/ocl_stdlib.h
-+++ b/backend/src/ocl_stdlib.h
-@@ -1,4 +1,4 @@
--/* 
-+/*
-  * Copyright © 2012 Intel Corporation
-  *
-  * This library is free software; you can redistribute it and/or
-@@ -5100,6 +5100,104 @@ INLINE void write_mem_fence(cl_mem_fence_flags flags) {
- }
- 
- /////////////////////////////////////////////////////////////////////////////
-+// Atomic functions
-+/////////////////////////////////////////////////////////////////////////////
-+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
-+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
-+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
-+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
-+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
-+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
-+
-+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
-+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
-+    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
-+  }
-+
-+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
-+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
-+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX) \
-+
-+#define DECL_ATOMIC_OP(NAME) \
-+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)              \
-+  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
-+
-+DECL_ATOMIC_OP(add)
-+DECL_ATOMIC_OP(sub)
-+DECL_ATOMIC_OP(and)
-+DECL_ATOMIC_OP(or)
-+DECL_ATOMIC_OP(xor)
-+DECL_ATOMIC_OP(xchg)
-+DECL_ATOMIC_OP_TYPE(xchg, float, atomic_)
-+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
-+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
-+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
-+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
-+
-+#undef DECL_ATOMIC_OP
-+#undef DECL_ATOMIC_OP_TYPE
-+#undef DECL_ATOMIC_OP_SPACE
-+
-+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
-+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
-+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
-+  }
-+
-+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
-+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
-+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
-+
-+#define DECL_ATOMIC_OP(NAME) \
-+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
-+  DECL_ATOMIC_OP_TYPE(NAME, int)
-+
-+DECL_ATOMIC_OP(inc)
-+DECL_ATOMIC_OP(dec)
-+
-+#undef DECL_ATOMIC_OP
-+#undef DECL_ATOMIC_OP_TYPE
-+#undef DECL_ATOMIC_OP_SPACE
-+
-+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
-+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
-+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
-+  }
-+
-+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
-+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
-+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
-+
-+#define DECL_ATOMIC_OP(NAME) \
-+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
-+  DECL_ATOMIC_OP_TYPE(NAME, int)
-+
-+DECL_ATOMIC_OP(cmpxchg)
-+
-+#undef DECL_ATOMIC_OP
-+#undef DECL_ATOMIC_OP_TYPE
-+#undef DECL_ATOMIC_OP_SPACE
-+
-+/////////////////////////////////////////////////////////////////////////////
- // Force the compilation to SIMD8 or SIMD16
- /////////////////////////////////////////////////////////////////////////////
- 
--- 
-1.7.10.4
-
diff --git a/debian/patches/0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch b/debian/patches/0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch
deleted file mode 100644
index 96651e7..0000000
--- a/debian/patches/0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch
+++ /dev/null
@@ -1,144 +0,0 @@
-From 5dda5e3155c9188b60ef0787402614b744a686d9 Mon Sep 17 00:00:00 2001
-From: Junyan He <junyan.he at linux.intel.com>
-Date: Wed, 3 Jul 2013 12:41:17 +0800
-Subject: [PATCH 06/10] Modify all the builtin function vect return to
- (vect_name)(e1, e2, e3)
-To: beignet at lists.freedesktop.org
-
-Some builtin functions has the prototype like:
-int3 function_name (int3 x) { return (x.s0, x.s1, x.s2);}
-which not comply with CL spec and will cause the clang IR
-be translated error.
-The vector declare should be (vect)(e1, e2, e3)
-
-Signed-off-by: Junyan He <junyan.he at linux.intel.com>
----
- backend/src/ocl_stdlib.h |   70 +++++++++++++++++++++++-----------------------
- 1 file changed, 35 insertions(+), 35 deletions(-)
-
-diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index 04984d8..bc9a0d4 100644
---- a/backend/src/ocl_stdlib.h
-+++ b/backend/src/ocl_stdlib.h
-@@ -4175,11 +4175,11 @@ uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
-     return add_sat((uchar)x, (uchar)0);
- }
- 
--#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x) { return (name(x.s0), name(x.s1)); }
--#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x) { return (name(x.s0), name(x.s1), name(x.s2)); }
--#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3)); }
--#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7)); }
--#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x) { return (name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7), name(x.s8), name(x.s9), name(x.sA), name(x.sB), name(x.sC), name(x.sD), name(x.sE), name(x.sF)); }
-+#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x) { return (int2)(name(x.s0), name(x.s1)); }
-+#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x) { return (int3)(name(x.s0), name(x.s1), name(x.s2)); }
-+#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x) { return (int4)(name(x.s0), name(x.s1), name(x.s2), name(x.s3)); }
-+#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x) { return (int8)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7)); }
-+#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x) { return (int16)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7), name(x.s8), name(x.s9), name(x.sA), name(x.sB), name(x.sC), name(x.sD), name(x.sE), name(x.sF)); }
- INLINE_OVERLOADABLE int isfinite(float x) { return __builtin_isfinite(x); }
- DEC2(isfinite);
- DEC3(isfinite);
-@@ -4216,11 +4216,11 @@ DEC16(signbit);
- #undef DEC8
- #undef DEC16
- 
--#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x, float2 y) { return (name(x.s0, y.s0), name(x.s1, y.s1)); }
--#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x, float3 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2)); }
--#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x, float4 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3)); }
--#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x, float8 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7)); }
--#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x, float16 y) { return (name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7), name(x.s8, y.s8), name(x.s9, y.s9), name(x.sA, y.sA), name(x.sB, y.sB), name(x.sC, y.sC), name(x.sD, y.sD), name(x.sE, y.sE), name(x.sF, y.sF)); }
-+#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x, float2 y) { return (int2)(name(x.s0, y.s0), name(x.s1, y.s1)); }
-+#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x, float3 y) { return (int3)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2)); }
-+#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x, float4 y) { return (int4)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3)); }
-+#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x, float8 y) { return (int8)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7)); }
-+#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x, float16 y) { return (int16)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7), name(x.s8, y.s8), name(x.s9, y.s9), name(x.sA, y.sA), name(x.sB, y.sB), name(x.sC, y.sC), name(x.sD, y.sD), name(x.sE, y.sE), name(x.sF, y.sF)); }
- INLINE_OVERLOADABLE int islessgreater(float x, float y) { return (x<y)||(x>y); }
- DEC2(islessgreater);
- DEC3(islessgreater);
-@@ -4338,11 +4338,11 @@ INLINE_OVERLOADABLE uint clz(uint x) {
-   return __gen_ocl_fbh(x);
- }
- 
--#define DEC2(type) INLINE_OVERLOADABLE type##2 clz(type##2 a) { return (clz(a.s0), clz(a.s1)); }
--#define DEC3(type) INLINE_OVERLOADABLE type##3 clz(type##3 a) { return (clz(a.s0), clz(a.s1), clz(a.s2)); }
--#define DEC4(type) INLINE_OVERLOADABLE type##4 clz(type##4 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3)); }
--#define DEC8(type) INLINE_OVERLOADABLE type##8 clz(type##8 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7)); }
--#define DEC16(type) INLINE_OVERLOADABLE type##16 clz(type##16 a) { return (clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7), clz(a.s8), clz(a.s9), clz(a.sa), clz(a.sb), clz(a.sc), clz(a.sd), clz(a.se), clz(a.sf)); }
-+#define DEC2(type) INLINE_OVERLOADABLE type##2 clz(type##2 a) { return (type##2)(clz(a.s0), clz(a.s1)); }
-+#define DEC3(type) INLINE_OVERLOADABLE type##3 clz(type##3 a) { return (type##3)(clz(a.s0), clz(a.s1), clz(a.s2)); }
-+#define DEC4(type) INLINE_OVERLOADABLE type##4 clz(type##4 a) { return (type##4)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3)); }
-+#define DEC8(type) INLINE_OVERLOADABLE type##8 clz(type##8 a) { return (type##8)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7)); }
-+#define DEC16(type) INLINE_OVERLOADABLE type##16 clz(type##16 a) { return (type##16)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7), clz(a.s8), clz(a.s9), clz(a.sa), clz(a.sb), clz(a.sc), clz(a.sd), clz(a.se), clz(a.sf)); }
- #define DEC(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint) 
- DEC(2)
- DEC(3)
-@@ -4364,11 +4364,11 @@ INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
- INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
- INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
- INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
--#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
--#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
--#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
--#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
--#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
-+#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (type##2)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
-+#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (type##3)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
-+#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (type##4)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
-+#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (type##8)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
-+#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (type##16)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
- #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
- DEF(2)
- DEF(3)
-@@ -4390,11 +4390,11 @@ DEF(ushort)
- DEF(int)
- DEF(uint)
- #undef DEF
--#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
--#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
--#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
--#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
--#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b.se, c.se), [...]
-+#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (type##2)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
-+#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (type##3)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
-+#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (type##4)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
-+#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (type##8)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
-+#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (type##16)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b. [...]
- #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
- DEF(2)
- DEF(3)
-@@ -4422,11 +4422,11 @@ DEF(ushort, 15)
- DEF(int, 31)
- DEF(uint, 31)
- #undef DEF
--#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
--#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
--#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
--#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
--#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
-+#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (type##2)(rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
-+#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (type##3)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
-+#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (type##4)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
-+#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (type##8)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
-+#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (type##16)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
- #define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
- DEF(2)
- DEF(3)
-@@ -4454,11 +4454,11 @@ INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 &
- INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
- INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
- INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
--#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (func(a.s0, b.s0), func(a.s1, b.s1)); }
--#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
--#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
--#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
--#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
-+#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (type##2)(func(a.s0, b.s0), func(a.s1, b.s1)); }
-+#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (type##3)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
-+#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (type##4)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
-+#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (type##8)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
-+#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (type##16)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
- #define DEF(func, n) DEC##n(func, char); DEC##n(func, uchar); DEC##n(func, short); DEC##n(func, ushort); DEC##n(func, int); DEC##n(func, uint)
- DEF(hadd, 2)
- DEF(hadd, 3)
--- 
-1.7.10.4
-
diff --git a/debian/patches/0007-Add-atomic-test-case.patch b/debian/patches/0007-Add-atomic-test-case.patch
deleted file mode 100644
index 6a3ae8a..0000000
--- a/debian/patches/0007-Add-atomic-test-case.patch
+++ /dev/null
@@ -1,188 +0,0 @@
-From d83c5424c918bba806f39c72f27db6a788600dc9 Mon Sep 17 00:00:00 2001
-From: Yang Rong <rong.r.yang at intel.com>
-Date: Wed, 26 Jun 2013 15:29:23 +0800
-Subject: [PATCH 07/11] Add atomic test case.
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Yang Rong <rong.r.yang at intel.com>
----
- kernels/compiler_atomic_functions.cl |   55 ++++++++++++++++-----
- utests/CMakeLists.txt                |    1 +
- utests/compiler_atomic_functions.cpp |   87 ++++++++++++++++++++++++++++++++--
- 3 files changed, 127 insertions(+), 16 deletions(-)
-
-diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
-index 23f3e73..dd9ec56 100644
---- a/kernels/compiler_atomic_functions.cl
-+++ b/kernels/compiler_atomic_functions.cl
-@@ -1,14 +1,43 @@
--/* test OpenCL 1.1 Atomic Functions (section 6.11.1, 9.4) */
--__kernel void compiler_atomic_functions(global int *a, global int *b) {
--  atomic_add(a, *b);
--  atomic_sub(a, *b);
--  atomic_xchg(a, *b);
--  atomic_inc(a);
--  atomic_dec(a);
--  atomic_cmpxchg(a, b, 100);
--  atomic_min(a, *b);
--  atomic_max(a, *b);
--  atomic_and(a, *b);
--  atomic_or(a, *b);
--  atomic_xor(a, *b);
-+__kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
-+  int lid = get_local_id(0);
-+  int i = lid % 12;
-+  atomic_xchg(&tmp[4], -1);
-+	switch(i) {
-+	  case 0: atomic_inc(&tmp[i]); break;
-+	  case 1: atomic_dec(&tmp[i]); break;
-+	  case 2: atomic_add(&tmp[i], src[lid]); break;
-+	  case 3: atomic_sub(&tmp[i], src[lid]); break;
-+	  case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid>>2))); break;
-+	  case 5: atomic_or (&tmp[i], src[lid]<<(lid>>2)); break;
-+	  case 6: atomic_xor(&tmp[i], src[lid]); break;
-+	  case 7: atomic_min(&tmp[i], -src[lid]); break;
-+    case 8: atomic_max(&tmp[i], src[lid]); break;
-+		case 9: atomic_min((__local unsigned int *)&tmp[i], -src[lid]); break;
-+		case 10: atomic_max((__local unsigned int *)&tmp[i], src[lid]); break;
-+	  case 11: atomic_cmpxchg(&(tmp[i]), 0, src[10]); break;
-+		default:  break;
-+	}
-+
-+	switch(i) {
-+	  case 0: atomic_inc(&dst[i]); break;
-+	  case 1: atomic_dec(&dst[i]); break;
-+	  case 2: atomic_add(&dst[i], src[lid]); break;
-+	  case 3: atomic_sub(&dst[i], src[lid]); break;
-+	  case 4: atomic_and(&dst[i], ~(src[lid]<<(lid>>2))); break;
-+	  case 5: atomic_or (&dst[i], src[lid]<<(lid>>2)); break;
-+	  case 6: atomic_xor(&dst[i], src[lid]); break;
-+	  case 7: atomic_min(&dst[i], -src[lid]); break;
-+    case 8: atomic_max(&dst[i], src[lid]); break;
-+		case 9: atomic_min((__global unsigned int *)&dst[i], -src[lid]); break;
-+		case 10: atomic_max((__global unsigned int *)&dst[i], src[lid]); break;
-+	  case 11: atomic_cmpxchg(&dst[i], 0, src[10]); break;
-+		default:  break;
-+	}
-+
-+	barrier(CLK_GLOBAL_MEM_FENCE);
-+
-+	if(get_global_id(0) == 0) {
-+	  for(i=0; i<12; i=i+1)
-+		  atomic_add(&dst[i], tmp[i]);
-+	}
- }
-diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index b75f3b4..c313acd 100644
---- a/utests/CMakeLists.txt
-+++ b/utests/CMakeLists.txt
-@@ -75,6 +75,7 @@ set (utests_sources
-   compiler_write_only_shorts.cpp
-   compiler_switch.cpp
-   compiler_math.cpp
-+  compiler_atomic_functions.cpp
-   compiler_insn_selection_min.cpp
-   compiler_insn_selection_max.cpp
-   compiler_insn_selection_masked_min_max.cpp
-diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
-index 20202da..71e8384 100644
---- a/utests/compiler_atomic_functions.cpp
-+++ b/utests/compiler_atomic_functions.cpp
-@@ -1,10 +1,91 @@
- #include "utest_helper.hpp"
-+#include <cmath>
-+#include <algorithm>
- 
--void compiler_atomic_functions(void)
-+#define GROUP_NUM 16
-+#define LOCAL_SIZE 64
-+static void cpu_compiler_atomic(int *dst, int *src)
- {
--  OCL_CREATE_KERNEL("compiler_atomic_functions");
-+  dst[4] = 0xffffffff;
-+  int tmp[16] = { 0 };
-+
-+  for(int j=0; j<LOCAL_SIZE; j++) {
-+    int i = j % 12;
-+
-+    switch(i) {
-+      case 0: tmp[i] += 1; break;
-+      case 1: tmp[i] -= 1; break;
-+      case 2: tmp[i] += src[j]; break;
-+      case 3: tmp[i] -= src[j]; break;
-+      case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
-+      case 5: tmp[i] |= src[j]<<(j>>2); break;
-+      case 6: tmp[i] ^= src[j]; break;
-+      case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
-+      case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
-+      case 9: tmp[i] = (unsigned int)tmp[i] < (unsigned int)(-src[j]) ? tmp[i] : -src[j]; break;
-+      case 10: tmp[i] = (unsigned int)tmp[i] > (unsigned int)(src[j]) ? tmp[i] : src[j]; break;
-+      case 11:  tmp[i] = src[10]; break;
-+      default:  break;
-+    }
-+  }
-+
-+  for(int k=0; k<GROUP_NUM; k++) {
-+    for(int j=0; j<LOCAL_SIZE; j++) {
-+      int i = j % 12;
-+
-+      switch(i) {
-+        case 0: dst[i] += 1; break;
-+        case 1: dst[i] -= 1; break;
-+        case 2: dst[i] += src[j]; break;
-+        case 3: dst[i] -= src[j]; break;
-+        case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
-+        case 5: dst[i] |= src[j]<<(j>>2); break;
-+        case 6: dst[i] ^= src[j]; break;
-+        case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
-+        case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
-+        case 9: dst[i] = (unsigned int)dst[i] < (unsigned int)(-src[j]) ? dst[i] : -src[j]; break;
-+        case 10: dst[i] = (unsigned int)dst[i] > (unsigned int)(src[j]) ? dst[i] : src[j]; break;
-+        case 11:  dst[i] = src[10]; break;
-+        default:  break;
-+      }
-+    }
-+  }
-+
-+  for(int i=0; i<12; i++)
-+    dst[i] += tmp[i];
- }
- 
--MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions);
-+static void compiler_atomic_functions(void)
-+{
-+  const size_t n = GROUP_NUM * LOCAL_SIZE;
-+  int cpu_dst[16] = {0}, cpu_src[256];
- 
-+  globals[0] = n;
-+  locals[0] = LOCAL_SIZE;
-+
-+  // Setup kernel and buffers
-+  OCL_CREATE_KERNEL("compiler_atomic_functions");
-+  OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
-+  OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
-+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-+  OCL_SET_ARG(1, 16 * sizeof(int), NULL);
-+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
-+
-+  OCL_MAP_BUFFER(1);
-+  for (uint32_t i = 0; i < locals[0]; ++i)
-+      cpu_src[i] = ((int*)buf_data[1])[i] = rand() & 0xff;
-+  cpu_compiler_atomic(cpu_dst, cpu_src);
-+  OCL_UNMAP_BUFFER(1);
-+  OCL_NDRANGE(1);
-+
-+  OCL_MAP_BUFFER(0);
-+
-+  // Check results
-+  for(int i=0; i<12; i++) {
-+    //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
-+    OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
-+  }
-+  OCL_UNMAP_BUFFER(0);
-+}
- 
-+MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions)
--- 
-1.7.10.4
-
diff --git a/debian/patches/0007-Add-the-vector3-support-for-builtin-abs-function.patch b/debian/patches/0007-Add-the-vector3-support-for-builtin-abs-function.patch
deleted file mode 100644
index e116290..0000000
--- a/debian/patches/0007-Add-the-vector3-support-for-builtin-abs-function.patch
+++ /dev/null
@@ -1,252 +0,0 @@
-From bebce26d3bb8daa227f73ae3353c544b3d8f36ed Mon Sep 17 00:00:00 2001
-From: Junyan He <junyan.he at linux.intel.com>
-Date: Wed, 3 Jul 2013 15:16:59 +0800
-Subject: [PATCH 07/10] Add the vector3 support for builtin abs function
-To: beignet at lists.freedesktop.org
-
-Add the forgetten abs vector3 for all the types.
-Because the kernel input alignment, improve the test
-case to match the alignment request.
-
-Signed-off-by: Junyan He <junyan.he at linux.intel.com>
----
- backend/src/ocl_stdlib.h |    4 +++-
- kernels/compiler_abs.cl  |    1 +
- utests/compiler_abs.cpp  |   54 +++++++++++++++++++++++++++++++++++++---------
- 3 files changed, 48 insertions(+), 11 deletions(-)
-
-diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index bc9a0d4..4bfdf9a 100644
---- a/backend/src/ocl_stdlib.h
-+++ b/backend/src/ocl_stdlib.h
-@@ -4481,6 +4481,7 @@ int __gen_ocl_abs(int x);
- #define ABS_I(I, CVT)  (CVT)__gen_ocl_abs(x.s##I)
- #define ABS_VEC1(CVT)  (CVT)__gen_ocl_abs(x)
- #define ABS_VEC2(CVT)  ABS_I(0, CVT), ABS_I(1, CVT)
-+#define ABS_VEC3(CVT)  ABS_I(0, CVT), ABS_I(1, CVT), ABS_I(2, CVT)
- #define ABS_VEC4(CVT)  ABS_VEC2(CVT), ABS_I(2, CVT), ABS_I(3, CVT)
- #define ABS_VEC8(CVT)  ABS_VEC4(CVT), ABS_I(4, CVT), ABS_I(5, CVT),\
- 	               ABS_I(6, CVT), ABS_I(7, CVT)
-@@ -4490,7 +4491,7 @@ int __gen_ocl_abs(int x);
- 
- #define DEC_1(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return ABS_VEC1(u##TYPE); }
- #define DEC_N(TYPE, N) INLINE_OVERLOADABLE u##TYPE##N abs(TYPE##N x) { return (u##TYPE##N)(ABS_VEC##N(u##TYPE)); };
--#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
-+#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 3) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
- 
- DEC(int)
- DEC(short)
-@@ -4509,6 +4510,7 @@ DEC(uchar)
- #undef ABS_I
- #undef ABS_VEC1
- #undef ABS_VEC2
-+#undef ABS_VEC3
- #undef ABS_VEC4
- #undef ABS_VEC8
- #undef ABS_VEC16
-diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs.cl
-index 9e77c2b..549575c 100644
---- a/kernels/compiler_abs.cl
-+++ b/kernels/compiler_abs.cl
-@@ -15,6 +15,7 @@
- #define COMPILER_ABS(TYPE, UTYPE)  \
-     COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
-     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
-+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
-     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
-     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
-     COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
-diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
-index 59d8365..a1b14b4 100644
---- a/utests/compiler_abs.cpp
-+++ b/utests/compiler_abs.cpp
-@@ -3,23 +3,26 @@
- 
- template <typename T, int N>
- struct cl_vec {
--    T ptr[N];
-+    T ptr[((N+1)/2)*2]; //align to 2 elements.
- 
-     typedef cl_vec<T, N> vec_type;
- 
-     cl_vec(void) {
--        memset(ptr, 0, sizeof(T) * N);
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-     }
-     cl_vec(vec_type & other) {
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-         memcpy (this->ptr, other.ptr, sizeof(T) * N);
-     }
- 
-     vec_type& operator= (vec_type & other) {
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-         memcpy (this->ptr, other.ptr, sizeof(T) * N);
-         return *this;
-     }
- 
-     template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-         memcpy (this->ptr, other.ptr, sizeof(T) * N);
-         return *this;
-     }
-@@ -56,6 +59,8 @@ template <typename T, typename U> static void cpu(int global_id, T *src, U *dst)
- template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
- {
-     int i = 0;
-+
-+    memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
-     for (; i < N; i++) {
-         vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
-     }
-@@ -66,25 +71,34 @@ template <typename T> static void gen_rand_val (T & val)
-     val = static_cast<T>((rand() & 63) - 32);
- }
- 
-+template <typename T>
-+inline static void print_data (T& val)
-+{
-+    if (std::is_unsigned<T>::value)
-+        printf(" %u", val);
-+    else
-+        printf(" %d", val);
-+}
-+
- template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* src,
--	cl_vec<U, N>* dst, int n)
-+        cl_vec<U, N>* dst, int n)
- {
-     U* val = reinterpret_cast<U *>(dst);
- 
--    n = n*N;
-+    n = n*((N+1)/2)*2;
- 
-     printf("\nRaw: \n");
-     for (int32_t i = 0; i < (int32_t) n; ++i) {
--        printf(" %d", ((T *)buf_data[0])[i]);
-+        print_data(((T *)buf_data[0])[i]);
-     }
- 
-     printf("\nCPU: \n");
-     for (int32_t i = 0; i < (int32_t) n; ++i) {
--        printf(" %d", val[i]);
-+        print_data(val[i]);
-     }
-     printf("\nGPU: \n");
-     for (int32_t i = 0; i < (int32_t) n; ++i) {
--        printf(" %d", ((U *)buf_data[1])[i]);
-+        print_data(((U *)buf_data[1])[i]);
-     }
- }
- 
-@@ -92,16 +106,16 @@ template <typename T, typename U> static void dump_data (T* src, U* dst, int n)
- {
-     printf("\nRaw: \n");
-     for (int32_t i = 0; i < (int32_t) n; ++i) {
--        printf(" %d", ((T *)buf_data[0])[i]);
-+        print_data(((T *)buf_data[0])[i]);
-     }
- 
-     printf("\nCPU: \n");
-     for (int32_t i = 0; i < (int32_t) n; ++i) {
--        printf(" %d", dst[i]);
-+        print_data(dst[i]);
-     }
-     printf("\nGPU: \n");
-     for (int32_t i = 0; i < (int32_t) n; ++i) {
--        printf(" %d", ((U *)buf_data[1])[i]);
-+        print_data(((U *)buf_data[1])[i]);
-     }
- }
- 
-@@ -111,6 +125,8 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
-     U cpu_dst[16];
-     T cpu_src[16];
- 
-+    printf("sizeof T, is %u, sizeof U is %u\n", (int)sizeof(T), (int)sizeof(U));
-+
-     // Setup buffers
-     OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
-     OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
-@@ -122,6 +138,12 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
-     // Run random tests
-     for (uint32_t pass = 0; pass < 8; ++pass) {
-         OCL_MAP_BUFFER(0);
-+
-+        /* Clear the dst buffer to avoid random data. */
-+        OCL_MAP_BUFFER(1);
-+        memset(buf_data[1], 0, sizeof(U) * n);
-+        OCL_UNMAP_BUFFER(1);
-+
-         for (int32_t i = 0; i < (int32_t) n; ++i) {
-             gen_rand_val(cpu_src[i]);
-         }
-@@ -166,54 +188,66 @@ ABS_TEST_TYPE(uchar, uchar)
- 
- 
- typedef cl_vec<int, 2> int2;
-+typedef cl_vec<int, 3> int3;
- typedef cl_vec<int, 4> int4;
- typedef cl_vec<int, 8> int8;
- typedef cl_vec<int, 16> int16;
- typedef cl_vec<unsigned int, 2> uint2;
-+typedef cl_vec<unsigned int, 3> uint3;
- typedef cl_vec<unsigned int, 4> uint4;
- typedef cl_vec<unsigned int, 8> uint8;
- typedef cl_vec<unsigned int, 16> uint16;
- ABS_TEST_TYPE(int2, uint2)
-+ABS_TEST_TYPE(int3, uint3)
- ABS_TEST_TYPE(int4, uint4)
- ABS_TEST_TYPE(int8, uint8)
- ABS_TEST_TYPE(int16, uint16)
- ABS_TEST_TYPE(uint2, uint2)
-+ABS_TEST_TYPE(uint3, uint3)
- ABS_TEST_TYPE(uint4, uint4)
- ABS_TEST_TYPE(uint8, uint8)
- ABS_TEST_TYPE(uint16, uint16)
- 
- 
- typedef cl_vec<char, 2> char2;
-+typedef cl_vec<char, 3> char3;
- typedef cl_vec<char, 4> char4;
- typedef cl_vec<char, 8> char8;
- typedef cl_vec<char, 16> char16;
- typedef cl_vec<unsigned char, 2> uchar2;
-+typedef cl_vec<unsigned char, 3> uchar3;
- typedef cl_vec<unsigned char, 4> uchar4;
- typedef cl_vec<unsigned char, 8> uchar8;
- typedef cl_vec<unsigned char, 16> uchar16;
- ABS_TEST_TYPE(char2, uchar2)
-+ABS_TEST_TYPE(char3, uchar3)
- ABS_TEST_TYPE(char4, uchar4)
- ABS_TEST_TYPE(char8, uchar8)
- ABS_TEST_TYPE(char16, uchar16)
- ABS_TEST_TYPE(uchar2, uchar2)
-+ABS_TEST_TYPE(uchar3, uchar3)
- ABS_TEST_TYPE(uchar4, uchar4)
- ABS_TEST_TYPE(uchar8, uchar8)
- ABS_TEST_TYPE(uchar16, uchar16)
- 
- 
- typedef cl_vec<short, 2> short2;
-+typedef cl_vec<short, 3> short3;
- typedef cl_vec<short, 4> short4;
- typedef cl_vec<short, 8> short8;
- typedef cl_vec<short, 16> short16;
- typedef cl_vec<unsigned short, 2> ushort2;
-+typedef cl_vec<unsigned short, 3> ushort3;
- typedef cl_vec<unsigned short, 4> ushort4;
- typedef cl_vec<unsigned short, 8> ushort8;
- typedef cl_vec<unsigned short, 16> ushort16;
- ABS_TEST_TYPE(short2, ushort2)
-+ABS_TEST_TYPE(short3, ushort3)
- ABS_TEST_TYPE(short4, ushort4)
- ABS_TEST_TYPE(short8, ushort8)
- ABS_TEST_TYPE(short16, ushort16)
- ABS_TEST_TYPE(ushort2, ushort2)
-+ABS_TEST_TYPE(ushort3, ushort3)
- ABS_TEST_TYPE(ushort4, ushort4)
- ABS_TEST_TYPE(ushort8, ushort8)
- ABS_TEST_TYPE(ushort16, ushort16)
--- 
-1.7.10.4
-
diff --git a/debian/patches/0008-Add-the-abs_diff-builtin-function-support.patch b/debian/patches/0008-Add-the-abs_diff-builtin-function-support.patch
deleted file mode 100644
index 35d5535..0000000
--- a/debian/patches/0008-Add-the-abs_diff-builtin-function-support.patch
+++ /dev/null
@@ -1,109 +0,0 @@
-From b1229e1adbea64a9d3d4bbb5f01740c3b3595485 Mon Sep 17 00:00:00 2001
-From: Junyan He <junyan.he at linux.intel.com>
-Date: Wed, 3 Jul 2013 15:17:05 +0800
-Subject: [PATCH 08/10] Add the abs_diff builtin function support
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Junyan He <junyan.he at linux.intel.com>
----
- backend/src/ocl_stdlib.h |   84 ++++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 84 insertions(+)
-
-diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index 4bfdf9a..9b80445 100644
---- a/backend/src/ocl_stdlib.h
-+++ b/backend/src/ocl_stdlib.h
-@@ -4515,6 +4515,90 @@ DEC(uchar)
- #undef ABS_VEC8
- #undef ABS_VEC16
- 
-+
-+/* Char and short type abs diff */
-+/* promote char and short to int and will be no module overflow */
-+#define ABS_DIFF(CVT) (CVT)(abs((int)x - (int)y))
-+#define ABS_DIFF_I(CVT, I)  (CVT)(abs((int)x.s##I - (int)y.s##I))
-+
-+#define ABS_DIFF_VEC1(CVT)  ABS_DIFF(CVT)
-+#define ABS_DIFF_VEC2(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1)
-+#define ABS_DIFF_VEC3(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1), ABS_DIFF_I(CVT, 2)
-+#define ABS_DIFF_VEC4(CVT)  ABS_DIFF_VEC2(CVT), ABS_DIFF_I(CVT, 2), ABS_DIFF_I(CVT, 3)
-+#define ABS_DIFF_VEC8(CVT)  ABS_DIFF_VEC4(CVT), ABS_DIFF_I(CVT, 4), ABS_DIFF_I(CVT, 5), \
-+                            ABS_DIFF_I(CVT, 6), ABS_DIFF_I(CVT, 7)
-+#define ABS_DIFF_VEC16(CVT)  ABS_DIFF_VEC8(CVT), ABS_DIFF_I(CVT, 8), ABS_DIFF_I(CVT, 9), \
-+                            ABS_DIFF_I(CVT, A), ABS_DIFF_I(CVT, B), \
-+                            ABS_DIFF_I(CVT, C), ABS_DIFF_I(CVT, D), \
-+                            ABS_DIFF_I(CVT, E), ABS_DIFF_I(CVT, F)
-+
-+#define DEC_1(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
-+                           { return ABS_DIFF_VEC1(UTYPE); }
-+#define DEC_N(TYPE, UTYPE, N) INLINE_OVERLOADABLE UTYPE##N abs_diff(TYPE##N x, TYPE##N y) \
-+                              { return (UTYPE##N)(ABS_DIFF_VEC##N(UTYPE)); };
-+#define DEC(TYPE, UTYPE)  DEC_1(TYPE, UTYPE) DEC_N(TYPE, UTYPE, 2)  DEC_N(TYPE, UTYPE, 3 ) \
-+                          DEC_N(TYPE, UTYPE, 4) DEC_N(TYPE, UTYPE, 8) DEC_N(TYPE, UTYPE, 16)
-+DEC(char, uchar)
-+DEC(uchar, uchar)
-+DEC(short, ushort)
-+DEC(ushort, ushort)
-+
-+#undef DEC
-+#undef DEC_1
-+#undef DEC_N
-+#undef ABS_DIFF
-+#undef ABS_DIFF_I
-+#undef ABS_DIFF_VEC1
-+#undef ABS_DIFF_VEC2
-+#undef ABS_DIFF_VEC3
-+#undef ABS_DIFF_VEC4
-+#undef ABS_DIFF_VEC8
-+#undef ABS_DIFF_VEC16
-+
-+INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
-+    /* same signed will never overflow. */
-+    return y > x ? (y -x) : (x - y);
-+}
-+
-+INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
-+    /* same signed will never module overflow. */
-+    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
-+        return abs(x - y);
-+
-+    return (abs(x) + abs(y));
-+}
-+
-+#define ABS_DIFF_I(I)  abs_diff(x.s##I, y.s##I)
-+
-+#define ABS_DIFF_VEC2  ABS_DIFF_I(0), ABS_DIFF_I(1)
-+#define ABS_DIFF_VEC3  ABS_DIFF_I(0), ABS_DIFF_I(1), ABS_DIFF_I(2)
-+#define ABS_DIFF_VEC4  ABS_DIFF_VEC2, ABS_DIFF_I(2), ABS_DIFF_I(3)
-+#define ABS_DIFF_VEC8  ABS_DIFF_VEC4, ABS_DIFF_I(4), ABS_DIFF_I(5), \
-+                       ABS_DIFF_I(6), ABS_DIFF_I(7)
-+#define ABS_DIFF_VEC16  ABS_DIFF_VEC8, ABS_DIFF_I(8), ABS_DIFF_I(9), \
-+                            ABS_DIFF_I(A), ABS_DIFF_I(B), \
-+                            ABS_DIFF_I(C), ABS_DIFF_I(D), \
-+                            ABS_DIFF_I(E), ABS_DIFF_I(F)
-+
-+#define DEC_N(TYPE, N) INLINE_OVERLOADABLE uint##N abs_diff(TYPE##N x, TYPE##N y) \
-+				      { return (uint##N)(ABS_DIFF_VEC##N); };
-+#define DEC(TYPE)   DEC_N(TYPE, 2)  DEC_N(TYPE, 3 ) \
-+                           DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
-+DEC(int)
-+DEC(uint)
-+
-+#undef DEC
-+#undef DEC_1
-+#undef DEC_N
-+#undef ABS_DIFF
-+#undef ABS_DIFF_I
-+#undef ABS_DIFF_VEC1
-+#undef ABS_DIFF_VEC2
-+#undef ABS_DIFF_VEC3
-+#undef ABS_DIFF_VEC4
-+#undef ABS_DIFF_VEC8
-+#undef ABS_DIFF_VEC16
-+
- /////////////////////////////////////////////////////////////////////////////
- // Work Items functions (see 6.11.1 of OCL 1.1 spec)
- /////////////////////////////////////////////////////////////////////////////
--- 
-1.7.10.4
-
diff --git a/debian/patches/0008-support-built-in-function-rotate.patch b/debian/patches/0008-support-built-in-function-rotate.patch
deleted file mode 100644
index 9b917e8..0000000
--- a/debian/patches/0008-support-built-in-function-rotate.patch
+++ /dev/null
@@ -1,58 +0,0 @@
-From 2e008d20fb703db8afb84476ad599dca92d9f763 Mon Sep 17 00:00:00 2001
-From: Homer Hsing <homer.xing at intel.com>
-Date: Wed, 26 Jun 2013 15:51:51 +0800
-Subject: [PATCH 08/11] support built-in function "rotate"
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Homer Hsing <homer.xing at intel.com>
----
- backend/src/ocl_stdlib.h |   33 +++++++++++++++++++++++++++++++++
- 1 file changed, 33 insertions(+)
-
-diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index 7a98e04..133e995 100644
---- a/backend/src/ocl_stdlib.h
-+++ b/backend/src/ocl_stdlib.h
-@@ -4355,6 +4355,39 @@ DEC(16)
- #undef DEC4
- #undef DEC8
- #undef DEC16
-+
-+INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
-+INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
-+INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
-+INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
-+INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
-+INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
-+#define DEF(type, n, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y < 0 ? n + y : y & m)); }
-+DEF(char, 8, 7)
-+DEF(uchar, 8, 7)
-+DEF(short, 16, 15)
-+DEF(ushort, 16, 15)
-+DEF(int, 32, 31)
-+DEF(uint, 32, 31)
-+#undef DEF
-+#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
-+#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
-+#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
-+#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
-+#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
-+#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
-+DEF(2)
-+DEF(3)
-+DEF(4)
-+DEF(8)
-+DEF(16)
-+#undef DEF
-+#undef DEC2
-+#undef DEC3
-+#undef DEC4
-+#undef DEC8
-+#undef DEC16
-+
- /////////////////////////////////////////////////////////////////////////////
- // Work Items functions (see 6.11.1 of OCL 1.1 spec)
- /////////////////////////////////////////////////////////////////////////////
--- 
-1.7.10.4
-
diff --git a/debian/patches/0009-Add-the-test-case-for-builtin-abs_diff-function.patch b/debian/patches/0009-Add-the-test-case-for-builtin-abs_diff-function.patch
deleted file mode 100644
index 49bc014..0000000
--- a/debian/patches/0009-Add-the-test-case-for-builtin-abs_diff-function.patch
+++ /dev/null
@@ -1,355 +0,0 @@
-From 7337895767f8dbd7a7637ffbd97efed0352e7e06 Mon Sep 17 00:00:00 2001
-From: Junyan He <junyan.he at linux.intel.com>
-Date: Wed, 3 Jul 2013 15:17:10 +0800
-Subject: [PATCH 09/10] Add the test case for builtin abs_diff() function
-To: beignet at lists.freedesktop.org
-
-All the integer value types check are supported.
-Please use the case named compiler_abs_diff_xxxx,
-where xxxx means the data type such as int2, char4
-
-Signed-off-by: Junyan He <junyan.he at linux.intel.com>
----
- kernels/compiler_abs_diff.cl |   28 +++++
- utests/CMakeLists.txt        |    1 +
- utests/compiler_abs.cpp      |    2 -
- utests/compiler_abs_diff.cpp |  267 ++++++++++++++++++++++++++++++++++++++++++
- 4 files changed, 296 insertions(+), 2 deletions(-)
- create mode 100644 kernels/compiler_abs_diff.cl
- create mode 100644 utests/compiler_abs_diff.cpp
-
-diff --git a/kernels/compiler_abs_diff.cl b/kernels/compiler_abs_diff.cl
-new file mode 100644
-index 0000000..583ba2b
---- /dev/null
-+++ b/kernels/compiler_abs_diff.cl
-@@ -0,0 +1,28 @@
-+#define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
-+    kernel void compiler_abs_diff_##TYPE ( \
-+           global TYPE* x, global TYPE* y, global UTYPE* diff) { \
-+        int i = get_global_id(0); \
-+        diff[i] = abs_diff(x[i], y[i]);     \
-+    }
-+
-+#define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
-+    kernel void compiler_abs_diff_##TYPE##N ( \
-+           global TYPE##N* x, global TYPE##N* y, global UTYPE##N* diff) { \
-+        int i = get_global_id(0); \
-+        diff[i] = abs_diff(x[i], y[i]);     \
-+    }
-+
-+#define COMPILER_ABS(TYPE, UTYPE)  \
-+    COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
-+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
-+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
-+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
-+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
-+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
-+
-+COMPILER_ABS(int, uint)
-+COMPILER_ABS(uint, uint)
-+COMPILER_ABS(char, uchar)
-+COMPILER_ABS(uchar, uchar)
-+COMPILER_ABS(short, ushort)
-+COMPILER_ABS(ushort, ushort)
-diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index cc99370..8cdf10d 100644
---- a/utests/CMakeLists.txt
-+++ b/utests/CMakeLists.txt
-@@ -32,6 +32,7 @@ set (utests_sources
-   compiler_copy_buffer_row.cpp
-   compiler_fabs.cpp
-   compiler_abs.cpp
-+  compiler_abs_diff.cpp
-   compiler_fill_image.cpp
-   compiler_fill_image0.cpp
-   compiler_fill_image_3d.cpp
-diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
-index a1b14b4..9457b9b 100644
---- a/utests/compiler_abs.cpp
-+++ b/utests/compiler_abs.cpp
-@@ -125,8 +125,6 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
-     U cpu_dst[16];
-     T cpu_src[16];
- 
--    printf("sizeof T, is %u, sizeof U is %u\n", (int)sizeof(T), (int)sizeof(U));
--
-     // Setup buffers
-     OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
-     OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
-diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
-new file mode 100644
-index 0000000..384a654
---- /dev/null
-+++ b/utests/compiler_abs_diff.cpp
-@@ -0,0 +1,267 @@
-+#include "utest_helper.hpp"
-+#include "string.h"
-+
-+template <typename T, int N>
-+struct cl_vec {
-+    T ptr[((N+1)/2)*2]; //align to 2 elements.
-+
-+    typedef cl_vec<T, N> vec_type;
-+
-+    cl_vec(void) {
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-+    }
-+    cl_vec(vec_type & other) {
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
-+    }
-+
-+    vec_type& operator= (vec_type & other) {
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
-+        return *this;
-+    }
-+
-+    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
-+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
-+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
-+        return *this;
-+    }
-+
-+    bool operator== (vec_type & other) {
-+        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
-+    }
-+
-+    void abs_diff(vec_type & other) {
-+        int i = 0;
-+        for (; i < N; i++) {
-+            T a = ptr[i];
-+            T b = other.ptr[i];
-+            T f = a > b ? (a - b) : (b - a);
-+            ptr[i] = f;
-+        }
-+    }
-+};
-+
-+template <typename T, typename U, int N> static void cpu (int global_id,
-+        cl_vec<T, N> *x, cl_vec<T, N> *y, cl_vec<U, N> *diff)
-+{
-+    cl_vec<T, N> v  = x[global_id];
-+    v.abs_diff(y[global_id]);
-+    diff[global_id] = v;
-+}
-+
-+template <typename T, typename U> static void cpu(int global_id, T *x, T *y, U *diff)
-+{
-+    T a = x[global_id];
-+    T b = y[global_id];
-+    U f = a > b ? (a - b) : (b - a);
-+    diff[global_id] = f;
-+}
-+
-+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
-+{
-+    int i = 0;
-+    for (; i < N; i++) {
-+        vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
-+    }
-+}
-+
-+template <typename T> static void gen_rand_val (T & val)
-+{
-+    val = static_cast<T>((rand() & 63) - 32);
-+}
-+
-+template <typename T>
-+inline static void print_data (T& val)
-+{
-+    if (std::is_unsigned<T>::value)
-+        printf(" %u", val);
-+    else
-+        printf(" %d", val);
-+}
-+
-+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* x,
-+        cl_vec<T, N>* y, cl_vec<U, N>* diff, int n)
-+{
-+    U* val = reinterpret_cast<U *>(diff);
-+
-+    n = n*((N+1)/2)*2;
-+
-+    printf("\nRaw x: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(((T *)buf_data[0])[i]);
-+    }
-+    printf("\nRaw y: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(((T *)buf_data[1])[i]);
-+    }
-+
-+    printf("\nCPU diff: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(val[i]);
-+    }
-+    printf("\nGPU diff: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(((U *)buf_data[2])[i]);
-+    }
-+}
-+
-+template <typename T, typename U> static void dump_data (T* x, T* y, U* diff, int n)
-+{
-+    printf("\nRaw x: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(((T *)buf_data[0])[i]);
-+    }
-+    printf("\nRaw y: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(((T *)buf_data[1])[i]);
-+    }
-+
-+    printf("\nCPU diff: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(diff[i]);
-+    }
-+    printf("\nGPU diff: \n");
-+    for (int32_t i = 0; i < (int32_t) n; ++i) {
-+        print_data(((U *)buf_data[2])[i]);
-+    }
-+}
-+
-+template <typename T, typename U> static void compiler_abs_diff_with_type(void)
-+{
-+    const size_t n = 16;
-+    U cpu_diff[16];
-+    T cpu_x[16];
-+    T cpu_y[16];
-+
-+    // Setup buffers
-+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
-+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
-+    OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(U), NULL);
-+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
-+    globals[0] = 16;
-+    locals[0] = 16;
-+
-+    // Run random tests
-+    for (uint32_t pass = 0; pass < 8; ++pass) {
-+        OCL_MAP_BUFFER(0);
-+        OCL_MAP_BUFFER(1);
-+
-+        /* Clear the dst buffer to avoid random data. */
-+        OCL_MAP_BUFFER(2);
-+        memset(buf_data[2], 0, sizeof(U) * n);
-+        OCL_UNMAP_BUFFER(2);
-+
-+        for (int32_t i = 0; i < (int32_t) n; ++i) {
-+            gen_rand_val(cpu_x[i]);
-+            gen_rand_val(cpu_y[i]);
-+        }
-+
-+        memcpy(buf_data[0], cpu_x, sizeof(T) * n);
-+        memcpy(buf_data[1], cpu_y, sizeof(T) * n);
-+
-+        // Run the kernel on GPU
-+        OCL_NDRANGE(1);
-+
-+        // Run on CPU
-+        for (int32_t i = 0; i < (int32_t) n; ++i)
-+            cpu(i, cpu_x, cpu_y, cpu_diff);
-+
-+        // Compare
-+        OCL_MAP_BUFFER(2);
-+
-+//      dump_data(cpu_x, cpu_y, cpu_diff, n);
-+
-+        OCL_ASSERT(!memcmp(buf_data[2], cpu_diff, sizeof(T) * n));
-+
-+        OCL_UNMAP_BUFFER(0);
-+        OCL_UNMAP_BUFFER(1);
-+        OCL_UNMAP_BUFFER(2);
-+    }
-+}
-+
-+#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) \
-+	static void compiler_abs_diff_##TYPE (void) \
-+        { \
-+           OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#TYPE, SOURCE, NULL);  \
-+           compiler_abs_diff_with_type<TYPE, UTYPE>(); \
-+        } \
-+	MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##TYPE);
-+
-+typedef unsigned char uchar;
-+typedef unsigned short ushort;
-+typedef unsigned int uint;
-+ABS_TEST_DIFF_TYPE(int, uint)
-+ABS_TEST_DIFF_TYPE(short, ushort)
-+ABS_TEST_DIFF_TYPE(char, uchar)
-+ABS_TEST_DIFF_TYPE(uint, uint)
-+ABS_TEST_DIFF_TYPE(ushort, ushort)
-+ABS_TEST_DIFF_TYPE(uchar, uchar)
-+
-+
-+typedef cl_vec<int, 2> int2;
-+typedef cl_vec<int, 3> int3;
-+typedef cl_vec<int, 4> int4;
-+typedef cl_vec<int, 8> int8;
-+typedef cl_vec<int, 16> int16;
-+typedef cl_vec<unsigned int, 2> uint2;
-+typedef cl_vec<unsigned int, 3> uint3;
-+typedef cl_vec<unsigned int, 4> uint4;
-+typedef cl_vec<unsigned int, 8> uint8;
-+typedef cl_vec<unsigned int, 16> uint16;
-+ABS_TEST_DIFF_TYPE(int2, uint2)
-+ABS_TEST_DIFF_TYPE(int3, uint3)
-+ABS_TEST_DIFF_TYPE(int4, uint4)
-+ABS_TEST_DIFF_TYPE(int8, uint8)
-+ABS_TEST_DIFF_TYPE(int16, uint16)
-+ABS_TEST_DIFF_TYPE(uint2, uint2)
-+ABS_TEST_DIFF_TYPE(uint3, uint3)
-+ABS_TEST_DIFF_TYPE(uint4, uint4)
-+ABS_TEST_DIFF_TYPE(uint8, uint8)
-+ABS_TEST_DIFF_TYPE(uint16, uint16)
-+
-+
-+typedef cl_vec<char, 2> char2;
-+typedef cl_vec<char, 3> char3;
-+typedef cl_vec<char, 4> char4;
-+typedef cl_vec<char, 8> char8;
-+typedef cl_vec<char, 16> char16;
-+typedef cl_vec<unsigned char, 2> uchar2;
-+typedef cl_vec<unsigned char, 3> uchar3;
-+typedef cl_vec<unsigned char, 4> uchar4;
-+typedef cl_vec<unsigned char, 8> uchar8;
-+typedef cl_vec<unsigned char, 16> uchar16;
-+ABS_TEST_DIFF_TYPE(char2, uchar2)
-+ABS_TEST_DIFF_TYPE(char3, uchar3)
-+ABS_TEST_DIFF_TYPE(char4, uchar4)
-+ABS_TEST_DIFF_TYPE(char8, uchar8)
-+ABS_TEST_DIFF_TYPE(char16, uchar16)
-+ABS_TEST_DIFF_TYPE(uchar2, uchar2)
-+ABS_TEST_DIFF_TYPE(uchar3, uchar3)
-+ABS_TEST_DIFF_TYPE(uchar4, uchar4)
-+ABS_TEST_DIFF_TYPE(uchar8, uchar8)
-+ABS_TEST_DIFF_TYPE(uchar16, uchar16)
-+
-+
-+typedef cl_vec<short, 2> short2;
-+typedef cl_vec<short, 3> short3;
-+typedef cl_vec<short, 4> short4;
-+typedef cl_vec<short, 8> short8;
-+typedef cl_vec<short, 16> short16;
-+typedef cl_vec<unsigned short, 2> ushort2;
-+typedef cl_vec<unsigned short, 3> ushort3;
-+typedef cl_vec<unsigned short, 4> ushort4;
-+typedef cl_vec<unsigned short, 8> ushort8;
-+typedef cl_vec<unsigned short, 16> ushort16;
-+ABS_TEST_DIFF_TYPE(short2, ushort2)
-+ABS_TEST_DIFF_TYPE(short3, ushort3)
-+ABS_TEST_DIFF_TYPE(short4, ushort4)
-+ABS_TEST_DIFF_TYPE(short8, ushort8)
-+ABS_TEST_DIFF_TYPE(short16, ushort16)
-+ABS_TEST_DIFF_TYPE(ushort2, ushort2)
-+ABS_TEST_DIFF_TYPE(ushort3, ushort3)
-+ABS_TEST_DIFF_TYPE(ushort4, ushort4)
-+ABS_TEST_DIFF_TYPE(ushort8, ushort8)
-+ABS_TEST_DIFF_TYPE(ushort16, ushort16)
--- 
-1.7.10.4
-
diff --git a/debian/patches/0009-test-case-for-function-rotate.patch b/debian/patches/0009-test-case-for-function-rotate.patch
deleted file mode 100644
index eef926a..0000000
--- a/debian/patches/0009-test-case-for-function-rotate.patch
+++ /dev/null
@@ -1,87 +0,0 @@
-From ac4b81f5d44a0803f24d3fe2202ebf4c6bd7d523 Mon Sep 17 00:00:00 2001
-From: Homer Hsing <homer.xing at intel.com>
-Date: Wed, 26 Jun 2013 15:51:52 +0800
-Subject: [PATCH 09/11] test case for function "rotate"
-To: beignet at lists.freedesktop.org
-
-Signed-off-by: Homer Hsing <homer.xing at intel.com>
----
- kernels/compiler_rotate.cl |    5 +++++
- utests/CMakeLists.txt      |    1 +
- utests/compiler_rotate.cpp |   40 ++++++++++++++++++++++++++++++++++++++++
- 3 files changed, 46 insertions(+)
- create mode 100644 kernels/compiler_rotate.cl
- create mode 100644 utests/compiler_rotate.cpp
-
-diff --git a/kernels/compiler_rotate.cl b/kernels/compiler_rotate.cl
-new file mode 100644
-index 0000000..8d0dd0f
---- /dev/null
-+++ b/kernels/compiler_rotate.cl
-@@ -0,0 +1,5 @@
-+kernel void compiler_rotate(global int *src, global int *dst, global int *y) {
-+  int i = get_global_id(0);
-+  dst[i] = rotate(src[i], y[i]);
-+}
-+
-diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index c313acd..3326064 100644
---- a/utests/CMakeLists.txt
-+++ b/utests/CMakeLists.txt
-@@ -55,6 +55,7 @@ set (utests_sources
-   compiler_lower_return1.cpp
-   compiler_lower_return2.cpp
-   compiler_multiple_kernels.cpp
-+  compiler_rotate.cpp
-   compiler_saturate.cpp
-   compiler_saturate_sub.cpp
-   compiler_shift_right.cpp
-diff --git a/utests/compiler_rotate.cpp b/utests/compiler_rotate.cpp
-new file mode 100644
-index 0000000..bf52ca4
---- /dev/null
-+++ b/utests/compiler_rotate.cpp
-@@ -0,0 +1,40 @@
-+#include "utest_helper.hpp"
-+
-+int cpu(int src, int y) {
-+  return (src << y) | (src >> (32 - y));
-+}
-+
-+void compiler_rotate(void)
-+{
-+  const int n = 32;
-+  int src[n], y[n];
-+
-+  // Setup kernel and buffers
-+  OCL_CREATE_KERNEL("compiler_rotate");
-+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
-+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
-+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
-+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
-+  globals[0] = n;
-+  locals[0] = 16;
-+
-+  OCL_MAP_BUFFER(0);
-+  OCL_MAP_BUFFER(2);
-+  for (int i = 0; i < n; ++i) {
-+    src[i] = ((int*)buf_data[0])[i] = rand();
-+    y[i] = ((int*)buf_data[2])[i] = rand() & 31;
-+  }
-+  OCL_UNMAP_BUFFER(0);
-+  OCL_UNMAP_BUFFER(2);
-+
-+  OCL_NDRANGE(1);
-+
-+  OCL_MAP_BUFFER(1);
-+  for (int i = 0; i < n; ++i)
-+    OCL_ASSERT(((int*)buf_data[1])[i] == cpu(src[i], y[i]));
-+  OCL_UNMAP_BUFFER(1);
-+}
-+
-+MAKE_UTEST_FROM_FUNCTION(compiler_rotate);
--- 
-1.7.10.4
-
diff --git a/debian/patches/0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch b/debian/patches/0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch
deleted file mode 100644
index 04dc32f..0000000
--- a/debian/patches/0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch
+++ /dev/null
@@ -1,153 +0,0 @@
-From eddef9eb62ed9ea04e657687c8ed28f43f1e0584 Mon Sep 17 00:00:00 2001
-From: Ruiling Song <ruiling.song at intel.com>
-Date: Wed, 26 Jun 2013 15:52:12 +0800
-Subject: [PATCH 10/11] GBE: Add more support of char and short arithmetic
-To: beignet at lists.freedesktop.org
-
-add * / % support of char and short type.
-
-Signed-off-by: Ruiling Song <ruiling.song at intel.com>
----
- backend/src/backend/gen_insn_selection.cpp |   87 ++++++++++++++++++++--------
- backend/src/llvm/llvm_gen_backend.cpp      |    4 +-
- 2 files changed, 65 insertions(+), 26 deletions(-)
-
-diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
-index c64afd9..e98be3e 100644
---- a/backend/src/backend/gen_insn_selection.cpp
-+++ b/backend/src/backend/gen_insn_selection.cpp
-@@ -1293,6 +1293,54 @@ namespace gbe
-           this->opcodes.push_back(ir::Opcode(op));
-     }
- 
-+    bool emitDivRemInst(Selection::Opaque &sel, SelectionDAG &dag, ir::Opcode op) const
-+    {
-+      using namespace ir;
-+      const ir::BinaryInstruction &insn = cast<BinaryInstruction>(dag.insn);
-+      const Type type = insn.getType();
-+      GenRegister dst  = sel.selReg(insn.getDst(0), type);
-+      GenRegister src0 = sel.selReg(insn.getSrc(0), type);
-+      GenRegister src1 = sel.selReg(insn.getSrc(1), type);
-+      const uint32_t simdWidth = sel.curr.execWidth;
-+      const RegisterFamily family = getFamily(type);
-+      uint32_t function = (op == OP_DIV)?
-+                          GEN_MATH_FUNCTION_INT_DIV_QUOTIENT :
-+                          GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
-+
-+      //bytes and shorts must be converted to int for DIV and REM per GEN restriction
-+      if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
-+        GenRegister tmp0, tmp1;
-+        ir::Register reg = sel.reg(FAMILY_DWORD);
-+
-+        tmp0 = GenRegister::udxgrf(simdWidth, reg);
-+        tmp0 = GenRegister::retype(tmp0, GEN_TYPE_D);
-+        sel.MOV(tmp0, src0);
-+
-+        tmp1 = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-+        tmp1 = GenRegister::retype(tmp1, GEN_TYPE_D);
-+        sel.MOV(tmp1, src1);
-+
-+        sel.MATH(tmp0, function, tmp0, tmp1);
-+        GenRegister unpacked;
-+        if(family == FAMILY_WORD) {
-+          unpacked = GenRegister::unpacked_uw(reg);
-+        } else {
-+          unpacked = GenRegister::unpacked_ub(reg);
-+        }
-+        unpacked = GenRegister::retype(unpacked, getGenType(type));
-+        sel.MOV(dst, unpacked);
-+      } else if (type == TYPE_S32 || type == TYPE_U32 ) {
-+        sel.MATH(dst, function, src0, src1);
-+      } else if(type == TYPE_FLOAT) {
-+        GBE_ASSERT(op != OP_REM);
-+        sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
-+      } else {
-+        NOT_IMPLEMENTED;
-+      }
-+      markAllChildren(dag);
-+      return true;
-+    }
-+
-     INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
-     {
-       using namespace ir;
-@@ -1301,29 +1349,20 @@ namespace gbe
-       const Type type = insn.getType();
-       GenRegister dst  = sel.selReg(insn.getDst(0), type);
- 
--      // Immediates not supported
--      if (opcode == OP_DIV || opcode == OP_POW) {
--        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
--        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
--        uint32_t function;
--        if (type == TYPE_S32 || type == TYPE_U32)
--          function = GEN_MATH_FUNCTION_INT_DIV_QUOTIENT;
--        else
--          function = opcode == OP_DIV ?
--                     GEN_MATH_FUNCTION_FDIV :
--                     GEN_MATH_FUNCTION_POW;
--        sel.MATH(dst, function, src0, src1);
--        markAllChildren(dag);
--        return true;
-+      if(opcode == OP_DIV || opcode == OP_REM) {
-+        return this->emitDivRemInst(sel, dag, opcode);
-       }
--      if (opcode == OP_REM) {
-+      // Immediates not supported
-+      if (opcode == OP_POW) {
-         GenRegister src0 = sel.selReg(insn.getSrc(0), type);
-         GenRegister src1 = sel.selReg(insn.getSrc(1), type);
--        if (type == TYPE_U32 || type == TYPE_S32) {
--          sel.MATH(dst, GEN_MATH_FUNCTION_INT_DIV_REMAINDER, src0, src1);
--          markAllChildren(dag);
--        } else
-+
-+        if(type == TYPE_FLOAT) {
-+          sel.MATH(dst, GEN_MATH_FUNCTION_POW, src0, src1);
-+        } else {
-           NOT_IMPLEMENTED;
-+        }
-+        markAllChildren(dag);
-         return true;
-       }
- 
-@@ -1385,14 +1424,14 @@ namespace gbe
-         case OP_SHR: sel.SHR(dst, src0, src1); break;
-         case OP_ASR: sel.ASR(dst, src0, src1); break;
-         case OP_MUL:
--          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
--            sel.MUL(dst, src0, src1);
--          else if (type == TYPE_U32 || type == TYPE_S32) {
-+          if (type == TYPE_U32 || type == TYPE_S32) {
-             sel.pop();
-             return false;
-           }
--          else
--            NOT_IMPLEMENTED;
-+          else {
-+            GBE_ASSERTM((type != TYPE_S64 && type != TYPE_U64), "64bit integer not supported yet!" );
-+            sel.MUL(dst, src0, src1);
-+          }
-         break;
-         default: NOT_IMPLEMENTED;
-       }
-diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
-index 5db3fbe..fa052ce 100644
---- a/backend/src/llvm/llvm_gen_backend.cpp
-+++ b/backend/src/llvm/llvm_gen_backend.cpp
-@@ -1280,10 +1280,10 @@ namespace gbe
-       case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
-       case Instruction::Mul:
-       case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
--      case Instruction::URem:
-+      case Instruction::URem: ctx.REM(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
-       case Instruction::SRem:
-       case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
--      case Instruction::UDiv:
-+      case Instruction::UDiv: ctx.DIV(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
-       case Instruction::SDiv:
-       case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
-       case Instruction::And:  ctx.AND(type, dst, src0, src1); break;
--- 
-1.7.10.4
-
diff --git a/debian/patches/0010-Readd-OpenCL-1.2-definitions-required-for-ICD.patch b/debian/patches/0010-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
deleted file mode 100644
index 0dd14f4..0000000
--- a/debian/patches/0010-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
+++ /dev/null
@@ -1,95 +0,0 @@
-From cddef6876d11c289a0bcfe75d5f11ffe982ba5e4 Mon Sep 17 00:00:00 2001
-From: Simon Richter <Simon.Richter at hogyros.de>
-Date: Wed, 19 Jun 2013 11:30:36 +0200
-Subject: [PATCH 10/10] Readd OpenCL 1.2 definitions required for ICD
-To: beignet at lists.freedesktop.org
-
-The definition for the ICD dispatch table requires a few additional
-definitions from OpenCL 1.2.
----
- include/CL/cl.h          |   15 +++++++++++++++
- include/CL/cl_platform.h |    2 ++
- src/cl_mem.h             |   12 ------------
- 3 files changed, 17 insertions(+), 12 deletions(-)
-
-diff --git a/include/CL/cl.h b/include/CL/cl.h
-index 4355e74..a7f25d1 100644
---- a/include/CL/cl.h
-+++ b/include/CL/cl.h
-@@ -67,6 +67,7 @@ typedef cl_uint             cl_channel_type;
- typedef cl_bitfield         cl_mem_flags;
- typedef cl_uint             cl_mem_object_type;
- typedef cl_uint             cl_mem_info;
-+typedef cl_bitfield         cl_mem_migration_flags;
- typedef cl_uint             cl_image_info;
- typedef cl_uint             cl_buffer_create_type;
- typedef cl_uint             cl_addressing_mode;
-@@ -75,8 +76,10 @@ typedef cl_uint             cl_sampler_info;
- typedef cl_bitfield         cl_map_flags;
- typedef cl_uint             cl_program_info;
- typedef cl_uint             cl_program_build_info;
-+typedef intptr_t            cl_device_partition_property;
- typedef cl_int              cl_build_status;
- typedef cl_uint             cl_kernel_info;
-+typedef cl_uint             cl_kernel_arg_info;
- typedef cl_uint             cl_kernel_work_group_info;
- typedef cl_uint             cl_event_info;
- typedef cl_uint             cl_command_type;
-@@ -87,6 +90,18 @@ typedef struct _cl_image_format {
-     cl_channel_type         image_channel_data_type;
- } cl_image_format;
- 
-+typedef struct _cl_image_desc {
-+    cl_mem_object_type      image_type;
-+    size_t                  image_width;
-+    size_t                  image_height;
-+    size_t                  image_depth;
-+    size_t                  image_array_size;
-+    size_t                  image_row_pitch;
-+    size_t                  image_slice_pitch;
-+    cl_uint                 num_mip_levels;
-+    cl_uint                 num_samples;
-+    cl_mem                  buffer;
-+} cl_image_desc;
- 
- typedef struct _cl_buffer_region {
-     size_t                  origin;
-diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
-index 043b048..9a2f17a 100644
---- a/include/CL/cl_platform.h
-+++ b/include/CL/cl_platform.h
-@@ -58,6 +58,8 @@ extern "C" {
-     #define CL_EXT_SUFFIX__VERSION_1_0
-     #define CL_API_SUFFIX__VERSION_1_1
-     #define CL_EXT_SUFFIX__VERSION_1_1
-+    #define CL_API_SUFFIX__VERSION_1_2
-+    #define CL_EXT_SUFFIX__VERSION_1_2
-     #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
- #endif
- 
-diff --git a/src/cl_mem.h b/src/cl_mem.h
-index 33ad174..66815fe 100644
---- a/src/cl_mem.h
-+++ b/src/cl_mem.h
-@@ -29,18 +29,6 @@
- #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
- #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
- #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
--typedef struct _cl_image_desc {
--    cl_mem_object_type      image_type;
--    size_t                  image_width;
--    size_t                  image_height;
--    size_t                  image_depth;
--    size_t                  image_array_size;
--    size_t                  image_row_pitch;
--    size_t                  image_slice_pitch;
--    cl_uint                 num_mip_levels;
--    cl_uint                 num_samples;
--    cl_mem                  buffer;
--} cl_image_desc;
- #endif
- 
- typedef enum cl_image_tiling {
--- 
-1.7.10.4
-
diff --git a/debian/patches/0011-utests-Add-basic-arithmetic-test-case.patch b/debian/patches/0011-utests-Add-basic-arithmetic-test-case.patch
deleted file mode 100644
index 9a2e59e..0000000
--- a/debian/patches/0011-utests-Add-basic-arithmetic-test-case.patch
+++ /dev/null
@@ -1,336 +0,0 @@
-From 1400415e754d8362ed6a628f5e77c3da2417adae Mon Sep 17 00:00:00 2001
-From: Ruiling Song <ruiling.song at intel.com>
-Date: Wed, 26 Jun 2013 15:52:13 +0800
-Subject: [PATCH 11/11] utests: Add basic arithmetic test case
-To: beignet at lists.freedesktop.org
-
-test case for + - * / % of data type (u)int8/16/32
-remove duplicated cases.
-
-Signed-off-by: Ruiling Song <ruiling.song at intel.com>
----
- kernels/compiler_basic_arithmetic.cl |   53 ++++++++++++++++
- kernels/compiler_sub_bytes.cl        |    7 ---
- kernels/compiler_sub_shorts.cl       |    7 ---
- utests/CMakeLists.txt                |    3 +-
- utests/compiler_basic_arithmetic.cpp |  112 ++++++++++++++++++++++++++++++++++
- utests/compiler_sub_bytes.cpp        |   35 -----------
- utests/compiler_sub_shorts.cpp       |   36 -----------
- 7 files changed, 166 insertions(+), 87 deletions(-)
- create mode 100644 kernels/compiler_basic_arithmetic.cl
- delete mode 100644 kernels/compiler_sub_bytes.cl
- delete mode 100644 kernels/compiler_sub_shorts.cl
- create mode 100644 utests/compiler_basic_arithmetic.cpp
- delete mode 100644 utests/compiler_sub_bytes.cpp
- delete mode 100644 utests/compiler_sub_shorts.cpp
-
-diff --git a/kernels/compiler_basic_arithmetic.cl b/kernels/compiler_basic_arithmetic.cl
-new file mode 100644
-index 0000000..3e145d8
---- /dev/null
-+++ b/kernels/compiler_basic_arithmetic.cl
-@@ -0,0 +1,53 @@
-+#define DECL_KERNEL_SUB(type)\
-+__kernel void \
-+compiler_sub_##type(__global type *src0, __global type *src1, __global type *dst) \
-+{ \
-+  int id = (int)get_global_id(0); \
-+  dst[id] = src0[id] - src1[id]; \
-+}
-+
-+#define DECL_KERNEL_ADD(type)\
-+__kernel void \
-+compiler_add_##type(__global type *src0, __global type *src1, __global type *dst) \
-+{ \
-+  int id = (int)get_global_id(0); \
-+  dst[id] = src0[id] + src1[id]; \
-+}
-+
-+#define DECL_KERNEL_MUL(type)\
-+__kernel void \
-+compiler_mul_##type(__global type *src0, __global type *src1, __global type *dst) \
-+{ \
-+  int id = (int)get_global_id(0); \
-+  dst[id] = src0[id] * src1[id]; \
-+}
-+
-+#define DECL_KERNEL_DIV(type)\
-+__kernel void \
-+compiler_div_##type(__global type *src0, __global type *src1, __global type *dst) \
-+{ \
-+  int id = (int)get_global_id(0); \
-+  dst[id] = src0[id] / src1[id]; \
-+}
-+
-+#define DECL_KERNEL_REM(type)\
-+__kernel void \
-+compiler_rem_##type(__global type *src0, __global type *src1, __global type *dst) \
-+{ \
-+  int id = (int)get_global_id(0); \
-+  dst[id] = src0[id] % src1[id]; \
-+}
-+
-+#define DECL_KERNEL_FOR_ALL_TYPE(op) \
-+DECL_KERNEL_##op(char)               \
-+DECL_KERNEL_##op(uchar)              \
-+DECL_KERNEL_##op(short)              \
-+DECL_KERNEL_##op(ushort)             \
-+DECL_KERNEL_##op(int)                \
-+DECL_KERNEL_##op(uint)
-+
-+DECL_KERNEL_FOR_ALL_TYPE(SUB)
-+DECL_KERNEL_FOR_ALL_TYPE(ADD)
-+DECL_KERNEL_FOR_ALL_TYPE(MUL)
-+DECL_KERNEL_FOR_ALL_TYPE(DIV)
-+DECL_KERNEL_FOR_ALL_TYPE(REM)
-diff --git a/kernels/compiler_sub_bytes.cl b/kernels/compiler_sub_bytes.cl
-deleted file mode 100644
-index f058561..0000000
---- a/kernels/compiler_sub_bytes.cl
-+++ /dev/null
-@@ -1,7 +0,0 @@
--__kernel void
--compiler_sub_bytes(__global char *src0, __global char *src1, __global char *dst)
--{
--  int id = (int)get_global_id(0);
--  dst[id] = src0[id] - src1[id];
--}
--
-diff --git a/kernels/compiler_sub_shorts.cl b/kernels/compiler_sub_shorts.cl
-deleted file mode 100644
-index d26de7f..0000000
---- a/kernels/compiler_sub_shorts.cl
-+++ /dev/null
-@@ -1,7 +0,0 @@
--__kernel void
--compiler_sub_shorts(__global short *src0, __global short *src1, __global short *dst)
--{
--  int id = (int)get_global_id(0);
--  dst[id] = src0[id] - src1[id];
--}
--
-diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index 3326064..3740841 100644
---- a/utests/CMakeLists.txt
-+++ b/utests/CMakeLists.txt
-@@ -5,6 +5,7 @@ link_directories (${LLVM_LIBRARY_DIR})
- set (utests_sources
-   cl_create_kernel.cpp
-   utest_error.c
-+  compiler_basic_arithmetic.cpp
-   compiler_displacement_map_element.cpp
-   compiler_shader_toy.cpp
-   compiler_mandelbrot.cpp
-@@ -60,8 +61,6 @@ set (utests_sources
-   compiler_saturate_sub.cpp
-   compiler_shift_right.cpp
-   compiler_short_scatter.cpp
--  compiler_sub_bytes.cpp
--  compiler_sub_shorts.cpp
-   compiler_uint2_copy.cpp
-   compiler_uint3_copy.cpp
-   compiler_uint8_copy.cpp
-diff --git a/utests/compiler_basic_arithmetic.cpp b/utests/compiler_basic_arithmetic.cpp
-new file mode 100644
-index 0000000..dcdd084
---- /dev/null
-+++ b/utests/compiler_basic_arithmetic.cpp
-@@ -0,0 +1,112 @@
-+#include "utest_helper.hpp"
-+
-+enum eTestOP {
-+  TEST_OP_ADD =0,
-+  TEST_OP_SUB,
-+  TEST_OP_MUL,
-+  TEST_OP_DIV,
-+  TEST_OP_REM
-+};
-+
-+template <typename T, eTestOP op>
-+static void test_exec(const char* kernel_name)
-+{
-+  const size_t n = 160;
-+
-+  // Setup kernel and buffers
-+  OCL_CREATE_KERNEL_FROM_FILE("compiler_basic_arithmetic", kernel_name);
-+std::cout <<"kernel name: " << kernel_name << std::endl;
-+  buf_data[0] = (T*) malloc(sizeof(T) * n);
-+  buf_data[1] = (T*) malloc(sizeof(T) * n);
-+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = (T) rand();
-+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[1])[i] = (T) rand();
-+  if(op == TEST_OP_DIV || op == TEST_OP_REM) {
-+    for (uint32_t i = 0; i < n; ++i) {
-+      if(((T*)buf_data[1])[i] == 0)
-+       ((T*)buf_data[1])[i] = (T) 1;
-+    }
-+  }
-+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
-+  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[1]);
-+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
-+
-+  // Run the kernel
-+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
-+  globals[0] = n;
-+  locals[0] = 16;
-+  OCL_NDRANGE(1);
-+
-+  // Check result
-+  OCL_MAP_BUFFER(2);
-+  if(op == TEST_OP_SUB) {
-+    for (uint32_t i = 0; i < n; ++i)
-+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] - ((T*)buf_data[1])[i]));
-+  } else if(op == TEST_OP_ADD) {
-+    for (uint32_t i = 0; i < n; ++i)
-+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] + ((T*)buf_data[1])[i]));
-+  } else if(op == TEST_OP_MUL) {
-+    for (uint32_t i = 0; i < n; ++i)
-+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] * ((T*)buf_data[1])[i]));
-+  } else if(op == TEST_OP_DIV) {
-+    for (uint32_t i = 0; i < n; ++i)
-+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] / ((T*)buf_data[1])[i]));
-+  } else {
-+    for (uint32_t i = 0; i < n; ++i)
-+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] % ((T*)buf_data[1])[i]));
-+  }
-+  free(buf_data[0]);
-+  free(buf_data[1]);
-+  buf_data[0] = buf_data[1] = NULL;
-+}
-+
-+#define DECL_TEST_SUB(type, alias) \
-+static void compiler_sub_ ##alias(void)\
-+{\
-+  test_exec<type, TEST_OP_SUB>("compiler_sub_" # alias);\
-+}\
-+MAKE_UTEST_FROM_FUNCTION(compiler_sub_ ## alias)
-+
-+#define DECL_TEST_ADD(type, alias) \
-+static void compiler_add_ ##alias(void)\
-+{\
-+  test_exec<type, TEST_OP_ADD>("compiler_add_" # alias);\
-+}\
-+MAKE_UTEST_FROM_FUNCTION(compiler_add_ ## alias)
-+
-+#define DECL_TEST_MUL(type, alias) \
-+static void compiler_mul_ ##alias(void)\
-+{\
-+  test_exec<type, TEST_OP_MUL>("compiler_mul_" # alias);\
-+}\
-+MAKE_UTEST_FROM_FUNCTION(compiler_mul_ ## alias)
-+
-+#define DECL_TEST_DIV(type, alias) \
-+static void compiler_div_ ##alias(void)\
-+{\
-+  test_exec<type, TEST_OP_DIV>("compiler_div_" # alias);\
-+}\
-+MAKE_UTEST_FROM_FUNCTION(compiler_div_ ## alias)
-+
-+#define DECL_TEST_REM(type, alias) \
-+static void compiler_rem_ ##alias(void)\
-+{\
-+  test_exec<type, TEST_OP_REM>("compiler_rem_" # alias);\
-+}\
-+MAKE_UTEST_FROM_FUNCTION(compiler_rem_ ## alias)
-+
-+#define DECL_TEST_FOR_ALL_TYPE(op)\
-+DECL_TEST_##op(int8_t, char) \
-+DECL_TEST_##op(uint8_t, uchar) \
-+DECL_TEST_##op(int16_t, short) \
-+DECL_TEST_##op(uint16_t, ushort) \
-+DECL_TEST_##op(int32_t, int) \
-+DECL_TEST_##op(uint32_t, uint)
-+
-+DECL_TEST_FOR_ALL_TYPE(SUB)
-+DECL_TEST_FOR_ALL_TYPE(ADD)
-+DECL_TEST_FOR_ALL_TYPE(MUL)
-+DECL_TEST_FOR_ALL_TYPE(DIV)
-+DECL_TEST_FOR_ALL_TYPE(REM)
-+#undef DECL_TEST_FOR_ALL_TYPE
-diff --git a/utests/compiler_sub_bytes.cpp b/utests/compiler_sub_bytes.cpp
-deleted file mode 100644
-index 740a8fd..0000000
---- a/utests/compiler_sub_bytes.cpp
-+++ /dev/null
-@@ -1,35 +0,0 @@
--#include "utest_helper.hpp"
--
--static void compiler_sub_bytes(void)
--{
--  const size_t n = 16;
--
--  // Setup kernel and buffers
--  OCL_CREATE_KERNEL("compiler_sub_bytes");
--  buf_data[0] = (int8_t*) malloc(sizeof(int8_t) * n);
--  buf_data[1] = (int8_t*) malloc(sizeof(int8_t) * n);
--  for (uint32_t i = 0; i < n; ++i) ((int8_t*)buf_data[0])[i] = (int8_t) rand();
--  for (uint32_t i = 0; i < n; ++i) ((int8_t*)buf_data[1])[i] = (int8_t) rand();
--  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(int8_t), buf_data[0]);
--  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int8_t), buf_data[1]);
--  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int8_t), NULL);
--
--  // Run the kernel
--  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
--  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
--  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
--  globals[0] = n;
--  locals[0] = 16;
--  OCL_NDRANGE(1);
--
--  // Check result
--  OCL_MAP_BUFFER(2);
--  for (uint32_t i = 0; i < n; ++i)
--    OCL_ASSERT(((int8_t*)buf_data[2])[i] == (int8_t)(((int8_t*)buf_data[0])[i] - ((int8_t*)buf_data[1])[i]));
--  free(buf_data[0]);
--  free(buf_data[1]);
--  buf_data[0] = buf_data[1] = NULL;
--}
--
--MAKE_UTEST_FROM_FUNCTION(compiler_sub_bytes);
--
-diff --git a/utests/compiler_sub_shorts.cpp b/utests/compiler_sub_shorts.cpp
-deleted file mode 100644
-index 7c24a56..0000000
---- a/utests/compiler_sub_shorts.cpp
-+++ /dev/null
-@@ -1,36 +0,0 @@
--#include "utest_helper.hpp"
--
--static void compiler_sub_shorts(void)
--{
--  const size_t n = 16;
--
--  // Setup kernel and buffers
--  OCL_CREATE_KERNEL("compiler_sub_shorts");
--  buf_data[0] = (int16_t*) malloc(sizeof(int16_t) * n);
--  buf_data[1] = (int16_t*) malloc(sizeof(int16_t) * n);
--  for (uint32_t i = 0; i < n; ++i) ((int16_t*)buf_data[0])[i] = (int16_t) rand();
--  for (uint32_t i = 0; i < n; ++i) ((int16_t*)buf_data[1])[i] = (int16_t) rand();
--  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(int16_t), buf_data[0]);
--  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int16_t), buf_data[1]);
--  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int16_t), NULL);
--
--  // Run the kernel
--  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
--  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
--  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
--  globals[0] = n;
--  locals[0] = 16;
--  OCL_NDRANGE(1);
--
--  // Check result
--  OCL_MAP_BUFFER(2);
--  for (uint32_t i = 0; i < n; ++i)
--    OCL_ASSERT(((int16_t*)buf_data[2])[i] == (int16_t)(((int16_t*)buf_data[0])[i] - ((int16_t*)buf_data[1])[i]));
--  free(buf_data[0]);
--  free(buf_data[1]);
--  buf_data[0] = buf_data[1] = NULL;
--}
--
--MAKE_UTEST_FROM_FUNCTION(compiler_sub_shorts);
--
--
--- 
-1.7.10.4
-
diff --git a/debian/patches/series b/debian/patches/series
index 9f3609d..dfd681e 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -3,12 +3,3 @@ flags
 khronos
 deprecated-in-utest
 private
-0001-Add-vector-argument-test-case.patch
-0002-Fix-atomic-test-failed-in-GT1.patch
-0003-GBE-fixed-a-barrier-related-bug.patch
-0004-utests-increase-local-size-in-the-two-barrier-test-c.patch
-0005-Disable-error-message-output-in-release-version.patch
-0006-Modify-all-the-builtin-function-vect-return-to-vect_.patch
-0007-Add-the-vector3-support-for-builtin-abs-function.patch
-0008-Add-the-abs_diff-builtin-function-support.patch
-0009-Add-the-test-case-for-builtin-abs_diff-function.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git