[Pkg-opencl-devel] [beignet] 23/47: Imported Debian patch 0.1+git20130626+41005e0-1

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 21:45:48 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit e55e903142601ede900c036407ce6f129040e077
Author: Simon Richter <sjr at debian.org>
Date:   Wed Jun 26 10:06:29 2013 +0200

    Imported Debian patch 0.1+git20130626+41005e0-1
---
 debian/changelog                                   |   6 +
 .../0001-Add-vector-argument-test-case.patch       |   8 +-
 ...d-OpenCL-1.2-definitions-required-for-ICD.patch |   4 +-
 ...tin-function-abs-and-the-according-test-.patch} |  39 +-
 ...PATCH-Refine-the-get_local_id-.-builtins.patch} |  12 +-
 ...help-functions.-Support-global-and-local-.patch | 611 ++++++++++++++++++++
 ...Add-all-atomic-built-in-functions-support.patch | 289 ++++++++++
 debian/patches/0007-Add-atomic-test-case.patch     | 188 ++++++
 ...-support-of-the-API-clGetCommandQueueInfo.patch | 185 ------
 ...e-test-case-for-clGetCommandQueueInfo-API.patch | 631 ---------------------
 .../0008-support-built-in-function-rotate.patch    |  58 ++
 .../0009-test-case-for-function-rotate.patch       |  87 +++
 ...ore-support-of-char-and-short-arithmetic.patch} | 128 +++--
 ...11-utests-Add-basic-arithmetic-test-case.patch} | 122 ++--
 debian/patches/series                              |  15 +-
 15 files changed, 1392 insertions(+), 991 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index f7f7c49..d5fb18e 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+beignet (0.1+git20130626+41005e0-1) unstable; urgency=low
+
+  * New upstream release
+
+ -- Simon Richter <sjr at debian.org>  Wed, 26 Jun 2013 10:06:29 +0200
+
 beignet (0.1+git20130625+97c3a9b-1) unstable; urgency=low
 
   * New upstream release
diff --git a/debian/patches/0001-Add-vector-argument-test-case.patch b/debian/patches/0001-Add-vector-argument-test-case.patch
index eac26c2..86a7636 100644
--- a/debian/patches/0001-Add-vector-argument-test-case.patch
+++ b/debian/patches/0001-Add-vector-argument-test-case.patch
@@ -1,7 +1,7 @@
-From 0ee7f97ae6e740ea80766d1126e0520583780d40 Mon Sep 17 00:00:00 2001
+From 5e262194e009622863d19e17c03cb44d0bd066cc Mon Sep 17 00:00:00 2001
 From: Yang Rong <rong.r.yang at intel.com>
 Date: Thu, 16 May 2013 12:36:35 +0800
-Subject: [PATCH 1/8] Add vector argument test case.
+Subject: [PATCH 01/11] Add vector argument test case.
 To: beignet at lists.freedesktop.org
 
 Signed-off-by: Yang Rong <rong.r.yang at intel.com>
@@ -26,10 +26,10 @@ index 0000000..0985dbd
 +  dst[id] = value.w;
 +}
 diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index c009d99..ea23f31 100644
+index df59feb..8a58ff4 100644
 --- a/utests/CMakeLists.txt
 +++ b/utests/CMakeLists.txt
-@@ -37,6 +37,7 @@ set (utests_sources
+@@ -39,6 +39,7 @@ set (utests_sources
    compiler_fill_image_3d_2.cpp
    compiler_function_argument0.cpp
    compiler_function_argument1.cpp
diff --git a/debian/patches/0002-Readd-OpenCL-1.2-definitions-required-for-ICD.patch b/debian/patches/0002-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
index c385dcd..bbda3ac 100644
--- a/debian/patches/0002-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
+++ b/debian/patches/0002-Readd-OpenCL-1.2-definitions-required-for-ICD.patch
@@ -1,7 +1,7 @@
-From 3d63c833d4ebcafe8e46fd498c28d08cb3046cd8 Mon Sep 17 00:00:00 2001
+From 116a8feff8b91cedcd91c087ba14214ee939fa72 Mon Sep 17 00:00:00 2001
 From: Simon Richter <Simon.Richter at hogyros.de>
 Date: Wed, 19 Jun 2013 11:30:36 +0200
-Subject: [PATCH 2/8] Readd OpenCL 1.2 definitions required for ICD
+Subject: [PATCH 02/11] Readd OpenCL 1.2 definitions required for ICD
 To: beignet at lists.freedesktop.org
 
 The definition for the ICD dispatch table requires a few additional
diff --git a/debian/patches/0005-Add-the-builtin-function-abs-and-the-according-test-.patch b/debian/patches/0003-Add-the-builtin-function-abs-and-the-according-test-.patch
similarity index 88%
rename from debian/patches/0005-Add-the-builtin-function-abs-and-the-according-test-.patch
rename to debian/patches/0003-Add-the-builtin-function-abs-and-the-according-test-.patch
index 0f0490e..cf69bf3 100644
--- a/debian/patches/0005-Add-the-builtin-function-abs-and-the-according-test-.patch
+++ b/debian/patches/0003-Add-the-builtin-function-abs-and-the-according-test-.patch
@@ -1,7 +1,7 @@
-From 1e18f92e0de249a403d4a50842d778d61a84d053 Mon Sep 17 00:00:00 2001
+From 8b61fc4c2644e6748577d30567e6e4bf70aef436 Mon Sep 17 00:00:00 2001
 From: Junyan He <junyan.he at linux.intel.com>
 Date: Tue, 25 Jun 2013 15:50:54 +0800
-Subject: [PATCH 5/8] Add the builtin function abs() and the according test
+Subject: [PATCH 03/11] Add the builtin function abs() and the according test
  case
 To: beignet at lists.freedesktop.org
 
@@ -19,12 +19,12 @@ Signed-off-by: Junyan He <junyan.he at linux.intel.com>
  create mode 100644 utests/compiler_abs.cpp
 
 diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
-index b1c6093..4f5b337 100644
+index 8fb2a80..11efcb9 100644
 --- a/backend/src/backend/gen_insn_selection.cpp
 +++ b/backend/src/backend/gen_insn_selection.cpp
-@@ -1216,7 +1216,15 @@ namespace gbe
-       const GenRegister dst = sel.selReg(insn.getDst(0));
-       const GenRegister src = sel.selReg(insn.getSrc(0));
+@@ -1224,7 +1224,15 @@ namespace gbe
+       const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode));
+       const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode));
        switch (opcode) {
 -        case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
 +        case ir::OP_ABS:
@@ -40,10 +40,10 @@ index b1c6093..4f5b337 100644
            if (dst.isdf()) {
              ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
 diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
-index b0e8c6c..71f41b4 100644
+index 08500ba..ba2192a 100644
 --- a/backend/src/llvm/llvm_gen_backend.cpp
 +++ b/backend/src/llvm/llvm_gen_backend.cpp
-@@ -1686,6 +1686,7 @@ namespace gbe
+@@ -1688,6 +1688,7 @@ namespace gbe
        case GEN_OCL_POW:
        case GEN_OCL_RCP:
        case GEN_OCL_ABS:
@@ -51,7 +51,7 @@ index b0e8c6c..71f41b4 100644
        case GEN_OCL_RNDZ:
        case GEN_OCL_RNDE:
        case GEN_OCL_RNDU:
-@@ -1842,13 +1843,20 @@ namespace gbe
+@@ -1844,6 +1845,13 @@ namespace gbe
              ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
              break;
            }
@@ -62,9 +62,10 @@ index b0e8c6c..71f41b4 100644
 +            ctx.ALU1(ir::OP_ABS, ir::TYPE_S32, dst, src);
 +            break;
 +          }
+           case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
+           case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
            case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
-           case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
-           case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
+@@ -1852,7 +1860,7 @@ namespace gbe
            case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
            case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
            case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
@@ -74,7 +75,7 @@ index b0e8c6c..71f41b4 100644
            case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
            case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
 diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
-index 6cd7298..9cfad78 100644
+index fe19844..89a04ea 100644
 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
 +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
 @@ -19,7 +19,8 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
@@ -88,10 +89,10 @@ index 6cd7298..9cfad78 100644
  DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
  DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
 diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index 81a0193..eaf8f21 100644
+index 3b191ab..5ad829e 100644
 --- a/backend/src/ocl_stdlib.h
 +++ b/backend/src/ocl_stdlib.h
-@@ -4337,6 +4337,7 @@ INLINE uint get_global_id(uint dim) {
+@@ -4401,6 +4401,7 @@ INLINE uint get_global_id(uint dim) {
  /////////////////////////////////////////////////////////////////////////////
  // Math Functions (see 6.11.2 of OCL 1.1 spec)
  /////////////////////////////////////////////////////////////////////////////
@@ -99,7 +100,7 @@ index 81a0193..eaf8f21 100644
  PURE CONST float __gen_ocl_fabs(float x);
  PURE CONST float __gen_ocl_sin(float x);
  PURE CONST float __gen_ocl_cos(float x);
-@@ -4515,6 +4516,7 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+@@ -4579,6 +4580,7 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
  // XXX work-around PTX profile
  #define sqrt native_sqrt
  INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
@@ -107,7 +108,7 @@ index 81a0193..eaf8f21 100644
  INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
  INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
  INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
-@@ -4860,6 +4862,7 @@ DECL_VECTOR_1OP(native_exp10, float);
+@@ -4924,6 +4926,7 @@ DECL_VECTOR_1OP(native_exp10, float);
  DECL_VECTOR_1OP(__gen_ocl_internal_expm1, float);
  DECL_VECTOR_1OP(__gen_ocl_internal_cbrt, float);
  DECL_VECTOR_1OP(__gen_ocl_internal_fabs, float);
@@ -115,7 +116,7 @@ index 81a0193..eaf8f21 100644
  DECL_VECTOR_1OP(__gen_ocl_internal_trunc, float);
  DECL_VECTOR_1OP(__gen_ocl_internal_round, float);
  DECL_VECTOR_1OP(__gen_ocl_internal_floor, float);
-@@ -4987,6 +4990,7 @@ INLINE_OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(fl
+@@ -5051,6 +5054,7 @@ INLINE_OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(fl
  INLINE_OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
  
  // XXX workaround ptx profile
@@ -135,10 +136,10 @@ index 0000000..7030a26
 +}
 +
 diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index edfbda1..d63b31f 100644
+index 8a58ff4..b75f3b4 100644
 --- a/utests/CMakeLists.txt
 +++ b/utests/CMakeLists.txt
-@@ -32,6 +32,7 @@ set (utests_sources
+@@ -33,6 +33,7 @@ set (utests_sources
    compiler_double_2.cpp
    compiler_double_3.cpp
    compiler_fabs.cpp
diff --git a/debian/patches/0006-PATCH-Refine-the-get_local_id-.-builtins.patch b/debian/patches/0004-PATCH-Refine-the-get_local_id-.-builtins.patch
similarity index 84%
rename from debian/patches/0006-PATCH-Refine-the-get_local_id-.-builtins.patch
rename to debian/patches/0004-PATCH-Refine-the-get_local_id-.-builtins.patch
index 25f2f39..888ffc4 100644
--- a/debian/patches/0006-PATCH-Refine-the-get_local_id-.-builtins.patch
+++ b/debian/patches/0004-PATCH-Refine-the-get_local_id-.-builtins.patch
@@ -1,7 +1,7 @@
-From 8a2826a9a7b2085935f5396ae633645d35255b17 Mon Sep 17 00:00:00 2001
+From 3c0ee8cab035426e7e28425d2142e15051d83c90 Mon Sep 17 00:00:00 2001
 From: Zhigang Gong <zhigang.gong at linux.intel.com>
 Date: Tue, 25 Jun 2013 14:15:09 +0800
-Subject: [PATCH 6/8] [PATCH] Refine the get_local_id/... builtins.
+Subject: [PATCH 04/11] [PATCH] Refine the get_local_id/... builtins.
 To: beignet at lists.freedesktop.org
 
 As we could prepare correct value on runtime library side and give
@@ -19,10 +19,10 @@ Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
  2 files changed, 4 insertions(+), 6 deletions(-)
 
 diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
-index eaf8f21..dd70da9 100644
+index 5ad829e..227454d 100644
 --- a/backend/src/ocl_stdlib.h
 +++ b/backend/src/ocl_stdlib.h
-@@ -4315,11 +4315,9 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
+@@ -4379,11 +4379,9 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
  #define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
  INLINE unsigned NAME(unsigned int dim) {             \
    if (dim == 0) return __gen_ocl_##NAME##0();        \
@@ -38,10 +38,10 @@ index eaf8f21..dd70da9 100644
  
  DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
 diff --git a/src/cl_api.c b/src/cl_api.c
-index 3c78243..ebca294 100644
+index f7db4bc..bb09c07 100644
 --- a/src/cl_api.c
 +++ b/src/cl_api.c
-@@ -1570,7 +1570,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
+@@ -1581,7 +1581,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
  {
    size_t fixed_global_off[] = {0,0,0};
    size_t fixed_global_sz[] = {1,1,1};
diff --git a/debian/patches/0005-Add-atomic-help-functions.-Support-global-and-local-.patch b/debian/patches/0005-Add-atomic-help-functions.-Support-global-and-local-.patch
new file mode 100644
index 0000000..f011afd
--- /dev/null
+++ b/debian/patches/0005-Add-atomic-help-functions.-Support-global-and-local-.patch
@@ -0,0 +1,611 @@
+From 6e70caed68eb0ace38a8b42b105c0fd4c38e8f9e Mon Sep 17 00:00:00 2001
+From: Yang Rong <rong.r.yang at intel.com>
+Date: Wed, 26 Jun 2013 15:29:21 +0800
+Subject: [PATCH 05/11] Add atomic help functions. Support global and local
+ buffer.
+To: beignet at lists.freedesktop.org
+
+Signed-off-by: Yang Rong <rong.r.yang at intel.com>
+---
+ backend/src/backend/gen_context.cpp                |    9 +++
+ backend/src/backend/gen_context.hpp                |    1 +
+ backend/src/backend/gen_defs.hpp                   |   49 ++++++++++++--
+ backend/src/backend/gen_encoder.cpp                |   35 ++++++++++
+ backend/src/backend/gen_encoder.hpp                |    2 +
+ .../src/backend/gen_insn_gen7_schedule_info.hxx    |    2 +-
+ backend/src/backend/gen_insn_selection.cpp         |   61 +++++++++++++++++
+ backend/src/backend/gen_insn_selection.hpp         |    4 +-
+ backend/src/backend/gen_insn_selection.hxx         |    1 +
+ backend/src/ir/context.hpp                         |    6 ++
+ backend/src/ir/instruction.cpp                     |   70 +++++++++++++++++++-
+ backend/src/ir/instruction.hpp                     |   43 +++++++++++-
+ backend/src/ir/instruction.hxx                     |   13 ++++
+ 13 files changed, 283 insertions(+), 13 deletions(-)
+
+diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
+index 93d3932..41cab90 100644
+--- a/backend/src/backend/gen_context.cpp
++++ b/backend/src/backend/gen_context.cpp
+@@ -226,6 +226,15 @@ namespace gbe
+     }
+   }
+ 
++  void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
++    const GenRegister src = ra->genReg(insn.src(0));
++    const GenRegister dst = ra->genReg(insn.dst(0));
++    const uint32_t function = insn.extra.function;
++    const uint32_t bti = insn.extra.elem;
++
++    p->ATOMIC(dst, function, src, bti, insn.srcNum);
++  }
++
+   void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
+     GenRegister src = ra->genReg(insn.src(0));
+     if(isScalarReg(src.reg()))
+diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
+index 804384d..5dfaef9 100644
+--- a/backend/src/backend/gen_context.hpp
++++ b/backend/src/backend/gen_context.hpp
+@@ -91,6 +91,7 @@ namespace gbe
+     void emitWriteFloat64Instruction(const SelectionInstruction &insn);
+     void emitUntypedReadInstruction(const SelectionInstruction &insn);
+     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
++    void emitAtomicInstruction(const SelectionInstruction &insn);
+     void emitByteGatherInstruction(const SelectionInstruction &insn);
+     void emitByteScatterInstruction(const SelectionInstruction &insn);
+     void emitSampleInstruction(const SelectionInstruction &insn);
+diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
+index d1ce6b2..61412c4 100644
+--- a/backend/src/backend/gen_defs.hpp
++++ b/backend/src/backend/gen_defs.hpp
+@@ -1,4 +1,4 @@
+-/* 
++/*
+  * Copyright © 2012 Intel Corporation
+  *
+  * This library is free software; you can redistribute it and/or
+@@ -21,7 +21,7 @@
+  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+  develop this 3D driver.
+- 
++
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+@@ -29,11 +29,11 @@
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+- 
++
+  The above copyright notice and this permission notice (including the
+  next paragraph) shall be included in all copies or substantial
+  portions of the Software.
+- 
++
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+@@ -41,7 +41,7 @@
+  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+- 
++
+  **********************************************************************/
+  /*
+   * Authors:
+@@ -169,6 +169,28 @@ enum opcode {
+   GEN_OPCODE_NOP = 126,
+ };
+ 
++#define GEN_ATOMIC_SIMD16   0
++#define GEN_ATOMIC_SIMD8    1
++
++enum GenAtomicOpCode {
++  GEN_ATOMIC_OP_CMPWR8B   = 0,
++  GEN_ATOMIC_OP_AND       = 1,
++  GEN_ATOMIC_OP_OR        = 2,
++  GEN_ATOMIC_OP_XOR       = 3,
++  GEN_ATOMIC_OP_MOV       = 4,
++  GEN_ATOMIC_OP_INC       = 5,
++  GEN_ATOMIC_OP_DEC       = 6,
++  GEN_ATOMIC_OP_ADD       = 7,
++  GEN_ATOMIC_OP_SUB       = 8,
++  GEN_ATOMIC_OP_REVSUB    = 9,
++  GEN_ATOMIC_OP_IMAX      = 10,
++  GEN_ATOMIC_OP_IMIN      = 11,
++  GEN_ATOMIC_OP_UMAX      = 12,
++  GEN_ATOMIC_OP_UMIN      = 13,
++  GEN_ATOMIC_OP_CMPWR     = 14,
++  GEN_ATOMIC_OP_PREDEC    = 15
++};
++
+ /*! Gen SFID */
+ enum GenMessageTarget {
+   GEN_SFID_NULL                     = 0,
+@@ -772,7 +794,7 @@ struct GenInstruction
+     /*! Memory fence */
+     struct {
+       uint32_t bti:8;
+-      uint32_t ingored:5;
++      uint32_t pad:5;
+       uint32_t commit_enable:1;
+       uint32_t msg_type:4;
+       uint32_t pad2:1;
+@@ -783,6 +805,21 @@ struct GenInstruction
+       uint32_t end_of_thread:1;
+     } gen7_memory_fence;
+ 
++    /*! atomic messages */
++    struct {
++      uint32_t bti:8;
++      uint32_t aop_type:4;
++      uint32_t simd_mode:1;
++      uint32_t return_data:1;
++      uint32_t msg_type:4;
++      uint32_t category:1;
++      uint32_t header_present:1;
++      uint32_t response_length:5;
++      uint32_t msg_length:4;
++      uint32_t pad3:2;
++      uint32_t end_of_thread:1;
++    } gen7_atomic_op;
++
+     struct {
+       uint32_t src1_subreg_nr_high:1;
+       uint32_t src1_reg_nr:8;
+diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
+index e96678b..43658e8 100644
+--- a/backend/src/backend/gen_encoder.cpp
++++ b/backend/src/backend/gen_encoder.cpp
+@@ -558,6 +558,41 @@ namespace gbe
+                            response_length);
+   }
+ 
++  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
++    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
++    uint32_t msg_length = 0;
++    uint32_t response_length = 0;
++
++    if (this->curr.execWidth == 8) {
++      msg_length = srcNum;
++      response_length = 1;
++    } else if (this->curr.execWidth == 16) {
++      msg_length = 2*srcNum;
++      response_length = 2;
++    } else
++      NOT_IMPLEMENTED;
++
++    this->setHeader(insn);
++    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
++    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
++    this->setSrc1(insn, GenRegister::immud(0));
++
++    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
++    setMessageDescriptor(this, insn, sfid, msg_length, response_length);
++    insn->bits3.gen7_atomic_op.msg_type = GEN_UNTYPED_ATOMIC_READ;
++    insn->bits3.gen7_atomic_op.bti = bti;
++    insn->bits3.gen7_atomic_op.return_data = 1;
++    insn->bits3.gen7_atomic_op.aop_type = function;
++
++    if (this->curr.execWidth == 8)
++      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
++    else if (this->curr.execWidth == 16)
++      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
++    else
++      NOT_SUPPORTED;
++
++  }
++
+   GenInstruction *GenEncoder::next(uint32_t opcode) {
+      GenInstruction insn;
+      std::memset(&insn, 0, sizeof(GenInstruction));
+diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
+index 88a3e77..3ff8c97 100644
+--- a/backend/src/backend/gen_encoder.hpp
++++ b/backend/src/backend/gen_encoder.hpp
+@@ -136,6 +136,8 @@ namespace gbe
+     void NOP(void);
+     /*! Wait instruction (used for the barrier) */
+     void WAIT(void);
++    /*! Atomic instructions */
++    void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+     /*! Read 64-bits float arrays */
+     void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+     /*! Write 64-bits float arrays */
+diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+index a3b4621..f3f4a25 100644
+--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
++++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+@@ -21,4 +21,4 @@ DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
+ DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
+ DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
+ DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
+-
++DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
+diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
+index 11efcb9..c64afd9 100644
+--- a/backend/src/backend/gen_insn_selection.cpp
++++ b/backend/src/backend/gen_insn_selection.cpp
+@@ -168,12 +168,14 @@ namespace gbe
+   bool SelectionInstruction::isRead(void) const {
+     return this->opcode == SEL_OP_UNTYPED_READ ||
+            this->opcode == SEL_OP_READ_FLOAT64 ||
++           this->opcode == SEL_OP_ATOMIC       ||
+            this->opcode == SEL_OP_BYTE_GATHER;
+   }
+ 
+   bool SelectionInstruction::isWrite(void) const {
+     return this->opcode == SEL_OP_UNTYPED_WRITE ||
+            this->opcode == SEL_OP_WRITE_FLOAT64 ||
++           this->opcode == SEL_OP_ATOMIC        ||
+            this->opcode == SEL_OP_BYTE_SCATTER;
+   }
+ 
+@@ -456,6 +458,8 @@ namespace gbe
+     void NOP(void);
+     /*! Wait instruction (used for the barrier) */
+     void WAIT(void);
++    /*! Atomic instruction */
++    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
+     /*! Read 64 bits float array */
+     void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+     /*! Write 64 bits float array */
+@@ -730,6 +734,23 @@ namespace gbe
+     insn->src(0) = src;
+   }
+ 
++  void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
++                                     uint32_t srcNum, Reg src0,
++                                     Reg src1, Reg src2, uint32_t bti) {
++    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
++    insn->dst(0) = dst;
++    insn->src(0) = src0;
++    if(srcNum > 1) insn->src(1) = src1;
++    if(srcNum > 2) insn->src(2) = src2;
++    insn->extra.function = function;
++    insn->extra.elem     = bti;
++    SelectionVector *vector = this->appendVector();
++
++    vector->regNum = srcNum;
++    vector->reg = &insn->src(0);
++    vector->isSrc = 1;
++  }
++
+   void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
+   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
+   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
+@@ -916,6 +937,7 @@ namespace gbe
+   bool Selection::Opaque::isRoot(const ir::Instruction &insn) const {
+     if (insn.getDstNum() > 1 ||
+         insn.hasSideEffect() ||
++        insn.isMemberOf<ir::AtomicInstruction>() ||
+         insn.isMemberOf<ir::BranchInstruction>() ||
+         insn.isMemberOf<ir::LabelInstruction>())
+     return true;
+@@ -2033,6 +2055,44 @@ namespace gbe
+     DECL_CTOR(ConvertInstruction, 1, 1);
+   };
+ 
++  /*! Convert instruction pattern */
++  DECL_PATTERN(AtomicInstruction)
++  {
++    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn) const
++    {
++      using namespace ir;
++      const Opcode opcode = insn.getOpcode();
++      const AddressSpace space = insn.getAddressSpace();
++      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
++      const uint32_t srcNum = insn.getSrcNum();
++      const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
++      GenRegister src1 = src0, src2 = src0;
++      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
++      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
++      GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
++      GenAtomicOpCode aop = GEN_ATOMIC_OP_CMPWR8B;
++      switch (opcode) {
++        case OP_ATOMIC_ADD:     aop = GEN_ATOMIC_OP_ADD; break;
++        case OP_ATOMIC_SUB:     aop = GEN_ATOMIC_OP_SUB; break;
++        case OP_ATOMIC_AND:     aop = GEN_ATOMIC_OP_AND; break;
++        case OP_ATOMIC_OR :     aop = GEN_ATOMIC_OP_OR;  break;
++        case OP_ATOMIC_XOR:     aop = GEN_ATOMIC_OP_XOR; break;
++        case OP_ATOMIC_XCHG:    aop = GEN_ATOMIC_OP_MOV; break;
++        case OP_ATOMIC_UMIN:    aop = GEN_ATOMIC_OP_UMIN; break;
++        case OP_ATOMIC_UMAX:    aop = GEN_ATOMIC_OP_UMAX; break;
++        case OP_ATOMIC_IMIN:    aop = GEN_ATOMIC_OP_IMIN; break;
++        case OP_ATOMIC_IMAX:    aop = GEN_ATOMIC_OP_IMAX; break;
++        case OP_ATOMIC_INC:     aop = GEN_ATOMIC_OP_INC; break;
++        case OP_ATOMIC_DEC:     aop = GEN_ATOMIC_OP_DEC; break;
++        case OP_ATOMIC_CMPXCHG: aop = GEN_ATOMIC_OP_CMPWR; break;
++        default: NOT_SUPPORTED;
++      }
++      sel.ATOMIC(dst, aop, srcNum, src0, src1, src2, bti);
++      return true;
++    }
++    DECL_CTOR(AtomicInstruction, 1, 1);
++  };
++
+   /*! Select instruction pattern */
+   class SelectInstructionPattern : public SelectionPattern
+   {
+@@ -2379,6 +2439,7 @@ namespace gbe
+     this->insert<SelectInstructionPattern>();
+     this->insert<CompareInstructionPattern>();
+     this->insert<ConvertInstructionPattern>();
++    this->insert<AtomicInstructionPattern>();
+     this->insert<LabelInstructionPattern>();
+     this->insert<BranchInstructionPattern>();
+     this->insert<Int32x32MulInstructionPattern>();
+diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
+index 778eb1f..5ae6e42 100644
+--- a/backend/src/backend/gen_insn_selection.hpp
++++ b/backend/src/backend/gen_insn_selection.hpp
+@@ -92,9 +92,9 @@ namespace gbe
+     GenInstructionState state;
+     union {
+       struct {
+-        /*! Store bti for loads/stores and function for math and compares */
++        /*! Store bti for loads/stores and function for math, atomic and compares */
+         uint16_t function:8;
+-        /*! elemSize for byte scatters / gathers, elemNum for untyped msg */
++        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
+         uint16_t elem:8;
+       };
+       struct {
+diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
+index cc2be08..f1a4701 100644
+--- a/backend/src/backend/gen_insn_selection.hxx
++++ b/backend/src/backend/gen_insn_selection.hxx
+@@ -20,6 +20,7 @@ DECL_SELECTION_IR(RSL, BinaryInstruction)
+ DECL_SELECTION_IR(ASR, BinaryInstruction)
+ DECL_SELECTION_IR(ADD, BinaryInstruction)
+ DECL_SELECTION_IR(MUL, BinaryInstruction)
++DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
+ DECL_SELECTION_IR(MACH, BinaryInstruction)
+ DECL_SELECTION_IR(CMP, CompareInstruction)
+ DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
+diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
+index c286f1d..55e76f2 100644
+--- a/backend/src/ir/context.hpp
++++ b/backend/src/ir/context.hpp
+@@ -150,6 +150,12 @@ namespace ir {
+       this->append(insn);
+     }
+ 
++    /*! For all atomic functions */
++    void ATOMIC(Opcode opcode, Register dst, AddressSpace space, Tuple src) {
++      const Instruction insn = gbe::ir::ATOMIC(opcode, dst, space, src);
++      this->append(insn);
++    }
++
+     /*! LOAD with the destinations directly specified */
+     template <typename... Args>
+     void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, Args...values)
+diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
+index 67a4c12..578f5d2 100644
+--- a/backend/src/ir/instruction.cpp
++++ b/backend/src/ir/instruction.cpp
+@@ -1,4 +1,4 @@
+-/* 
++/*
+  * Copyright © 2012 Intel Corporation
+  *
+  * This library is free software; you can redistribute it and/or
+@@ -56,7 +56,7 @@ namespace ir {
+     };
+ 
+     /*! For regular n source instructions */
+-    template <typename T, uint32_t srcNum> 
++    template <typename T, uint32_t srcNum>
+     struct NSrcPolicy {
+       INLINE uint32_t getSrcNum(void) const { return srcNum; }
+       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+@@ -246,6 +246,37 @@ namespace ir {
+       Type srcType; //!< Type to convert from
+     };
+ 
++    class ALIGNED_INSTRUCTION AtomicInstruction :
++      public BasePolicy,
++      public TupleSrcPolicy<AtomicInstruction>,
++      public NDstPolicy<AtomicInstruction, 1>
++    {
++    public:
++      AtomicInstruction(Opcode opcode,
++                         Register dst,
++                         AddressSpace addrSpace,
++                         Tuple src)
++      {
++        this->opcode = opcode;
++        this->dst[0] = dst;
++        this->src = src;
++        this->addrSpace = addrSpace;
++        srcNum = 2;
++        if((opcode == OP_ATOMIC_INC) ||
++          (opcode == OP_ATOMIC_DEC))
++          srcNum = 1;
++        if(opcode == OP_ATOMIC_CMPXCHG)
++          srcNum = 3;
++      }
++      INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
++      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
++      INLINE void out(std::ostream &out, const Function &fn) const;
++      Register dst[1];
++      Tuple src;
++      AddressSpace addrSpace; //!< Address space
++      uint8_t srcNum;         //!<Source Number
++    };
++
+     class ALIGNED_INSTRUCTION BranchInstruction :
+       public BasePolicy,
+       public NDstPolicy<BranchInstruction, 0>
+@@ -738,6 +769,20 @@ namespace ir {
+       return true;
+     }
+ 
++    // We can convert anything to anything, but types and families must match
++    INLINE bool AtomicInstruction::wellFormed(const Function &fn, std::string &whyNot) const
++    {
++      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
++        return false;
++      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
++        return false;
++      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
++        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
++          return false;
++
++      return true;
++    }
++
+     /*! Loads and stores follow the same restrictions */
+     template <typename T>
+     INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
+@@ -883,6 +928,15 @@ namespace ir {
+       ternaryOrSelectOut(*this, out, fn);
+     }
+ 
++    INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
++      this->outOpcode(out);
++      out << "." << addrSpace;
++      out << " %" << this->getDst(fn, 0);
++      out << " {" << "%" << this->getSrc(fn, 0) << "}";
++      for (uint32_t i = 1; i < srcNum; ++i)
++        out << " %" << this->getSrc(fn, i);
++    }
++
+     INLINE void ConvertInstruction::out(std::ostream &out, const Function &fn) const {
+       this->outOpcode(out);
+       out << "." << this->getDstType()
+@@ -1009,6 +1063,10 @@ START_INTROSPECTION(ConvertInstruction)
+ #include "ir/instruction.hxx"
+ END_INTROSPECTION(ConvertInstruction)
+ 
++START_INTROSPECTION(AtomicInstruction)
++#include "ir/instruction.hxx"
++END_INTROSPECTION(AtomicInstruction)
++
+ START_INTROSPECTION(SelectInstruction)
+ #include "ir/instruction.hxx"
+ END_INTROSPECTION(SelectInstruction)
+@@ -1180,7 +1238,7 @@ END_FUNCTION(Instruction, Register)
+   }
+ 
+   bool Instruction::hasSideEffect(void) const {
+-    return opcode == OP_STORE || 
++    return opcode == OP_STORE ||
+            opcode == OP_TYPED_WRITE ||
+            opcode == OP_SYNC;
+   }
+@@ -1197,6 +1255,7 @@ DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
+ DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
+ DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
+ DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
++DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+ DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
+ DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
+ DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+@@ -1304,6 +1363,11 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
+     return internal::ConvertInstruction(dstType, srcType, dst, src).convert();
+   }
+ 
++  // For all unary functions with given opcode
++  Instruction ATOMIC(Opcode opcode, Register dst, AddressSpace space, Tuple src) {
++    return internal::AtomicInstruction(opcode, dst, space, src).convert();
++  }
++
+   // BRA
+   Instruction BRA(LabelIndex labelIndex) {
+     return internal::BranchInstruction(OP_BRA, labelIndex).convert();
+diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
+index 0f3bd34..91d280d 100644
+--- a/backend/src/ir/instruction.hpp
++++ b/backend/src/ir/instruction.hpp
+@@ -1,4 +1,4 @@
+-/* 
++/*
+  * Copyright © 2012 Intel Corporation
+  *
+  * This library is free software; you can redistribute it and/or
+@@ -228,6 +228,19 @@ namespace ir {
+     static bool isClassOf(const Instruction &insn);
+   };
+ 
++  /*! Atomic instruction */
++  class AtomicInstruction : public Instruction {
++  public:
++    /*! Where the address register goes */
++    static const uint32_t addressIndex = 0;
++    /*! Address space that is manipulated here */
++    AddressSpace getAddressSpace(void) const;
++    /*! Return the register that contains the addresses */
++    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
++    /*! Return true if the given instruction is an instance of this class */
++    static bool isClassOf(const Instruction &insn);
++  };
++
+   /*! Store instruction. First source is the address. Next sources are the
+    *  values to store contiguously at the given address
+    */
+@@ -555,6 +568,34 @@ namespace ir {
+   Instruction GT(Type type, Register dst, Register src0, Register src1);
+   /*! cvt.{dstType <- srcType} dst src */
+   Instruction CVT(Type dstType, Type srcType, Register dst, Register src);
++  /*! atomic dst addr.space {src1 {src2}} */
++  Instruction ATOMIC(Opcode opcode, Register dst, AddressSpace space, Tuple src);
++  /*! atomic_add dst addr.space src1 */
++  Instruction ATOMIC_ADD(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_sub dst addr.space src1 */
++  Instruction ATOMIC_SUB(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_and dst addr.space src1 */
++  Instruction ATOMIC_AND(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_or dst addr.space src1 */
++  Instruction ATOMIC_OR(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_xor dst addr.space src1 */
++  Instruction ATOMIC_XOR(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_xchg dst addr.space src1 */
++  Instruction ATOMIC_XCHG(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_imin dst addr.space src1 */
++  Instruction ATOMIC_IMIN(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_imax dst addr.space src1 */
++  Instruction ATOMIC_IMAX(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_umin dst addr.space src1 */
++  Instruction ATOMIC_UMIN(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_umax dst addr.space src1 */
++  Instruction ATOMIC_UMAX(Register dst, Register addr, AddressSpace space, Register src1);
++  /*! atomic_inc dst addr.space */
++  Instruction ATOMIC_INC(Register dst, Register addr, AddressSpace space);
++  /*! atomic_dec dst addr.space */
++  Instruction ATOMIC_DEC(Register dst, Register addr, AddressSpace space);
++  /*! atomic_cmpxchg dst addr.space src1 src2 */
++  Instruction ATOMIC_CMPXCHG(Register dst, Register addr, AddressSpace space, Register src1, Register src2);
+   /*! bra labelIndex */
+   Instruction BRA(LabelIndex labelIndex);
+   /*! (pred) bra labelIndex */
+diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
+index acfb45a..42c1e89 100644
+--- a/backend/src/ir/instruction.hxx
++++ b/backend/src/ir/instruction.hxx
+@@ -61,6 +61,19 @@ DECL_INSN(LT, CompareInstruction)
+ DECL_INSN(GE, CompareInstruction)
+ DECL_INSN(GT, CompareInstruction)
+ DECL_INSN(CVT, ConvertInstruction)
++DECL_INSN(ATOMIC_ADD, AtomicInstruction)
++DECL_INSN(ATOMIC_SUB, AtomicInstruction)
++DECL_INSN(ATOMIC_AND, AtomicInstruction)
++DECL_INSN(ATOMIC_OR, AtomicInstruction)
++DECL_INSN(ATOMIC_XOR, AtomicInstruction)
++DECL_INSN(ATOMIC_XCHG, AtomicInstruction)
++DECL_INSN(ATOMIC_UMIN, AtomicInstruction)
++DECL_INSN(ATOMIC_UMAX, AtomicInstruction)
++DECL_INSN(ATOMIC_IMIN, AtomicInstruction)
++DECL_INSN(ATOMIC_IMAX, AtomicInstruction)
++DECL_INSN(ATOMIC_INC, AtomicInstruction)
++DECL_INSN(ATOMIC_DEC, AtomicInstruction)
++DECL_INSN(ATOMIC_CMPXCHG, AtomicInstruction)
+ DECL_INSN(BRA, BranchInstruction)
+ DECL_INSN(RET, BranchInstruction)
+ DECL_INSN(LOADI, LoadImmInstruction)
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0006-Add-all-atomic-built-in-functions-support.patch b/debian/patches/0006-Add-all-atomic-built-in-functions-support.patch
new file mode 100644
index 0000000..fac6999
--- /dev/null
+++ b/debian/patches/0006-Add-all-atomic-built-in-functions-support.patch
@@ -0,0 +1,289 @@
+From 6d471350375328273070c9b73b22e9ab042c3313 Mon Sep 17 00:00:00 2001
+From: Yang Rong <rong.r.yang at intel.com>
+Date: Wed, 26 Jun 2013 15:29:22 +0800
+Subject: [PATCH 06/11] Add all atomic built-in functions support.
+To: beignet at lists.freedesktop.org
+
+Signed-off-by: Yang Rong <rong.r.yang at intel.com>
+---
+ backend/src/llvm/llvm_gen_backend.cpp      |   76 +++++++++++++++++++++
+ backend/src/llvm/llvm_gen_ocl_function.hxx |   28 ++++++++
+ backend/src/ocl_stdlib.h                   |  100 +++++++++++++++++++++++++++-
+ 3 files changed, 203 insertions(+), 1 deletion(-)
+
+diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
+index ba2192a..5db3fbe 100644
+--- a/backend/src/llvm/llvm_gen_backend.cpp
++++ b/backend/src/llvm/llvm_gen_backend.cpp
+@@ -534,6 +534,8 @@ namespace gbe
+ 
+     // Emit unary instructions from gen native function
+     void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
++    // Emit unary instructions from gen native function
++    void emitAtomicInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
+ 
+     // These instructions are not supported at all
+     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
+@@ -693,10 +695,12 @@ namespace gbe
+           return doIt(uint64_t(0));
+         }
+       }
++
+       // NULL pointers
+       if(isa<ConstantPointerNull>(CPV)) {
+         return doIt(uint32_t(0));
+       }
++
+       // Floats and doubles
+       const Type::TypeID typeID = CPV->getType()->getTypeID();
+       switch (typeID) {
+@@ -1698,6 +1702,32 @@ namespace gbe
+       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
+       case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
+       case GEN_OCL_GET_IMAGE_DEPTH:
++      case GEN_OCL_ATOMIC_ADD0:
++      case GEN_OCL_ATOMIC_ADD1:
++      case GEN_OCL_ATOMIC_SUB0:
++      case GEN_OCL_ATOMIC_SUB1:
++      case GEN_OCL_ATOMIC_AND0:
++      case GEN_OCL_ATOMIC_AND1:
++      case GEN_OCL_ATOMIC_OR0:
++      case GEN_OCL_ATOMIC_OR1:
++      case GEN_OCL_ATOMIC_XOR0:
++      case GEN_OCL_ATOMIC_XOR1:
++      case GEN_OCL_ATOMIC_XCHG0:
++      case GEN_OCL_ATOMIC_XCHG1:
++      case GEN_OCL_ATOMIC_UMAX0:
++      case GEN_OCL_ATOMIC_UMAX1:
++      case GEN_OCL_ATOMIC_UMIN0:
++      case GEN_OCL_ATOMIC_UMIN1:
++      case GEN_OCL_ATOMIC_IMAX0:
++      case GEN_OCL_ATOMIC_IMAX1:
++      case GEN_OCL_ATOMIC_IMIN0:
++      case GEN_OCL_ATOMIC_IMIN1:
++      case GEN_OCL_ATOMIC_INC0:
++      case GEN_OCL_ATOMIC_INC1:
++      case GEN_OCL_ATOMIC_DEC0:
++      case GEN_OCL_ATOMIC_DEC1:
++      case GEN_OCL_ATOMIC_CMPXCHG0:
++      case GEN_OCL_ATOMIC_CMPXCHG1:
+         // No structure can be returned
+         this->newRegister(&I);
+         break;
+@@ -1782,6 +1812,26 @@ namespace gbe
+     ctx.ALU1(opcode, ir::TYPE_FLOAT, dst, src);
+   }
+ 
++  void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::Opcode opcode) {
++    CallSite::arg_iterator AI = CS.arg_begin();
++#if GBE_DEBUG
++    CallSite::arg_iterator AE = CS.arg_end();
++#endif /* GBE_DEBUG */
++    GBE_ASSERT(AI != AE);
++    unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
++    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
++    const ir::Register dst = this->getRegister(&I);
++
++    vector<ir::Register> src;
++    uint32_t srcNum = 0;
++    while(AI != AE) {
++      src.push_back(this->getRegister(*(AI++)));
++      srcNum++;
++    }
++    const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
++    ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
++  }
++
+   void GenWriter::emitCallInst(CallInst &I) {
+     if (Function *F = I.getCalledFunction()) {
+       if (F->getIntrinsicID() != 0) {
+@@ -1870,6 +1920,32 @@ namespace gbe
+           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
+           case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
+           case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
++          case GEN_OCL_ATOMIC_ADD0:
++          case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_ADD); break;
++          case GEN_OCL_ATOMIC_SUB0:
++          case GEN_OCL_ATOMIC_SUB1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_SUB); break;
++          case GEN_OCL_ATOMIC_AND0:
++          case GEN_OCL_ATOMIC_AND1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_AND); break;
++          case GEN_OCL_ATOMIC_OR0:
++          case GEN_OCL_ATOMIC_OR1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_OR); break;
++          case GEN_OCL_ATOMIC_XOR0:
++          case GEN_OCL_ATOMIC_XOR1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_XOR); break;
++          case GEN_OCL_ATOMIC_XCHG0:
++          case GEN_OCL_ATOMIC_XCHG1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_XCHG); break;
++          case GEN_OCL_ATOMIC_INC0:
++          case GEN_OCL_ATOMIC_INC1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_INC); break;
++          case GEN_OCL_ATOMIC_DEC0:
++          case GEN_OCL_ATOMIC_DEC1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_DEC); break;
++          case GEN_OCL_ATOMIC_UMIN0:
++          case GEN_OCL_ATOMIC_UMIN1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_UMIN); break;
++          case GEN_OCL_ATOMIC_UMAX0:
++          case GEN_OCL_ATOMIC_UMAX1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_UMAX); break;
++          case GEN_OCL_ATOMIC_IMIN0:
++          case GEN_OCL_ATOMIC_IMIN1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_IMIN); break;
++          case GEN_OCL_ATOMIC_IMAX0:
++          case GEN_OCL_ATOMIC_IMAX1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_IMAX); break;
++          case GEN_OCL_ATOMIC_CMPXCHG0:
++          case GEN_OCL_ATOMIC_CMPXCHG1: this->emitAtomicInst(I,CS,ir::OP_ATOMIC_CMPXCHG); break;
+           case GEN_OCL_GET_IMAGE_WIDTH:
+           case GEN_OCL_GET_IMAGE_HEIGHT:
+           case GEN_OCL_GET_IMAGE_DEPTH:
+diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
+index 89a04ea..2f79690 100644
+--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
++++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
+@@ -79,6 +79,34 @@ DECL_LLVM_GEN_FUNCTION(GET_IMAGE_DEPTH,  __gen_ocl_get_image_depth)
+ DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_DATA_TYPE,  __gen_ocl_get_image_channel_data_type)
+ DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_ORDER,  __gen_ocl_get_image_channel_order)
+ 
++// atomic related functions.
++DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD0, _Z20__gen_ocl_atomic_addPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD1, _Z20__gen_ocl_atomic_addPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB0, _Z20__gen_ocl_atomic_subPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB1, _Z20__gen_ocl_atomic_subPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_AND0, _Z20__gen_ocl_atomic_andPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_AND1, _Z20__gen_ocl_atomic_andPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_OR0,  _Z19__gen_ocl_atomic_orPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_OR1,  _Z19__gen_ocl_atomic_orPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR0, _Z20__gen_ocl_atomic_xorPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR1, _Z20__gen_ocl_atomic_xorPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN0, _Z21__gen_ocl_atomic_uminPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN1, _Z21__gen_ocl_atomic_uminPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX0, _Z21__gen_ocl_atomic_umaxPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX1, _Z21__gen_ocl_atomic_umaxPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN0, _Z21__gen_ocl_atomic_iminPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN1, _Z21__gen_ocl_atomic_iminPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX0, _Z21__gen_ocl_atomic_imaxPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX1, _Z21__gen_ocl_atomic_imaxPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG0, _Z21__gen_ocl_atomic_xchgPU3AS1jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG1, _Z21__gen_ocl_atomic_xchgPU3AS3jj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_INC0, _Z20__gen_ocl_atomic_incPU3AS1j)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_INC1, _Z20__gen_ocl_atomic_incPU3AS3j)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC0, _Z20__gen_ocl_atomic_decPU3AS1j)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC1, _Z20__gen_ocl_atomic_decPU3AS3j)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG0, _Z24__gen_ocl_atomic_cmpxchgPU3AS1jjj)
++DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG1, _Z24__gen_ocl_atomic_cmpxchgPU3AS3jjj)
++
+ // saturation related functions.
+ DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
+ DECL_LLVM_GEN_FUNCTION(SADD_SAT_SHORT, _Z12ocl_sadd_satss)
+diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
+index 227454d..7a98e04 100644
+--- a/backend/src/ocl_stdlib.h
++++ b/backend/src/ocl_stdlib.h
+@@ -1,4 +1,4 @@
+-/* 
++/*
+  * Copyright © 2012 Intel Corporation
+  *
+  * This library is free software; you can redistribute it and/or
+@@ -5100,6 +5100,104 @@ INLINE void write_mem_fence(cl_mem_fence_flags flags) {
+ }
+ 
+ /////////////////////////////////////////////////////////////////////////////
++// Atomic functions
++/////////////////////////////////////////////////////////////////////////////
++OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
++OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
++OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
++OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
++OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
++OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
++
++#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
++  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
++    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
++  }
++
++#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
++  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
++  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX) \
++
++#define DECL_ATOMIC_OP(NAME) \
++  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)              \
++  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
++
++DECL_ATOMIC_OP(add)
++DECL_ATOMIC_OP(sub)
++DECL_ATOMIC_OP(and)
++DECL_ATOMIC_OP(or)
++DECL_ATOMIC_OP(xor)
++DECL_ATOMIC_OP(xchg)
++DECL_ATOMIC_OP_TYPE(xchg, float, atomic_)
++DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
++DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
++DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
++DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
++
++#undef DECL_ATOMIC_OP
++#undef DECL_ATOMIC_OP_TYPE
++#undef DECL_ATOMIC_OP_SPACE
++
++#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
++  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
++    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
++  }
++
++#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
++  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
++  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
++
++#define DECL_ATOMIC_OP(NAME) \
++  DECL_ATOMIC_OP_TYPE(NAME, uint) \
++  DECL_ATOMIC_OP_TYPE(NAME, int)
++
++DECL_ATOMIC_OP(inc)
++DECL_ATOMIC_OP(dec)
++
++#undef DECL_ATOMIC_OP
++#undef DECL_ATOMIC_OP_TYPE
++#undef DECL_ATOMIC_OP_SPACE
++
++#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
++  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
++    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
++  }
++
++#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
++  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
++  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
++
++#define DECL_ATOMIC_OP(NAME) \
++  DECL_ATOMIC_OP_TYPE(NAME, uint) \
++  DECL_ATOMIC_OP_TYPE(NAME, int)
++
++DECL_ATOMIC_OP(cmpxchg)
++
++#undef DECL_ATOMIC_OP
++#undef DECL_ATOMIC_OP_TYPE
++#undef DECL_ATOMIC_OP_SPACE
++
++/////////////////////////////////////////////////////////////////////////////
+ // Force the compilation to SIMD8 or SIMD16
+ /////////////////////////////////////////////////////////////////////////////
+ 
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0007-Add-atomic-test-case.patch b/debian/patches/0007-Add-atomic-test-case.patch
new file mode 100644
index 0000000..6a3ae8a
--- /dev/null
+++ b/debian/patches/0007-Add-atomic-test-case.patch
@@ -0,0 +1,188 @@
+From d83c5424c918bba806f39c72f27db6a788600dc9 Mon Sep 17 00:00:00 2001
+From: Yang Rong <rong.r.yang at intel.com>
+Date: Wed, 26 Jun 2013 15:29:23 +0800
+Subject: [PATCH 07/11] Add atomic test case.
+To: beignet at lists.freedesktop.org
+
+Signed-off-by: Yang Rong <rong.r.yang at intel.com>
+---
+ kernels/compiler_atomic_functions.cl |   55 ++++++++++++++++-----
+ utests/CMakeLists.txt                |    1 +
+ utests/compiler_atomic_functions.cpp |   87 ++++++++++++++++++++++++++++++++--
+ 3 files changed, 127 insertions(+), 16 deletions(-)
+
+diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
+index 23f3e73..dd9ec56 100644
+--- a/kernels/compiler_atomic_functions.cl
++++ b/kernels/compiler_atomic_functions.cl
+@@ -1,14 +1,43 @@
+-/* test OpenCL 1.1 Atomic Functions (section 6.11.1, 9.4) */
+-__kernel void compiler_atomic_functions(global int *a, global int *b) {
+-  atomic_add(a, *b);
+-  atomic_sub(a, *b);
+-  atomic_xchg(a, *b);
+-  atomic_inc(a);
+-  atomic_dec(a);
+-  atomic_cmpxchg(a, b, 100);
+-  atomic_min(a, *b);
+-  atomic_max(a, *b);
+-  atomic_and(a, *b);
+-  atomic_or(a, *b);
+-  atomic_xor(a, *b);
++__kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
++  int lid = get_local_id(0);
++  int i = lid % 12;
++  atomic_xchg(&tmp[4], -1);
++	switch(i) {
++	  case 0: atomic_inc(&tmp[i]); break;
++	  case 1: atomic_dec(&tmp[i]); break;
++	  case 2: atomic_add(&tmp[i], src[lid]); break;
++	  case 3: atomic_sub(&tmp[i], src[lid]); break;
++	  case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid>>2))); break;
++	  case 5: atomic_or (&tmp[i], src[lid]<<(lid>>2)); break;
++	  case 6: atomic_xor(&tmp[i], src[lid]); break;
++	  case 7: atomic_min(&tmp[i], -src[lid]); break;
++    case 8: atomic_max(&tmp[i], src[lid]); break;
++		case 9: atomic_min((__local unsigned int *)&tmp[i], -src[lid]); break;
++		case 10: atomic_max((__local unsigned int *)&tmp[i], src[lid]); break;
++	  case 11: atomic_cmpxchg(&(tmp[i]), 0, src[10]); break;
++		default:  break;
++	}
++
++	switch(i) {
++	  case 0: atomic_inc(&dst[i]); break;
++	  case 1: atomic_dec(&dst[i]); break;
++	  case 2: atomic_add(&dst[i], src[lid]); break;
++	  case 3: atomic_sub(&dst[i], src[lid]); break;
++	  case 4: atomic_and(&dst[i], ~(src[lid]<<(lid>>2))); break;
++	  case 5: atomic_or (&dst[i], src[lid]<<(lid>>2)); break;
++	  case 6: atomic_xor(&dst[i], src[lid]); break;
++	  case 7: atomic_min(&dst[i], -src[lid]); break;
++    case 8: atomic_max(&dst[i], src[lid]); break;
++		case 9: atomic_min((__global unsigned int *)&dst[i], -src[lid]); break;
++		case 10: atomic_max((__global unsigned int *)&dst[i], src[lid]); break;
++	  case 11: atomic_cmpxchg(&dst[i], 0, src[10]); break;
++		default:  break;
++	}
++
++	barrier(CLK_GLOBAL_MEM_FENCE);
++
++	if(get_global_id(0) == 0) {
++	  for(i=0; i<12; i=i+1)
++		  atomic_add(&dst[i], tmp[i]);
++	}
+ }
+diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
+index b75f3b4..c313acd 100644
+--- a/utests/CMakeLists.txt
++++ b/utests/CMakeLists.txt
+@@ -75,6 +75,7 @@ set (utests_sources
+   compiler_write_only_shorts.cpp
+   compiler_switch.cpp
+   compiler_math.cpp
++  compiler_atomic_functions.cpp
+   compiler_insn_selection_min.cpp
+   compiler_insn_selection_max.cpp
+   compiler_insn_selection_masked_min_max.cpp
+diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
+index 20202da..71e8384 100644
+--- a/utests/compiler_atomic_functions.cpp
++++ b/utests/compiler_atomic_functions.cpp
+@@ -1,10 +1,91 @@
+ #include "utest_helper.hpp"
++#include <cmath>
++#include <algorithm>
+ 
+-void compiler_atomic_functions(void)
++#define GROUP_NUM 16
++#define LOCAL_SIZE 64
++static void cpu_compiler_atomic(int *dst, int *src)
+ {
+-  OCL_CREATE_KERNEL("compiler_atomic_functions");
++  dst[4] = 0xffffffff;
++  int tmp[16] = { 0 };
++
++  for(int j=0; j<LOCAL_SIZE; j++) {
++    int i = j % 12;
++
++    switch(i) {
++      case 0: tmp[i] += 1; break;
++      case 1: tmp[i] -= 1; break;
++      case 2: tmp[i] += src[j]; break;
++      case 3: tmp[i] -= src[j]; break;
++      case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
++      case 5: tmp[i] |= src[j]<<(j>>2); break;
++      case 6: tmp[i] ^= src[j]; break;
++      case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
++      case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
++      case 9: tmp[i] = (unsigned int)tmp[i] < (unsigned int)(-src[j]) ? tmp[i] : -src[j]; break;
++      case 10: tmp[i] = (unsigned int)tmp[i] > (unsigned int)(src[j]) ? tmp[i] : src[j]; break;
++      case 11:  tmp[i] = src[10]; break;
++      default:  break;
++    }
++  }
++
++  for(int k=0; k<GROUP_NUM; k++) {
++    for(int j=0; j<LOCAL_SIZE; j++) {
++      int i = j % 12;
++
++      switch(i) {
++        case 0: dst[i] += 1; break;
++        case 1: dst[i] -= 1; break;
++        case 2: dst[i] += src[j]; break;
++        case 3: dst[i] -= src[j]; break;
++        case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
++        case 5: dst[i] |= src[j]<<(j>>2); break;
++        case 6: dst[i] ^= src[j]; break;
++        case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
++        case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
++        case 9: dst[i] = (unsigned int)dst[i] < (unsigned int)(-src[j]) ? dst[i] : -src[j]; break;
++        case 10: dst[i] = (unsigned int)dst[i] > (unsigned int)(src[j]) ? dst[i] : src[j]; break;
++        case 11:  dst[i] = src[10]; break;
++        default:  break;
++      }
++    }
++  }
++
++  for(int i=0; i<12; i++)
++    dst[i] += tmp[i];
+ }
+ 
+-MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions);
++static void compiler_atomic_functions(void)
++{
++  const size_t n = GROUP_NUM * LOCAL_SIZE;
++  int cpu_dst[16] = {0}, cpu_src[256];
+ 
++  globals[0] = n;
++  locals[0] = LOCAL_SIZE;
++
++  // Setup kernel and buffers
++  OCL_CREATE_KERNEL("compiler_atomic_functions");
++  OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
++  OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
++  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
++  OCL_SET_ARG(1, 16 * sizeof(int), NULL);
++  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
++
++  OCL_MAP_BUFFER(1);
++  for (uint32_t i = 0; i < locals[0]; ++i)
++      cpu_src[i] = ((int*)buf_data[1])[i] = rand() & 0xff;
++  cpu_compiler_atomic(cpu_dst, cpu_src);
++  OCL_UNMAP_BUFFER(1);
++  OCL_NDRANGE(1);
++
++  OCL_MAP_BUFFER(0);
++
++  // Check results
++  for(int i=0; i<12; i++) {
++    //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
++    OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
++  }
++  OCL_UNMAP_BUFFER(0);
++}
+ 
++MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions)
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0007-Add-the-support-of-the-API-clGetCommandQueueInfo.patch b/debian/patches/0007-Add-the-support-of-the-API-clGetCommandQueueInfo.patch
deleted file mode 100644
index e43e1cc..0000000
--- a/debian/patches/0007-Add-the-support-of-the-API-clGetCommandQueueInfo.patch
+++ /dev/null
@@ -1,185 +0,0 @@
-From 1e1556a65ff1779dca6a5b30c204985ad14f0e4f Mon Sep 17 00:00:00 2001
-From: Junyan He <junyan.he at linux.intel.com>
-Date: Tue, 25 Jun 2013 18:15:25 +0800
-Subject: [PATCH 7/8] Add the support of the API: clGetCommandQueueInfo
-To: beignet at lists.freedesktop.org
-
-Though we support get the CL_QUEUE_PROPERTIES,
-but because the CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
-and CL_QUEUE_PROFILING_ENABLE will never be set when
-create the queue, it just return a all 0 bitfield now.
-
-Signed-off-by: Junyan He <junyan.he at linux.intel.com>
----
- src/cl_api.c           |   63 ++++++++++++++++++++++++++++--------------------
- src/cl_command_queue.h |   15 ++++++------
- src/cl_context.c       |    1 +
- 3 files changed, 46 insertions(+), 33 deletions(-)
-
-diff --git a/src/cl_api.c b/src/cl_api.c
-index ebca294..bb09c07 100644
---- a/src/cl_api.c
-+++ b/src/cl_api.c
-@@ -46,6 +46,19 @@
- typedef intptr_t cl_device_partition_property;
- #endif
- 
-+#define FILL_GETINFO_RET(TYPE, ELT, VAL, RET) \
-+	do { \
-+	  if (param_value && param_value_size < sizeof(TYPE)*ELT) \
-+	      return CL_INVALID_VALUE;  \
-+	  if (param_value) { \
-+	      memcpy(param_value, (VAL), sizeof(TYPE)*ELT); \
-+	  } \
-+          \
-+	  if (param_value_size_ret) \
-+	      *param_value_size_ret = sizeof(TYPE)*ELT; \
-+	  return RET; \
-+	} while(0)
-+
- static cl_int
- cl_check_device_type(cl_device_type device_type)
- {
-@@ -341,7 +354,20 @@ clGetCommandQueueInfo(cl_command_queue       command_queue,
- {
-   cl_int err = CL_SUCCESS;
-   CHECK_QUEUE (command_queue);
--  NOT_IMPLEMENTED;
-+
-+  if (param_name == CL_QUEUE_CONTEXT) {
-+    FILL_GETINFO_RET (cl_context, 1, &command_queue->ctx, CL_SUCCESS);
-+  } else if (param_name == CL_QUEUE_DEVICE) {
-+    FILL_GETINFO_RET (cl_device_id, 1, &command_queue->ctx->device, CL_SUCCESS);
-+  } else if (param_name == CL_QUEUE_REFERENCE_COUNT) {
-+    cl_uint ref = command_queue->ref_n;
-+    FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
-+  } else if (param_name == CL_QUEUE_PROPERTIES) {
-+    FILL_GETINFO_RET (cl_command_queue_properties, 1, &command_queue->props, CL_SUCCESS);
-+  } else {
-+    return CL_INVALID_VALUE;
-+  }
-+
- error:
-   return err;
- }
-@@ -734,19 +760,6 @@ clUnloadCompiler(void)
-   return 0;
- }
- 
--#define FILL_AND_RET(TYPE, ELT, VAL, RET) \
--	do { \
--	  if (param_value && param_value_size < sizeof(TYPE)*ELT) \
--	      return CL_INVALID_VALUE;  \
--	  if (param_value) { \
--	      memcpy(param_value, (VAL), sizeof(TYPE)*ELT); \
--	  } \
--          \
--	  if (param_value_size_ret) \
--	      *param_value_size_ret = sizeof(TYPE)*ELT; \
--	  return RET; \
--	} while(0)
--
- cl_int
- clGetProgramInfo(cl_program       program,
-                  cl_program_info  param_name,
-@@ -761,24 +774,24 @@ clGetProgramInfo(cl_program       program,
- 
-   if (param_name == CL_PROGRAM_REFERENCE_COUNT) {
-     cl_uint ref = program->ref_n;
--    FILL_AND_RET (cl_uint, 1, (&ref), CL_SUCCESS);
-+    FILL_GETINFO_RET (cl_uint, 1, (&ref), CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_CONTEXT) {
-     cl_context context = program->ctx;
--    FILL_AND_RET (cl_context, 1, &context, CL_SUCCESS);
-+    FILL_GETINFO_RET (cl_context, 1, &context, CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
-     cl_uint num_dev = 1; // Just 1 dev now.
--    FILL_AND_RET (cl_uint, 1, &num_dev, CL_SUCCESS);
-+    FILL_GETINFO_RET (cl_uint, 1, &num_dev, CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_DEVICES) {
-     cl_device_id dev_id = program->ctx->device;
--    FILL_AND_RET (cl_device_id, 1, &dev_id, CL_SUCCESS);
-+    FILL_GETINFO_RET (cl_device_id, 1, &dev_id, CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_SOURCE) {
- 
-     if (!program->source)
--      FILL_AND_RET (char, 1, &ret_str, CL_SUCCESS);
--    FILL_AND_RET (char, (strlen(program->source) + 1),
-+      FILL_GETINFO_RET (char, 1, &ret_str, CL_SUCCESS);
-+    FILL_GETINFO_RET (char, (strlen(program->source) + 1),
-                    program->source, CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
--    FILL_AND_RET (size_t, 1, (&program->bin_sz), CL_SUCCESS);
-+    FILL_GETINFO_RET (size_t, 1, (&program->bin_sz), CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_BINARIES) {
-     if (!param_value)
-       return CL_SUCCESS;
-@@ -825,15 +838,15 @@ clGetProgramBuildInfo(cl_program             program,
-       status = CL_BUILD_ERROR;
-     // TODO: Support CL_BUILD_IN_PROGRESS ?
- 
--    FILL_AND_RET (cl_build_status, 1, &status, CL_SUCCESS);
-+    FILL_GETINFO_RET (cl_build_status, 1, &status, CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
-     if (program->is_built && program->build_opts)
-       ret_str = program->build_opts;
- 
--    FILL_AND_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
-+    FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
-   } else if (param_name == CL_PROGRAM_BUILD_LOG) {
-     // TODO: need to add logs in backend when compiling.
--    FILL_AND_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
-+    FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
-   } else {
-     return CL_INVALID_VALUE;
-   }
-@@ -842,8 +855,6 @@ error:
-     return err;
- }
- 
--#undef FILL_AND_RET
--
- cl_kernel
- clCreateKernel(cl_program   program,
-                const char * kernel_name,
-diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
-index 0e04ff3..4f6f987 100644
---- a/src/cl_command_queue.h
-+++ b/src/cl_command_queue.h
-@@ -30,13 +30,14 @@ struct intel_gpgpu;
- /* Basically, this is a (kind-of) batch buffer */
- struct _cl_command_queue {
-   DEFINE_ICD(dispatch)
--  uint64_t magic;              /* To identify it as a command queue */
--  volatile int ref_n;          /* We reference count this object */
--  cl_context ctx;              /* Its parent context */
--  cl_command_queue prev, next; /* We chain the command queues together */
--  cl_gpgpu gpgpu;              /* Setup all GEN commands */
--  cl_mem perf;                 /* Where to put the perf counters */
--  cl_mem fulsim_out;           /* Fulsim will output this buffer */
-+  uint64_t magic;                      /* To identify it as a command queue */
-+  volatile int ref_n;                  /* We reference count this object */
-+  cl_context ctx;                      /* Its parent context */
-+  cl_command_queue_properties  props;  /* Queue properties */
-+  cl_command_queue prev, next;         /* We chain the command queues together */
-+  cl_gpgpu gpgpu;                      /* Setup all GEN commands */
-+  cl_mem perf;                         /* Where to put the perf counters */
-+  cl_mem fulsim_out;                   /* Fulsim will output this buffer */
- };
- 
- /* Allocate and initialize a new command queue. Also insert it in the list of
-diff --git a/src/cl_context.c b/src/cl_context.c
-index fa4c7e0..0331151 100644
---- a/src/cl_context.c
-+++ b/src/cl_context.c
-@@ -196,6 +196,7 @@ cl_context_create_queue(cl_context ctx,
- 
-   /* We create the command queue and store it in the context list of queues */
-   TRY_ALLOC (queue, cl_command_queue_new(ctx));
-+  queue->props = properties;
- 
- exit:
-   if (errcode_ret)
--- 
-1.7.10.4
-
diff --git a/debian/patches/0008-Add-the-test-case-for-clGetCommandQueueInfo-API.patch b/debian/patches/0008-Add-the-test-case-for-clGetCommandQueueInfo-API.patch
deleted file mode 100644
index 5bfcf07..0000000
--- a/debian/patches/0008-Add-the-test-case-for-clGetCommandQueueInfo-API.patch
+++ /dev/null
@@ -1,631 +0,0 @@
-From bb741a356647c2959135844373c6d7287cedfd2f Mon Sep 17 00:00:00 2001
-From: Junyan He <junyan.he at linux.intel.com>
-Date: Tue, 25 Jun 2013 18:15:32 +0800
-Subject: [PATCH 8/8] Add the test case for clGetCommandQueueInfo API
-To: beignet at lists.freedesktop.org
-
-Because all the get clGetXXXInfo API have similar
-structure in function type, we will integrate them
-together, and rename the get_program_info.cpp to get_cl_info.cpp
-
-Signed-off-by: Junyan He <junyan.he at linux.intel.com>
----
- kernels/get_program_info.cl |   10 --
- utests/CMakeLists.txt       |    2 +-
- utests/get_cl_info.cpp      |  319 +++++++++++++++++++++++++++++++++++++++++++
- utests/get_program_info.cpp |  247 ---------------------------------
- 4 files changed, 320 insertions(+), 258 deletions(-)
- delete mode 100644 kernels/get_program_info.cl
- create mode 100644 utests/get_cl_info.cpp
- delete mode 100644 utests/get_program_info.cpp
-
-diff --git a/kernels/get_program_info.cl b/kernels/get_program_info.cl
-deleted file mode 100644
-index 8e0dd94..0000000
---- a/kernels/get_program_info.cl
-+++ /dev/null
-@@ -1,10 +0,0 @@
--kernel void get_program_info( __global int *ret ) {
--    uint x = get_work_dim();
--    size_t y = get_global_size(0);
--    y = get_global_id(0);
--    y = get_local_size(0);
--    y = get_local_id(0);
--    y = get_num_groups(0);
--    y = get_group_id(0);
--    y = get_global_offset(0);
--}
-diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index d63b31f..e12069c 100644
---- a/utests/CMakeLists.txt
-+++ b/utests/CMakeLists.txt
-@@ -86,7 +86,7 @@ set (utests_sources
-   compiler_get_image_info.cpp
-   compiler_vector_load_store.cpp
-   compiler_cl_finish.cpp
--  get_program_info.cpp
-+  get_cl_info.cpp
-   buildin_work_dim.cpp
-   builtin_global_size.cpp
-   runtime_createcontext.cpp
-diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
-new file mode 100644
-index 0000000..6d5e7bb
---- /dev/null
-+++ b/utests/get_cl_info.cpp
-@@ -0,0 +1,319 @@
-+#include <string.h>
-+#include <string>
-+#include <map>
-+#include <iostream>
-+#include <fstream>
-+#include <algorithm>
-+#include "utest_helper.hpp"
-+
-+using namespace std;
-+
-+/* ***************************************************** *
-+ * This file to test all the API like: clGetXXXXInfo     *
-+ * ***************************************************** */
-+#define NO_STANDARD_REF 0xFFFFF
-+
-+template <typename T = cl_uint>
-+struct Info_Result {
-+    T ret;
-+    T refer;
-+    int size;
-+    typedef T type_value;
-+
-+    void * get_ret(void) {
-+        return (void *)&ret;
-+    }
-+
-+    Info_Result(T other) {
-+        refer = other;
-+        size = sizeof(T);
-+    }
-+
-+    bool check_result (void) {
-+        if (ret != refer && refer != (T)NO_STANDARD_REF)
-+            return false;
-+
-+        return true;
-+    }
-+};
-+
-+template <>
-+struct Info_Result<char *> {
-+    char * ret;
-+    char * refer;
-+    int size;
-+    typedef char* type_value;
-+
-+    Info_Result(char *other, int sz) {
-+        size = sz;
-+        ret = (char *)malloc(sizeof(char) * sz);
-+        if (other) {
-+            refer = (char *)malloc(sizeof(char) * sz);
-+            memcpy(refer, other, sz);
-+        }
-+    }
-+
-+    ~Info_Result(void) {
-+        free(refer);
-+        free(ret);
-+    }
-+
-+    void * get_ret(void) {
-+        return (void *)ret;
-+    }
-+
-+    bool check_result (void) {
-+        if (refer && ::memcmp(ret, refer, size))
-+            return false;
-+
-+        return true;
-+    }
-+};
-+
-+template <> //Used for such as CL_PROGRAM_BINARIES
-+struct Info_Result<char **> {
-+    char ** ret;
-+    char ** refer;
-+    int *elt_size;
-+    int size;
-+    typedef char** type_value;
-+
-+    Info_Result(char **other, int *sz, int elt_num) {
-+        size = elt_num;
-+
-+        ret = (char **)malloc(elt_num * sizeof(char *));
-+        memset(ret, 0, (elt_num * sizeof(char *)));
-+        refer = (char **)malloc(elt_num * sizeof(char *));
-+        memset(refer, 0, (elt_num * sizeof(char *)));
-+        elt_size = (int *)malloc(elt_num * sizeof(int));
-+        memset(elt_size, 0, (elt_num * sizeof(int)));
-+        if (sz) {
-+            int i = 0;
-+            for (; i < elt_num; i++) {
-+                elt_size[i] = sz[i];
-+                ret[i] = (char *)malloc(sz[i] * sizeof(char));
-+
-+                if (other[i] && elt_size[i] > 0) {
-+                    refer[i] = (char *)malloc(sz[i] * sizeof(char));
-+                    memcpy(&refer[i], &other[i], sz[i]);
-+                }
-+                else
-+                    refer[i] = NULL;
-+            }
-+        }
-+    }
-+
-+    ~Info_Result(void) {
-+        int i = 0;
-+        for (; i < size; i++) {
-+            if (refer[i])
-+                free(refer[i]);
-+            free(ret[i]);
-+        }
-+        free(ret);
-+        free(refer);
-+        free(elt_size);
-+    }
-+
-+    void * get_ret(void) {
-+        return (void *)ret;
-+    }
-+
-+    bool check_result (void) {
-+        int i = 0;
-+        for (; i < size; i++) {
-+            if (refer[i] && ::memcmp(ret[i], refer[i], elt_size[i]))
-+                return false;
-+        }
-+
-+        return true;
-+    }
-+};
-+
-+template <typename T1, typename T2>
-+struct Traits {
-+    static bool Is_Same(void) {
-+        return false;
-+    };
-+};
-+
-+template <typename T1>
-+struct Traits<T1, T1> {
-+    static bool Is_Same(void) {
-+        return true;
-+    };
-+};
-+
-+template <typename T>
-+Info_Result<T>* cast_as(void *info)
-+{
-+    Info_Result<T>* ret;
-+    ret = reinterpret_cast<Info_Result<T>*>(info);
-+    OCL_ASSERT((Traits<T, typename Info_Result<T>::type_value>::Is_Same()));
-+    return ret;
-+}
-+
-+
-+#define CALL_INFO_AND_RET(TYPE, FUNC, OBJ) \
-+    do { \
-+	cl_int ret; \
-+	size_t ret_size; \
-+	\
-+	Info_Result<TYPE>* info = cast_as<TYPE>(x.second); \
-+	ret = FUNC (OBJ, x.first, \
-+		info->size, info->get_ret(), &ret_size); \
-+	OCL_ASSERT((!ret)); \
-+	OCL_ASSERT((info->check_result())); \
-+	delete info; \
-+    } while(0)
-+
-+/* ***************************************************** *
-+ * clGetProgramInfo                                      *
-+ * ***************************************************** */
-+#define CALL_PROGINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetProgramInfo, program)
-+
-+void get_program_info(void)
-+{
-+    map<cl_program_info, void *> maps;
-+    int expect_value;
-+    char * expect_source;
-+    int sz;
-+    char *ker_path = (char *)malloc(4096 * sizeof(char));
-+    const char *kiss_path = getenv("OCL_KERNEL_PATH");
-+    string line;
-+    string source_code;
-+
-+    sprintf(ker_path, "%s/%s", kiss_path, "compiler_if_else.cl");
-+
-+    ifstream in(ker_path);
-+    while (getline(in,line)) {
-+        source_code = (source_code == "") ?
-+                      source_code + line : source_code + "\n" + line;
-+    }
-+    free(ker_path);
-+    //cout<< source_code;
-+    source_code = source_code + "\n";
-+
-+    expect_source = (char *)source_code.c_str();
-+
-+    OCL_CREATE_KERNEL("compiler_if_else");
-+
-+    /* First test for clGetProgramInfo. We just have 1 devices now */
-+    expect_value = 2;//One program, one kernel.
-+    maps.insert(make_pair(CL_PROGRAM_REFERENCE_COUNT,
-+                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
-+    maps.insert(make_pair(CL_PROGRAM_CONTEXT,
-+                          (void *)(new Info_Result<cl_context>(ctx))));
-+    expect_value = 1;
-+    maps.insert(make_pair(CL_PROGRAM_NUM_DEVICES,
-+                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
-+    maps.insert(make_pair(CL_PROGRAM_DEVICES,
-+                          (void *)(new Info_Result<cl_device_id>(device))));
-+    sz = (strlen(expect_source) + 1);
-+    maps.insert(make_pair(CL_PROGRAM_SOURCE,
-+                          (void *)(new Info_Result<char *>(expect_source, sz))));
-+    expect_value = NO_STANDARD_REF;
-+    maps.insert(make_pair(CL_PROGRAM_BINARY_SIZES,
-+                          (void *)(new Info_Result<size_t>((size_t)expect_value))));
-+    sz = 4096; //big enough?
-+    expect_source = NULL;
-+    maps.insert(make_pair(CL_PROGRAM_BINARIES,
-+                          (void *)(new Info_Result<char **>(&expect_source, &sz, 1))));
-+
-+    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
-+        switch (x.first) {
-+        case CL_PROGRAM_REFERENCE_COUNT:
-+        case CL_PROGRAM_NUM_DEVICES:
-+            CALL_PROGINFO_AND_RET(cl_uint);
-+            break;
-+        case CL_PROGRAM_CONTEXT:
-+            CALL_PROGINFO_AND_RET(cl_context);
-+            break;
-+        case CL_PROGRAM_DEVICES:
-+            CALL_PROGINFO_AND_RET(cl_device_id);
-+            break;
-+        case CL_PROGRAM_SOURCE:
-+            CALL_PROGINFO_AND_RET(char *);
-+            break;
-+        case CL_PROGRAM_BINARY_SIZES:
-+            CALL_PROGINFO_AND_RET(size_t);
-+            break;
-+        case CL_PROGRAM_BINARIES:
-+            CALL_PROGINFO_AND_RET(char **);
-+            break;
-+        default:
-+            break;
-+        }
-+    });
-+}
-+
-+MAKE_UTEST_FROM_FUNCTION(get_program_info);
-+
-+/* ***************************************************** *
-+ * clGetCommandQueueInfo                                 *
-+ * ***************************************************** */
-+#define CALL_QUEUEINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetCommandQueueInfo, queue)
-+
-+void get_queue_info(void)
-+{
-+    /* use the compiler_fabs case to test us. */
-+    const size_t n = 16;
-+    map<cl_program_info, void *> maps;
-+    int expect_ref;
-+    cl_command_queue_properties prop;
-+
-+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
-+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
-+    OCL_CREATE_KERNEL("compiler_fabs");
-+
-+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-+
-+    globals[0] = 16;
-+    locals[0] = 16;
-+
-+    OCL_MAP_BUFFER(0);
-+    for (int32_t i = 0; i < (int32_t) n; ++i)
-+        ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
-+    OCL_UNMAP_BUFFER(0);
-+
-+    // Run the kernel on GPU
-+    OCL_NDRANGE(1);
-+
-+    /* Do our test.*/
-+    maps.insert(make_pair(CL_QUEUE_CONTEXT,
-+                          (void *)(new Info_Result<cl_context>(ctx))));
-+    maps.insert(make_pair(CL_QUEUE_DEVICE,
-+                          (void *)(new Info_Result<cl_device_id>(device))));
-+
-+    expect_ref = 1;
-+    maps.insert(make_pair(CL_QUEUE_REFERENCE_COUNT,
-+                          (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
-+
-+    prop = 0;
-+    maps.insert(make_pair(CL_QUEUE_PROPERTIES,
-+                          (void *)(new Info_Result<cl_command_queue_properties>(
-+                                       ((cl_command_queue_properties)prop)))));
-+
-+    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
-+        switch (x.first) {
-+        case CL_QUEUE_CONTEXT:
-+            CALL_QUEUEINFO_AND_RET(cl_context);
-+            break;
-+        case CL_QUEUE_DEVICE:
-+            CALL_QUEUEINFO_AND_RET(cl_device_id);
-+            break;
-+        case CL_QUEUE_REFERENCE_COUNT:
-+            CALL_QUEUEINFO_AND_RET(cl_uint);
-+            break;
-+        case CL_QUEUE_PROPERTIES:
-+            CALL_QUEUEINFO_AND_RET(cl_command_queue_properties);
-+            break;
-+        default:
-+            break;
-+        }
-+    });
-+}
-+
-+MAKE_UTEST_FROM_FUNCTION(get_queue_info);
-+
-diff --git a/utests/get_program_info.cpp b/utests/get_program_info.cpp
-deleted file mode 100644
-index 20248e8..0000000
---- a/utests/get_program_info.cpp
-+++ /dev/null
-@@ -1,247 +0,0 @@
--#include <string.h>
--#include <string>
--#include <map>
--#include <iostream>
--#include <fstream>
--#include <algorithm>
--#include "utest_helper.hpp"
--
--using namespace std;
--
--/* ********************************************** *
-- * This file to test the API of:                  *
-- * clGetProgramInfo                               *
-- * ********************************************** */
--#define NO_STANDARD_REF 0xFFFFF
--
--template <typename T = cl_uint>
--struct Info_Result {
--    T ret;
--    T refer;
--    int size;
--    typedef T type_value;
--
--    void * get_ret(void) {
--        return (void *)&ret;
--    }
--
--    Info_Result(T other) {
--        refer = other;
--        size = sizeof(T);
--    }
--
--    bool check_result (void) {
--        if (ret != refer && refer != (T)NO_STANDARD_REF)
--            return false;
--
--        return true;
--    }
--};
--
--template <>
--struct Info_Result<char *> {
--    char * ret;
--    char * refer;
--    int size;
--    typedef char* type_value;
--
--    Info_Result(char *other, int sz) {
--        size = sz;
--        ret = (char *)malloc(sizeof(char) * sz);
--        if (other) {
--            refer = (char *)malloc(sizeof(char) * sz);
--            memcpy(refer, other, sz);
--        }
--    }
--
--    ~Info_Result(void) {
--        free(refer);
--        free(ret);
--    }
--
--    void * get_ret(void) {
--        return (void *)ret;
--    }
--
--    bool check_result (void) {
--        if (refer && ::memcmp(ret, refer, size))
--            return false;
--
--        return true;
--    }
--};
--
--template <> //Used for such as CL_PROGRAM_BINARIES
--struct Info_Result<char **> {
--    char ** ret;
--    char ** refer;
--    int *elt_size;
--    int size;
--    typedef char** type_value;
--
--    Info_Result(char **other, int *sz, int elt_num) {
--        size = elt_num;
--
--        ret = (char **)malloc(elt_num * sizeof(char *));
--        memset(ret, 0, (elt_num * sizeof(char *)));
--        refer = (char **)malloc(elt_num * sizeof(char *));
--        memset(refer, 0, (elt_num * sizeof(char *)));
--        elt_size = (int *)malloc(elt_num * sizeof(int));
--        memset(elt_size, 0, (elt_num * sizeof(int)));
--        if (sz) {
--            int i = 0;
--            for (; i < elt_num; i++) {
--                elt_size[i] = sz[i];
--                ret[i] = (char *)malloc(sz[i] * sizeof(char));
--
--                if (other[i] && elt_size[i] > 0) {
--                    refer[i] = (char *)malloc(sz[i] * sizeof(char));
--                    memcpy(&refer[i], &other[i], sz[i]);
--                }
--                else
--                    refer[i] = NULL;
--            }
--        }
--    }
--
--    ~Info_Result(void) {
--        int i = 0;
--        for (; i < size; i++) {
--            if (refer[i])
--                free(refer[i]);
--            free(ret[i]);
--        }
--        free(ret);
--        free(refer);
--        free(elt_size);
--    }
--
--    void * get_ret(void) {
--        return (void *)ret;
--    }
--
--    bool check_result (void) {
--        int i = 0;
--        for (; i < size; i++) {
--            if (refer[i] && ::memcmp(ret[i], refer[i], elt_size[i]))
--                return false;
--        }
--
--        return true;
--    }
--};
--
--template <typename T1, typename T2>
--struct Traits {
--    static bool Is_Same(void) {
--        return false;
--    };
--};
--
--template <typename T1>
--struct Traits<T1, T1> {
--    static bool Is_Same(void) {
--        return true;
--    };
--};
--
--template <typename T>
--Info_Result<T>* cast_as(void *info)
--{
--    Info_Result<T>* ret;
--    ret = reinterpret_cast<Info_Result<T>*>(info);
--    OCL_ASSERT((Traits<T, typename Info_Result<T>::type_value>::Is_Same()));
--    return ret;
--}
--
--
--#define CALL_PROGINFO_AND_RET(TYPE) \
--    do { \
--	cl_int ret; \
--	size_t ret_size; \
--	\
--	Info_Result<TYPE>* info = cast_as<TYPE>(x.second); \
--	ret = clGetProgramInfo(program, x.first, \
--		info->size, info->get_ret(), &ret_size); \
--	OCL_ASSERT((!ret)); \
--	OCL_ASSERT((info->check_result())); \
--	delete info; \
--    } while(0)
--
--void get_program_info(void)
--{
--    map<cl_program_info, void *> maps;
--    int expect_value;
--    char * expect_source;
--    int sz;
--    char *ker_path = (char *)malloc(4096 * sizeof(char));
--    const char *kiss_path = getenv("OCL_KERNEL_PATH");
--    string line;
--    string source_code;
--
--    sprintf(ker_path, "%s/%s", kiss_path, "get_program_info.cl");
--
--    ifstream in(ker_path);
--    while (getline(in,line)) {
--        source_code = (source_code == "") ?
--                      source_code + line : source_code + "\n" + line;
--    }
--    free(ker_path);
--    //cout<< source_code;
--    source_code = source_code + "\n";
--
--    expect_source = (char *)source_code.c_str();
--
--    OCL_CREATE_KERNEL("get_program_info");
--
--    /* First test for clGetProgramInfo. We just have 1 devices now */
--    expect_value = 2;//One program, one kernel.
--    maps.insert(make_pair(CL_PROGRAM_REFERENCE_COUNT,
--                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
--    maps.insert(make_pair(CL_PROGRAM_CONTEXT,
--                          (void *)(new Info_Result<cl_context>(ctx))));
--    expect_value = 1;
--    maps.insert(make_pair(CL_PROGRAM_NUM_DEVICES,
--                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
--    maps.insert(make_pair(CL_PROGRAM_DEVICES,
--                          (void *)(new Info_Result<cl_device_id>(device))));
--    sz = (strlen(expect_source) + 1);
--    maps.insert(make_pair(CL_PROGRAM_SOURCE,
--                          (void *)(new Info_Result<char *>(expect_source, sz))));
--    expect_value = NO_STANDARD_REF;
--    maps.insert(make_pair(CL_PROGRAM_BINARY_SIZES,
--                          (void *)(new Info_Result<size_t>((size_t)expect_value))));
--    sz = 4096; //big enough?
--    expect_source = NULL;
--    maps.insert(make_pair(CL_PROGRAM_BINARIES,
--                          (void *)(new Info_Result<char **>(&expect_source, &sz, 1))));
--
--    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
--        switch (x.first) {
--        case CL_PROGRAM_REFERENCE_COUNT:
--        case CL_PROGRAM_NUM_DEVICES:
--            CALL_PROGINFO_AND_RET(cl_uint);
--            break;
--        case CL_PROGRAM_CONTEXT:
--            CALL_PROGINFO_AND_RET(cl_context);
--            break;
--        case CL_PROGRAM_DEVICES:
--            CALL_PROGINFO_AND_RET(cl_device_id);
--            break;
--        case CL_PROGRAM_SOURCE:
--            CALL_PROGINFO_AND_RET(char *);
--            break;
--        case CL_PROGRAM_BINARY_SIZES:
--            CALL_PROGINFO_AND_RET(size_t);
--            break;
--        case CL_PROGRAM_BINARIES:
--            CALL_PROGINFO_AND_RET(char **);
--            break;
--        default:
--            break;
--        }
--    });
--}
--
--MAKE_UTEST_FROM_FUNCTION(get_program_info);
--
--- 
-1.7.10.4
-
diff --git a/debian/patches/0008-support-built-in-function-rotate.patch b/debian/patches/0008-support-built-in-function-rotate.patch
new file mode 100644
index 0000000..9b917e8
--- /dev/null
+++ b/debian/patches/0008-support-built-in-function-rotate.patch
@@ -0,0 +1,58 @@
+From 2e008d20fb703db8afb84476ad599dca92d9f763 Mon Sep 17 00:00:00 2001
+From: Homer Hsing <homer.xing at intel.com>
+Date: Wed, 26 Jun 2013 15:51:51 +0800
+Subject: [PATCH 08/11] support built-in function "rotate"
+To: beignet at lists.freedesktop.org
+
+Signed-off-by: Homer Hsing <homer.xing at intel.com>
+---
+ backend/src/ocl_stdlib.h |   33 +++++++++++++++++++++++++++++++++
+ 1 file changed, 33 insertions(+)
+
+diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
+index 7a98e04..133e995 100644
+--- a/backend/src/ocl_stdlib.h
++++ b/backend/src/ocl_stdlib.h
+@@ -4355,6 +4355,39 @@ DEC(16)
+ #undef DEC4
+ #undef DEC8
+ #undef DEC16
++
++INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
++INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
++INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
++INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
++INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
++INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
++#define DEF(type, n, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y < 0 ? n + y : y & m)); }
++DEF(char, 8, 7)
++DEF(uchar, 8, 7)
++DEF(short, 16, 15)
++DEF(ushort, 16, 15)
++DEF(int, 32, 31)
++DEF(uint, 32, 31)
++#undef DEF
++#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
++#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
++#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
++#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
++#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
++#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
++DEF(2)
++DEF(3)
++DEF(4)
++DEF(8)
++DEF(16)
++#undef DEF
++#undef DEC2
++#undef DEC3
++#undef DEC4
++#undef DEC8
++#undef DEC16
++
+ /////////////////////////////////////////////////////////////////////////////
+ // Work Items functions (see 6.11.1 of OCL 1.1 spec)
+ /////////////////////////////////////////////////////////////////////////////
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0009-test-case-for-function-rotate.patch b/debian/patches/0009-test-case-for-function-rotate.patch
new file mode 100644
index 0000000..eef926a
--- /dev/null
+++ b/debian/patches/0009-test-case-for-function-rotate.patch
@@ -0,0 +1,87 @@
+From ac4b81f5d44a0803f24d3fe2202ebf4c6bd7d523 Mon Sep 17 00:00:00 2001
+From: Homer Hsing <homer.xing at intel.com>
+Date: Wed, 26 Jun 2013 15:51:52 +0800
+Subject: [PATCH 09/11] test case for function "rotate"
+To: beignet at lists.freedesktop.org
+
+Signed-off-by: Homer Hsing <homer.xing at intel.com>
+---
+ kernels/compiler_rotate.cl |    5 +++++
+ utests/CMakeLists.txt      |    1 +
+ utests/compiler_rotate.cpp |   40 ++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 46 insertions(+)
+ create mode 100644 kernels/compiler_rotate.cl
+ create mode 100644 utests/compiler_rotate.cpp
+
+diff --git a/kernels/compiler_rotate.cl b/kernels/compiler_rotate.cl
+new file mode 100644
+index 0000000..8d0dd0f
+--- /dev/null
++++ b/kernels/compiler_rotate.cl
+@@ -0,0 +1,5 @@
++kernel void compiler_rotate(global int *src, global int *dst, global int *y) {
++  int i = get_global_id(0);
++  dst[i] = rotate(src[i], y[i]);
++}
++
+diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
+index c313acd..3326064 100644
+--- a/utests/CMakeLists.txt
++++ b/utests/CMakeLists.txt
+@@ -55,6 +55,7 @@ set (utests_sources
+   compiler_lower_return1.cpp
+   compiler_lower_return2.cpp
+   compiler_multiple_kernels.cpp
++  compiler_rotate.cpp
+   compiler_saturate.cpp
+   compiler_saturate_sub.cpp
+   compiler_shift_right.cpp
+diff --git a/utests/compiler_rotate.cpp b/utests/compiler_rotate.cpp
+new file mode 100644
+index 0000000..bf52ca4
+--- /dev/null
++++ b/utests/compiler_rotate.cpp
+@@ -0,0 +1,40 @@
++#include "utest_helper.hpp"
++
++int cpu(int src, int y) {
++  return (src << y) | (src >> (32 - y));
++}
++
++void compiler_rotate(void)
++{
++  const int n = 32;
++  int src[n], y[n];
++
++  // Setup kernel and buffers
++  OCL_CREATE_KERNEL("compiler_rotate");
++  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
++  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
++  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
++  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
++  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
++  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
++  globals[0] = n;
++  locals[0] = 16;
++
++  OCL_MAP_BUFFER(0);
++  OCL_MAP_BUFFER(2);
++  for (int i = 0; i < n; ++i) {
++    src[i] = ((int*)buf_data[0])[i] = rand();
++    y[i] = ((int*)buf_data[2])[i] = rand() & 31;
++  }
++  OCL_UNMAP_BUFFER(0);
++  OCL_UNMAP_BUFFER(2);
++
++  OCL_NDRANGE(1);
++
++  OCL_MAP_BUFFER(1);
++  for (int i = 0; i < n; ++i)
++    OCL_ASSERT(((int*)buf_data[1])[i] == cpu(src[i], y[i]));
++  OCL_UNMAP_BUFFER(1);
++}
++
++MAKE_UTEST_FROM_FUNCTION(compiler_rotate);
+-- 
+1.7.10.4
+
diff --git a/debian/patches/0003-GBE-Add-more-support-of-char-and-short-arithmetic.patch b/debian/patches/0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch
similarity index 53%
rename from debian/patches/0003-GBE-Add-more-support-of-char-and-short-arithmetic.patch
rename to debian/patches/0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch
index 22a0eec..04dc32f 100644
--- a/debian/patches/0003-GBE-Add-more-support-of-char-and-short-arithmetic.patch
+++ b/debian/patches/0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch
@@ -1,40 +1,45 @@
-From 1525f53083d7623659e51a9d6f1e4835a83a6caf Mon Sep 17 00:00:00 2001
+From eddef9eb62ed9ea04e657687c8ed28f43f1e0584 Mon Sep 17 00:00:00 2001
 From: Ruiling Song <ruiling.song at intel.com>
-Date: Tue, 25 Jun 2013 15:38:48 +0800
-Subject: [PATCH 3/8] GBE: Add more support of char and short arithmetic
+Date: Wed, 26 Jun 2013 15:52:12 +0800
+Subject: [PATCH 10/11] GBE: Add more support of char and short arithmetic
 To: beignet at lists.freedesktop.org
 
 add * / % support of char and short type.
 
 Signed-off-by: Ruiling Song <ruiling.song at intel.com>
 ---
- backend/src/backend/gen_insn_selection.cpp |   67 +++++++++++++++++++++++-----
+ backend/src/backend/gen_insn_selection.cpp |   87 ++++++++++++++++++++--------
  backend/src/llvm/llvm_gen_backend.cpp      |    4 +-
- 2 files changed, 57 insertions(+), 14 deletions(-)
+ 2 files changed, 65 insertions(+), 26 deletions(-)
 
 diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
-index 1e5f514..b1c6093 100644
+index c64afd9..e98be3e 100644
 --- a/backend/src/backend/gen_insn_selection.cpp
 +++ b/backend/src/backend/gen_insn_selection.cpp
-@@ -1260,30 +1260,73 @@ namespace gbe
-       const Opcode opcode = insn.getOpcode();
-       const Type type = insn.getType();
-       GenRegister dst  = sel.selReg(insn.getDst(0), type);
+@@ -1293,6 +1293,54 @@ namespace gbe
+           this->opcodes.push_back(ir::Opcode(op));
+     }
+ 
++    bool emitDivRemInst(Selection::Opaque &sel, SelectionDAG &dag, ir::Opcode op) const
++    {
++      using namespace ir;
++      const ir::BinaryInstruction &insn = cast<BinaryInstruction>(dag.insn);
++      const Type type = insn.getType();
++      GenRegister dst  = sel.selReg(insn.getDst(0), type);
++      GenRegister src0 = sel.selReg(insn.getSrc(0), type);
++      GenRegister src1 = sel.selReg(insn.getSrc(1), type);
 +      const uint32_t simdWidth = sel.curr.execWidth;
 +      const RegisterFamily family = getFamily(type);
++      uint32_t function = (op == OP_DIV)?
++                          GEN_MATH_FUNCTION_INT_DIV_QUOTIENT :
++                          GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
 +
 +      //bytes and shorts must be converted to int for DIV and REM per GEN restriction
-+      if((opcode == OP_DIV || opcode == OP_REM)
-+        && (family == FAMILY_WORD || family == FAMILY_BYTE)) {
-+        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
-+        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
-+        uint32_t function = (opcode == OP_DIV)?
-+                            GEN_MATH_FUNCTION_INT_DIV_QUOTIENT :
-+                            GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
-+        GenRegister tmp0 = src0;
-+        GenRegister tmp1 = src1;
-+        GenRegister tmp2 = dst;
-+        tmp0 = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
++      if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
++        GenRegister tmp0, tmp1;
++        ir::Register reg = sel.reg(FAMILY_DWORD);
++
++        tmp0 = GenRegister::udxgrf(simdWidth, reg);
 +        tmp0 = GenRegister::retype(tmp0, GEN_TYPE_D);
 +        sel.MOV(tmp0, src0);
 +
@@ -42,62 +47,71 @@ index 1e5f514..b1c6093 100644
 +        tmp1 = GenRegister::retype(tmp1, GEN_TYPE_D);
 +        sel.MOV(tmp1, src1);
 +
-+        tmp2 = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-+        tmp2 = GenRegister::retype(tmp2, GEN_TYPE_D);
-+
-+        sel.MATH(tmp2, function, tmp0, tmp1);
++        sel.MATH(tmp0, function, tmp0, tmp1);
 +        GenRegister unpacked;
 +        if(family == FAMILY_WORD) {
-+          unpacked = GenRegister::unpacked_uw(sel.reg(FAMILY_DWORD));
++          unpacked = GenRegister::unpacked_uw(reg);
 +        } else {
-+          unpacked = GenRegister::unpacked_ub(sel.reg(FAMILY_DWORD));
++          unpacked = GenRegister::unpacked_ub(reg);
 +        }
 +        unpacked = GenRegister::retype(unpacked, getGenType(type));
-+        sel.MOV(unpacked, tmp2);
 +        sel.MOV(dst, unpacked);
- 
-+        markAllChildren(dag);
-+        return true;
++      } else if (type == TYPE_S32 || type == TYPE_U32 ) {
++        sel.MATH(dst, function, src0, src1);
++      } else if(type == TYPE_FLOAT) {
++        GBE_ASSERT(op != OP_REM);
++        sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
++      } else {
++        NOT_IMPLEMENTED;
 +      }
-       // Immediates not supported
-       if (opcode == OP_DIV || opcode == OP_POW) {
-         GenRegister src0 = sel.selReg(insn.getSrc(0), type);
-         GenRegister src1 = sel.selReg(insn.getSrc(1), type);
-         uint32_t function;
++      markAllChildren(dag);
++      return true;
++    }
++
+     INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+     {
+       using namespace ir;
+@@ -1301,29 +1349,20 @@ namespace gbe
+       const Type type = insn.getType();
+       GenRegister dst  = sel.selReg(insn.getDst(0), type);
+ 
+-      // Immediates not supported
+-      if (opcode == OP_DIV || opcode == OP_POW) {
+-        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+-        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+-        uint32_t function;
 -        if (type == TYPE_S32 || type == TYPE_U32)
-+        if (type == TYPE_S32 || type == TYPE_U32 ) {
-           function = GEN_MATH_FUNCTION_INT_DIV_QUOTIENT;
+-          function = GEN_MATH_FUNCTION_INT_DIV_QUOTIENT;
 -        else
-+          sel.MATH(dst, function, src0, src1);
-+        } else if(type == TYPE_FLOAT) {
-           function = opcode == OP_DIV ?
-                      GEN_MATH_FUNCTION_FDIV :
-                      GEN_MATH_FUNCTION_POW;
+-          function = opcode == OP_DIV ?
+-                     GEN_MATH_FUNCTION_FDIV :
+-                     GEN_MATH_FUNCTION_POW;
 -        sel.MATH(dst, function, src0, src1);
-+          sel.MATH(dst, function, src0, src1);
-+        } else {
-+          NOT_IMPLEMENTED;
-+        }
-         markAllChildren(dag);
-         return true;
+-        markAllChildren(dag);
+-        return true;
++      if(opcode == OP_DIV || opcode == OP_REM) {
++        return this->emitDivRemInst(sel, dag, opcode);
        }
-       if (opcode == OP_REM) {
+-      if (opcode == OP_REM) {
++      // Immediates not supported
++      if (opcode == OP_POW) {
          GenRegister src0 = sel.selReg(insn.getSrc(0), type);
          GenRegister src1 = sel.selReg(insn.getSrc(1), type);
 -        if (type == TYPE_U32 || type == TYPE_S32) {
-+        if(type == TYPE_S32 || type == TYPE_U32) {
-           sel.MATH(dst, GEN_MATH_FUNCTION_INT_DIV_REMAINDER, src0, src1);
+-          sel.MATH(dst, GEN_MATH_FUNCTION_INT_DIV_REMAINDER, src0, src1);
 -          markAllChildren(dag);
 -        } else
--          NOT_IMPLEMENTED;
++
++        if(type == TYPE_FLOAT) {
++          sel.MATH(dst, GEN_MATH_FUNCTION_POW, src0, src1);
 +        } else {
-+          GBE_ASSERTM(0, "Unsupported type in remainder operation!");
+           NOT_IMPLEMENTED;
 +        }
 +        markAllChildren(dag);
          return true;
        }
  
-@@ -1345,14 +1388,14 @@ namespace gbe
+@@ -1385,14 +1424,14 @@ namespace gbe
          case OP_SHR: sel.SHR(dst, src0, src1); break;
          case OP_ASR: sel.ASR(dst, src0, src1); break;
          case OP_MUL:
@@ -118,10 +132,10 @@ index 1e5f514..b1c6093 100644
          default: NOT_IMPLEMENTED;
        }
 diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
-index 5b7754c..b0e8c6c 100644
+index 5db3fbe..fa052ce 100644
 --- a/backend/src/llvm/llvm_gen_backend.cpp
 +++ b/backend/src/llvm/llvm_gen_backend.cpp
-@@ -1276,10 +1276,10 @@ namespace gbe
+@@ -1280,10 +1280,10 @@ namespace gbe
        case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
        case Instruction::Mul:
        case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
diff --git a/debian/patches/0004-utests-Add-basic-arithmetic-test-case.patch b/debian/patches/0011-utests-Add-basic-arithmetic-test-case.patch
similarity index 79%
rename from debian/patches/0004-utests-Add-basic-arithmetic-test-case.patch
rename to debian/patches/0011-utests-Add-basic-arithmetic-test-case.patch
index a3ae637..9a2e59e 100644
--- a/debian/patches/0004-utests-Add-basic-arithmetic-test-case.patch
+++ b/debian/patches/0011-utests-Add-basic-arithmetic-test-case.patch
@@ -1,7 +1,7 @@
-From c89dbb34332c104df22c8ea8c22bac0bcb0b5221 Mon Sep 17 00:00:00 2001
+From 1400415e754d8362ed6a628f5e77c3da2417adae Mon Sep 17 00:00:00 2001
 From: Ruiling Song <ruiling.song at intel.com>
-Date: Tue, 25 Jun 2013 15:38:49 +0800
-Subject: [PATCH 4/8] utests: Add basic arithmetic test case
+Date: Wed, 26 Jun 2013 15:52:13 +0800
+Subject: [PATCH 11/11] utests: Add basic arithmetic test case
 To: beignet at lists.freedesktop.org
 
 test case for + - * / % of data type (u)int8/16/32
@@ -9,14 +9,14 @@ remove duplicated cases.
 
 Signed-off-by: Ruiling Song <ruiling.song at intel.com>
 ---
- kernels/compiler_basic_arithmetic.cl |   73 +++++++++++++++++++
- kernels/compiler_sub_bytes.cl        |    7 --
- kernels/compiler_sub_shorts.cl       |    7 --
+ kernels/compiler_basic_arithmetic.cl |   53 ++++++++++++++++
+ kernels/compiler_sub_bytes.cl        |    7 ---
+ kernels/compiler_sub_shorts.cl       |    7 ---
  utests/CMakeLists.txt                |    3 +-
- utests/compiler_basic_arithmetic.cpp |  132 ++++++++++++++++++++++++++++++++++
- utests/compiler_sub_bytes.cpp        |   35 ---------
- utests/compiler_sub_shorts.cpp       |   36 ----------
- 7 files changed, 206 insertions(+), 87 deletions(-)
+ utests/compiler_basic_arithmetic.cpp |  112 ++++++++++++++++++++++++++++++++++
+ utests/compiler_sub_bytes.cpp        |   35 -----------
+ utests/compiler_sub_shorts.cpp       |   36 -----------
+ 7 files changed, 166 insertions(+), 87 deletions(-)
  create mode 100644 kernels/compiler_basic_arithmetic.cl
  delete mode 100644 kernels/compiler_sub_bytes.cl
  delete mode 100644 kernels/compiler_sub_shorts.cl
@@ -26,10 +26,10 @@ Signed-off-by: Ruiling Song <ruiling.song at intel.com>
 
 diff --git a/kernels/compiler_basic_arithmetic.cl b/kernels/compiler_basic_arithmetic.cl
 new file mode 100644
-index 0000000..2bc2c27
+index 0000000..3e145d8
 --- /dev/null
 +++ b/kernels/compiler_basic_arithmetic.cl
-@@ -0,0 +1,73 @@
+@@ -0,0 +1,53 @@
 +#define DECL_KERNEL_SUB(type)\
 +__kernel void \
 +compiler_sub_##type(__global type *src0, __global type *src1, __global type *dst) \
@@ -69,40 +69,20 @@ index 0000000..2bc2c27
 +  int id = (int)get_global_id(0); \
 +  dst[id] = src0[id] % src1[id]; \
 +}
-+DECL_KERNEL_SUB(char)
-+DECL_KERNEL_SUB(uchar)
-+DECL_KERNEL_SUB(short)
-+DECL_KERNEL_SUB(ushort)
-+DECL_KERNEL_SUB(int)
-+DECL_KERNEL_SUB(uint)
 +
-+DECL_KERNEL_ADD(char)
-+DECL_KERNEL_ADD(uchar)
-+DECL_KERNEL_ADD(short)
-+DECL_KERNEL_ADD(ushort)
-+DECL_KERNEL_ADD(int)
-+DECL_KERNEL_ADD(uint)
++#define DECL_KERNEL_FOR_ALL_TYPE(op) \
++DECL_KERNEL_##op(char)               \
++DECL_KERNEL_##op(uchar)              \
++DECL_KERNEL_##op(short)              \
++DECL_KERNEL_##op(ushort)             \
++DECL_KERNEL_##op(int)                \
++DECL_KERNEL_##op(uint)
 +
-+DECL_KERNEL_MUL(char)
-+DECL_KERNEL_MUL(uchar)
-+DECL_KERNEL_MUL(short)
-+DECL_KERNEL_MUL(ushort)
-+DECL_KERNEL_MUL(int)
-+DECL_KERNEL_MUL(uint)
-+
-+DECL_KERNEL_DIV(char)
-+DECL_KERNEL_DIV(uchar)
-+DECL_KERNEL_DIV(short)
-+DECL_KERNEL_DIV(ushort)
-+DECL_KERNEL_DIV(int)
-+DECL_KERNEL_DIV(uint)
-+
-+DECL_KERNEL_REM(char)
-+DECL_KERNEL_REM(uchar)
-+DECL_KERNEL_REM(short)
-+DECL_KERNEL_REM(ushort)
-+DECL_KERNEL_REM(int)
-+DECL_KERNEL_REM(uint)
++DECL_KERNEL_FOR_ALL_TYPE(SUB)
++DECL_KERNEL_FOR_ALL_TYPE(ADD)
++DECL_KERNEL_FOR_ALL_TYPE(MUL)
++DECL_KERNEL_FOR_ALL_TYPE(DIV)
++DECL_KERNEL_FOR_ALL_TYPE(REM)
 diff --git a/kernels/compiler_sub_bytes.cl b/kernels/compiler_sub_bytes.cl
 deleted file mode 100644
 index f058561..0000000
@@ -130,7 +110,7 @@ index d26de7f..0000000
 -}
 -
 diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
-index ea23f31..edfbda1 100644
+index 3326064..3740841 100644
 --- a/utests/CMakeLists.txt
 +++ b/utests/CMakeLists.txt
 @@ -5,6 +5,7 @@ link_directories (${LLVM_LIBRARY_DIR})
@@ -141,7 +121,7 @@ index ea23f31..edfbda1 100644
    compiler_displacement_map_element.cpp
    compiler_shader_toy.cpp
    compiler_mandelbrot.cpp
-@@ -56,8 +57,6 @@ set (utests_sources
+@@ -60,8 +61,6 @@ set (utests_sources
    compiler_saturate_sub.cpp
    compiler_shift_right.cpp
    compiler_short_scatter.cpp
@@ -152,10 +132,10 @@ index ea23f31..edfbda1 100644
    compiler_uint8_copy.cpp
 diff --git a/utests/compiler_basic_arithmetic.cpp b/utests/compiler_basic_arithmetic.cpp
 new file mode 100644
-index 0000000..5ab5f44
+index 0000000..dcdd084
 --- /dev/null
 +++ b/utests/compiler_basic_arithmetic.cpp
-@@ -0,0 +1,132 @@
+@@ -0,0 +1,112 @@
 +#include "utest_helper.hpp"
 +
 +enum eTestOP {
@@ -254,40 +234,20 @@ index 0000000..5ab5f44
 +}\
 +MAKE_UTEST_FROM_FUNCTION(compiler_rem_ ## alias)
 +
-+DECL_TEST_SUB(int8_t, char);
-+DECL_TEST_SUB(uint8_t, uchar);
-+DECL_TEST_SUB(int16_t, short);
-+DECL_TEST_SUB(uint16_t, ushort);
-+DECL_TEST_SUB(int32_t, int);
-+DECL_TEST_SUB(uint32_t, uint);
-+
-+DECL_TEST_ADD(int8_t, char);
-+DECL_TEST_ADD(uint8_t, uchar);
-+DECL_TEST_ADD(int16_t, short);
-+DECL_TEST_ADD(uint16_t, ushort);
-+DECL_TEST_ADD(int32_t, int);
-+DECL_TEST_ADD(uint32_t, uint);
-+
-+DECL_TEST_MUL(int8_t, char);
-+DECL_TEST_MUL(uint8_t, uchar);
-+DECL_TEST_MUL(int16_t, short);
-+DECL_TEST_MUL(uint16_t, ushort);
-+DECL_TEST_MUL(int32_t, int);
-+DECL_TEST_MUL(uint32_t, uint);
-+
-+DECL_TEST_DIV(int8_t, char);
-+DECL_TEST_DIV(uint8_t, uchar);
-+DECL_TEST_DIV(int16_t, short);
-+DECL_TEST_DIV(uint16_t, ushort);
-+DECL_TEST_DIV(int32_t, int);
-+DECL_TEST_DIV(uint32_t, uint);
++#define DECL_TEST_FOR_ALL_TYPE(op)\
++DECL_TEST_##op(int8_t, char) \
++DECL_TEST_##op(uint8_t, uchar) \
++DECL_TEST_##op(int16_t, short) \
++DECL_TEST_##op(uint16_t, ushort) \
++DECL_TEST_##op(int32_t, int) \
++DECL_TEST_##op(uint32_t, uint)
 +
-+DECL_TEST_REM(int8_t, char);
-+DECL_TEST_REM(uint8_t, uchar);
-+DECL_TEST_REM(int16_t, short);
-+DECL_TEST_REM(uint16_t, ushort);
-+DECL_TEST_REM(int32_t, int);
-+DECL_TEST_REM(uint32_t, uint);
++DECL_TEST_FOR_ALL_TYPE(SUB)
++DECL_TEST_FOR_ALL_TYPE(ADD)
++DECL_TEST_FOR_ALL_TYPE(MUL)
++DECL_TEST_FOR_ALL_TYPE(DIV)
++DECL_TEST_FOR_ALL_TYPE(REM)
++#undef DECL_TEST_FOR_ALL_TYPE
 diff --git a/utests/compiler_sub_bytes.cpp b/utests/compiler_sub_bytes.cpp
 deleted file mode 100644
 index 740a8fd..0000000
diff --git a/debian/patches/series b/debian/patches/series
index 6902ed0..90fcaf8 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -4,9 +4,12 @@ khronos
 deprecated-in-utest
 private
 0001-Add-vector-argument-test-case.patch
-0003-GBE-Add-more-support-of-char-and-short-arithmetic.patch
-0004-utests-Add-basic-arithmetic-test-case.patch
-0005-Add-the-builtin-function-abs-and-the-according-test-.patch
-0006-PATCH-Refine-the-get_local_id-.-builtins.patch
-0007-Add-the-support-of-the-API-clGetCommandQueueInfo.patch
-0008-Add-the-test-case-for-clGetCommandQueueInfo-API.patch
+0003-Add-the-builtin-function-abs-and-the-according-test-.patch
+0004-PATCH-Refine-the-get_local_id-.-builtins.patch
+0005-Add-atomic-help-functions.-Support-global-and-local-.patch
+0006-Add-all-atomic-built-in-functions-support.patch
+0007-Add-atomic-test-case.patch
+0008-support-built-in-function-rotate.patch
+0009-test-case-for-function-rotate.patch
+0010-GBE-Add-more-support-of-char-and-short-arithmetic.patch
+0011-utests-Add-basic-arithmetic-test-case.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list