[Pkg-opencl-devel] [beignet] 40/66: Imported Upstream version 0.1+git20130703+84f63e8

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 07:27:06 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit b251309e3580d91a76e8c2484c4eef231eb60861
Author: Simon Richter <sjr at debian.org>
Date:   Wed Jul 3 09:52:32 2013 +0200

    Imported Upstream version 0.1+git20130703+84f63e8
---
 backend/src/backend/gen/gen_mesa_disasm.c          |   1 +
 backend/src/backend/gen_context.cpp                |  79 ++++++
 backend/src/backend/gen_context.hpp                |   1 +
 backend/src/backend/gen_defs.hpp                   |  50 +++-
 backend/src/backend/gen_encoder.cpp                |  44 ++++
 backend/src/backend/gen_encoder.hpp                |   3 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   2 +-
 backend/src/backend/gen_insn_selection.cpp         | 160 +++++++++++--
 backend/src/backend/gen_insn_selection.hpp         |   4 +-
 backend/src/backend/gen_insn_selection.hxx         |   4 +
 backend/src/backend/program.cpp                    |   2 +-
 backend/src/ir/instruction.cpp                     |  80 ++++++-
 backend/src/ir/instruction.hpp                     |  42 +++-
 backend/src/ir/instruction.hxx                     |   4 +
 backend/src/llvm/llvm_gen_backend.cpp              | 131 +++++++++-
 backend/src/llvm/llvm_gen_ocl_function.hxx         |  35 ++-
 backend/src/ocl_stdlib.h                           | 266 ++++++++++++++++++++-
 kernels/compiler_abs.cl                            |  27 +++
 kernels/compiler_atomic_functions.cl               |  55 ++++-
 kernels/compiler_basic_arithmetic.cl               |  53 ++++
 kernels/compiler_hadd.cl                           |   4 +
 kernels/compiler_mad_hi.cl                         |   4 +
 kernels/compiler_mul_hi.cl                         |   4 +
 kernels/compiler_rhadd.cl                          |   4 +
 kernels/compiler_rotate.cl                         |   5 +
 kernels/compiler_sub_bytes.cl                      |   7 -
 kernels/compiler_sub_shorts.cl                     |   7 -
 kernels/compiler_vect_compare.cl                   |   7 +
 src/cl_api.c                                       |  81 ++++---
 src/cl_context.c                                   |  14 +-
 src/cl_context.h                                   |   2 +
 src/cl_extensions.c                                |   2 -
 utests/CMakeLists.txt                              |  17 +-
 utests/compiler_abs.cpp                            | 219 +++++++++++++++++
 utests/compiler_atomic_functions.cpp               |  92 ++++++-
 utests/compiler_basic_arithmetic.cpp               | 112 +++++++++
 utests/compiler_hadd.cpp                           |  40 ++++
 utests/compiler_mad_hi.cpp                         |  46 ++++
 utests/compiler_mul_hi.cpp                         |  40 ++++
 utests/compiler_rhadd.cpp                          |  41 ++++
 utests/compiler_rotate.cpp                         |  40 ++++
 utests/compiler_sub_bytes.cpp                      |  35 ---
 utests/compiler_sub_shorts.cpp                     |  36 ---
 utests/compiler_vect_compare.cpp                   |  44 ++++
 utests/get_cl_info.cpp                             | 132 +++++++++-
 45 files changed, 1881 insertions(+), 197 deletions(-)

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 9a4e283..f65cc30 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -82,6 +82,7 @@ static const struct {
 
   [GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_ADDC] = { .name = "addc", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 93d3932..acd9c1d 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -179,7 +179,77 @@ namespace gbe
     const GenRegister src1 = ra->genReg(insn.src(1));
     const GenRegister src2 = ra->genReg(insn.src(2));
     switch (insn.opcode) {
+      case SEL_OP_MUL_HI:
+       {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = 0;
+        p->push();
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
+        p->curr.accWrEnable = 1;
+        p->MACH(src2, src0, src1);
+        p->curr.accWrEnable = 0;
+        p->pop();
+        p->MOV(dst, src2);
+        if (w == 16) {
+          p->push();
+          p->curr.predicate = GEN_PREDICATE_NONE;
+          p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
+          p->curr.accWrEnable = 1;
+          p->MACH(src2, GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
+          p->curr.accWrEnable = 0;
+          p->pop();
+          p->curr.quarterControl = 1;
+          p->MOV(GenRegister::Qn(dst, 1), src2);
+        }
+        p->pop();
+        break;
+       }
       case SEL_OP_MAD:  p->MAD(dst, src0, src1, src2); break;
+      case SEL_OP_HADD:
+       {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = 0;
+        p->ADDC(dst, src0, src1);
+        p->SHR(dst, dst, GenRegister::immud(1));
+        p->SHL(src2, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+        p->OR(dst, dst, src2);
+        if (w == 16) {
+          p->curr.quarterControl = 1;
+          p->ADDC(GenRegister::Qn(dst, 1), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
+          p->SHR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
+          p->SHL(GenRegister::Qn(src2, 1), GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+          p->OR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::Qn(src2, 1));
+        }
+        p->pop();
+        break;
+       }
+      case SEL_OP_RHADD:
+       {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = 0;
+        p->ADDC(dst, src0, src1);
+        p->ADD(dst, dst, GenRegister::immud(1));
+        p->SHR(dst, dst, GenRegister::immud(1));
+        p->SHL(src2, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+        p->OR(dst, dst, src2);
+        if (w == 16) {
+          p->curr.quarterControl = 1;
+          p->ADDC(GenRegister::Qn(dst, 1), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
+          p->ADD(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
+          p->SHR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
+          p->SHL(GenRegister::Qn(src2, 1), GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+          p->OR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::Qn(src2, 1));
+        }
+        p->pop();
+        break;
+       }
       default: NOT_IMPLEMENTED;
     }
   }
@@ -226,6 +296,15 @@ namespace gbe
     }
   }
 
+  void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const uint32_t function = insn.extra.function;
+    const uint32_t bti = insn.extra.elem;
+
+    p->ATOMIC(dst, function, src, bti, insn.srcNum);
+  }
+
   void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
     GenRegister src = ra->genReg(insn.src(0));
     if(isScalarReg(src.reg()))
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 804384d..5dfaef9 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -91,6 +91,7 @@ namespace gbe
     void emitWriteFloat64Instruction(const SelectionInstruction &insn);
     void emitUntypedReadInstruction(const SelectionInstruction &insn);
     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
+    void emitAtomicInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn);
     void emitByteScatterInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index d1ce6b2..5a9bb2d 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -21,7 +21,7 @@
  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
  develop this 3D driver.
- 
+
  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  "Software"), to deal in the Software without restriction, including
@@ -29,11 +29,11 @@
  distribute, sublicense, and/or sell copies of the Software, and to
  permit persons to whom the Software is furnished to do so, subject to
  the following conditions:
- 
+
  The above copyright notice and this permission notice (including the
  next paragraph) shall be included in all copies or substantial
  portions of the Software.
- 
+
  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -41,7 +41,7 @@
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
+
  **********************************************************************/
  /*
   * Authors:
@@ -156,6 +156,7 @@ enum opcode {
   GEN_OPCODE_LZD = 74,
   GEN_OPCODE_FBH = 75,
   GEN_OPCODE_FBL = 76,
+  GEN_OPCODE_ADDC = 78,
   GEN_OPCODE_SAD2 = 80,
   GEN_OPCODE_SADA2 = 81,
   GEN_OPCODE_DP4 = 84,
@@ -169,6 +170,28 @@ enum opcode {
   GEN_OPCODE_NOP = 126,
 };
 
+#define GEN_ATOMIC_SIMD16   0
+#define GEN_ATOMIC_SIMD8    1
+
+enum GenAtomicOpCode {
+  GEN_ATOMIC_OP_CMPWR8B   = 0,
+  GEN_ATOMIC_OP_AND       = 1,
+  GEN_ATOMIC_OP_OR        = 2,
+  GEN_ATOMIC_OP_XOR       = 3,
+  GEN_ATOMIC_OP_MOV       = 4,
+  GEN_ATOMIC_OP_INC       = 5,
+  GEN_ATOMIC_OP_DEC       = 6,
+  GEN_ATOMIC_OP_ADD       = 7,
+  GEN_ATOMIC_OP_SUB       = 8,
+  GEN_ATOMIC_OP_REVSUB    = 9,
+  GEN_ATOMIC_OP_IMAX      = 10,
+  GEN_ATOMIC_OP_IMIN      = 11,
+  GEN_ATOMIC_OP_UMAX      = 12,
+  GEN_ATOMIC_OP_UMIN      = 13,
+  GEN_ATOMIC_OP_CMPWR     = 14,
+  GEN_ATOMIC_OP_PREDEC    = 15
+};
+
 /*! Gen SFID */
 enum GenMessageTarget {
   GEN_SFID_NULL                     = 0,
@@ -772,7 +795,7 @@ struct GenInstruction
     /*! Memory fence */
     struct {
       uint32_t bti:8;
-      uint32_t ingored:5;
+      uint32_t pad:5;
       uint32_t commit_enable:1;
       uint32_t msg_type:4;
       uint32_t pad2:1;
@@ -783,6 +806,21 @@ struct GenInstruction
       uint32_t end_of_thread:1;
     } gen7_memory_fence;
 
+    /*! atomic messages */
+    struct {
+      uint32_t bti:8;
+      uint32_t aop_type:4;
+      uint32_t simd_mode:1;
+      uint32_t return_data:1;
+      uint32_t msg_type:4;
+      uint32_t category:1;
+      uint32_t header_present:1;
+      uint32_t response_length:5;
+      uint32_t msg_length:4;
+      uint32_t pad3:2;
+      uint32_t end_of_thread:1;
+    } gen7_atomic_op;
+
     struct {
       uint32_t src1_subreg_nr_high:1;
       uint32_t src1_reg_nr:8;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index e96678b..25303b4 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -558,6 +558,41 @@ namespace gbe
                            response_length);
   }
 
+  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
+    GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+
+    if (this->curr.execWidth == 8) {
+      msg_length = srcNum;
+      response_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2*srcNum;
+      response_length = 2;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    setMessageDescriptor(this, insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_atomic_op.msg_type = GEN_UNTYPED_ATOMIC_READ;
+    insn->bits3.gen7_atomic_op.bti = bti;
+    insn->bits3.gen7_atomic_op.return_data = 1;
+    insn->bits3.gen7_atomic_op.aop_type = function;
+
+    if (this->curr.execWidth == 8)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+    else if (this->curr.execWidth == 16)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+    else
+      NOT_SUPPORTED;
+
+  }
+
   GenInstruction *GenEncoder::next(uint32_t opcode) {
      GenInstruction insn;
      std::memset(&insn, 0, sizeof(GenInstruction));
@@ -844,6 +879,13 @@ namespace gbe
   ALU2(MACH)
   ALU3(MAD)
 
+  void GenEncoder::ADDC(GenRegister dest, GenRegister src0, GenRegister src1) {
+    push();
+    curr.accWrEnable = 1;
+    alu2(this, GEN_OPCODE_ADDC, dest, src0, src1);
+    pop();
+  }
+
   void GenEncoder::ADD(GenRegister dest, GenRegister src0, GenRegister src1) {
      if (src0.type == GEN_TYPE_F ||
          (src0.file == GEN_IMMEDIATE_VALUE &&
@@ -1013,6 +1055,7 @@ namespace gbe
      if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
          function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER) {
         insn->header.execution_size = GEN_WIDTH_8;
+        insn->header.quarter_control = GEN_COMPRESSION_Q1;
 
         if(this->curr.execWidth == 16) {
           GenInstruction *insn2 = this->next(GEN_OPCODE_MATH);
@@ -1023,6 +1066,7 @@ namespace gbe
           insn2->header.destreg_or_condmod = function;
           this->setHeader(insn2);
           insn2->header.execution_size = GEN_WIDTH_8;
+          insn2->header.quarter_control = GEN_COMPRESSION_Q2;
           this->setDst(insn2, new_dest);
           this->setSrc0(insn2, new_src0);
           this->setSrc1(insn2, new_src1);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 88a3e77..a7cbc89 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -107,6 +107,7 @@ namespace gbe
     ALU2(RSL)
     ALU2(ASR)
     ALU2(ADD)
+    ALU2(ADDC)
     ALU2(MUL)
     ALU1(FRC)
     ALU2(MAC)
@@ -136,6 +137,8 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Atomic instructions */
+    void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
     /*! Read 64-bits float arrays */
     void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Write 64-bits float arrays */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index a3b4621..f3f4a25 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -21,4 +21,4 @@ DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
-
+DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 8fb2a80..bbe392d 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -168,12 +168,14 @@ namespace gbe
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
            this->opcode == SEL_OP_READ_FLOAT64 ||
+           this->opcode == SEL_OP_ATOMIC       ||
            this->opcode == SEL_OP_BYTE_GATHER;
   }
 
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
            this->opcode == SEL_OP_WRITE_FLOAT64 ||
+           this->opcode == SEL_OP_ATOMIC        ||
            this->opcode == SEL_OP_BYTE_SCATTER;
   }
 
@@ -431,8 +433,11 @@ namespace gbe
     ALU2(MACH)
     ALU1(LZD)
     ALU3(MAD)
+    ALU3(MUL_HI)
     ALU1(FBH)
     ALU1(FBL)
+    ALU3(HADD)
+    ALU3(RHADD)
 #undef ALU1
 #undef ALU2
 #undef ALU3
@@ -456,6 +461,8 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Atomic instruction */
+    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
     /*! Read 64 bits float array */
     void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Write 64 bits float array */
@@ -730,6 +737,23 @@ namespace gbe
     insn->src(0) = src;
   }
 
+  void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
+                                     uint32_t srcNum, Reg src0,
+                                     Reg src1, Reg src2, uint32_t bti) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    if(srcNum > 1) insn->src(1) = src1;
+    if(srcNum > 2) insn->src(2) = src2;
+    insn->extra.function = function;
+    insn->extra.elem     = bti;
+    SelectionVector *vector = this->appendVector();
+
+    vector->regNum = srcNum;
+    vector->reg = &insn->src(0);
+    vector->isSrc = 1;
+  }
+
   void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
@@ -1224,7 +1248,16 @@ namespace gbe
       const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode));
       const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode));
       switch (opcode) {
-        case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
+        case ir::OP_ABS:
+          if (insn.getType() == ir::TYPE_S32) {
+            const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
+            const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
+            sel.MOV(dst_, GenRegister::abs(src_));
+          } else {
+            GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT);
+            sel.MOV(dst, GenRegister::abs(src));
+          }
+          break;
         case ir::OP_MOV:
           if (dst.isdf()) {
             ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
@@ -1263,6 +1296,54 @@ namespace gbe
           this->opcodes.push_back(ir::Opcode(op));
     }
 
+    bool emitDivRemInst(Selection::Opaque &sel, SelectionDAG &dag, ir::Opcode op) const
+    {
+      using namespace ir;
+      const ir::BinaryInstruction &insn = cast<BinaryInstruction>(dag.insn);
+      const Type type = insn.getType();
+      GenRegister dst  = sel.selReg(insn.getDst(0), type);
+      GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+      GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+      const uint32_t simdWidth = sel.curr.execWidth;
+      const RegisterFamily family = getFamily(type);
+      uint32_t function = (op == OP_DIV)?
+                          GEN_MATH_FUNCTION_INT_DIV_QUOTIENT :
+                          GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
+
+      //bytes and shorts must be converted to int for DIV and REM per GEN restriction
+      if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
+        GenRegister tmp0, tmp1;
+        ir::Register reg = sel.reg(FAMILY_DWORD);
+
+        tmp0 = GenRegister::udxgrf(simdWidth, reg);
+        tmp0 = GenRegister::retype(tmp0, GEN_TYPE_D);
+        sel.MOV(tmp0, src0);
+
+        tmp1 = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        tmp1 = GenRegister::retype(tmp1, GEN_TYPE_D);
+        sel.MOV(tmp1, src1);
+
+        sel.MATH(tmp0, function, tmp0, tmp1);
+        GenRegister unpacked;
+        if(family == FAMILY_WORD) {
+          unpacked = GenRegister::unpacked_uw(reg);
+        } else {
+          unpacked = GenRegister::unpacked_ub(reg);
+        }
+        unpacked = GenRegister::retype(unpacked, getGenType(type));
+        sel.MOV(dst, unpacked);
+      } else if (type == TYPE_S32 || type == TYPE_U32 ) {
+        sel.MATH(dst, function, src0, src1);
+      } else if(type == TYPE_FLOAT) {
+        GBE_ASSERT(op != OP_REM);
+        sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
+      } else {
+        NOT_IMPLEMENTED;
+      }
+      markAllChildren(dag);
+      return true;
+    }
+
     INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
     {
       using namespace ir;
@@ -1271,29 +1352,20 @@ namespace gbe
       const Type type = insn.getType();
       GenRegister dst  = sel.selReg(insn.getDst(0), type);
 
-      // Immediates not supported
-      if (opcode == OP_DIV || opcode == OP_POW) {
-        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
-        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
-        uint32_t function;
-        if (type == TYPE_S32 || type == TYPE_U32)
-          function = GEN_MATH_FUNCTION_INT_DIV_QUOTIENT;
-        else
-          function = opcode == OP_DIV ?
-                     GEN_MATH_FUNCTION_FDIV :
-                     GEN_MATH_FUNCTION_POW;
-        sel.MATH(dst, function, src0, src1);
-        markAllChildren(dag);
-        return true;
+      if(opcode == OP_DIV || opcode == OP_REM) {
+        return this->emitDivRemInst(sel, dag, opcode);
       }
-      if (opcode == OP_REM) {
+      // Immediates not supported
+      if (opcode == OP_POW) {
         GenRegister src0 = sel.selReg(insn.getSrc(0), type);
         GenRegister src1 = sel.selReg(insn.getSrc(1), type);
-        if (type == TYPE_U32 || type == TYPE_S32) {
-          sel.MATH(dst, GEN_MATH_FUNCTION_INT_DIV_REMAINDER, src0, src1);
-          markAllChildren(dag);
-        } else
+
+        if(type == TYPE_FLOAT) {
+          sel.MATH(dst, GEN_MATH_FUNCTION_POW, src0, src1);
+        } else {
           NOT_IMPLEMENTED;
+        }
+        markAllChildren(dag);
         return true;
       }
 
@@ -1354,16 +1426,31 @@ namespace gbe
         case OP_SHL: sel.SHL(dst, src0, src1); break;
         case OP_SHR: sel.SHR(dst, src0, src1); break;
         case OP_ASR: sel.ASR(dst, src0, src1); break;
+        case OP_MUL_HI: {
+            GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+            sel.MUL_HI(dst, src0, src1, temp);
+            break;
+          }
         case OP_MUL:
-          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
-            sel.MUL(dst, src0, src1);
-          else if (type == TYPE_U32 || type == TYPE_S32) {
+          if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
             return false;
           }
-          else
-            NOT_IMPLEMENTED;
+          else {
+            GBE_ASSERTM((type != TYPE_S64 && type != TYPE_U64), "64bit integer not supported yet!" );
+            sel.MUL(dst, src0, src1);
+          }
         break;
+        case OP_HADD: {
+            GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
+            sel.HADD(dst, src0, src1, temp);
+            break;
+          }
+        case OP_RHADD: {
+            GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
+            sel.RHADD(dst, src0, src1, temp);
+            break;
+          }
         default: NOT_IMPLEMENTED;
       }
       sel.pop();
@@ -2025,6 +2112,28 @@ namespace gbe
     DECL_CTOR(ConvertInstruction, 1, 1);
   };
 
+  /*! Convert instruction pattern */
+  DECL_PATTERN(AtomicInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn) const
+    {
+      using namespace ir;
+      const AtomicOps atomicOp = insn.getAtomicOpcode();
+      const AddressSpace space = insn.getAddressSpace();
+      const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
+      const uint32_t srcNum = insn.getSrcNum();
+      const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
+      GenRegister src1 = src0, src2 = src0;
+      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
+      GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
+      GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+      sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
+      return true;
+    }
+    DECL_CTOR(AtomicInstruction, 1, 1);
+  };
+
   /*! Select instruction pattern */
   class SelectInstructionPattern : public SelectionPattern
   {
@@ -2371,6 +2480,7 @@ namespace gbe
     this->insert<SelectInstructionPattern>();
     this->insert<CompareInstructionPattern>();
     this->insert<ConvertInstructionPattern>();
+    this->insert<AtomicInstructionPattern>();
     this->insert<LabelInstructionPattern>();
     this->insert<BranchInstructionPattern>();
     this->insert<Int32x32MulInstructionPattern>();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 778eb1f..5ae6e42 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -92,9 +92,9 @@ namespace gbe
     GenInstructionState state;
     union {
       struct {
-        /*! Store bti for loads/stores and function for math and compares */
+        /*! Store bti for loads/stores and function for math, atomic and compares */
         uint16_t function:8;
-        /*! elemSize for byte scatters / gathers, elemNum for untyped msg */
+        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
         uint16_t elem:8;
       };
       struct {
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index cc2be08..c85d328 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -20,6 +20,7 @@ DECL_SELECTION_IR(RSL, BinaryInstruction)
 DECL_SELECTION_IR(ASR, BinaryInstruction)
 DECL_SELECTION_IR(ADD, BinaryInstruction)
 DECL_SELECTION_IR(MUL, BinaryInstruction)
+DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
 DECL_SELECTION_IR(MACH, BinaryInstruction)
 DECL_SELECTION_IR(CMP, CompareInstruction)
 DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
@@ -41,5 +42,8 @@ DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_SELECTION_IR(MUL_HI, TernaryInstruction)
 DECL_SELECTION_IR(FBH, UnaryInstruction)
 DECL_SELECTION_IR(FBL, UnaryInstruction)
+DECL_SELECTION_IR(HADD, TernaryInstruction)
+DECL_SELECTION_IR(RHADD, TernaryInstruction)
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index e41e5b6..2a4feb9 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -141,12 +141,12 @@ namespace gbe {
     while (end != std::string::npos) {
       end = options.find(' ', start);
       std::string str = options.substr(start, end - start);
+      start = end + 1;
       if(str.size() == 0)
         continue;
       if(str == "-cl-opt-disable") bOpt = false;
       useless.push_back(str);
       args.push_back(str.c_str());
-      start = end + 1;
     }
 
     args.push_back("-emit-llvm");
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 67a4c12..21b82ce 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -56,7 +56,7 @@ namespace ir {
     };
 
     /*! For regular n source instructions */
-    template <typename T, uint32_t srcNum> 
+    template <typename T, uint32_t srcNum>
     struct NSrcPolicy {
       INLINE uint32_t getSrcNum(void) const { return srcNum; }
       INLINE Register getSrc(const Function &fn, uint32_t ID) const {
@@ -246,6 +246,40 @@ namespace ir {
       Type srcType; //!< Type to convert from
     };
 
+    class ALIGNED_INSTRUCTION AtomicInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<AtomicInstruction>,
+      public NDstPolicy<AtomicInstruction, 1>
+    {
+    public:
+      AtomicInstruction(AtomicOps atomicOp,
+                         Register dst,
+                         AddressSpace addrSpace,
+                         Tuple src)
+      {
+        this->opcode = OP_ATOMIC;
+        this->atomicOp = atomicOp;
+        this->dst[0] = dst;
+        this->src = src;
+        this->addrSpace = addrSpace;
+        srcNum = 2;
+        if((atomicOp == ATOMIC_OP_INC) ||
+          (atomicOp == ATOMIC_OP_DEC))
+          srcNum = 1;
+        if(atomicOp == ATOMIC_OP_CMPXCHG)
+          srcNum = 3;
+      }
+      INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
+      INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Register dst[1];
+      Tuple src;
+      AddressSpace addrSpace; //!< Address space
+      uint8_t srcNum:2;     //!<Source Number
+      AtomicOps atomicOp:6;     //!<Source Number
+    };
+
     class ALIGNED_INSTRUCTION BranchInstruction :
       public BasePolicy,
       public NDstPolicy<BranchInstruction, 0>
@@ -738,6 +772,20 @@ namespace ir {
       return true;
     }
 
+    // We can convert anything to anything, but types and families must match
+    INLINE bool AtomicInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+        return false;
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
+        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
+          return false;
+
+      return true;
+    }
+
     /*! Loads and stores follow the same restrictions */
     template <typename T>
     INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
@@ -883,6 +931,15 @@ namespace ir {
       ternaryOrSelectOut(*this, out, fn);
     }
 
+    INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << addrSpace;
+      out << " %" << this->getDst(fn, 0);
+      out << " {" << "%" << this->getSrc(fn, 0) << "}";
+      for (uint32_t i = 1; i < srcNum; ++i)
+        out << " %" << this->getSrc(fn, i);
+    }
+
     INLINE void ConvertInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
       out << "." << this->getDstType()
@@ -1009,6 +1066,10 @@ START_INTROSPECTION(ConvertInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(ConvertInstruction)
 
+START_INTROSPECTION(AtomicInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(AtomicInstruction)
+
 START_INTROSPECTION(SelectInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(SelectInstruction)
@@ -1180,9 +1241,10 @@ END_FUNCTION(Instruction, Register)
   }
 
   bool Instruction::hasSideEffect(void) const {
-    return opcode == OP_STORE || 
+    return opcode == OP_STORE ||
            opcode == OP_TYPED_WRITE ||
-           opcode == OP_SYNC;
+           opcode == OP_SYNC ||
+           opcode == OP_ATOMIC;
   }
 
 #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
@@ -1197,6 +1259,8 @@ DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
 DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
 DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
 DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
@@ -1265,6 +1329,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   DECL_EMIT_FUNCTION(ADDSAT)
   DECL_EMIT_FUNCTION(SUB)
   DECL_EMIT_FUNCTION(SUBSAT)
+  DECL_EMIT_FUNCTION(MUL_HI)
   DECL_EMIT_FUNCTION(DIV)
   DECL_EMIT_FUNCTION(REM)
   DECL_EMIT_FUNCTION(SHL)
@@ -1275,6 +1340,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   DECL_EMIT_FUNCTION(OR)
   DECL_EMIT_FUNCTION(XOR)
   DECL_EMIT_FUNCTION(AND)
+  DECL_EMIT_FUNCTION(HADD)
+  DECL_EMIT_FUNCTION(RHADD)
 
 #undef DECL_EMIT_FUNCTION
 
@@ -1304,6 +1371,11 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
     return internal::ConvertInstruction(dstType, srcType, dst, src).convert();
   }
 
+  // For all unary functions with given opcode
+  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Tuple src) {
+    return internal::AtomicInstruction(atomicOp, dst, space, src).convert();
+  }
+
   // BRA
   Instruction BRA(LabelIndex labelIndex) {
     return internal::BranchInstruction(OP_BRA, labelIndex).convert();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 0f3bd34..fc1c984 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -53,6 +53,23 @@ namespace ir {
     MEM_INVALID
   };
 
+  enum AtomicOps {
+    ATOMIC_OP_AND       = 1,
+    ATOMIC_OP_OR        = 2,
+    ATOMIC_OP_XOR       = 3,
+    ATOMIC_OP_XCHG      = 4,
+    ATOMIC_OP_INC       = 5,
+    ATOMIC_OP_DEC       = 6,
+    ATOMIC_OP_ADD       = 7,
+    ATOMIC_OP_SUB       = 8,
+    ATOMIC_OP_IMAX      = 10,
+    ATOMIC_OP_IMIN      = 11,
+    ATOMIC_OP_UMAX      = 12,
+    ATOMIC_OP_UMIN      = 13,
+    ATOMIC_OP_CMPXCHG   = 14,
+    ATOMIC_OP_INVALID
+  };
+
   /* Vote function per hardware thread */
   enum VotePredicate : uint8_t {
     VOTE_ALL = 0,
@@ -228,6 +245,21 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Atomic instruction */
+  class AtomicInstruction : public Instruction {
+  public:
+    /*! Where the address register goes */
+    static const uint32_t addressIndex = 0;
+    /*! Address space that is manipulated here */
+    AddressSpace getAddressSpace(void) const;
+    /*! Return the atomic function code */
+    AtomicOps getAtomicOpcode(void) const;
+    /*! Return the register that contains the addresses */
+    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Store instruction. First source is the address. Next sources are the
    *  values to store contiguously at the given address
    */
@@ -485,10 +517,16 @@ namespace ir {
   Instruction COS(Type type, Register dst, Register src);
   /*! sin.type dst src */
   Instruction SIN(Type type, Register dst, Register src);
+  /*! mul_hi.type dst src */
+  Instruction MUL_HI(Type type, Register dst, Register src0, Register src1);
   /*! fbh.type dst src */
   Instruction FBH(Type type, Register dst, Register src);
   /*! fbl.type dst src */
   Instruction FBL(Type type, Register dst, Register src);
+  /*! hadd.type dst src */
+  Instruction HADD(Type type, Register dst, Register src0, Register src1);
+  /*! rhadd.type dst src */
+  Instruction RHADD(Type type, Register dst, Register src0, Register src1);
   /*! tan.type dst src */
   Instruction RCP(Type type, Register dst, Register src);
   /*! abs.type dst src */
@@ -555,6 +593,8 @@ namespace ir {
   Instruction GT(Type type, Register dst, Register src0, Register src1);
   /*! cvt.{dstType <- srcType} dst src */
   Instruction CVT(Type dstType, Type srcType, Register dst, Register src);
+  /*! atomic dst addr.space {src1 {src2}} */
+  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Tuple src);
   /*! bra labelIndex */
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index acfb45a..0e1c575 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -61,6 +61,7 @@ DECL_INSN(LT, CompareInstruction)
 DECL_INSN(GE, CompareInstruction)
 DECL_INSN(GT, CompareInstruction)
 DECL_INSN(CVT, ConvertInstruction)
+DECL_INSN(ATOMIC, AtomicInstruction)
 DECL_INSN(BRA, BranchInstruction)
 DECL_INSN(RET, BranchInstruction)
 DECL_INSN(LOADI, LoadImmInstruction)
@@ -71,5 +72,8 @@ DECL_INSN(SAMPLE, SampleInstruction)
 DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_INSN(MUL_HI, BinaryInstruction)
 DECL_INSN(FBH, UnaryInstruction)
 DECL_INSN(FBL, UnaryInstruction)
+DECL_INSN(HADD, BinaryInstruction)
+DECL_INSN(RHADD, BinaryInstruction)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 08500ba..8385e21 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -534,6 +534,8 @@ namespace gbe
 
     // Emit unary instructions from gen native function
     void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
+    // Emit unary instructions from gen native function
+    void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
 
     // These instructions are not supported at all
     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
@@ -693,10 +695,12 @@ namespace gbe
           return doIt(uint64_t(0));
         }
       }
+
       // NULL pointers
       if(isa<ConstantPointerNull>(CPV)) {
         return doIt(uint32_t(0));
       }
+
       // Floats and doubles
       const Type::TypeID typeID = CPV->getType()->getTypeID();
       switch (typeID) {
@@ -1276,10 +1280,10 @@ namespace gbe
       case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
       case Instruction::Mul:
       case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
-      case Instruction::URem:
+      case Instruction::URem: ctx.REM(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
       case Instruction::SRem:
       case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
-      case Instruction::UDiv:
+      case Instruction::UDiv: ctx.DIV(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
       case Instruction::SDiv:
       case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
       case Instruction::And:  ctx.AND(type, dst, src0, src1); break;
@@ -1480,7 +1484,12 @@ namespace gbe
         if (srcType == ir::TYPE_BOOL) {
           const ir::RegisterFamily family = getFamily(dstType);
           const ir::ImmediateIndex zero = ctx.newIntegerImmediate(0, dstType);
-          const ir::ImmediateIndex one = ctx.newIntegerImmediate(1, dstType);
+          ir::ImmediateIndex one;
+          if (I.getOpcode() == Instruction::SExt
+              && (dstType == ir::TYPE_S8 || dstType == ir::TYPE_S16 || dstType == ir::TYPE_S32))
+            one = ctx.newIntegerImmediate(-1, dstType);
+          else
+            one = ctx.newIntegerImmediate(1, dstType);
           const ir::Register zeroReg = ctx.reg(family);
           const ir::Register oneReg = ctx.reg(family);
           ctx.LOADI(dstType, zeroReg, zero);
@@ -1688,6 +1697,7 @@ namespace gbe
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
+      case GEN_OCL_FABS:
       case GEN_OCL_RNDZ:
       case GEN_OCL_RNDE:
       case GEN_OCL_RNDU:
@@ -1697,6 +1707,32 @@ namespace gbe
       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
       case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
       case GEN_OCL_GET_IMAGE_DEPTH:
+      case GEN_OCL_ATOMIC_ADD0:
+      case GEN_OCL_ATOMIC_ADD1:
+      case GEN_OCL_ATOMIC_SUB0:
+      case GEN_OCL_ATOMIC_SUB1:
+      case GEN_OCL_ATOMIC_AND0:
+      case GEN_OCL_ATOMIC_AND1:
+      case GEN_OCL_ATOMIC_OR0:
+      case GEN_OCL_ATOMIC_OR1:
+      case GEN_OCL_ATOMIC_XOR0:
+      case GEN_OCL_ATOMIC_XOR1:
+      case GEN_OCL_ATOMIC_XCHG0:
+      case GEN_OCL_ATOMIC_XCHG1:
+      case GEN_OCL_ATOMIC_UMAX0:
+      case GEN_OCL_ATOMIC_UMAX1:
+      case GEN_OCL_ATOMIC_UMIN0:
+      case GEN_OCL_ATOMIC_UMIN1:
+      case GEN_OCL_ATOMIC_IMAX0:
+      case GEN_OCL_ATOMIC_IMAX1:
+      case GEN_OCL_ATOMIC_IMIN0:
+      case GEN_OCL_ATOMIC_IMIN1:
+      case GEN_OCL_ATOMIC_INC0:
+      case GEN_OCL_ATOMIC_INC1:
+      case GEN_OCL_ATOMIC_DEC0:
+      case GEN_OCL_ATOMIC_DEC1:
+      case GEN_OCL_ATOMIC_CMPXCHG0:
+      case GEN_OCL_ATOMIC_CMPXCHG1:
         // No structure can be returned
         this->newRegister(&I);
         break;
@@ -1739,6 +1775,8 @@ namespace gbe
         this->newRegister(&I);
         break;
       }
+      case GEN_OCL_MUL_HI_INT:
+      case GEN_OCL_MUL_HI_UINT:
       case GEN_OCL_SADD_SAT_CHAR:
       case GEN_OCL_SADD_SAT_SHORT:
       case GEN_OCL_SADD_SAT_INT:
@@ -1755,6 +1793,8 @@ namespace gbe
       case GEN_OCL_USUB_SAT_SHORT:
       case GEN_OCL_USUB_SAT_INT:
       case GEN_OCL_USUB_SAT_LONG:
+      case GEN_OCL_HADD:
+      case GEN_OCL_RHADD:
         this->newRegister(&I);
         break;
       default:
@@ -1781,6 +1821,26 @@ namespace gbe
     ctx.ALU1(opcode, ir::TYPE_FLOAT, dst, src);
   }
 
+  void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+#if GBE_DEBUG
+    CallSite::arg_iterator AE = CS.arg_end();
+#endif /* GBE_DEBUG */
+    GBE_ASSERT(AI != AE);
+    unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
+    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
+    const ir::Register dst = this->getRegister(&I);
+
+    vector<ir::Register> src;
+    uint32_t srcNum = 0;
+    while(AI != AE) {
+      src.push_back(this->getRegister(*(AI++)));
+      srcNum++;
+    }
+    const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
+    ctx.ATOMIC(opcode, dst, addrSpace, srcTuple);
+  }
+
   void GenWriter::emitCallInst(CallInst &I) {
     if (Function *F = I.getCalledFunction()) {
       if (F->getIntrinsicID() != 0) {
@@ -1846,13 +1906,20 @@ namespace gbe
           }
           case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
           case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+          case GEN_OCL_ABS:
+          {
+            const ir::Register src = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU1(ir::OP_ABS, ir::TYPE_S32, dst, src);
+            break;
+          }
           case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
           case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
-          case GEN_OCL_ABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
+          case GEN_OCL_FABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
           case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
           case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
           case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
@@ -1862,6 +1929,32 @@ namespace gbe
           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
           case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
           case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+          case GEN_OCL_ATOMIC_ADD0:
+          case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
+          case GEN_OCL_ATOMIC_SUB0:
+          case GEN_OCL_ATOMIC_SUB1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_SUB); break;
+          case GEN_OCL_ATOMIC_AND0:
+          case GEN_OCL_ATOMIC_AND1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_AND); break;
+          case GEN_OCL_ATOMIC_OR0:
+          case GEN_OCL_ATOMIC_OR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_OR); break;
+          case GEN_OCL_ATOMIC_XOR0:
+          case GEN_OCL_ATOMIC_XOR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XOR); break;
+          case GEN_OCL_ATOMIC_XCHG0:
+          case GEN_OCL_ATOMIC_XCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XCHG); break;
+          case GEN_OCL_ATOMIC_INC0:
+          case GEN_OCL_ATOMIC_INC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_INC); break;
+          case GEN_OCL_ATOMIC_DEC0:
+          case GEN_OCL_ATOMIC_DEC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_DEC); break;
+          case GEN_OCL_ATOMIC_UMIN0:
+          case GEN_OCL_ATOMIC_UMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMIN); break;
+          case GEN_OCL_ATOMIC_UMAX0:
+          case GEN_OCL_ATOMIC_UMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMAX); break;
+          case GEN_OCL_ATOMIC_IMIN0:
+          case GEN_OCL_ATOMIC_IMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMIN); break;
+          case GEN_OCL_ATOMIC_IMAX0:
+          case GEN_OCL_ATOMIC_IMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMAX); break;
+          case GEN_OCL_ATOMIC_CMPXCHG0:
+          case GEN_OCL_ATOMIC_CMPXCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_CMPXCHG); break;
           case GEN_OCL_GET_IMAGE_WIDTH:
           case GEN_OCL_GET_IMAGE_HEIGHT:
           case GEN_OCL_GET_IMAGE_DEPTH:
@@ -2049,6 +2142,22 @@ namespace gbe
             ctx.TYPED_WRITE(srcTuple, srcType, coordType);
             break;
           }
+          case GEN_OCL_MUL_HI_INT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_MUL_HI_UINT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           case GEN_OCL_SADD_SAT_CHAR:
           case GEN_OCL_SADD_SAT_SHORT:
           case GEN_OCL_SADD_SAT_INT:
@@ -2093,6 +2202,20 @@ namespace gbe
             ctx.SUBSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_HADD: {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.HADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_RHADD: {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.RHADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           default: break;
         }
       }
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index fe19844..f448a50 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -19,7 +19,7 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
 DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 
 // Math function
-DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
+DECL_LLVM_GEN_FUNCTION(FABS, __gen_ocl_fabs)
 DECL_LLVM_GEN_FUNCTION(COS, __gen_ocl_cos)
 DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
 DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
@@ -78,6 +78,34 @@ DECL_LLVM_GEN_FUNCTION(GET_IMAGE_DEPTH,  __gen_ocl_get_image_depth)
 DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_DATA_TYPE,  __gen_ocl_get_image_channel_data_type)
 DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_ORDER,  __gen_ocl_get_image_channel_order)
 
+// atomic related functions.
+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD0, _Z20__gen_ocl_atomic_addPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD1, _Z20__gen_ocl_atomic_addPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB0, _Z20__gen_ocl_atomic_subPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB1, _Z20__gen_ocl_atomic_subPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND0, _Z20__gen_ocl_atomic_andPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND1, _Z20__gen_ocl_atomic_andPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR0,  _Z19__gen_ocl_atomic_orPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR1,  _Z19__gen_ocl_atomic_orPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR0, _Z20__gen_ocl_atomic_xorPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR1, _Z20__gen_ocl_atomic_xorPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN0, _Z21__gen_ocl_atomic_uminPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN1, _Z21__gen_ocl_atomic_uminPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX0, _Z21__gen_ocl_atomic_umaxPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX1, _Z21__gen_ocl_atomic_umaxPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN0, _Z21__gen_ocl_atomic_iminPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN1, _Z21__gen_ocl_atomic_iminPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX0, _Z21__gen_ocl_atomic_imaxPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX1, _Z21__gen_ocl_atomic_imaxPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG0, _Z21__gen_ocl_atomic_xchgPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG1, _Z21__gen_ocl_atomic_xchgPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC0, _Z20__gen_ocl_atomic_incPU3AS1j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC1, _Z20__gen_ocl_atomic_incPU3AS3j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC0, _Z20__gen_ocl_atomic_decPU3AS1j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC1, _Z20__gen_ocl_atomic_decPU3AS3j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG0, _Z24__gen_ocl_atomic_cmpxchgPU3AS1jjj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG1, _Z24__gen_ocl_atomic_cmpxchgPU3AS3jjj)
+
 // saturation related functions.
 DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
 DECL_LLVM_GEN_FUNCTION(SADD_SAT_SHORT, _Z12ocl_sadd_satss)
@@ -98,5 +126,10 @@ DECL_LLVM_GEN_FUNCTION(USUB_SAT_INT, _Z12ocl_usub_satjj)
 DECL_LLVM_GEN_FUNCTION(USUB_SAT_LONG, _Z12ocl_usub_satmm)
 
 // integer built-in functions
+DECL_LLVM_GEN_FUNCTION(MUL_HI_INT, _Z16__gen_ocl_mul_hiii)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_UINT, _Z16__gen_ocl_mul_hijj)
 DECL_LLVM_GEN_FUNCTION(FBH, __gen_ocl_fbh)
 DECL_LLVM_GEN_FUNCTION(FBL, __gen_ocl_fbl)
+DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_abs)
+DECL_LLVM_GEN_FUNCTION(HADD, __gen_ocl_hadd)
+DECL_LLVM_GEN_FUNCTION(RHADD, __gen_ocl_rhadd)
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 3b191ab..04984d8 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -4355,6 +4355,164 @@ DEC(16)
 #undef DEC4
 #undef DEC8
 #undef DEC16
+
+OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
+OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
+INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
+#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
+#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
+#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
+#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
+#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
+DEF(2)
+DEF(3)
+DEF(4)
+DEF(8)
+DEF(16)
+#undef DEF
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+#undef DEF
+#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
+#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
+#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
+#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
+#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b.se, c.se),  [...]
+#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
+DEF(2)
+DEF(3)
+DEF(4)
+DEF(8)
+DEF(16)
+#undef DEF
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
+INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
+INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
+INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
+INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
+INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
+#define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
+DEF(char, 7)
+DEF(uchar, 7)
+DEF(short, 15)
+DEF(ushort, 15)
+DEF(int, 31)
+DEF(uint, 31)
+#undef DEF
+#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
+#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
+#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
+#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
+#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
+#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
+DEF(2)
+DEF(3)
+DEF(4)
+DEF(8)
+DEF(16)
+#undef DEF
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+PURE CONST uint __gen_ocl_hadd(uint x, uint y);
+PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
+#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
+#define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
+DEC
+#undef DEF
+#define DEF(type) INLINE_OVERLOADABLE type rhadd(type x, type y) { return (x + y + 1) >> 1; }
+DEC
+#undef DEF
+#undef DEC
+INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y) >> 1) : __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
+INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (func(a.s0, b.s0), func(a.s1, b.s1)); }
+#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
+#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
+#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
+#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
+#define DEF(func, n) DEC##n(func, char); DEC##n(func, uchar); DEC##n(func, short); DEC##n(func, ushort); DEC##n(func, int); DEC##n(func, uint)
+DEF(hadd, 2)
+DEF(hadd, 3)
+DEF(hadd, 4)
+DEF(hadd, 8)
+DEF(hadd, 16)
+DEF(rhadd, 2)
+DEF(rhadd, 3)
+DEF(rhadd, 4)
+DEF(rhadd, 8)
+DEF(rhadd, 16)
+#undef DEF
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+int __gen_ocl_abs(int x);
+#define ABS_I(I, CVT)  (CVT)__gen_ocl_abs(x.s##I)
+#define ABS_VEC1(CVT)  (CVT)__gen_ocl_abs(x)
+#define ABS_VEC2(CVT)  ABS_I(0, CVT), ABS_I(1, CVT)
+#define ABS_VEC4(CVT)  ABS_VEC2(CVT), ABS_I(2, CVT), ABS_I(3, CVT)
+#define ABS_VEC8(CVT)  ABS_VEC4(CVT), ABS_I(4, CVT), ABS_I(5, CVT),\
+	               ABS_I(6, CVT), ABS_I(7, CVT)
+#define ABS_VEC16(CVT) ABS_VEC8(CVT), ABS_I(8, CVT), ABS_I(9, CVT), \
+	               ABS_I(A, CVT), ABS_I(B, CVT), ABS_I(C, CVT), \
+	               ABS_I(D, CVT), ABS_I(E, CVT), ABS_I(F, CVT)
+
+#define DEC_1(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return ABS_VEC1(u##TYPE); }
+#define DEC_N(TYPE, N) INLINE_OVERLOADABLE u##TYPE##N abs(TYPE##N x) { return (u##TYPE##N)(ABS_VEC##N(u##TYPE)); };
+#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
+
+DEC(int)
+DEC(short)
+DEC(char)
+#undef DEC_1
+#undef DEC_N
+/* For unsigned types, do nothing. */
+#define DEC_1(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
+#define DEC_N(TYPE, N) INLINE_OVERLOADABLE TYPE##N abs(TYPE##N x) { return x; }
+DEC(uint)
+DEC(ushort)
+DEC(uchar)
+#undef DEC
+#undef DEC_1
+#undef DEC_N
+#undef ABS_I
+#undef ABS_VEC1
+#undef ABS_VEC2
+#undef ABS_VEC4
+#undef ABS_VEC8
+#undef ABS_VEC16
+
 /////////////////////////////////////////////////////////////////////////////
 // Work Items functions (see 6.11.1 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
@@ -4379,11 +4537,9 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
 #define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
 INLINE unsigned NAME(unsigned int dim) {             \
   if (dim == 0) return __gen_ocl_##NAME##0();        \
-  else if (dim > 0 && dim < get_work_dim()) {        \
-    if (dim == 1) return __gen_ocl_##NAME##1();      \
-    else if (dim == 2) return __gen_ocl_##NAME##2(); \
-  }                                                  \
-  return OTHER_RET;                                  \
+  else if (dim == 1) return __gen_ocl_##NAME##1();   \
+  else if (dim == 2) return __gen_ocl_##NAME##2();   \
+  else return OTHER_RET;                             \
 }
 
 DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
@@ -5098,6 +5254,104 @@ INLINE void write_mem_fence(cl_mem_fence_flags flags) {
 }
 
 /////////////////////////////////////////////////////////////////////////////
+// Atomic functions
+/////////////////////////////////////////////////////////////////////////////
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX) \
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)              \
+  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+
+DECL_ATOMIC_OP(add)
+DECL_ATOMIC_OP(sub)
+DECL_ATOMIC_OP(and)
+DECL_ATOMIC_OP(or)
+DECL_ATOMIC_OP(xor)
+DECL_ATOMIC_OP(xchg)
+DECL_ATOMIC_OP_TYPE(xchg, float, atomic_)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(inc)
+DECL_ATOMIC_OP(dec)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(cmpxchg)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+/////////////////////////////////////////////////////////////////////////////
 // Force the compilation to SIMD8 or SIMD16
 /////////////////////////////////////////////////////////////////////////////
 
diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs.cl
new file mode 100644
index 0000000..9e77c2b
--- /dev/null
+++ b/kernels/compiler_abs.cl
@@ -0,0 +1,27 @@
+#define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+    kernel void compiler_abs_##TYPE ( \
+           global TYPE* src, global UTYPE* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = abs(src[i]);     \
+    }
+
+#define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
+    kernel void compiler_abs_##TYPE##N ( \
+           global TYPE##N* src, global UTYPE##N* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = abs(src[i]);     \
+    }
+
+#define COMPILER_ABS(TYPE, UTYPE)  \
+    COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
+
+COMPILER_ABS(int, uint)
+COMPILER_ABS(uint, uint)
+COMPILER_ABS(char, uchar)
+COMPILER_ABS(uchar, uchar)
+COMPILER_ABS(short, ushort)
+COMPILER_ABS(ushort, ushort)
diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
index 23f3e73..61ce2f4 100644
--- a/kernels/compiler_atomic_functions.cl
+++ b/kernels/compiler_atomic_functions.cl
@@ -1,14 +1,43 @@
-/* test OpenCL 1.1 Atomic Functions (section 6.11.1, 9.4) */
-__kernel void compiler_atomic_functions(global int *a, global int *b) {
-  atomic_add(a, *b);
-  atomic_sub(a, *b);
-  atomic_xchg(a, *b);
-  atomic_inc(a);
-  atomic_dec(a);
-  atomic_cmpxchg(a, b, 100);
-  atomic_min(a, *b);
-  atomic_max(a, *b);
-  atomic_and(a, *b);
-  atomic_or(a, *b);
-  atomic_xor(a, *b);
+__kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
+  int lid = get_local_id(0);
+  int i = lid % 12;
+  atomic_xchg(&tmp[4], -1);
+  switch(i) {
+    case 0: atomic_inc(&tmp[i]); break;
+    case 1: atomic_dec(&tmp[i]); break;
+    case 2: atomic_add(&tmp[i], src[lid]); break;
+    case 3: atomic_sub(&tmp[i], src[lid]); break;
+    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 4))); break;
+    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 4)); break;
+    case 6: atomic_xor(&tmp[i], src[lid]); break;
+    case 7: atomic_min(&tmp[i], -src[lid]); break;
+    case 8: atomic_max(&tmp[i], src[lid]); break;
+    case 9: atomic_min((__local unsigned int *)&tmp[i], -src[lid]); break;
+    case 10: atomic_max((__local unsigned int *)&tmp[i], src[lid]); break;
+    case 11: atomic_cmpxchg(&(tmp[i]), 0, src[10]); break;
+    default:  break;
+  }
+
+  switch(i) {
+    case 0: atomic_inc(&dst[i]); break;
+    case 1: atomic_dec(&dst[i]); break;
+    case 2: atomic_add(&dst[i], src[lid]); break;
+    case 3: atomic_sub(&dst[i], src[lid]); break;
+    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 4))); break;
+    case 5: atomic_or (&dst[i], src[lid]<<(lid / 4)); break;
+    case 6: atomic_xor(&dst[i], src[lid]); break;
+    case 7: atomic_min(&dst[i], -src[lid]); break;
+    case 8: atomic_max(&dst[i], src[lid]); break;
+    case 9: atomic_min((__global unsigned int *)&dst[i], -src[lid]); break;
+    case 10: atomic_max((__global unsigned int *)&dst[i], src[lid]); break;
+    case 11: atomic_cmpxchg(&dst[i], 0, src[10]); break;
+    default:  break;
+  }
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  if(get_global_id(0) == 0) {
+    for(i=0; i<12; i=i+1)
+      atomic_add(&dst[i], tmp[i]);
+  }
 }
diff --git a/kernels/compiler_basic_arithmetic.cl b/kernels/compiler_basic_arithmetic.cl
new file mode 100644
index 0000000..3e145d8
--- /dev/null
+++ b/kernels/compiler_basic_arithmetic.cl
@@ -0,0 +1,53 @@
+#define DECL_KERNEL_SUB(type)\
+__kernel void \
+compiler_sub_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] - src1[id]; \
+}
+
+#define DECL_KERNEL_ADD(type)\
+__kernel void \
+compiler_add_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] + src1[id]; \
+}
+
+#define DECL_KERNEL_MUL(type)\
+__kernel void \
+compiler_mul_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] * src1[id]; \
+}
+
+#define DECL_KERNEL_DIV(type)\
+__kernel void \
+compiler_div_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] / src1[id]; \
+}
+
+#define DECL_KERNEL_REM(type)\
+__kernel void \
+compiler_rem_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] % src1[id]; \
+}
+
+#define DECL_KERNEL_FOR_ALL_TYPE(op) \
+DECL_KERNEL_##op(char)               \
+DECL_KERNEL_##op(uchar)              \
+DECL_KERNEL_##op(short)              \
+DECL_KERNEL_##op(ushort)             \
+DECL_KERNEL_##op(int)                \
+DECL_KERNEL_##op(uint)
+
+DECL_KERNEL_FOR_ALL_TYPE(SUB)
+DECL_KERNEL_FOR_ALL_TYPE(ADD)
+DECL_KERNEL_FOR_ALL_TYPE(MUL)
+DECL_KERNEL_FOR_ALL_TYPE(DIV)
+DECL_KERNEL_FOR_ALL_TYPE(REM)
diff --git a/kernels/compiler_hadd.cl b/kernels/compiler_hadd.cl
new file mode 100644
index 0000000..fe50195
--- /dev/null
+++ b/kernels/compiler_hadd.cl
@@ -0,0 +1,4 @@
+kernel void compiler_hadd(global int *src1, global int *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = hadd(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_mad_hi.cl b/kernels/compiler_mad_hi.cl
new file mode 100644
index 0000000..82b09c7
--- /dev/null
+++ b/kernels/compiler_mad_hi.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mad_hi(global int *src1, global int *src2, global int *src3, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = mad_hi(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_mul_hi.cl b/kernels/compiler_mul_hi.cl
new file mode 100644
index 0000000..28ce0a5
--- /dev/null
+++ b/kernels/compiler_mul_hi.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mul_hi(global int *src1, global int *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = mul_hi(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_rhadd.cl b/kernels/compiler_rhadd.cl
new file mode 100644
index 0000000..4024ace
--- /dev/null
+++ b/kernels/compiler_rhadd.cl
@@ -0,0 +1,4 @@
+kernel void compiler_rhadd(global int *src1, global int *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = rhadd(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_rotate.cl b/kernels/compiler_rotate.cl
new file mode 100644
index 0000000..8d0dd0f
--- /dev/null
+++ b/kernels/compiler_rotate.cl
@@ -0,0 +1,5 @@
+kernel void compiler_rotate(global int *src, global int *dst, global int *y) {
+  int i = get_global_id(0);
+  dst[i] = rotate(src[i], y[i]);
+}
+
diff --git a/kernels/compiler_sub_bytes.cl b/kernels/compiler_sub_bytes.cl
deleted file mode 100644
index f058561..0000000
--- a/kernels/compiler_sub_bytes.cl
+++ /dev/null
@@ -1,7 +0,0 @@
-__kernel void
-compiler_sub_bytes(__global char *src0, __global char *src1, __global char *dst)
-{
-  int id = (int)get_global_id(0);
-  dst[id] = src0[id] - src1[id];
-}
-
diff --git a/kernels/compiler_sub_shorts.cl b/kernels/compiler_sub_shorts.cl
deleted file mode 100644
index d26de7f..0000000
--- a/kernels/compiler_sub_shorts.cl
+++ /dev/null
@@ -1,7 +0,0 @@
-__kernel void
-compiler_sub_shorts(__global short *src0, __global short *src1, __global short *dst)
-{
-  int id = (int)get_global_id(0);
-  dst[id] = src0[id] - src1[id];
-}
-
diff --git a/kernels/compiler_vect_compare.cl b/kernels/compiler_vect_compare.cl
new file mode 100644
index 0000000..ae43ec6
--- /dev/null
+++ b/kernels/compiler_vect_compare.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_vect_compare(__global int4 *src, __global int4 *dst)
+{
+  int4 test = (int4)(0,0,0,0);
+
+  dst[get_global_id(0)] = test < src[get_global_id(0)];
+}
diff --git a/src/cl_api.c b/src/cl_api.c
index f7db4bc..dc52f0a 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -279,25 +279,30 @@ clGetContextInfo(cl_context      context,
                  void *          param_value,
                  size_t *        param_value_size_ret)
 {
-  switch (param_name) {
-    case CL_CONTEXT_DEVICES:
-      if (param_value) {
-        if (param_value_size < sizeof(cl_device_id))
-          return CL_INVALID_VALUE;
-          cl_device_id *device_list = (cl_device_id*)param_value;
-          device_list[0] = context->device;
-          if (param_value_size_ret)
-            *param_value_size_ret = sizeof(cl_device_id);
-          return CL_SUCCESS;
-        }
-        if (param_value_size_ret) {
-          *param_value_size_ret = sizeof(cl_device_id);
-          return CL_SUCCESS;
-        }
-    default:
-      NOT_IMPLEMENTED;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  if (param_name == CL_CONTEXT_DEVICES) {
+    FILL_GETINFO_RET (cl_device_id, 1, &context->device, CL_SUCCESS);
+  } else if (param_name == CL_CONTEXT_NUM_DEVICES) {
+    cl_uint n = 1;
+    FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
+  } else if (param_name == CL_CONTEXT_REFERENCE_COUNT) {
+    cl_uint ref = context->ref_n;
+    FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
+  } else if (param_name == CL_CONTEXT_PROPERTIES) {
+    if(context->prop_len > 0) {
+      FILL_GETINFO_RET (cl_context_properties, context->prop_len, context->prop_user, CL_SUCCESS);
+    } else {
+      cl_context_properties n = 0;
+      FILL_GETINFO_RET (cl_context_properties, 1, &n, CL_SUCCESS);
+    }
+  } else {
+    return CL_INVALID_VALUE;
   }
-  return 0;
+
+error:
+  return err;
 }
 
 cl_command_queue
@@ -372,19 +377,6 @@ error:
   return err;
 }
 
-cl_int
-clSetCommandQueueProperty(cl_command_queue               command_queue,
-                          cl_command_queue_properties    properties,
-                          cl_bool                        enable,
-                          cl_command_queue_properties *  old_properties)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_QUEUE (command_queue);
-  NOT_IMPLEMENTED;
-error:
-  return err;
-}
-
 cl_mem
 clCreateBuffer(cl_context    context,
                cl_mem_flags  flags,
@@ -927,8 +919,29 @@ clGetKernelInfo(cl_kernel        kernel,
                 void *           param_value,
                 size_t *         param_value_size_ret)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err;
+
+  CHECK_KERNEL(kernel);
+
+  if (param_name == CL_KERNEL_CONTEXT) {
+    FILL_GETINFO_RET (cl_context, 1, &kernel->program->ctx, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_PROGRAM) {
+    FILL_GETINFO_RET (cl_program, 1, &kernel->program, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_NUM_ARGS) {
+    cl_uint n = kernel->arg_n;
+    FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_REFERENCE_COUNT) {
+    cl_int ref = kernel->ref_n;
+    FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
+    const char * n = cl_kernel_get_name(kernel);
+    FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
 }
 
 cl_int
@@ -1581,7 +1594,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
 {
   size_t fixed_global_off[] = {0,0,0};
   size_t fixed_global_sz[] = {1,1,1};
-  size_t fixed_local_sz[] = {16,1,1};
+  size_t fixed_local_sz[] = {1,1,1};
   cl_int err = CL_SUCCESS;
   cl_uint i;
 
diff --git a/src/cl_context.c b/src/cl_context.c
index 0331151..338706b 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -34,10 +34,11 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
+#include <string.h>
 
 static cl_int
 cl_context_properties_process(const cl_context_properties *prop,
-                              struct _cl_context_prop *cl_props)
+                              struct _cl_context_prop *cl_props, cl_uint * prop_len)
 {
   cl_int err = CL_SUCCESS;
 
@@ -81,6 +82,7 @@ cl_context_properties_process(const cl_context_properties *prop,
       goto error;
     }
     prop += 2;
+    *prop_len += 2;
   }
 exit:
 error:
@@ -101,13 +103,13 @@ cl_create_context(const cl_context_properties *  properties,
   struct _cl_context_prop props;
   cl_context ctx = NULL;
   cl_int err = CL_SUCCESS;
-
+  cl_uint prop_len = 0;
   /* XXX */
   FATAL_IF (pfn_notify != NULL || user_data != NULL, "Unsupported call back");
   FATAL_IF (num_devices != 1, "Only one device is supported");
 
   /* Check that we are getting the right platform */
-  if (UNLIKELY(((err = cl_context_properties_process(properties, &props)) != CL_SUCCESS)))
+  if (UNLIKELY(((err = cl_context_properties_process(properties, &props, &prop_len)) != CL_SUCCESS)))
     goto error;
 
   /* We are good */
@@ -116,6 +118,11 @@ cl_create_context(const cl_context_properties *  properties,
     goto error;
   }
 
+  if(properties != NULL && prop_len > 0) {
+    TRY_ALLOC (ctx->prop_user, CALLOC_ARRAY(cl_context_properties, prop_len));
+    memcpy(ctx->prop_user, properties, sizeof(cl_context_properties)*prop_len);
+  }
+  ctx->prop_len = prop_len;
   /* Attach the device to the context */
   ctx->device = *devices;
 
@@ -171,6 +178,7 @@ cl_context_delete(cl_context ctx)
   assert(ctx->programs == NULL);
   assert(ctx->buffers == NULL);
   assert(ctx->drv);
+  cl_free(ctx->prop_user);
   cl_driver_delete(ctx->drv);
   ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(ctx);
diff --git a/src/cl_context.h b/src/cl_context.h
index 5dff2ef..80bf777 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -68,6 +68,8 @@ struct _cl_context {
   pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
   uint32_t ver;                     /* Gen version */
   struct _cl_context_prop props;
+  cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
+  cl_uint                 prop_len;  /* count of the properties */
 };
 
 /* Implement OpenCL function */
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 052b589..1ff81c1 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -26,8 +26,6 @@ void check_basic_extension(cl_extensions_t *extensions)
 {
   int id;
   for(id = BASE_EXT_START_ID; id <= BASE_EXT_END_ID; id++)
-    //It seems we only support this mandatory extension.
-    if (id == EXT_ID(khr_byte_addressable_store))
       extensions->extensions[id].base.ext_enabled = 1;
 }
 
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index df59feb..3fe0065 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -5,6 +5,7 @@ link_directories (${LLVM_LIBRARY_DIR})
 set (utests_sources
   cl_create_kernel.cpp
   utest_error.c
+  compiler_basic_arithmetic.cpp
   compiler_displacement_map_element.cpp
   compiler_shader_toy.cpp
   compiler_mandelbrot.cpp
@@ -29,10 +30,8 @@ set (utests_sources
   compiler_copy_image.cpp
   compiler_copy_image_3d.cpp
   compiler_copy_buffer_row.cpp
-  compiler_double.cpp
-  compiler_double_2.cpp
-  compiler_double_3.cpp
   compiler_fabs.cpp
+  compiler_abs.cpp
   compiler_fill_image.cpp
   compiler_fill_image0.cpp
   compiler_fill_image_3d.cpp
@@ -46,19 +45,22 @@ set (utests_sources
   compiler_global_constant.cpp
   compiler_global_constant_2.cpp
   compiler_group_size.cpp
+  compiler_hadd.cpp
   compiler_if_else.cpp
   compiler_integer_division.cpp
   compiler_integer_remainder.cpp
   compiler_lower_return0.cpp
   compiler_lower_return1.cpp
   compiler_lower_return2.cpp
+  compiler_mad_hi.cpp
+  compiler_mul_hi.cpp
   compiler_multiple_kernels.cpp
+  compiler_rhadd.cpp
+  compiler_rotate.cpp
   compiler_saturate.cpp
   compiler_saturate_sub.cpp
   compiler_shift_right.cpp
   compiler_short_scatter.cpp
-  compiler_sub_bytes.cpp
-  compiler_sub_shorts.cpp
   compiler_uint2_copy.cpp
   compiler_uint3_copy.cpp
   compiler_uint8_copy.cpp
@@ -73,6 +75,7 @@ set (utests_sources
   compiler_write_only_shorts.cpp
   compiler_switch.cpp
   compiler_math.cpp
+  compiler_atomic_functions.cpp
   compiler_insn_selection_min.cpp
   compiler_insn_selection_max.cpp
   compiler_insn_selection_masked_min_max.cpp
@@ -85,6 +88,7 @@ set (utests_sources
   compiler_volatile.cpp
   compiler_copy_image1.cpp
   compiler_get_image_info.cpp
+  compiler_vect_compare.cpp
   compiler_vector_load_store.cpp
   compiler_cl_finish.cpp
   get_cl_info.cpp
@@ -92,6 +96,9 @@ set (utests_sources
   builtin_global_size.cpp
   runtime_createcontext.cpp
   runtime_null_kernel_arg.cpp
+  compiler_double.cpp
+  compiler_double_2.cpp
+  compiler_double_3.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
new file mode 100644
index 0000000..59d8365
--- /dev/null
+++ b/utests/compiler_abs.cpp
@@ -0,0 +1,219 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+    T ptr[N];
+
+    typedef cl_vec<T, N> vec_type;
+
+    cl_vec(void) {
+        memset(ptr, 0, sizeof(T) * N);
+    }
+    cl_vec(vec_type & other) {
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    vec_type& operator= (vec_type & other) {
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    bool operator== (vec_type & other) {
+        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    void abs(void) {
+        int i = 0;
+        for (; i < N; i++) {
+            T f = ptr[i];
+            f = f < 0 ? -f : f;
+            ptr[i] = f;
+        }
+    }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+    cl_vec<T, N> v  = src[global_id];
+    v.abs();
+    dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *src, U *dst)
+{
+    T f = src[global_id];
+    f = f < 0 ? -f : f;
+    dst[global_id] = (U)f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+    int i = 0;
+    for (; i < N; i++) {
+        vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
+    }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+    val = static_cast<T>((rand() & 63) - 32);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* src,
+	cl_vec<U, N>* dst, int n)
+{
+    U* val = reinterpret_cast<U *>(dst);
+
+    n = n*N;
+
+    printf("\nRaw: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        printf(" %d", ((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        printf(" %d", val[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        printf(" %d", ((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (T* src, U* dst, int n)
+{
+    printf("\nRaw: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        printf(" %d", ((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        printf(" %d", dst[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        printf(" %d", ((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T, typename U> static void compiler_abs_with_type(void)
+{
+    const size_t n = 16;
+    U cpu_dst[16];
+    T cpu_src[16];
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_src[i]);
+        }
+
+        memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+//      dump_data(cpu_src, cpu_dst, n);
+
+        OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(0);
+    }
+}
+
+#define ABS_TEST_TYPE(TYPE, UTYPE) \
+	static void compiler_abs_##TYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_abs.cl", "compiler_abs_"#TYPE, SOURCE, NULL);  \
+           compiler_abs_with_type<TYPE, UTYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION(compiler_abs_##TYPE);
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+ABS_TEST_TYPE(int, uint)
+ABS_TEST_TYPE(short, ushort)
+ABS_TEST_TYPE(char, uchar)
+ABS_TEST_TYPE(uint, uint)
+ABS_TEST_TYPE(ushort, ushort)
+ABS_TEST_TYPE(uchar, uchar)
+
+
+typedef cl_vec<int, 2> int2;
+typedef cl_vec<int, 4> int4;
+typedef cl_vec<int, 8> int8;
+typedef cl_vec<int, 16> int16;
+typedef cl_vec<unsigned int, 2> uint2;
+typedef cl_vec<unsigned int, 4> uint4;
+typedef cl_vec<unsigned int, 8> uint8;
+typedef cl_vec<unsigned int, 16> uint16;
+ABS_TEST_TYPE(int2, uint2)
+ABS_TEST_TYPE(int4, uint4)
+ABS_TEST_TYPE(int8, uint8)
+ABS_TEST_TYPE(int16, uint16)
+ABS_TEST_TYPE(uint2, uint2)
+ABS_TEST_TYPE(uint4, uint4)
+ABS_TEST_TYPE(uint8, uint8)
+ABS_TEST_TYPE(uint16, uint16)
+
+
+typedef cl_vec<char, 2> char2;
+typedef cl_vec<char, 4> char4;
+typedef cl_vec<char, 8> char8;
+typedef cl_vec<char, 16> char16;
+typedef cl_vec<unsigned char, 2> uchar2;
+typedef cl_vec<unsigned char, 4> uchar4;
+typedef cl_vec<unsigned char, 8> uchar8;
+typedef cl_vec<unsigned char, 16> uchar16;
+ABS_TEST_TYPE(char2, uchar2)
+ABS_TEST_TYPE(char4, uchar4)
+ABS_TEST_TYPE(char8, uchar8)
+ABS_TEST_TYPE(char16, uchar16)
+ABS_TEST_TYPE(uchar2, uchar2)
+ABS_TEST_TYPE(uchar4, uchar4)
+ABS_TEST_TYPE(uchar8, uchar8)
+ABS_TEST_TYPE(uchar16, uchar16)
+
+
+typedef cl_vec<short, 2> short2;
+typedef cl_vec<short, 4> short4;
+typedef cl_vec<short, 8> short8;
+typedef cl_vec<short, 16> short16;
+typedef cl_vec<unsigned short, 2> ushort2;
+typedef cl_vec<unsigned short, 4> ushort4;
+typedef cl_vec<unsigned short, 8> ushort8;
+typedef cl_vec<unsigned short, 16> ushort16;
+ABS_TEST_TYPE(short2, ushort2)
+ABS_TEST_TYPE(short4, ushort4)
+ABS_TEST_TYPE(short8, ushort8)
+ABS_TEST_TYPE(short16, ushort16)
+ABS_TEST_TYPE(ushort2, ushort2)
+ABS_TEST_TYPE(ushort4, ushort4)
+ABS_TEST_TYPE(ushort8, ushort8)
+ABS_TEST_TYPE(ushort16, ushort16)
diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
index 20202da..571e0c6 100644
--- a/utests/compiler_atomic_functions.cpp
+++ b/utests/compiler_atomic_functions.cpp
@@ -1,10 +1,96 @@
 #include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+#include <string.h>
 
-void compiler_atomic_functions(void)
+#define GROUP_NUM 16
+#define LOCAL_SIZE 64
+static void cpu_compiler_atomic(int *dst, int *src)
 {
-  OCL_CREATE_KERNEL("compiler_atomic_functions");
+  dst[4] = 0xffffffff;
+  int tmp[16] = { 0 };
+
+  for(int j=0; j<LOCAL_SIZE; j++) {
+    int i = j % 12;
+
+    switch(i) {
+      case 0: tmp[i] += 1; break;
+      case 1: tmp[i] -= 1; break;
+      case 2: tmp[i] += src[j]; break;
+      case 3: tmp[i] -= src[j]; break;
+      case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
+      case 5: tmp[i] |= src[j]<<(j>>2); break;
+      case 6: tmp[i] ^= src[j]; break;
+      case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
+      case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
+      case 9: tmp[i] = (unsigned int)tmp[i] < (unsigned int)(-src[j]) ? tmp[i] : -src[j]; break;
+      case 10: tmp[i] = (unsigned int)tmp[i] > (unsigned int)(src[j]) ? tmp[i] : src[j]; break;
+      case 11:  tmp[i] = src[10]; break;
+      default:  break;
+    }
+  }
+
+  for(int k=0; k<GROUP_NUM; k++) {
+    for(int j=0; j<LOCAL_SIZE; j++) {
+      int i = j % 12;
+
+      switch(i) {
+        case 0: dst[i] += 1; break;
+        case 1: dst[i] -= 1; break;
+        case 2: dst[i] += src[j]; break;
+        case 3: dst[i] -= src[j]; break;
+        case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
+        case 5: dst[i] |= src[j]<<(j>>2); break;
+        case 6: dst[i] ^= src[j]; break;
+        case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
+        case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
+        case 9: dst[i] = (unsigned int)dst[i] < (unsigned int)(-src[j]) ? dst[i] : -src[j]; break;
+        case 10: dst[i] = (unsigned int)dst[i] > (unsigned int)(src[j]) ? dst[i] : src[j]; break;
+        case 11:  dst[i] = src[10]; break;
+        default:  break;
+      }
+    }
+  }
+
+  for(int i=0; i<12; i++)
+    dst[i] += tmp[i];
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions);
+static void compiler_atomic_functions(void)
+{
+  const size_t n = GROUP_NUM * LOCAL_SIZE;
+  int cpu_dst[16] = {0}, cpu_src[256];
+
+  globals[0] = n;
+  locals[0] = LOCAL_SIZE;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_atomic_functions");
+  OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, 16 * sizeof(int), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+
+  OCL_MAP_BUFFER(0);
+  memset(buf_data[0], 0, 16 * sizeof(int));
+  OCL_UNMAP_BUFFER(0);
 
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < locals[0]; ++i)
+      cpu_src[i] = ((int*)buf_data[1])[i] = rand() & 0xff;
+  cpu_compiler_atomic(cpu_dst, cpu_src);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for(int i=0; i<12; i++) {
+    //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
+    OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
 
+MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions)
diff --git a/utests/compiler_basic_arithmetic.cpp b/utests/compiler_basic_arithmetic.cpp
new file mode 100644
index 0000000..dcdd084
--- /dev/null
+++ b/utests/compiler_basic_arithmetic.cpp
@@ -0,0 +1,112 @@
+#include "utest_helper.hpp"
+
+enum eTestOP {
+  TEST_OP_ADD =0,
+  TEST_OP_SUB,
+  TEST_OP_MUL,
+  TEST_OP_DIV,
+  TEST_OP_REM
+};
+
+template <typename T, eTestOP op>
+static void test_exec(const char* kernel_name)
+{
+  const size_t n = 160;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_basic_arithmetic", kernel_name);
+std::cout <<"kernel name: " << kernel_name << std::endl;
+  buf_data[0] = (T*) malloc(sizeof(T) * n);
+  buf_data[1] = (T*) malloc(sizeof(T) * n);
+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = (T) rand();
+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[1])[i] = (T) rand();
+  if(op == TEST_OP_DIV || op == TEST_OP_REM) {
+    for (uint32_t i = 0; i < n; ++i) {
+      if(((T*)buf_data[1])[i] == 0)
+       ((T*)buf_data[1])[i] = (T) 1;
+    }
+  }
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[1]);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(2);
+  if(op == TEST_OP_SUB) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] - ((T*)buf_data[1])[i]));
+  } else if(op == TEST_OP_ADD) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] + ((T*)buf_data[1])[i]));
+  } else if(op == TEST_OP_MUL) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] * ((T*)buf_data[1])[i]));
+  } else if(op == TEST_OP_DIV) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] / ((T*)buf_data[1])[i]));
+  } else {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] % ((T*)buf_data[1])[i]));
+  }
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = buf_data[1] = NULL;
+}
+
+#define DECL_TEST_SUB(type, alias) \
+static void compiler_sub_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_SUB>("compiler_sub_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_ ## alias)
+
+#define DECL_TEST_ADD(type, alias) \
+static void compiler_add_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_ADD>("compiler_add_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_add_ ## alias)
+
+#define DECL_TEST_MUL(type, alias) \
+static void compiler_mul_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_MUL>("compiler_mul_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_mul_ ## alias)
+
+#define DECL_TEST_DIV(type, alias) \
+static void compiler_div_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_DIV>("compiler_div_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_div_ ## alias)
+
+#define DECL_TEST_REM(type, alias) \
+static void compiler_rem_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_REM>("compiler_rem_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_rem_ ## alias)
+
+#define DECL_TEST_FOR_ALL_TYPE(op)\
+DECL_TEST_##op(int8_t, char) \
+DECL_TEST_##op(uint8_t, uchar) \
+DECL_TEST_##op(int16_t, short) \
+DECL_TEST_##op(uint16_t, ushort) \
+DECL_TEST_##op(int32_t, int) \
+DECL_TEST_##op(uint32_t, uint)
+
+DECL_TEST_FOR_ALL_TYPE(SUB)
+DECL_TEST_FOR_ALL_TYPE(ADD)
+DECL_TEST_FOR_ALL_TYPE(MUL)
+DECL_TEST_FOR_ALL_TYPE(DIV)
+DECL_TEST_FOR_ALL_TYPE(REM)
+#undef DECL_TEST_FOR_ALL_TYPE
diff --git a/utests/compiler_hadd.cpp b/utests/compiler_hadd.cpp
new file mode 100644
index 0000000..9723702
--- /dev/null
+++ b/utests/compiler_hadd.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_hadd(void)
+{
+  const int n = 32;
+  int src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_hadd");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a += src2[i];
+    a >>= 1;
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_hadd);
diff --git a/utests/compiler_mad_hi.cpp b/utests/compiler_mad_hi.cpp
new file mode 100644
index 0000000..6f66e7f
--- /dev/null
+++ b/utests/compiler_mad_hi.cpp
@@ -0,0 +1,46 @@
+#include "utest_helper.hpp"
+
+void compiler_mad_hi(void)
+{
+  const int n = 32;
+  int src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mad_hi");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+    src3[i] = ((int*)buf_data[2])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a *= src2[i];
+    a >>= 32;
+    a += src3[i];
+    OCL_ASSERT(((int*)buf_data[3])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mad_hi);
diff --git a/utests/compiler_mul_hi.cpp b/utests/compiler_mul_hi.cpp
new file mode 100644
index 0000000..5ea6389
--- /dev/null
+++ b/utests/compiler_mul_hi.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_mul_hi(void)
+{
+  const int n = 32;
+  int src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mul_hi");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a *= src2[i];
+    a >>= 32;
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mul_hi);
diff --git a/utests/compiler_rhadd.cpp b/utests/compiler_rhadd.cpp
new file mode 100644
index 0000000..b25c788
--- /dev/null
+++ b/utests/compiler_rhadd.cpp
@@ -0,0 +1,41 @@
+#include "utest_helper.hpp"
+
+void compiler_rhadd(void)
+{
+  const int n = 32;
+  int src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_rhadd");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a += src2[i];
+    a ++;
+    a >>= 1;
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_rhadd);
diff --git a/utests/compiler_rotate.cpp b/utests/compiler_rotate.cpp
new file mode 100644
index 0000000..bf52ca4
--- /dev/null
+++ b/utests/compiler_rotate.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+int cpu(int src, int y) {
+  return (src << y) | (src >> (32 - y));
+}
+
+void compiler_rotate(void)
+{
+  const int n = 32;
+  int src[n], y[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_rotate");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src[i] = ((int*)buf_data[0])[i] = rand();
+    y[i] = ((int*)buf_data[2])[i] = rand() & 31;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[1])[i] == cpu(src[i], y[i]));
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_rotate);
diff --git a/utests/compiler_sub_bytes.cpp b/utests/compiler_sub_bytes.cpp
deleted file mode 100644
index 740a8fd..0000000
--- a/utests/compiler_sub_bytes.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "utest_helper.hpp"
-
-static void compiler_sub_bytes(void)
-{
-  const size_t n = 16;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_bytes");
-  buf_data[0] = (int8_t*) malloc(sizeof(int8_t) * n);
-  buf_data[1] = (int8_t*) malloc(sizeof(int8_t) * n);
-  for (uint32_t i = 0; i < n; ++i) ((int8_t*)buf_data[0])[i] = (int8_t) rand();
-  for (uint32_t i = 0; i < n; ++i) ((int8_t*)buf_data[1])[i] = (int8_t) rand();
-  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(int8_t), buf_data[0]);
-  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int8_t), buf_data[1]);
-  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int8_t), NULL);
-
-  // Run the kernel
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
-  globals[0] = n;
-  locals[0] = 16;
-  OCL_NDRANGE(1);
-
-  // Check result
-  OCL_MAP_BUFFER(2);
-  for (uint32_t i = 0; i < n; ++i)
-    OCL_ASSERT(((int8_t*)buf_data[2])[i] == (int8_t)(((int8_t*)buf_data[0])[i] - ((int8_t*)buf_data[1])[i]));
-  free(buf_data[0]);
-  free(buf_data[1]);
-  buf_data[0] = buf_data[1] = NULL;
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_bytes);
-
diff --git a/utests/compiler_sub_shorts.cpp b/utests/compiler_sub_shorts.cpp
deleted file mode 100644
index 7c24a56..0000000
--- a/utests/compiler_sub_shorts.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "utest_helper.hpp"
-
-static void compiler_sub_shorts(void)
-{
-  const size_t n = 16;
-
-  // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_shorts");
-  buf_data[0] = (int16_t*) malloc(sizeof(int16_t) * n);
-  buf_data[1] = (int16_t*) malloc(sizeof(int16_t) * n);
-  for (uint32_t i = 0; i < n; ++i) ((int16_t*)buf_data[0])[i] = (int16_t) rand();
-  for (uint32_t i = 0; i < n; ++i) ((int16_t*)buf_data[1])[i] = (int16_t) rand();
-  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(int16_t), buf_data[0]);
-  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int16_t), buf_data[1]);
-  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int16_t), NULL);
-
-  // Run the kernel
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
-  globals[0] = n;
-  locals[0] = 16;
-  OCL_NDRANGE(1);
-
-  // Check result
-  OCL_MAP_BUFFER(2);
-  for (uint32_t i = 0; i < n; ++i)
-    OCL_ASSERT(((int16_t*)buf_data[2])[i] == (int16_t)(((int16_t*)buf_data[0])[i] - ((int16_t*)buf_data[1])[i]));
-  free(buf_data[0]);
-  free(buf_data[1]);
-  buf_data[0] = buf_data[1] = NULL;
-}
-
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_shorts);
-
-
diff --git a/utests/compiler_vect_compare.cpp b/utests/compiler_vect_compare.cpp
new file mode 100644
index 0000000..e9e45be
--- /dev/null
+++ b/utests/compiler_vect_compare.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+typedef struct {
+  int x;
+  int y;
+  int z;
+  int w;
+} int4;
+
+void compiler_vect_compare(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_vect_compare");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int4), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int4), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i) {
+    ((int4*)buf_data[0])[i].x = i & 0x1;
+    ((int4*)buf_data[0])[i].y = i & 0x2;
+    ((int4*)buf_data[0])[i].z = i & 0x4;
+    ((int4*)buf_data[0])[i].w = i & 0x8;
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 16; ++i) {
+    OCL_ASSERT(((int4*)buf_data[1])[i].x == (int)((i&0x1)?0xffffffff:0));
+    OCL_ASSERT(((int4*)buf_data[1])[i].y == (int)((i&0x2)?0xffffffff:0));
+    OCL_ASSERT(((int4*)buf_data[1])[i].z == (int)((i&0x4)?0xffffffff:0));
+    OCL_ASSERT(((int4*)buf_data[1])[i].w == (int)((i&0x8)?0xffffffff:0));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_vect_compare);
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index bb31032..4c6f404 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -44,7 +44,7 @@ struct Info_Result<char *> {
     int size;
     typedef char* type_value;
 
-    Info_Result(char *other, int sz) {
+    Info_Result(const char *other, int sz): refer(NULL) {
         size = sz;
         ret = (char *)malloc(sizeof(char) * sz);
         if (other) {
@@ -362,3 +362,133 @@ void get_program_build_info(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_program_build_info);
+
+/* ***************************************************** *
+ * clGetContextInfo                                      *
+ * ***************************************************** */
+#define CALL_CONTEXTINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetContextInfo, ctx)
+
+void get_context_info(void)
+{
+    /* use the compiler_fabs case to test us. */
+    const size_t n = 16;
+    map<cl_context_info, void *> maps;
+    int expect_ref;
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_CREATE_KERNEL("compiler_fabs");
+
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+    globals[0] = 16;
+    locals[0] = 16;
+
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+        ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    /* Do our test.*/
+    expect_ref = 1;
+    maps.insert(make_pair(CL_CONTEXT_NUM_DEVICES,
+                          (void *)(new Info_Result<cl_uint>(expect_ref))));
+    maps.insert(make_pair(CL_CONTEXT_DEVICES,
+                          (void *)(new Info_Result<cl_device_id>(device))));
+    // reference count seems depends on the implementation
+    expect_ref = NO_STANDARD_REF;
+    maps.insert(make_pair(CL_CONTEXT_REFERENCE_COUNT,
+                          (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+    maps.insert(make_pair(CL_CONTEXT_PROPERTIES,
+                          (void *)(new Info_Result<char*>(
+                                       (const char*)NULL, 100*sizeof(cl_context_properties)))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_context_info, void *> x) {
+        switch (x.first) {
+        case CL_CONTEXT_NUM_DEVICES:
+            CALL_CONTEXTINFO_AND_RET(cl_uint);
+            break;
+        case CL_CONTEXT_DEVICES:
+            CALL_CONTEXTINFO_AND_RET(cl_device_id);
+            break;
+        case CL_CONTEXT_REFERENCE_COUNT:
+            CALL_CONTEXTINFO_AND_RET(cl_uint);
+            break;
+        case CL_CONTEXT_PROPERTIES:
+            CALL_CONTEXTINFO_AND_RET(char*);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_context_info);
+
+/* ***************************************************** *
+ * clGetKernelInfo                                      *
+ * ***************************************************** */
+#define CALL_KERNELINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetKernelInfo, kernel)
+
+void get_kernel_info(void)
+{
+    /* use the compiler_fabs case to test us. */
+    const size_t n = 16;
+    map<cl_kernel_info, void *> maps;
+    int expect_ref;
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_CREATE_KERNEL("compiler_fabs");
+
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+    // Run the kernel on GPU
+
+    maps.insert(make_pair(CL_KERNEL_PROGRAM,
+                          (void *)(new Info_Result<cl_program>(program))));
+    maps.insert(make_pair(CL_KERNEL_CONTEXT,
+                          (void *)(new Info_Result<cl_context>(ctx))));
+    // reference count seems depends on the implementation
+    expect_ref = NO_STANDARD_REF;
+    maps.insert(make_pair(CL_KERNEL_REFERENCE_COUNT,
+                          (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+    expect_ref = 2;
+    maps.insert(make_pair(CL_KERNEL_NUM_ARGS,
+                          (void *)(new Info_Result<cl_uint>(expect_ref))));
+
+    const char * expected_name = "compiler_fabs";
+    maps.insert(make_pair(CL_KERNEL_FUNCTION_NAME,
+                          (void *)(new Info_Result<char*>(expected_name, strlen(expected_name)+1))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_kernel_info, void *> x) {
+        switch (x.first) {
+        case CL_KERNEL_PROGRAM:
+            CALL_KERNELINFO_AND_RET(cl_program);
+            break;
+        case CL_KERNEL_CONTEXT:
+            CALL_KERNELINFO_AND_RET(cl_context);
+            break;
+        case CL_KERNEL_REFERENCE_COUNT:
+            CALL_KERNELINFO_AND_RET(cl_uint);
+            break;
+        case CL_KERNEL_NUM_ARGS:
+            CALL_KERNELINFO_AND_RET(cl_uint);
+            break;
+        case CL_KERNEL_FUNCTION_NAME:
+            CALL_KERNELINFO_AND_RET(char*);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_kernel_info);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list