[Pkg-opencl-devel] [beignet] 51/66: Imported Upstream version 0.2+git20130816+cc09b26

Fri Oct 31 07:27:08 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 1a4f8602d9416f95483cd941ecacabef0684e335
Author: Simon Richter <sjr at debian.org>
Date:   Fri Aug 16 12:07:35 2013 +0200

    Imported Upstream version 0.2+git20130816+cc09b26
---
 backend/src/backend/gen_context.cpp                | 508 ++++++++++++++++-----
 backend/src/backend/gen_context.hpp                |   9 +
 backend/src/backend/gen_encoder.cpp                |   6 -
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   5 +
 backend/src/backend/gen_insn_scheduling.cpp        |  21 +-
 backend/src/backend/gen_insn_selection.cpp         | 225 +++++++--
 backend/src/backend/gen_insn_selection.hxx         |  21 +-
 backend/src/backend/gen_reg_allocation.cpp         |   3 +-
 backend/src/backend/gen_register.hpp               |   7 +
 backend/src/gen_builtin_vector.py                  |   4 +-
 backend/src/ir/instruction.cpp                     |   4 +-
 backend/src/ir/instruction.hpp                     |   2 +
 backend/src/ir/instruction.hxx                     |   1 +
 backend/src/llvm/llvm_gen_backend.cpp              |   9 +
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   1 +
 backend/src/ocl_stdlib.tmpl.h                      | 167 ++++++-
 kernels/builtin_acos_asin.cl                       |  10 +
 kernels/compiler_async_copy.cl                     |  16 +
 kernels/compiler_async_stride_copy.cl              |  16 +
 kernels/compiler_event.cl                          |   6 +
 kernels/compiler_load_bool_imm.cl                  |  12 +
 kernels/compiler_long_asr.cl                       |   7 +
 kernels/compiler_long_cmp.cl                       |  29 ++
 kernels/compiler_long_convert.cl                   |   7 +
 kernels/compiler_long_mult.cl                      |   7 +
 kernels/compiler_long_shl.cl                       |   7 +
 kernels/compiler_long_shr.cl                       |   7 +
 kernels/compiler_upsample_long.cl                  |   4 +
 kernels/compiler_vector_load_store.cl              |   6 +-
 src/CMakeLists.txt                                 |   1 +
 src/cl_api.c                                       | 456 +++++++++---------
 src/cl_command_queue.c                             |   7 +
 src/cl_command_queue.h                             |   3 +
 src/cl_command_queue_gen7.c                        |   2 -
 src/cl_context.h                                   |   2 +
 src/cl_driver.h                                    |  31 ++
 src/cl_driver_defs.c                               |   5 +
 src/cl_enqueue.c                                   | 336 ++++++++++++++
 src/cl_enqueue.h                                   |  63 +++
 src/cl_event.c                                     | 388 +++++++++++++++-
 src/cl_event.h                                     |  66 ++-
 src/cl_internals.h                                 |   1 +
 src/cl_utils.h                                     |  14 +-
 src/intel/intel_gpgpu.c                            | 109 ++++-
 utests/CMakeLists.txt                              |  12 +
 utests/builtin_acos_asin.cpp                       |  87 ++++
 utests/compiler_async_copy.cpp                     |  39 ++
 utests/compiler_async_stride_copy.cpp              |  45 ++
 utests/compiler_load_bool_imm.cpp                  |  29 ++
 utests/compiler_long_asr.cpp                       |  41 ++
 utests/compiler_long_cmp.cpp                       | 117 +++++
 utests/compiler_long_convert.cpp                   |  67 +++
 utests/compiler_long_mult.cpp                      |  49 ++
 utests/compiler_long_shl.cpp                       |  41 ++
 utests/compiler_long_shr.cpp                       |  41 ++
 utests/compiler_smoothstep.cpp                     |   2 +-
 utests/compiler_upsample_long.cpp                  |  38 ++
 utests/compiler_vector_load_store.cpp              |   4 +-
 utests/runtime_event.cpp                           |  61 +++
 utests/utest_helper.hpp                            |  10 +-
 60 files changed, 2873 insertions(+), 421 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 621e7be..1a012fe 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -151,13 +151,146 @@ namespace gbe
     }
   }
 
+  void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction &insn) {
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister tmp = ra->genReg(insn.dst(1));
+    switch (insn.opcode) {
+      case SEL_OP_LOAD_DF_IMM:
+        p->LOAD_DF_IMM(dst, tmp, src.value.df);
+        break;
+      case SEL_OP_MOV_DF:
+        p->MOV_DF(dst, src, tmp);
+        break;
+      case SEL_OP_CONVI_TO_I64: {
+        GenRegister middle;
+        if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
+          middle = tmp;
+          middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
+          p->MOV(middle, src);
+        } else {
+          middle = src;
+        }
+        int execWidth = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        for (int nib = 0; nib < execWidth / 4; nib ++) {
+          p->curr.chooseNib(nib);
+          p->MOV(dst.bottom_half(), middle);
+          if(middle.is_signed_int())
+            p->ASR(dst.top_half(), middle, GenRegister::immud(31));
+          else
+            p->MOV(dst.top_half(), GenRegister::immd(0));
+          dst = GenRegister::suboffset(dst, 4);
+          middle = GenRegister::suboffset(middle, 4);
+        }
+        p->pop();
+        break;
+      }
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction &insn) {
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister tmp = ra->genReg(insn.dst(1));
+    switch (insn.opcode) {
+      case SEL_OP_I64ADD: {
+        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
+                    y = GenRegister::suboffset(x, p->curr.execWidth);
+        loadBottomHalf(x, src0);
+        loadBottomHalf(y, src1);
+        addWithCarry(x, x, y);
+        storeBottomHalf(dst, x);
+        loadTopHalf(x, src0);
+        p->ADD(x, x, y);
+        loadTopHalf(y, src1);
+        p->ADD(x, x, y);
+        storeTopHalf(dst, x);
+        break;
+      }
+      case SEL_OP_I64SUB: {
+        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
+                    y = GenRegister::suboffset(x, p->curr.execWidth);
+        loadBottomHalf(x, src0);
+        loadBottomHalf(y, src1);
+        subWithBorrow(x, x, y);
+        storeBottomHalf(dst, x);
+        loadTopHalf(x, src0);
+        subWithBorrow(x, x, y);
+        loadTopHalf(y, src1);
+        subWithBorrow(x, x, y);
+        storeTopHalf(dst, x);
+        break;
+      }
+      case SEL_OP_MUL_HI: {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        for (int i = 0; i < w / 8; i ++) {
+          p->push();
+          p->curr.predicate = GEN_PREDICATE_NONE;
+          p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
+          p->curr.accWrEnable = 1;
+          p->MACH(tmp, src0, src1);
+          p->pop();
+          p->curr.quarterControl = i;
+          p->MOV(dst, tmp);
+          dst = GenRegister::Qn(dst, 1);
+          src0 = GenRegister::Qn(src0, 1);
+          src1 = GenRegister::Qn(src1, 1);
+        }
+        p->pop();
+        break;
+       }
+     case SEL_OP_HADD: {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        for (int i = 0; i < w / 8; i ++) {
+          p->curr.quarterControl = i;
+          p->ADDC(dst, src0, src1);
+          p->SHR(dst, dst, GenRegister::immud(1));
+          p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+          p->OR(dst, dst, tmp);
+          dst = GenRegister::Qn(dst, 1);
+          src0 = GenRegister::Qn(src0, 1);
+          src1 = GenRegister::Qn(src1, 1);
+        }
+        p->pop();
+        break;
+       }
+      case SEL_OP_RHADD: {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        for (int i = 0; i < w / 8; i ++) {
+          p->curr.quarterControl = i;
+          p->ADDC(dst, src0, src1);
+          p->ADD(dst, dst, GenRegister::immud(1));
+          p->SHR(dst, dst, GenRegister::immud(1));
+          p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+          p->OR(dst, dst, tmp);
+          dst = GenRegister::Qn(dst, 1);
+          src0 = GenRegister::Qn(src0, 1);
+          src1 = GenRegister::Qn(src1, 1);
+        }
+        p->pop();
+        break;
+       }
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
+
   void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     switch (insn.opcode) {
-      case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break;
-      case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
       case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
       case SEL_OP_SEL_INT64:
         {
@@ -248,10 +381,219 @@ namespace gbe
       case SEL_OP_MACH: p->MACH(dst, src0, src1); break;
       case SEL_OP_UPSAMPLE_SHORT: p->UPSAMPLE_SHORT(dst, src0, src1); break;
       case SEL_OP_UPSAMPLE_INT: p->UPSAMPLE_INT(dst, src0, src1); break;
+      case SEL_OP_UPSAMPLE_LONG:
+        {
+          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+          int execWidth = p->curr.execWidth;
+          p->push();
+          p->curr.execWidth = 8;
+          for (int nib = 0; nib < execWidth / 4; nib ++) {
+            p->curr.chooseNib(nib);
+            p->MOV(xdst.top_half(), xsrc0.bottom_half());
+            p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
+            xdst = GenRegister::suboffset(xdst, 4);
+            xsrc0 = GenRegister::suboffset(xsrc0, 4);
+            xsrc1 = GenRegister::suboffset(xsrc1, 4);
+          }
+          p->pop();
+        }
+        break;
       default: NOT_IMPLEMENTED;
     }
   }
 
+  void GenContext::collectShifter(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    for (int nib = 0; nib < execWidth / 4; nib ++) {
+      p->AND(dest, src.bottom_half(), GenRegister::immud(63));
+      dest = GenRegister::suboffset(dest, 4);
+      src = GenRegister::suboffset(src, 4);
+    }
+    p->pop();
+  }
+
+  void GenContext::emitI64ShiftInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+    GenRegister zero = GenRegister::immud(0);
+    switch(insn.opcode) {
+      case SEL_OP_I64SHL:
+        p->push();
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        collectShifter(a, y);
+        loadBottomHalf(e, x);
+        loadTopHalf(f, x);
+        p->SHR(b, e, GenRegister::negate(a));
+        p->SHL(c, e, a);
+        p->SHL(d, f, a);
+        p->OR(e, d, b);
+        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, e);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->AND(a, a, GenRegister::immud(32));
+        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, c);
+        p->SEL(c, c, zero);
+        p->pop();
+        storeBottomHalf(dest, c);
+        storeTopHalf(dest, d);
+        break;
+      case SEL_OP_I64SHR:
+        p->push();
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        collectShifter(a, y);
+        loadBottomHalf(e, x);
+        loadTopHalf(f, x);
+        p->SHL(b, f, GenRegister::negate(a));
+        p->SHR(c, f, a);
+        p->SHR(d, e, a);
+        p->OR(e, d, b);
+        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, e);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->AND(a, a, GenRegister::immud(32));
+        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, c);
+        p->SEL(c, c, zero);
+        p->pop();
+        storeBottomHalf(dest, d);
+        storeTopHalf(dest, c);
+        break;
+      case SEL_OP_I64ASR:
+        f.type = GEN_TYPE_D;
+        p->push();
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        collectShifter(a, y);
+        loadBottomHalf(e, x);
+        loadTopHalf(f, x);
+        p->SHL(b, f, GenRegister::negate(a));
+        p->ASR(c, f, a);
+        p->SHR(d, e, a);
+        p->OR(e, d, b);
+        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, e);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->AND(a, a, GenRegister::immud(32));
+        p->MOV(GenRegister::flag(1, 1), GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.physicalFlag = 1, p->curr.flag = 1, p->curr.subFlag = 1;
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, c);
+        p->SEL(c, c, GenRegister::immd(-1));
+        p->pop();
+        storeBottomHalf(dest, d);
+        storeTopHalf(dest, c);
+        break;
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::saveFlag(GenRegister dest, int flag, int subFlag) {
+    p->push();
+    p->curr.execWidth = 1;
+    p->MOV(dest, GenRegister::flag(flag, subFlag));
+    p->pop();
+  }
+
+  void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister tmp0 = ra->genReg(insn.dst(0));
+    GenRegister tmp1 = ra->genReg(insn.dst(1));
+    GenRegister tmp2 = ra->genReg(insn.dst(2));
+    tmp0.type = (src0.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+    tmp1.type = (src1.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+    int flag = p->curr.flag, subFlag = p->curr.subFlag;
+    GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW),
+                f2 = GenRegister::suboffset(f1, 1),
+                f3 = GenRegister::suboffset(f1, 2);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.flag = 0, p->curr.subFlag = 1;
+    loadTopHalf(tmp0, src0);
+    loadTopHalf(tmp1, src1);
+    switch(insn.extra.function) {
+      case GEN_CONDITIONAL_L:
+      case GEN_CONDITIONAL_LE:
+      case GEN_CONDITIONAL_G:
+      case GEN_CONDITIONAL_GE:
+        {
+          int cmpTopHalf = insn.extra.function;
+          if(insn.extra.function == GEN_CONDITIONAL_LE)
+            cmpTopHalf = GEN_CONDITIONAL_L;
+          if(insn.extra.function == GEN_CONDITIONAL_GE)
+            cmpTopHalf = GEN_CONDITIONAL_G;
+          p->CMP(cmpTopHalf, tmp0, tmp1);
+        }
+        saveFlag(f1, 0, 1);
+        p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+        saveFlag(f2, 0, 1);
+        tmp0.type = tmp1.type = GEN_TYPE_UD;
+        loadBottomHalf(tmp0, src0);
+        loadBottomHalf(tmp1, src1);
+        p->CMP(insn.extra.function, tmp0, tmp1);
+        saveFlag(f3, 0, 1);
+        p->AND(f2, f2, f3);
+        p->OR(f1, f1, f2);
+        break;
+      case GEN_CONDITIONAL_EQ:
+        p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+        saveFlag(f1, 0, 1);
+        tmp0.type = tmp1.type = GEN_TYPE_UD;
+        loadBottomHalf(tmp0, src0);
+        loadBottomHalf(tmp1, src1);
+        p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+        saveFlag(f2, 0, 1);
+        p->AND(f1, f1, f2);
+        break;
+      case GEN_CONDITIONAL_NEQ:
+        p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+        saveFlag(f1, 0, 1);
+        tmp0.type = tmp1.type = GEN_TYPE_UD;
+        loadBottomHalf(tmp0, src0);
+        loadBottomHalf(tmp1, src1);
+        p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+        saveFlag(f2, 0, 1);
+        p->OR(f1, f1, f2);
+        break;
+      default:
+        NOT_IMPLEMENTED;
+    }
+    saveFlag(f2, flag, subFlag);
+    p->AND(f1, f1, f2);
+    p->MOV(GenRegister::flag(flag, subFlag), f1);
+    p->pop();
+  }
+
   void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
     int execWidth = p->curr.execWidth;
     src = src.top_half();
@@ -352,113 +694,59 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1) {
+    GenRegister acc = GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD);
+    int execWidth = p->curr.execWidth;
+    p->push();
+    p->curr.execWidth = 8;
+    for(int i = 0; i < execWidth; i += 8) {
+      p->MUL(acc, src0, src1);
+      p->curr.accWrEnable = 1;
+      p->MACH(high, src0, src1);
+      p->curr.accWrEnable = 0;
+      p->MOV(low, acc);
+      src0 = GenRegister::suboffset(src0, 8);
+      src1 = GenRegister::suboffset(src1, 8);
+      high = GenRegister::suboffset(high, 8);
+      low = GenRegister::suboffset(low, 8);
+    }
+    p->pop();
+  }
+
+  void GenContext::emitI64MULInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    I32FullMult(GenRegister::null(), e, b, c);
+    I32FullMult(GenRegister::null(), f, a, d);
+    p->ADD(e, e, f);
+    I32FullMult(f, a, b, d);
+    p->ADD(e, e, f);
+    p->pop();
+    storeTopHalf(dest, e);
+    storeBottomHalf(dest, a);
+  }
+
   void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     const GenRegister src2 = ra->genReg(insn.src(2));
     switch (insn.opcode) {
-      case SEL_OP_I64ADD:
-        {
-          GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
-                      y = GenRegister::suboffset(x, p->curr.execWidth);
-          loadBottomHalf(x, src0);
-          loadBottomHalf(y, src1);
-          addWithCarry(x, x, y);
-          storeBottomHalf(dst, x);
-          loadTopHalf(x, src0);
-          p->ADD(x, x, y);
-          loadTopHalf(y, src1);
-          p->ADD(x, x, y);
-          storeTopHalf(dst, x);
-        }
-        break;
-      case SEL_OP_I64SUB:
-        {
-          GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
-                      y = GenRegister::suboffset(x, p->curr.execWidth);
-          loadBottomHalf(x, src0);
-          loadBottomHalf(y, src1);
-          subWithBorrow(x, x, y);
-          storeBottomHalf(dst, x);
-          loadTopHalf(x, src0);
-          subWithBorrow(x, x, y);
-          loadTopHalf(y, src1);
-          subWithBorrow(x, x, y);
-          storeTopHalf(dst, x);
-        }
-        break;
-      case SEL_OP_MUL_HI:
-       {
-        int w = p->curr.execWidth;
-        p->push();
-        p->curr.execWidth = 8;
-        p->curr.quarterControl = 0;
-        p->push();
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
-        p->curr.accWrEnable = 1;
-        p->MACH(src2, src0, src1);
-        p->curr.accWrEnable = 0;
-        p->pop();
-        p->MOV(dst, src2);
-        if (w == 16) {
-          p->push();
-          p->curr.predicate = GEN_PREDICATE_NONE;
-          p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
-          p->curr.accWrEnable = 1;
-          p->MACH(src2, GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
-          p->curr.accWrEnable = 0;
-          p->pop();
-          p->curr.quarterControl = 1;
-          p->MOV(GenRegister::Qn(dst, 1), src2);
-        }
-        p->pop();
-        break;
-       }
       case SEL_OP_MAD:  p->MAD(dst, src0, src1, src2); break;
-      case SEL_OP_HADD:
-       {
-        int w = p->curr.execWidth;
-        p->push();
-        p->curr.execWidth = 8;
-        p->curr.quarterControl = 0;
-        p->ADDC(dst, src0, src1);
-        p->SHR(dst, dst, GenRegister::immud(1));
-        p->SHL(src2, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
-        p->OR(dst, dst, src2);
-        if (w == 16) {
-          p->curr.quarterControl = 1;
-          p->ADDC(GenRegister::Qn(dst, 1), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
-          p->SHR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
-          p->SHL(GenRegister::Qn(src2, 1), GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
-          p->OR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::Qn(src2, 1));
-        }
-        p->pop();
-        break;
-       }
-      case SEL_OP_RHADD:
-       {
-        int w = p->curr.execWidth;
-        p->push();
-        p->curr.execWidth = 8;
-        p->curr.quarterControl = 0;
-        p->ADDC(dst, src0, src1);
-        p->ADD(dst, dst, GenRegister::immud(1));
-        p->SHR(dst, dst, GenRegister::immud(1));
-        p->SHL(src2, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
-        p->OR(dst, dst, src2);
-        if (w == 16) {
-          p->curr.quarterControl = 1;
-          p->ADDC(GenRegister::Qn(dst, 1), GenRegister::Qn(src0, 1), GenRegister::Qn(src1, 1));
-          p->ADD(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
-          p->SHR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::immud(1));
-          p->SHL(GenRegister::Qn(src2, 1), GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
-          p->OR(GenRegister::Qn(dst, 1), GenRegister::Qn(dst, 1), GenRegister::Qn(src2, 1));
-        }
-        p->pop();
-        break;
-       }
       default: NOT_IMPLEMENTED;
     }
   }
@@ -553,11 +841,12 @@ namespace gbe
   }
 
   void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
+    p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
     p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
       p->curr.execWidth = 8;
       p->curr.noMask = 1;
-      p->EOT(0);
+      p->EOT(112);
     p->pop();
   }
 
@@ -601,10 +890,10 @@ namespace gbe
   void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
     const uint32_t elemNum = insn.extra.elem;
     const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
-    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
-    const GenRegister tmp = ra->genReg(insn.dst(0));
+    const GenRegister tempAddr = ra->genReg(insn.dst(0));
+    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize + 1));
+    const GenRegister tmp = ra->genReg(insn.dst(1));
     const GenRegister src = ra->genReg(insn.src(0));
-    const GenRegister tempAddr = ra->genReg(insn.src(1));
     const uint32_t bti = insn.extra.function;
     p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
   }
@@ -621,11 +910,12 @@ namespace gbe
   //  then follow the real destination registers.
   //  For SIMD16, we allocate elemNum temporary registers from dst(0).
   void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
-    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister src = ra->genReg(insn.dst(0));
     const uint32_t elemNum = insn.extra.elem;
-    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
-    const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1));
+    const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
+    const GenRegister data = ra->genReg(insn.src(1));
     const uint32_t bti = insn.extra.function;
+    p->MOV(src, addr);
     p->WRITE64(src, data, bti, elemNum);
   }
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 694ae98..8b481d0 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -76,6 +76,7 @@ namespace gbe
       return this->liveness->getLiveOut(bb);
     }
 
+    void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
     void storeTopHalf(GenRegister dest, GenRegister src);
 
@@ -84,12 +85,18 @@ namespace gbe
 
     void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
     void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+    void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
+    void saveFlag(GenRegister dest, int flag, int subFlag);
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
     void emitUnaryInstruction(const SelectionInstruction &insn);
+    void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
     void emitBinaryInstruction(const SelectionInstruction &insn);
+    void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
     void emitTernaryInstruction(const SelectionInstruction &insn);
+    void emitI64ShiftInstruction(const SelectionInstruction &insn);
+    void emitI64CompareInstruction(const SelectionInstruction &insn);
     void emitCompareInstruction(const SelectionInstruction &insn);
     void emitJumpInstruction(const SelectionInstruction &insn);
     void emitIndirectMoveInstruction(const SelectionInstruction &insn);
@@ -111,8 +118,10 @@ namespace gbe
     void emitSpillRegInstruction(const SelectionInstruction &insn);
     void emitUnSpillRegInstruction(const SelectionInstruction &insn);
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
+    void emitI64MULInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
     /*! Store the position of each label instruction in the Gen ISA stream */
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 4d6aa34..1a459e1 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -412,12 +412,6 @@ namespace gbe
       curr.noMask = originMask;
       this->UNTYPED_WRITE(msg, bti, elemNum);
     }
-    /* Let's restore the original message(addr) register. */
-    /* XXX could be optimized if we don't allocate the address to the header
-       position of the message. */
-    curr.predicate = GEN_PREDICATE_NONE;
-    curr.noMask = GEN_MASK_DISABLE;
-    ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(-4));
     pop();
   }
 
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index da8f2a2..2204837 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -1,9 +1,13 @@
 //                 Family     Latency     SIMD16     SIMD8
 DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
 DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
+DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        4,        2)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
+DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        4,        2)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Shift,        20,        4,        2)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Compare,      20,        4,        2)
 DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
 DECL_GEN7_SCHEDULE(IndirectMove,    20,        2,        2)
 DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
@@ -24,3 +28,4 @@ DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
 DECL_GEN7_SCHEDULE(UnSpillReg,      80,        1,        1)
 DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
+DECL_GEN7_SCHEDULE(I64MUL,          20,        4,        2)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index 0b720b7..f1f5775 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -165,11 +165,15 @@ namespace gbe
       }
       return false;
     }
+    /*! Owns the tracker */
+    SelectionScheduler &scheduler;
     /*! Add a new dependency "node0 depends on node set for register reg" */
-    INLINE void addDependency(ScheduleDAGNode *node0, GenRegister reg) {
+    INLINE  void addDependency(ScheduleDAGNode *node0, GenRegister reg) {
       if (this->ignoreDependency(reg) == false) {
         const uint32_t index = this->getIndex(reg);
         this->addDependency(node0, index);
+        if (reg.isdf() || reg.isint64())
+          this->addDependency(node0, index + 1);
       }
     }
     /*! Add a new dependency "node set for register reg depends on node0" */
@@ -177,6 +181,8 @@ namespace gbe
       if (this->ignoreDependency(reg) == false) {
         const uint32_t index = this->getIndex(reg);
         this->addDependency(index, node0);
+        if (reg.isdf() || reg.isint64())
+          this->addDependency(index + 1, node0);
       }
     }
     /*! Make the node located at insnID a barrier */
@@ -187,8 +193,6 @@ namespace gbe
     static const uint32_t MAX_FLAG_REGISTER = 8u;
     /*! Maximum number of *physical* accumulators registers */
     static const uint32_t MAX_ACC_REGISTER = 1u;
-    /*! Owns the tracker */
-    SelectionScheduler &scheduler;
     /*! Stores the last node that wrote to a register / memory ... */
     vector<ScheduleDAGNode*> nodes;
     /*! Stores the nodes per instruction */
@@ -328,11 +332,13 @@ namespace gbe
       if (this->ignoreDependency(dst) == false) {
         const uint32_t index = this->getIndex(dst);
         this->nodes[index] = node;
+        if (dst.isdf() || dst.isint64())
+          this->nodes[index + 1] = node;
       }
     }
 
     // Track writes in predicates
-    if (insn.opcode == SEL_OP_CMP) {
+    if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
       const uint32_t index = this->getIndex(getFlag(insn));
       this->nodes[index] = node;
     }
@@ -454,7 +460,7 @@ namespace gbe
         tracker.addDependency(node, insn.dst(dstID));
 
       // write-after-write for predicate
-      if (insn.opcode == SEL_OP_CMP)
+      if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP)
         tracker.addDependency(node, getFlag(insn));
 
       // write-after-write for accumulators
@@ -467,6 +473,11 @@ namespace gbe
         tracker.addDependency(node, index);
       }
 
+      // write-after-write in scratch memory
+      if (insn.opcode == SEL_OP_SPILL_REG) {
+        const uint32_t index = tracker.getIndex(0xff);
+        tracker.addDependency(node, index);
+      }
 
       // Consider barriers and wait are writing memory (local and global)
     if (insn.opcode == SEL_OP_BARRIER ||
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 1a3af68..8e4cd8f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -410,13 +410,19 @@ namespace gbe
 
 #define ALU1(OP) \
   INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
+#define ALU1WithTemp(OP) \
+  INLINE void OP(Reg dst, Reg src, Reg temp) { ALU1WithTemp(SEL_OP_##OP, dst, src, temp); }
 #define ALU2(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1) { ALU2(SEL_OP_##OP, dst, src0, src1); }
+#define ALU2WithTemp(OP) \
+  INLINE void OP(Reg dst, Reg src0, Reg src1, Reg temp) { ALU2WithTemp(SEL_OP_##OP, dst, src0, src1, temp); }
 #define ALU3(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
+#define I64Shift(OP) \
+  INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
     ALU1(MOV)
-    ALU2(MOV_DF)
-    ALU2(LOAD_DF_IMM)
+    ALU1WithTemp(MOV_DF)
+    ALU1WithTemp(LOAD_DF_IMM)
     ALU1(LOAD_INT64_IMM)
     ALU1(RNDZ)
     ALU1(RNDE)
@@ -435,8 +441,8 @@ namespace gbe
     ALU2(RSL)
     ALU2(ASR)
     ALU2(ADD)
-    ALU3(I64ADD)
-    ALU3(I64SUB)
+    ALU2WithTemp(I64ADD)
+    ALU2WithTemp(I64SUB)
     ALU2(MUL)
     ALU1(FRC)
     ALU1(RNDD)
@@ -444,16 +450,28 @@ namespace gbe
     ALU2(MACH)
     ALU1(LZD)
     ALU3(MAD)
-    ALU3(MUL_HI)
+    ALU2WithTemp(MUL_HI)
     ALU1(FBH)
     ALU1(FBL)
-    ALU3(HADD)
-    ALU3(RHADD)
+    ALU2WithTemp(HADD)
+    ALU2WithTemp(RHADD)
     ALU2(UPSAMPLE_SHORT)
     ALU2(UPSAMPLE_INT)
+    ALU2(UPSAMPLE_LONG)
+    ALU1WithTemp(CONVI_TO_I64)
+    I64Shift(I64SHL)
+    I64Shift(I64SHR)
+    I64Shift(I64ASR)
 #undef ALU1
+#undef ALU1WithTemp
 #undef ALU2
+#undef ALU2WithTemp
 #undef ALU3
+#undef I64Shift
+    /*! Shift a 64-bit integer */
+    void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    /*! Compare 64-bit integer */
+    void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
     /*! Encode a barrier instruction */
     void BARRIER(GenRegister src);
     /*! Encode a barrier instruction */
@@ -479,7 +497,7 @@ namespace gbe
     /*! Read 64 bits float/int array */
     void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
     /*! Write 64 bits float/int array */
-    void WRITE64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
+    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */
@@ -494,8 +512,12 @@ namespace gbe
     void MATH(Reg dst, uint32_t function, Reg src);
     /*! Encode unary instructions */
     void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
+    /*! Encode unary with temp reg instructions */
+    void ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg temp);
     /*! Encode binary instructions */
     void ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1);
+    /*! Encode binary with temp reg instructions */
+    void ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp);
     /*! Encode ternary instructions */
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
@@ -504,6 +526,8 @@ namespace gbe
     void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
     /*! Get image information */
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
+    /*! Multiply 64-bit integers */
+    void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
     /*! Use custom allocators */
     GBE_CLASS(Opaque);
     friend class SelectionBlock;
@@ -629,12 +653,15 @@ namespace gbe
 
     for (auto &block : blockList)
       for (auto &insn : block.insnList) {
+        // spill / unspill insn should be skipped when do spilling
+        if(insn.opcode == SEL_OP_SPILL_REG || insn.opcode == SEL_OP_UNSPILL_REG) continue;
+
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
 
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const GenRegister selReg = insn.src(srcID);
           const ir::Register reg = selReg.reg();
-          if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+          if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
             GBE_ASSERT(srcID < 5);
             SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
             unspill->state  = GenInstructionState(simdWidth);
@@ -653,7 +680,7 @@ namespace gbe
         for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
           const GenRegister selReg = insn.dst(dstID);
           const ir::Register reg = selReg.reg();
-          if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+          if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
             GBE_ASSERT(dstID < 5);
             SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
             spill->state  = GenInstructionState(simdWidth);
@@ -825,28 +852,29 @@ namespace gbe
   /* elemNum contains all the temporary register and the
      real destination registers.*/
   void Selection::Opaque::READ64(Reg addr,
-                                       Reg tempAddr,
-                                       const GenRegister *dst,
-                                       uint32_t elemNum,
-                                       uint32_t valueNum,
-                                       uint32_t bti)
+                                 Reg tempAddr,
+                                 const GenRegister *dst,
+                                 uint32_t elemNum,
+                                 uint32_t valueNum,
+                                 uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 2);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
+    /* temporary addr register is to be modified, set it to dst registers.*/
+    insn->dst(0) = tempAddr;
     // Regular instruction to encode
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-      insn->dst(elemID) = dst[elemID];
+      insn->dst(elemID + 1) = dst[elemID];
     insn->src(0) = addr;
-    insn->src(1) = tempAddr;
     insn->extra.function = bti;
     insn->extra.elem = valueNum;
 
     // Only the temporary registers need contiguous allocation
     dstVector->regNum = elemNum - valueNum;
     dstVector->isSrc = 0;
-    dstVector->reg = &insn->dst(0);
+    dstVector->reg = &insn->dst(1);
 
     // Source cannot be scalar (yet)
     srcVector->regNum = 1;
@@ -883,24 +911,27 @@ namespace gbe
   /* elemNum contains all the temporary register and the
      real data registers.*/
   void Selection::Opaque::WRITE64(Reg addr,
-                                        const GenRegister *src,
-                                        uint32_t elemNum,
-                                        uint32_t valueNum,
-                                        uint32_t bti)
+                                  const GenRegister *src,
+                                  uint32_t srcNum,
+                                  const GenRegister *dst,
+                                  uint32_t dstNum,
+                                  uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, elemNum+1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
     SelectionVector *vector = this->appendVector();
 
     // Regular instruction to encode
     insn->src(0) = addr;
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-      insn->src(elemID+1) = src[elemID];
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->src(elemID + 1) = src[elemID];
+    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
     insn->extra.function = bti;
-    insn->extra.elem = valueNum;
+    insn->extra.elem = srcNum;
 
     // Only the addr + temporary registers need to be contiguous.
-    vector->regNum = (elemNum - valueNum) + 1;
-    vector->reg = &insn->src(0);
+    vector->regNum = dstNum;
+    vector->reg = &insn->dst(0);
     vector->isSrc = 1;
   }
 
@@ -977,12 +1008,28 @@ namespace gbe
     insn->extra.function = function;
   }
 
+  void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 6; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
   void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
     insn->dst(0) = dst;
     insn->src(0) = src;
   }
 
+  void Selection::Opaque::ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src, Reg temp) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 2, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+    insn->dst(1) = temp;
+  }
+
   void Selection::Opaque::ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 2);
     insn->dst(0) = dst;
@@ -990,6 +1037,14 @@ namespace gbe
     insn->src(1) = src1;
   }
 
+  void Selection::Opaque::ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 2, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->dst(1) = temp;
+  }
+
   void Selection::Opaque::ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 3);
     insn->dst(0) = dst;
@@ -998,6 +1053,24 @@ namespace gbe
     insn->src(2) = src2;
   }
 
+  void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2);
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i=0; i<3; i++)
+      insn->dst(i) = tmp[i];
+    insn->extra.function = conditional;
+  }
+
+  void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 7, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 6; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
   // Boiler plate to initialize the selection library at c++ pre-main
   static SelectionLibrary *selLib = NULL;
   static void destroySelectionLibrary(void) { GBE_DELETE(selLib); }
@@ -1524,9 +1597,33 @@ namespace gbe
             sel.ADD(dst, src0, GenRegister::negate(src1));
           sel.pop();
           break;
-        case OP_SHL: sel.SHL(dst, src0, src1); break;
-        case OP_SHR: sel.SHR(dst, src0, src1); break;
-        case OP_ASR: sel.ASR(dst, src0, src1); break;
+        case OP_SHL:
+          if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i ++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.I64SHL(dst, src0, src1, tmp);
+          } else
+            sel.SHL(dst, src0, src1);
+          break;
+        case OP_SHR:
+          if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i ++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.I64SHR(dst, src0, src1, tmp);
+          } else
+            sel.SHR(dst, src0, src1);
+          break;
+        case OP_ASR:
+          if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i ++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.I64ASR(dst, src0, src1, tmp);
+          } else
+            sel.ASR(dst, src0, src1);
+          break;
         case OP_MUL_HI: {
             GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
             sel.MUL_HI(dst, src0, src1, temp);
@@ -1536,12 +1633,14 @@ namespace gbe
           if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
             return false;
-          }
-          else {
-            GBE_ASSERTM((type != TYPE_S64 && type != TYPE_U64), "64bit integer not supported yet!" );
+          } else if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.I64MUL(dst, src0, src1, tmp);
+          } else
             sel.MUL(dst, src0, src1);
-          }
-        break;
+          break;
         case OP_HADD: {
             GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
             sel.HADD(dst, src0, src1, temp);
@@ -1558,6 +1657,9 @@ namespace gbe
         case OP_UPSAMPLE_INT:
           sel.UPSAMPLE_INT(dst, src0, src1);
           break;
+        case OP_UPSAMPLE_LONG:
+          sel.UPSAMPLE_LONG(dst, src0, src1);
+          break;
         default: NOT_IMPLEMENTED;
       }
       sel.pop();
@@ -1666,7 +1768,6 @@ namespace gbe
       sel.push();
         sel.curr.predicate = GEN_PREDICATE_NONE;
         sel.curr.execWidth = simdWidth;
-        sel.curr.physicalFlag = 0;
         sel.SEL_CMP(genCmp, tmp, src0, src1);
       sel.pop();
 
@@ -1861,15 +1962,32 @@ namespace gbe
       const Type type = insn.getType();
       const Immediate imm = insn.getImmediate();
       const GenRegister dst = sel.selReg(insn.getDst(0), type);
+      GenRegister flagReg;
 
       sel.push();
       if (sel.isScalarOrBool(insn.getDst(0)) == true) {
         sel.curr.execWidth = 1;
+        if(type == TYPE_BOOL) {
+          if(imm.data.b) {
+            if(sel.curr.predicate == GEN_PREDICATE_NONE)
+              flagReg = GenRegister::immuw(0xffff);
+            else {
+              if(sel.curr.physicalFlag)
+                flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
+              else
+                flagReg = sel.selReg(Register(sel.curr.flagIndex), TYPE_U16);
+            }
+          } else
+            flagReg = GenRegister::immuw(0x0);
+        }
         sel.curr.predicate = GEN_PREDICATE_NONE;
         sel.curr.noMask = 1;
       }
 
       switch (type) {
+        case TYPE_BOOL:
+          sel.MOV(dst, flagReg);
+        break;
         case TYPE_U32:
         case TYPE_S32:
         case TYPE_FLOAT:
@@ -1985,7 +2103,7 @@ namespace gbe
         dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
       for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
         dst[dstID] = sel.selReg(insn.getValue(valueID));
-      sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, valueNum + tmpRegNum, valueNum, bti);
+      sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
     }
 
     void emitByteGather(Selection::Opaque &sel,
@@ -2085,13 +2203,16 @@ namespace gbe
       addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
       // The first 16 DWORD register space is for temporary usage at encode stage.
       uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
-      GenRegister src[valueNum + tmpRegNum];
+      GenRegister src[valueNum];
+      GenRegister dst[tmpRegNum + 1];
+      /* dst 0 is for the temporary address register. */
+      dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
       for (srcID = 0; srcID < tmpRegNum; ++srcID)
-        src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD));
+        dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
 
-      for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID)
-        src[srcID] = sel.selReg(insn.getValue(valueID));
-      sel.WRITE64(addr, src, valueNum + tmpRegNum, valueNum, bti);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+        src[valueID] = sel.selReg(insn.getValue(valueID));
+      sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
@@ -2187,7 +2308,13 @@ namespace gbe
       sel.push();
         sel.curr.physicalFlag = 0;
         sel.curr.flagIndex = uint16_t(dst);
-        sel.CMP(genCmp, src0, src1);
+        if (type == TYPE_S64 || type == TYPE_U64) {
+          GenRegister tmp[3];
+          for(int i=0; i<3; i++)
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          sel.I64CMP(genCmp, src0, src1, tmp);
+        } else
+          sel.CMP(genCmp, src0, src1);
       sel.pop();
       return true;
     }
@@ -2223,6 +2350,14 @@ namespace gbe
       } else if (dst.isdf()) {
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
         sel.MOV_DF(dst, src, sel.selReg(r));
+      } else if (dst.isint64()) {
+        switch(src.type) {
+          case GEN_TYPE_F:
+          case GEN_TYPE_DF:
+            NOT_IMPLEMENTED;
+          default:
+            sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+        }
       } else
         sel.MOV(dst, src);
       return true;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index eeca9af..32c7a05 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,7 +1,7 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
-DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
-DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
+DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
 DECL_SELECTION_IR(NOT, UnaryInstruction)
 DECL_SELECTION_IR(LZD, UnaryInstruction)
@@ -23,13 +23,18 @@ DECL_SELECTION_IR(SHL, BinaryInstruction)
 DECL_SELECTION_IR(RSR, BinaryInstruction)
 DECL_SELECTION_IR(RSL, BinaryInstruction)
 DECL_SELECTION_IR(ASR, BinaryInstruction)
+DECL_SELECTION_IR(I64SHR, I64ShiftInstruction)
+DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
+DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
 DECL_SELECTION_IR(ADD, BinaryInstruction)
-DECL_SELECTION_IR(I64ADD, TernaryInstruction)
-DECL_SELECTION_IR(I64SUB, TernaryInstruction)
+DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
 DECL_SELECTION_IR(MUL, BinaryInstruction)
+DECL_SELECTION_IR(I64MUL, I64MULInstruction)
 DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
 DECL_SELECTION_IR(MACH, BinaryInstruction)
 DECL_SELECTION_IR(CMP, CompareInstruction)
+DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
 DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
 DECL_SELECTION_IR(MAD, TernaryInstruction)
 DECL_SELECTION_IR(JMPI, JumpInstruction)
@@ -51,10 +56,12 @@ DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
 DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
-DECL_SELECTION_IR(MUL_HI, TernaryInstruction)
+DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
 DECL_SELECTION_IR(FBH, UnaryInstruction)
 DECL_SELECTION_IR(FBL, UnaryInstruction)
-DECL_SELECTION_IR(HADD, TernaryInstruction)
-DECL_SELECTION_IR(RHADD, TernaryInstruction)
+DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
 DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
+DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index ccbc0da..a765917 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -136,6 +136,7 @@ namespace gbe
       const uint32_t offset = GEN_REG_SIZE + curbeOffset + subOffset;
       RA.insert(std::make_pair(reg, offset));
       this->intervals[reg].minID = 0;
+      this->intervals[reg].maxID = 0;
     }
   }
 
@@ -456,7 +457,7 @@ namespace gbe
 
           // Compare instructions update the flags so we must copy it back to
           // the GRF
-          if (insn.opcode == SEL_OP_CMP) {
+          if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP) {
             SelectionInstruction *mov1 = selection.create(SEL_OP_MOV,1,1);
             mov1->state = mov0->state;
             mov1->dst(0) = mov0->src(0);
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 2cad4c0..ea1bc06 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -270,9 +270,16 @@ namespace gbe
       GenRegister r = *this;
       r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
       r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      r.vstride = GEN_VERTICAL_STRIDE_16;
       return r;
     }
 
+    INLINE bool is_signed_int(void) const {
+      if ((type == GEN_TYPE_B || type == GEN_TYPE_W || type == GEN_TYPE_D || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
     INLINE bool isdf(void) const {
       if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
         return true;
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
index b073682..35e3a2a 100755
--- a/backend/src/gen_builtin_vector.py
+++ b/backend/src/gen_builtin_vector.py
@@ -260,9 +260,7 @@ class builtinProto():
             # XXX FIXME now skip all double vector, as we don't
             # defined those scalar version's prototype.
             if ptype[0].find('double') != -1 or \
-               vtype[0].find('double') != -1 or \
-               ptype[0].find('long') != -1 or \
-               vtype[0].find('long') != -1 :
+               vtype[0].find('double') != -1:
                 return
 
             if (n == 0):
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 45095db..48e83b4 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -861,7 +861,8 @@ namespace ir {
         return false;
       if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
         return false;
-      CHECK_TYPE(this->type, allButBool);
+      //Support all type IMM, disable check
+      //CHECK_TYPE(this->type, allButBool);
       return true;
     }
 
@@ -1334,6 +1335,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   DECL_EMIT_FUNCTION(MUL_HI)
   DECL_EMIT_FUNCTION(UPSAMPLE_SHORT)
   DECL_EMIT_FUNCTION(UPSAMPLE_INT)
+  DECL_EMIT_FUNCTION(UPSAMPLE_LONG)
   DECL_EMIT_FUNCTION(DIV)
   DECL_EMIT_FUNCTION(REM)
   DECL_EMIT_FUNCTION(SHL)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 48e6963..40a3d40 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -523,6 +523,8 @@ namespace ir {
   Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
   /*! upsample_int.type dst src */
   Instruction UPSAMPLE_INT(Type type, Register dst, Register src0, Register src1);
+  /*! upsample_long.type dst src */
+  Instruction UPSAMPLE_LONG(Type type, Register dst, Register src0, Register src1);
   /*! fbh.type dst src */
   Instruction FBH(Type type, Register dst, Register src);
   /*! fbl.type dst src */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index b9f0e73..c15e912 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -79,3 +79,4 @@ DECL_INSN(HADD, BinaryInstruction)
 DECL_INSN(RHADD, BinaryInstruction)
 DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
 DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
+DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 18448cf..12d809d 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1827,6 +1827,7 @@ namespace gbe
       case GEN_OCL_MUL_HI_UINT:
       case GEN_OCL_UPSAMPLE_SHORT:
       case GEN_OCL_UPSAMPLE_INT:
+      case GEN_OCL_UPSAMPLE_LONG:
       case GEN_OCL_SADD_SAT_CHAR:
       case GEN_OCL_SADD_SAT_SHORT:
       case GEN_OCL_SADD_SAT_INT:
@@ -2223,6 +2224,14 @@ namespace gbe
             ctx.UPSAMPLE_INT(getType(ctx, I.getType()), dst, src0, src1);
             break;
           }
+          case GEN_OCL_UPSAMPLE_LONG:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.UPSAMPLE_LONG(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           case GEN_OCL_SADD_SAT_CHAR:
           case GEN_OCL_SADD_SAT_SHORT:
           case GEN_OCL_SADD_SAT_INT:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 8e940bc..b712860 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -135,3 +135,4 @@ DECL_LLVM_GEN_FUNCTION(HADD, __gen_ocl_hadd)
 DECL_LLVM_GEN_FUNCTION(RHADD, __gen_ocl_rhadd)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 84f15ca..8d4220c 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -123,6 +123,7 @@ typedef size_t __event_t;
 #define FLT_MIN_10_EXP -37
 #define FLT_MIN_EXP -125
 #define FLT_RADIX 2
+#define FLT_ONE 1.0000000000e+00         /* 0x3F800000 */
 #define FLT_MAX 0x1.fffffep127f
 #define FLT_MIN 0x1.0p-126f
 #define FLT_EPSILON 0x1.0p-23f
@@ -310,6 +311,14 @@ INLINE_OVERLOADABLE uint clz(uint x) {
   return __gen_ocl_fbh(x);
 }
 
+INLINE_OVERLOADABLE long clz(long x) {
+  return 0;
+}
+
+INLINE_OVERLOADABLE ulong clz(ulong x) {
+  return 0;
+}
+
 OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
 OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
 INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
@@ -318,6 +327,12 @@ INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
 INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
 INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
 INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE long mul_hi(long x, long y) {
+  return 0;
+}
+INLINE_OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
+  return 0;
+}
 
 #define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
 DEF(char)
@@ -326,6 +341,8 @@ DEF(short)
 DEF(ushort)
 DEF(int)
 DEF(uint)
+DEF(long)
+DEF(ulong)
 #undef DEF
 
 INLINE_OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
@@ -366,12 +383,27 @@ INLINE_OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c) {
   return x;
 }
 
-/* XXX not implemented. */
 INLINE_OVERLOADABLE int mad_sat(int a, int b, int c) {
-  return 0;
+  long x = (long)a * (long)b + (long)c;
+  if (x > 0x7FFFFFFF)
+    x = 0x7FFFFFFF;
+  else if (x < -0x7FFFFFFF-1)
+    x = -0x7FFFFFFF-1;
+  return (int)x;
 }
 
 INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
+  ulong x = (ulong)a * (ulong)b + (ulong)c;
+  if (x > 0xFFFFFFFFu)
+    x = 0xFFFFFFFFu;
+  return (uint)x;
+}
+
+INLINE_OVERLOADABLE long mad_sat(long a, long b, long c) {
+  return 0;
+}
+
+INLINE_OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
   return 0;
 }
 
@@ -389,13 +421,26 @@ DEF(ushort, 15)
 DEF(int, 31)
 DEF(uint, 31)
 #undef DEF
+INLINE_OVERLOADABLE long rotate(long x, long y) {
+  return 0;
+}
+INLINE_OVERLOADABLE ulong rotate(ulong x, ulong y) {
+  return 0;
+}
 
 OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
 OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
+OVERLOADABLE long __gen_ocl_upsample(long hi, long lo);
 INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
 INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
 INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
 INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE long upsample(int hi, uint lo) {
+  return __gen_ocl_upsample((long)hi, (long)lo);
+}
+INLINE_OVERLOADABLE ulong upsample(uint hi, uint lo) {
+  return __gen_ocl_upsample((long)hi, (long)lo);
+}
 
 PURE CONST uint __gen_ocl_hadd(uint x, uint y);
 PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
@@ -411,6 +456,18 @@ INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 &
 INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
 INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
 INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+INLINE_OVERLOADABLE long hadd(long x, long y) {
+  return 0;
+}
+INLINE_OVERLOADABLE ulong hadd(ulong x, ulong y) {
+  return 0;
+}
+INLINE_OVERLOADABLE long rhadd(long x, long y) {
+  return 0;
+}
+INLINE_OVERLOADABLE ulong rhadd(ulong x, ulong y) {
+  return 0;
+}
 
 int __gen_ocl_abs(int x);
 #define DEC(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
@@ -418,11 +475,13 @@ DEC(int)
 DEC(short)
 DEC(char)
 #undef DEC
+INLINE_OVERLOADABLE ulong abs(long x) { return x < 0 ? -x : x; }
 /* For unsigned types, do nothing. */
 #define DEC(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
 DEC(uint)
 DEC(ushort)
 DEC(uchar)
+DEC(ulong)
 #undef DEC
 
 /* Char and short type abs diff */
@@ -448,6 +507,13 @@ INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
     return (abs(x) + abs(y));
 }
 
+INLINE_OVERLOADABLE ulong abs_diff (long x, long y) {
+  return 0;
+}
+INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
+  return 0;
+}
+
 /////////////////////////////////////////////////////////////////////////////
 // Work Items functions (see 6.11.1 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
@@ -561,7 +627,36 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
   float y = native_exp(-2 * x);
   return (1 - y) / (1 + y);
 }
+
+typedef union
+{
+  float value;
+  int word;
+} ieee_float_shape_type;
+
+#ifndef GET_FLOAT_WORD
+#define GET_FLOAT_WORD(i,d)         \
+do {                                \
+  ieee_float_shape_type gf_u;       \
+  gf_u.value = (d);                 \
+  (i) = gf_u.word;                  \
+} while (0)
+#endif
+
 INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+  int hx, ix;
+  GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix == 0x3f800000) {
+    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
+  }
+  if(ix > 0x3f800000) {            /* |x|>= 1 */
+    return (x-x) / (x-x);          /* asin(|x|>1) is NaN */
+  }
+  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
+    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
+  }
+  /* 1 > |x| >= 2**-27 */
   float sum = x, c = x, m = 1.0;
   int n = 1;
   do
@@ -733,6 +828,8 @@ DECL_MIN_MAX_CLAMP(char)
 DECL_MIN_MAX_CLAMP(uint)
 DECL_MIN_MAX_CLAMP(unsigned short)
 DECL_MIN_MAX_CLAMP(unsigned char)
+DECL_MIN_MAX_CLAMP(long)
+DECL_MIN_MAX_CLAMP(ulong)
 #undef DECL_MIN_MAX_CLAMP
 
 #define BODY \
@@ -1196,6 +1293,72 @@ INLINE void write_mem_fence(cl_mem_fence_flags flags) {
 }
 
 /////////////////////////////////////////////////////////////////////////////
+// Async Copies and prefetch
+/////////////////////////////////////////////////////////////////////////////
+#define BODY(SRC_STRIDE, DST_STRIDE) \
+  uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
+  uint count = num / size;  \
+  uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1);  \
+  offset = offset * get_local_size(0) + get_local_id(0); \
+  for(uint i=0; i<count; i+=1) { \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+    offset += size;                                 \
+  } \
+  if(offset < num) \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+  return 0;
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE event_t async_work_group_copy (local TYPE *dst,  const global TYPE *src, \
+										    size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_copy (global TYPE *dst,  const local TYPE *src, \
+										    size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst,  const global TYPE *src, \
+										            size_t num, size_t src_stride, event_t event) { \
+  BODY(src_stride, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst,  const local TYPE *src, \
+										            size_t num, size_t dst_stride, event_t event) { \
+  BODY(1, dst_stride); \
+}
+#define DEF(TYPE) \
+  DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+#undef BODY
+#undef DEFN
+#undef DEF
+
+INLINE void wait_group_events (int num_events, event_t *event_list) {
+  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
 // Atomic functions
 /////////////////////////////////////////////////////////////////////////////
 OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
diff --git a/kernels/builtin_acos_asin.cl b/kernels/builtin_acos_asin.cl
new file mode 100644
index 0000000..bba2d21
--- /dev/null
+++ b/kernels/builtin_acos_asin.cl
@@ -0,0 +1,10 @@
+__kernel void builtin_acos_asin(__global float *dst, __global float *src, __global int *max_func) {
+  int i = get_global_id(0);
+  float x = src[i];
+
+  dst[i * (*max_func) + 0] = acos(x);
+  dst[i * (*max_func) + 1] = acosh(x);
+  dst[i * (*max_func) + 2] = asin(x);
+  dst[i * (*max_func) + 3] = asinh(x);
+  dst[i * (*max_func) + 4] = x;
+};
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
new file mode 100644
index 0000000..a2432a4
--- /dev/null
+++ b/kernels/compiler_async_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_copy(__global int2 *dst, __global int2 *src, __local int2 *localBuffer, int copiesPerWorkItem)
+{
+  event_t event;
+  int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+  int i;
+  event = async_work_group_copy((__local int2*)localBuffer, (__global const int2*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 );
+  wait_group_events( 1, &event );
+
+  for(i=0; i<copiesPerWorkItem; i++)
+    localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (int2)(3, 3);
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  event = async_work_group_copy((__global int2*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const int2*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 );
+  wait_group_events( 1, &event );
+}
diff --git a/kernels/compiler_async_stride_copy.cl b/kernels/compiler_async_stride_copy.cl
new file mode 100644
index 0000000..a926588
--- /dev/null
+++ b/kernels/compiler_async_stride_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_stride_copy(__global char4 *dst, __global char4 *src, __local char4 *localBuffer, int copiesPerWorkItem, int stride)
+{
+  event_t event;
+  int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+  int i;
+  event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+  wait_group_events( 1, &event );
+
+  for(i=0; i<copiesPerWorkItem; i++)
+    localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (char4)(3);
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+  wait_group_events( 1, &event );
+}
diff --git a/kernels/compiler_event.cl b/kernels/compiler_event.cl
new file mode 100644
index 0000000..a901b05
--- /dev/null
+++ b/kernels/compiler_event.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_event(__global int *dst, int value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] += value;
+}
diff --git a/kernels/compiler_load_bool_imm.cl b/kernels/compiler_load_bool_imm.cl
new file mode 100644
index 0000000..fda49b9
--- /dev/null
+++ b/kernels/compiler_load_bool_imm.cl
@@ -0,0 +1,12 @@
+__kernel void
+compiler_load_bool_imm(__global int *dst, __local int *localBuffer, int copiesPerWorkItem )
+{
+  int i;
+  for(i=0; i<copiesPerWorkItem; i++)
+    localBuffer[get_local_id(0)*copiesPerWorkItem+i] = copiesPerWorkItem;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=0; i<copiesPerWorkItem; i++)
+    dst[get_global_id(0)*copiesPerWorkItem + i] = localBuffer[get_local_id(0)*copiesPerWorkItem+i];
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
diff --git a/kernels/compiler_long_asr.cl b/kernels/compiler_long_asr.cl
new file mode 100644
index 0000000..901630b
--- /dev/null
+++ b/kernels/compiler_long_asr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_asr(global long *src, global long *dst) {
+  int i = get_global_id(0);
+  if(i > 7)
+    dst[i] = src[i] >> i;
+  else
+    dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_cmp.cl b/kernels/compiler_long_cmp.cl
new file mode 100644
index 0000000..90dfb60
--- /dev/null
+++ b/kernels/compiler_long_cmp.cl
@@ -0,0 +1,29 @@
+kernel void compiler_long_cmp_l(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] < src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_le(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] <= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_g(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] > src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_ge(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] >= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_eq(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] == src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_neq(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] != src2[i]) ? 3 : 4;
+}
diff --git a/kernels/compiler_long_convert.cl b/kernels/compiler_long_convert.cl
new file mode 100644
index 0000000..f22914f
--- /dev/null
+++ b/kernels/compiler_long_convert.cl
@@ -0,0 +1,7 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_long_convert(global char *src1, global short *src2, global int *src3, global long *dst1, global long *dst2, global long *dst3) {
+  int i = get_global_id(0);
+  dst1[i] = src1[i];
+  dst2[i] = src2[i];
+  dst3[i] = src3[i];
+}
diff --git a/kernels/compiler_long_mult.cl b/kernels/compiler_long_mult.cl
new file mode 100644
index 0000000..5b96d74
--- /dev/null
+++ b/kernels/compiler_long_mult.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_mult(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  if(i < 3)
+    dst[i] = src1[i] + src2[i];
+  else
+    dst[i] = src1[i] * src2[i];
+}
diff --git a/kernels/compiler_long_shl.cl b/kernels/compiler_long_shl.cl
new file mode 100644
index 0000000..3786b77
--- /dev/null
+++ b/kernels/compiler_long_shl.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shl(global long *src, global long *dst) {
+  int i = get_global_id(0);
+  if(i > 7)
+    dst[i] = src[i] << i;
+  else
+    dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_shr.cl b/kernels/compiler_long_shr.cl
new file mode 100644
index 0000000..d4e859c
--- /dev/null
+++ b/kernels/compiler_long_shr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shr(global ulong *src, global ulong *dst) {
+  int i = get_global_id(0);
+  if(i > 7)
+    dst[i] = src[i] >> i;
+  else
+    dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_upsample_long.cl b/kernels/compiler_upsample_long.cl
new file mode 100644
index 0000000..16f806b
--- /dev/null
+++ b/kernels/compiler_upsample_long.cl
@@ -0,0 +1,4 @@
+kernel void compiler_upsample_int(global int *src1, global uint *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = upsample(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
index 320194e..964f5e7 100644
--- a/kernels/compiler_vector_load_store.cl
+++ b/kernels/compiler_vector_load_store.cl
@@ -25,11 +25,11 @@ __kernel void test_##type ##n(__global type *pin, \
   TEST_TYPE(int,n)   \
   TEST_TYPE(uint,n)  \
   TEST_TYPE(float,n) \
-  TEST_TYPE(double,n)
+  TEST_TYPE(double,n)\
+  TEST_TYPE(long,n)  \
+  TEST_TYPE(ulong,n)
 
 #if 0
-  TEST_TYPE(long,n)
-  TEST_TYPE(ulong,n)
   TEST_TYPE(half,n)
 #endif
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cea78c0..58d23cb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ set(OPENCL_SRC
     cl_program.c
     cl_sampler.c
     cl_event.c
+    cl_enqueue.c
     cl_image.c
     cl_mem.c
     cl_platform_id.c
diff --git a/src/cl_api.c b/src/cl_api.c
index 146c010..4f048ee 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -18,9 +18,11 @@
  */
 
 #include "cl_platform_id.h"
-#include "cl_device_id.h" 
+#include "cl_device_id.h"
 #include "cl_context.h"
 #include "cl_command_queue.h"
+#include "cl_enqueue.h"
+#include "cl_event.h"
 #include "cl_program.h"
 #include "cl_kernel.h"
 #include "cl_mem.h"
@@ -36,6 +38,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
+#include <unistd.h>
 
 #ifndef CL_VERSION_1_2
 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
@@ -59,6 +62,23 @@ typedef intptr_t cl_device_partition_property;
 	  return RET; \
 	} while(0)
 
+inline cl_int
+handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
+              cl_event* event, enqueue_data* data, cl_command_type type)
+{
+  cl_int status = cl_event_wait_events(num, wait_list);
+  cl_event e;
+  if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
+    e = cl_event_new(queue->ctx, queue, type, event!=NULL);
+    if(event != NULL)
+      *event = e;
+    if(status == CL_ENQUEUE_EXECUTE_DEFER) {
+      cl_event_new_enqueue_callback(e, data, num, wait_list);
+    }
+  }
+  return status;
+}
+
 static cl_int
 cl_check_device_type(cl_device_type device_type)
 {
@@ -987,8 +1007,20 @@ cl_int
 clWaitForEvents(cl_uint          num_events,
                 const cl_event * event_list)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  cl_context ctx = NULL;
+
+  if(num_events > 0 && event_list)
+    ctx = event_list[0]->ctx;
+
+  TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
+
+  while(cl_event_wait_events(num_events, event_list) == CL_ENQUEUE_EXECUTE_DEFER) {
+    usleep(8000);       //sleep 8ms to wait other thread
+  }
+
+error:
+  return err;
 }
 
 cl_int
@@ -998,38 +1030,94 @@ clGetEventInfo(cl_event      event,
                void *        param_value,
                size_t *      param_value_size_ret)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  CHECK_EVENT(event);
+
+  if (param_name == CL_EVENT_COMMAND_QUEUE) {
+    if(event->queue == NULL) {
+      param_value_size_ret = 0;
+      param_value = NULL;
+      return err;
+    }
+    FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_CONTEXT) {
+    FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_COMMAND_TYPE) {
+    FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
+    cl_event_update_status(event);
+    FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
+    cl_uint ref = event->ref_n;
+    FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+
 }
 
 cl_event
 clCreateUserEvent(cl_context context,
                   cl_int *   errcode_ret)
 {
-  NOT_IMPLEMENTED;
-  return NULL;
+  cl_int err = CL_SUCCESS;
+  cl_event event = NULL;
+  CHECK_CONTEXT(context);
+
+  TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER, CL_TRUE));
+
+error:
+  if(errcode_ret)
+    *errcode_ret = err;
+  return event;
 }
 
 cl_int
 clRetainEvent(cl_event  event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  cl_event_add_ref(event);
+
+error:
+  return err;
 }
 
 cl_int
 clReleaseEvent(cl_event  event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  cl_event_delete(event);
+
+error:
+  return err;
 }
 
 cl_int
 clSetUserEventStatus(cl_event    event,
                      cl_int      execution_status)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  if(execution_status > CL_COMPLETE) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if(event->status != CL_SUBMITTED) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  cl_event_set_status(event, execution_status);
+error:
+  return err;
 }
 
 cl_int
@@ -1038,8 +1126,20 @@ clSetEventCallback(cl_event     event,
                    void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *),
                    void *       user_data)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  if((pfn_notify == NULL) ||
+    (command_exec_callback_type > CL_SUBMITTED) ||
+    (command_exec_callback_type < CL_COMPLETE)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
+
+error:
+  return err;
+
 }
 
 cl_int
@@ -1087,8 +1187,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
                     cl_event *       event)
 {
   cl_int err = CL_SUCCESS;
-  void* src_ptr;
-
+  enqueue_data *data, defer_enqueue_data = { 0 };
   CHECK_QUEUE(command_queue);
   CHECK_MEM(buffer);
   if (command_queue->ctx != buffer->ctx) {
@@ -1109,14 +1208,20 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
      goto error;
   }
 
-  if (!(src_ptr = cl_mem_map_auto(buffer))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
 
-  memcpy(ptr, (char*)src_ptr + offset, size);
+  data = &defer_enqueue_data;
+  data->type    = EnqueueReadBuffer;
+  data->mem_obj = buffer;
+  data->ptr     = ptr;
+  data->offset  = offset;
+  data->size    = size;
 
-  err = cl_mem_unmap_auto(buffer);
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
 
 error:
   return err;
@@ -1154,7 +1259,7 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
                      cl_event *          event)
 {
   cl_int err = CL_SUCCESS;
-  void* dst_ptr;
+  enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_MEM(buffer);
@@ -1176,16 +1281,22 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
     goto error;
   }
 
-  if (!(dst_ptr = cl_mem_map_auto(buffer))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
 
-  memcpy((char*)dst_ptr + offset, ptr, size);
+  data = &no_wait_data;
+  data->type      = EnqueueWriteBuffer;
+  data->mem_obj   = buffer;
+  data->const_ptr = ptr;
+  data->offset    = offset;
+  data->size      = size;
 
-  err = cl_mem_unmap_auto(buffer);
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
 
-error:
+ error:
   return err;
 }
 
@@ -1257,7 +1368,7 @@ clEnqueueReadImage(cl_command_queue      command_queue,
                    cl_event *            event)
 {
   cl_int err = CL_SUCCESS;
-  void* src_ptr;
+  enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(image);
@@ -1304,35 +1415,22 @@ clEnqueueReadImage(cl_command_queue      command_queue,
      goto error;
   }
 
-  if (!(src_ptr = cl_mem_map_auto(image))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-
-  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  src_ptr = (char*)src_ptr + offset;
-
-  if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
-      (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
-  {
-    memcpy(ptr, src_ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueReadImage;
+  data->mem_obj     = image;
+  data->ptr         = ptr;
+  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = row_pitch;
+  data->slice_pitch = slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
-  else {
-    cl_uint y, z;
-    for (z = 0; z < region[2]; z++) {
-      const char* src = src_ptr;
-      char* dst = ptr;
-      for (y = 0; y < region[1]; y++) {
-	memcpy(dst, src, image->bpp*region[0]);
-	src += image->row_pitch;
-	dst += row_pitch;
-      }
-      src_ptr = (char*)src_ptr + image->slice_pitch;
-      ptr = (char*)ptr + slice_pitch;
-    }
-  }
-
-  err = cl_mem_unmap_auto(image);
 
 error:
   return err;
@@ -1352,7 +1450,7 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
                     cl_event *           event)
 {
   cl_int err = CL_SUCCESS;
-  void* dst_ptr;
+  enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(image);
@@ -1399,35 +1497,22 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
     goto error;
   }
 
-  if (!(dst_ptr = cl_mem_map_auto(image))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-
-  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  dst_ptr = (char*)dst_ptr + offset;
-
-  if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
-      (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
-  {
-    memcpy(dst_ptr, ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueWriteImage;
+  data->mem_obj     = image;
+  data->const_ptr   = ptr;
+  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = row_pitch;
+  data->slice_pitch = slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
-  else {
-    cl_uint y, z;
-    for (z = 0; z < region[2]; z++) {
-      const char* src = ptr;
-      char* dst = dst_ptr;
-      for (y = 0; y < region[1]; y++) {
-	memcpy(dst, src, image->bpp*region[0]);
-	src += row_pitch;
-	dst += image->row_pitch;
-      }
-      ptr = (char*)ptr + slice_pitch;
-      dst_ptr = (char*)dst_ptr + image->slice_pitch;
-    }
-  }
-
-  err = cl_mem_unmap_auto(image);
 
 error:
   return err;
@@ -1490,10 +1575,8 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
                    cl_event *        event,
                    cl_int *          errcode_ret)
 {
-  void *ptr = NULL;
-  void *mem_ptr = NULL;
   cl_int err = CL_SUCCESS;
-  int slot = -1;
+  enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_MEM(buffer);
@@ -1519,73 +1602,25 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     goto error;
   }
 
-  if (!(ptr = cl_mem_map_auto(buffer))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
 
-  ptr = (char*)ptr + offset;
+  data = &no_wait_data;
+  data->type        = EnqueueMapBuffer;
+  data->mem_obj     = buffer;
+  data->offset      = offset;
+  data->size        = size;
+  data->map_flags   = map_flags;
 
-  if(buffer->flags & CL_MEM_USE_HOST_PTR) {
-    assert(buffer->host_ptr);
-    memcpy(buffer->host_ptr + offset, ptr, size);
-    mem_ptr = buffer->host_ptr + offset;
-  } else {
-    mem_ptr = ptr;
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
-  /* Record the mapped address. */
-  if (!buffer->mapped_ptr_sz) {
-    buffer->mapped_ptr_sz = 16;
-    buffer->mapped_ptr = (cl_mapped_ptr *)malloc(
-          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz);
-    if (!buffer->mapped_ptr) {
-      cl_mem_unmap_auto (buffer);
-      err = CL_OUT_OF_HOST_MEMORY;
-      ptr = NULL;
-      goto error;
-    }
-
-    memset(buffer->mapped_ptr, 0, buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-    slot = 0;
-  } else {
-    int i = 0;
-    for (; i < buffer->mapped_ptr_sz; i++) {
-      if (buffer->mapped_ptr[i].ptr == NULL) {
-        slot = i;
-        break;
-      }
-    }
-
-    if (i == buffer->mapped_ptr_sz) {
-      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
-          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz * 2);
-      if (!new_ptr) {
-        cl_mem_unmap_auto (buffer);
-        err = CL_OUT_OF_HOST_MEMORY;
-        ptr = NULL;
-        goto error;
-      }
-      memset(new_ptr, 0, 2 * buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-      memcpy(new_ptr, buffer->mapped_ptr,
-             buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
-      slot = buffer->mapped_ptr_sz;
-      buffer->mapped_ptr_sz *= 2;
-      free(buffer->mapped_ptr);
-      buffer->mapped_ptr = new_ptr;
-    }
-  }
-
-  assert(slot != -1);
-  buffer->mapped_ptr[slot].ptr = mem_ptr;
-  buffer->mapped_ptr[slot].v_ptr = ptr;
-  buffer->mapped_ptr[slot].size = size;
-  buffer->map_ref++;
-
 error:
   if (errcode_ret)
     *errcode_ret = err;
-  return mem_ptr;
+  return data->ptr;
 }
 
 void *
@@ -1602,8 +1637,8 @@ clEnqueueMapImage(cl_command_queue   command_queue,
                   cl_event *         event,
                   cl_int *           errcode_ret)
 {
-  void *ptr = NULL;
   cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_IMAGE(image);
@@ -1638,18 +1673,27 @@ clEnqueueMapImage(cl_command_queue   command_queue,
     goto error;
   }
 
-  if (!(ptr = cl_mem_map_auto(image))) {
-    err = CL_MAP_FAILURE;
-    goto error;
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueMapImage;
+  data->mem_obj     = image;
+  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = *image_row_pitch;
+  data->slice_pitch = *image_slice_pitch;
+  data->map_flags   = map_flags;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
-  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
-  ptr = (char*)ptr + offset;
-
 error:
   if (errcode_ret)
     *errcode_ret = err;
-  return ptr;
+  return data->ptr; //TODO: map and unmap first
 }
 
 cl_int
@@ -1661,9 +1705,7 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
                         cl_event *        event)
 {
   cl_int err = CL_SUCCESS;
-  int i;
-  size_t mapped_size = 0;
-  void * v_ptr = NULL;
+  enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_MEM(memobj);
@@ -1672,54 +1714,17 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
     goto error;
   }
 
-  assert(memobj->mapped_ptr_sz >= memobj->map_ref);
-  INVALID_VALUE_IF(!mapped_ptr);
-  for (i = 0; i < memobj->mapped_ptr_sz; i++) {
-    if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
-      memobj->mapped_ptr[i].ptr = NULL;
-      mapped_size = memobj->mapped_ptr[i].size;
-      v_ptr = memobj->mapped_ptr[i].v_ptr;
-      memobj->mapped_ptr[i].size = 0;
-      memobj->mapped_ptr[i].v_ptr = NULL;
-      memobj->map_ref--;
-      break;
-    }
-  }
-  /* can not find a mapped address? */
-  INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
-
-  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
-    assert(mapped_ptr >= memobj->host_ptr &&
-      mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
-    /* Sync the data. */
-    memcpy(v_ptr, mapped_ptr, mapped_size);
-  } else {
-    assert(v_ptr == mapped_ptr);
-  }
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, memobj->ctx);
 
-  cl_mem_unmap_auto(memobj);
+  data = &no_wait_data;
+  data->type        = EnqueueUnmapMemObject;
+  data->mem_obj     = memobj;
+  data->ptr         = mapped_ptr;
 
-  /* shrink the mapped slot. */
-  if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
-    int j = 0;
-    cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
-	sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
-    if (!new_ptr) {
-      /* Just do nothing. */
-      goto error;
-    }
-    memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
-
-    for (i = 0; i < memobj->mapped_ptr_sz; i++) {
-      if (memobj->mapped_ptr[i].ptr) {
-        new_ptr[j] = memobj->mapped_ptr[i];
-        j++;
-        assert(j < memobj->mapped_ptr_sz/2);
-      }
-    }
-    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
-    free(memobj->mapped_ptr);
-    memobj->mapped_ptr = new_ptr;
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
 error:
@@ -1742,6 +1747,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   size_t fixed_local_sz[] = {1,1,1};
   cl_int err = CL_SUCCESS;
   cl_uint i;
+  enqueue_data *data, no_wait_data = { 0 };
 
   CHECK_QUEUE(command_queue);
   CHECK_KERNEL(kernel);
@@ -1774,8 +1780,8 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     }
 
   /* Local sizes must be non-null and divide global sizes */
-  if (local_work_size != NULL) 
-    for (i = 0; i < work_dim; ++i) 
+  if (local_work_size != NULL)
+    for (i = 0; i < work_dim; ++i)
       if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
         err = CL_INVALID_WORK_GROUP_SIZE;
         goto error;
@@ -1789,9 +1795,9 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   }
 
   /* XXX No event right now */
-  FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
-  FATAL_IF(event_wait_list != NULL, "Events are not supported");
-  FATAL_IF(event != NULL, "Events are not supported");
+  //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
+  //FATAL_IF(event_wait_list != NULL, "Events are not supported");
+  //FATAL_IF(event != NULL, "Events are not supported");
 
   if (local_work_size != NULL)
     for (i = 0; i < work_dim; ++i)
@@ -1810,6 +1816,17 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
                                   fixed_global_off,
                                   fixed_global_sz,
                                   fixed_local_sz);
+  if(err != CL_SUCCESS)
+    goto error;
+
+  data = &no_wait_data;
+  data->type = EnqueueNDRangeKernel;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
 
 error:
   return err;
@@ -1855,8 +1872,12 @@ clEnqueueWaitForEvents(cl_command_queue  command_queue,
                        cl_uint           num_events,
                        const cl_event *  event_list)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+  err = clWaitForEvents(num_events, event_list);
+
+error:
+  return err;
 }
 
 cl_int
@@ -1864,6 +1885,7 @@ clEnqueueBarrier(cl_command_queue  command_queue)
 {
   NOT_IMPLEMENTED;
   return 0;
+  //return clFinish(command_queue);
 }
 
 #define EXTFUNC(x)                      \
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index b296dd7..e82f75c 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -413,6 +413,13 @@ error:
 }
 
 LOCAL cl_int
+cl_command_queue_flush(cl_command_queue queue)
+{
+  cl_gpgpu_flush(queue->gpgpu);
+  return CL_SUCCESS;
+}
+
+LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
   cl_gpgpu_sync(queue->gpgpu);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 4f6f987..135d659 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -65,6 +65,9 @@ extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
 /* Fulsim will dump this buffer (mostly to check its consistency */
 cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
 
+/* Flush for the command queue */
+extern cl_int cl_command_queue_flush(cl_command_queue);
+
 /* Wait for the completion of the command queue */
 extern cl_int cl_command_queue_finish(cl_command_queue);
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index e58433f..1d415d4 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -266,8 +266,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Close the batch buffer and submit it */
   cl_gpgpu_batch_end(gpgpu, 0);
-  cl_gpgpu_flush(gpgpu);
-
 error:
   return err;
 }
diff --git a/src/cl_context.h b/src/cl_context.h
index 80bf777..718d589 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -62,10 +62,12 @@ struct _cl_context {
   cl_program programs;              /* All programs currently allocated */
   cl_mem buffers;                   /* All memory object currently allocated */
   cl_sampler samplers;              /* All sampler object currently allocated */
+  cl_event   events;                /* All event object currently allocated */
   pthread_mutex_t queue_lock;       /* To allocate and deallocate queues */
   pthread_mutex_t program_lock;     /* To allocate and deallocate programs */
   pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
   pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
+  pthread_mutex_t event_lock;       /* To allocate and deallocate events */
   uint32_t ver;                     /* Gen version */
   struct _cl_context_prop props;
   cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 673985d..1a0ec38 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -46,6 +46,9 @@ typedef struct _cl_driver *cl_driver;
 /* Encapsulates the gpgpu stream of commands */
 typedef struct _cl_gpgpu *cl_gpgpu;
 
+/* Encapsulates the event  of a command stream */
+typedef struct _cl_gpgpu_event *cl_gpgpu_event;
+
 typedef struct _cl_context_prop *cl_context_prop;
 typedef struct _cl_sampler *cl_sampler;
 
@@ -86,6 +89,13 @@ typedef enum cl_cache_control {
   cc_llc_l3   = 0x3
 } cl_cache_control;
 
+typedef enum gpu_command_status {
+  command_queued    = 3,
+  command_submitted = 2,
+  command_running   = 1,
+  command_complete  = 0
+} gpu_command_status;
+
 /* Use this structure to bind kernels in the gpgpu state */
 typedef struct cl_gpgpu_kernel {
   const char *name;        /* kernel name and bo name */
@@ -179,6 +189,27 @@ extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end;
 typedef void (cl_gpgpu_flush_cb)(cl_gpgpu);
 extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
 
+/* new a event for a batch buffer */
+typedef cl_gpgpu_event (cl_gpgpu_event_new_cb)(cl_gpgpu);
+extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new;
+
+/* new a event for a batch buffer */
+typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int);
+extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status;
+
+/* new a event for a batch buffer */
+typedef void (cl_gpgpu_event_pending_cb)(cl_gpgpu, cl_gpgpu_event);
+extern cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending;
+
+/* new a event for a batch buffer */
+typedef void (cl_gpgpu_event_resume_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume;
+
+/* new a event for a batch buffer */
+typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
+
+
 /* Will spawn all threads */
 typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
                                   uint32_t simd_sz,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 9aa926e..e7412de 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -63,4 +63,9 @@ LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
 LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
 LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
 LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
+LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
+LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
+LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
+LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
 
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
new file mode 100644
index 0000000..a112cc4
--- /dev/null
+++ b/src/cl_enqueue.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+
+#include "cl_enqueue.h"
+#include "cl_image.h"
+#include "cl_driver.h"
+#include "cl_utils.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
+
+cl_int cl_enqueue_read_buffer(enqueue_data* data)
+{
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+
+  if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  memcpy(data->ptr, (char*)src_ptr + data->offset, data->size);
+
+  err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_write_buffer(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  void* dst_ptr;
+
+  if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  memcpy((char*)dst_ptr + data->offset, data->const_ptr, data->size);
+
+  err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_read_image(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+
+  cl_mem image = data->mem_obj;
+  const size_t* origin = data->origin;
+  const size_t* region = data->region;
+
+  if (!(src_ptr = cl_mem_map_auto(image))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  src_ptr = (char*)src_ptr + offset;
+
+  if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+  {
+    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+  }
+  else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src = src_ptr;
+      char* dst = data->ptr;
+      for (y = 0; y < region[1]; y++) {
+        memcpy(dst, src, image->bpp*region[0]);
+        src += image->row_pitch;
+        dst += data->row_pitch;
+      }
+      src_ptr = (char*)src_ptr + image->slice_pitch;
+      data->ptr = (char*)data->ptr + data->slice_pitch;
+    }
+  }
+
+ err = cl_mem_unmap_auto(image);
+
+error:
+  return err;
+
+}
+
+cl_int cl_enqueue_write_image(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  void* dst_ptr;
+
+  cl_mem image = data->mem_obj;
+  const size_t *origin = data->origin;
+  const size_t *region = data->region;
+
+  if (!(dst_ptr = cl_mem_map_auto(image))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  dst_ptr = (char*)dst_ptr + offset;
+
+  if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+  {
+    memcpy(dst_ptr, data->ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+  }
+  else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src = data->const_ptr;
+      char* dst = dst_ptr;
+      for (y = 0; y < region[1]; y++) {
+        memcpy(dst, src, image->bpp*region[0]);
+        src += data->row_pitch;
+        dst += image->row_pitch;
+      }
+      data->ptr = (char*)data->ptr + data->slice_pitch;
+      dst_ptr = (char*)dst_ptr + image->slice_pitch;
+    }
+  }
+
+  err = cl_mem_unmap_auto(image);
+
+error:
+  return err;
+
+}
+
+cl_int cl_enqueue_map_buffer(enqueue_data *data)
+{
+
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+  void *mem_ptr = NULL;
+  cl_int slot = -1;
+  cl_mem buffer = data->mem_obj;
+
+  if (!(ptr = cl_mem_map_auto(buffer))) {
+    err = CL_MAP_FAILURE;
+  }
+
+  ptr = (char*)ptr + data->offset;
+
+  if(buffer->flags & CL_MEM_USE_HOST_PTR) {
+    assert(buffer->host_ptr);
+    memcpy(buffer->host_ptr + data->offset, ptr, data->size);
+    mem_ptr = buffer->host_ptr + data->offset;
+  } else {
+    mem_ptr = ptr;
+  }
+
+  /* Record the mapped address. */
+  if (!buffer->mapped_ptr_sz) {
+    buffer->mapped_ptr_sz = 16;
+    buffer->mapped_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz);
+    if (!buffer->mapped_ptr) {
+      cl_mem_unmap_auto (buffer);
+      err = CL_OUT_OF_HOST_MEMORY;
+      ptr = NULL;
+      goto error;
+    }
+
+    memset(buffer->mapped_ptr, 0, buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+    slot = 0;
+  } else {
+   int i = 0;
+    for (; i < buffer->mapped_ptr_sz; i++) {
+      if (buffer->mapped_ptr[i].ptr == NULL) {
+        slot = i;
+        break;
+      }
+   }
+
+    if (i == buffer->mapped_ptr_sz) {
+      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz * 2);
+      if (!new_ptr) {
+       cl_mem_unmap_auto (buffer);
+        err = CL_OUT_OF_HOST_MEMORY;
+        ptr = NULL;
+        goto error;
+      }
+      memset(new_ptr, 0, 2 * buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      memcpy(new_ptr, buffer->mapped_ptr,
+             buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      slot = buffer->mapped_ptr_sz;
+      buffer->mapped_ptr_sz *= 2;
+      free(buffer->mapped_ptr);
+      buffer->mapped_ptr = new_ptr;
+    }
+  }
+
+  assert(slot != -1);
+  buffer->mapped_ptr[slot].ptr = mem_ptr;
+  buffer->mapped_ptr[slot].v_ptr = ptr;
+  buffer->mapped_ptr[slot].size = data->size;
+  buffer->map_ref++;
+
+  data->ptr = mem_ptr;
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_map_image(enqueue_data *data)
+{
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+
+  cl_mem image = data->mem_obj;
+  const size_t *origin = data->origin;
+
+  if (!(ptr = cl_mem_map_auto(image))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  data->ptr = (char*)ptr + offset;
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  int i;
+  size_t mapped_size = 0;
+  void * v_ptr = NULL;
+  void * mapped_ptr = data->ptr;
+  cl_mem memobj = data->mem_obj;
+
+  assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+  INVALID_VALUE_IF(!mapped_ptr);
+  for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+    if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+      memobj->mapped_ptr[i].ptr = NULL;
+      mapped_size = memobj->mapped_ptr[i].size;
+      v_ptr = memobj->mapped_ptr[i].v_ptr;
+      memobj->mapped_ptr[i].size = 0;
+      memobj->mapped_ptr[i].v_ptr = NULL;
+      memobj->map_ref--;
+      break;
+    }
+  }
+  /* can not find a mapped address? */
+  INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
+
+  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mapped_ptr >= memobj->host_ptr &&
+      mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+    /* Sync the data. */
+    memcpy(v_ptr, mapped_ptr, mapped_size);
+  } else {
+    assert(v_ptr == mapped_ptr);
+  }
+
+  cl_mem_unmap_auto(memobj);
+
+  /* shrink the mapped slot. */
+  if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
+    int j = 0;
+    cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+                             sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
+    if (!new_ptr) {
+      /* Just do nothing. */
+      goto error;
+    }
+    memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
+
+    for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+      if (memobj->mapped_ptr[i].ptr) {
+        new_ptr[j] = memobj->mapped_ptr[i];
+        j++;
+        assert(j < memobj->mapped_ptr_sz/2);
+      }
+    }
+    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
+    free(memobj->mapped_ptr);
+    memobj->mapped_ptr = new_ptr;
+  }
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_handle(enqueue_data* data)
+{
+  switch(data->type) {
+    case EnqueueReadBuffer:
+      return cl_enqueue_read_buffer(data);
+    case EnqueueWriteBuffer:
+      return cl_enqueue_write_buffer(data);
+    case EnqueueReadImage:
+      return cl_enqueue_read_image(data);
+    case EnqueueWriteImage:
+      return cl_enqueue_write_image(data);
+    case EnqueueMapBuffer:
+      return cl_enqueue_map_buffer(data);
+    case EnqueueMapImage:
+      return cl_enqueue_map_image(data);
+    case EnqueueUnmapMemObject:
+      return cl_enqueue_unmap_mem_object(data);
+    case EnqueueNDRangeKernel:
+      cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);   //goto default
+    default:
+      return CL_SUCCESS;
+  }
+}
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
new file mode 100644
index 0000000..7dc8ceb
--- /dev/null
+++ b/src/cl_enqueue.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+#ifndef __CL_ENQUEUE_H__
+#define __CL_ENQUEUE_H__
+
+#include "cl_mem.h"
+#include "cl_command_queue.h"
+#include "cl_internals.h"
+#include "CL/cl.h"
+
+typedef enum {
+  EnqueueReadBuffer = 0,
+  EnqueueReadBufferRect,
+  EnqueueWriteBuffer,
+  EnqueueWriteBufferRect,
+  EnqueueCopyBuffer,
+  EnqueueCopyBufferRect,
+  EnqueueReadImage,
+  EnqueueWriteImage,
+  EnqueueCopyImage,
+  EnqueueCopyImageToBuffer,
+  EnqueueCopyBufferToImage,
+  EnqueueMapBuffer,
+  EnqueueMapImage,
+  EnqueueUnmapMemObject,
+  EnqueueNDRangeKernel,
+  EnqueueInvalid
+} enqueue_type;
+
+typedef struct _enqueue_data {
+  enqueue_type      type;          /* Command type */
+  cl_mem            mem_obj;       /* Enqueue's cl_mem */
+  cl_command_queue  queue;         /* Command queue */
+  size_t            offset;        /* Mem object's offset */
+  size_t            size;          /* Size */
+  size_t            origin[3];     /* Origin */
+  size_t            region[3];     /* Region */
+  size_t            row_pitch;     /* Row pitch */
+  size_t            slice_pitch;   /* Slice pitch */
+  cl_map_flags      map_flags;     /* Map flags */
+  const void *      const_ptr;     /* Const ptr for memory read */
+  void *            ptr;           /* ptr for write and return value */
+} enqueue_data;
+
+/* Do real enqueue commands */
+cl_int cl_enqueue_handle(enqueue_data* data);
+#endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index 6539b05..e882c7c 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -14,7 +14,389 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Author: Rong Yang <rong.r.yang at intel.com>
  */
-struct empty {int dummy;};
 
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
+{
+  cl_event event = NULL;
+
+  /* Allocate and inialize the structure itself */
+  TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
+  SET_ICD(event->dispatch)
+  event->magic = CL_MAGIC_EVENT_HEADER;
+  event->ref_n = 1;
+
+  /* Append the event in the context event list */
+  pthread_mutex_lock(&ctx->event_lock);
+    event->next = ctx->events;
+    if (ctx->events != NULL)
+      ctx->events->prev = event;
+    ctx->events = event;
+  pthread_mutex_unlock(&ctx->event_lock);
+  event->ctx   = ctx;
+  cl_context_add_ref(ctx);
+
+  /* Initialize all members and create GPGPU event object */
+  event->queue = queue;
+  event->type  = type;
+  event->gpgpu_event = NULL;
+  if(type == CL_COMMAND_USER) {
+    event->status = CL_SUBMITTED;
+  }
+  else {
+    event->status = CL_QUEUED;
+    event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
+  }
+  cl_event_add_ref(event);       //dec when complete
+  event->user_cb = NULL;
+  event->enqueue_cb = NULL;
+  event->waits_head = NULL;
+  event->emplict = emplict;
+
+exit:
+  return event;
+error:
+  cl_event_delete(event);
+  event = NULL;
+  goto exit;
+}
+
+void cl_event_delete(cl_event event)
+{
+  if (UNLIKELY(event == NULL))
+    return;
+
+  if (atomic_dec(&event->ref_n) > 1)
+    return;
+
+  /* Call all user's callback if haven't execute */
+  user_callback *cb = event->user_cb;
+  while(event->user_cb) {
+    cb = event->user_cb;
+    if(cb->executed == CL_FALSE) {
+      cb->pfn_notify(event, event->status, cb->user_data);
+    }
+    event->user_cb = cb->next;
+    cl_free(cb);
+  }
+
+  /* delete gpgpu event object */
+  if(event->gpgpu_event)
+    cl_gpgpu_event_delete(event->gpgpu_event);
+
+  /* Remove it from the list */
+  assert(event->ctx);
+  pthread_mutex_lock(&event->ctx->event_lock);
+    if (event->prev)
+      event->prev->next = event->next;
+    if (event->next)
+      event->next->prev = event->prev;
+    if (event->prev == NULL && event->next == NULL)
+      event->ctx->events = NULL;
+  pthread_mutex_unlock(&event->ctx->event_lock);
+  cl_context_delete(event->ctx);
+
+  cl_free(event);
+}
+
+void cl_event_add_ref(cl_event event)
+{
+  assert(event);
+  atomic_inc(&event->ref_n);
+}
+
+cl_int cl_event_set_callback(cl_event event ,
+                                  cl_int command_exec_callback_type,
+                                  EVENT_NOTIFY pfn_notify,
+                                  void* user_data)
+{
+  assert(event);
+  assert(pfn_notify);
+
+  cl_int err = CL_SUCCESS;
+  user_callback *cb;
+  TRY_ALLOC(cb, CALLOC(user_callback));
+
+  cb->pfn_notify  = pfn_notify;
+  cb->user_data   = user_data;
+  cb->status      = command_exec_callback_type;
+  cb->executed    = CL_FALSE;
+
+  cb->next        = event->user_cb;
+  event->user_cb  = cb;
+
+exit:
+  return err;
+error:
+  err = CL_OUT_OF_HOST_MEMORY;
+  cl_free(cb);
+  goto exit;
+};
+
+cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event,cl_context ctx)
+{
+  cl_int err = CL_SUCCESS;
+  cl_int i;
+  /* check the event_wait_list and num_events_in_wait_list */
+  if((event_wait_list == NULL) &&
+     (num_events_in_wait_list > 0))
+    goto exit;
+
+  if ((event_wait_list != NULL) &&
+      (num_events_in_wait_list == 0)){
+    goto error;
+  }
+
+  /* check the event and context */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    CHECK_EVENT(event_wait_list[i]);
+    if(event_wait_list[i]->status < CL_COMPLETE) {
+      err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+      goto exit;
+    }
+    if(event && *event == event_wait_list[i])
+      goto error;
+    if(event_wait_list[i]->ctx != ctx)
+      goto error;
+  }
+
+exit:
+  return err;
+error:
+  err = CL_INVALID_EVENT_WAIT_LIST;  //reset error
+  goto exit;
+}
+
+cl_int cl_event_wait_events(cl_uint num_events_in_wait_list,
+                          const cl_event *event_wait_list)
+{
+  cl_int i, j;
+  /* Check whether wait user events */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    if(event_wait_list[i]->status <= CL_COMPLETE)
+      continue;
+
+    /* Need wait on user event, return and do enqueue defer */
+    if((event_wait_list[i]->type == CL_COMMAND_USER) ||
+       (event_wait_list[i]->enqueue_cb &&
+       (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
+      for(j=0; j<num_events_in_wait_list; j++)
+        cl_event_add_ref(event_wait_list[j]);  //add defer enqueue's wait event reference
+      return CL_ENQUEUE_EXECUTE_DEFER;
+    }
+  }
+
+  /* Non user events or all user event finished, wait all enqueue events finish */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    if(event_wait_list[i]->status <= CL_COMPLETE)
+      continue;
+
+    //enqueue callback haven't finish, in another thread, wait
+    if(event_wait_list[i]->enqueue_cb != NULL)
+      return CL_ENQUEUE_EXECUTE_DEFER;
+    cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
+    cl_event_set_status(event_wait_list[i], CL_COMPLETE);  //Execute user's callback
+  }
+  return CL_ENQUEUE_EXECUTE_IMM;
+}
+
+void cl_event_new_enqueue_callback(cl_event event,
+                                            enqueue_data *data,
+                                            cl_uint num_events_in_wait_list,
+                                            const cl_event *event_wait_list)
+{
+  enqueue_callback *cb, *node;
+  user_event *user_events, *u_ev;
+  cl_int i;
+
+  /* Allocate and inialize the structure itself */
+  TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
+  cb->num_events = num_events_in_wait_list;
+  cb->wait_list = event_wait_list;
+  cb->event = event;
+  cb->next = NULL;
+  cb->wait_user_events = NULL;
+
+  /* Find out all user events that events in event_wait_list wait */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    if(event_wait_list[i]->status <= CL_COMPLETE)
+      continue;
+
+    if(event_wait_list[i]->type == CL_COMMAND_USER) {
+      /* Insert the enqueue_callback to user event list */
+      node = event_wait_list[i]->waits_head;
+      if(node == NULL)
+        event_wait_list[i]->waits_head = cb;
+      else {
+        while((node != cb) && node->next)
+          node = node->next;
+        if(node == cb)   //wait on dup user event
+          continue;
+        node->next = cb;
+      }
+      /* Insert the user event to enqueue_callback's wait_user_events */
+      TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+      u_ev->event = event_wait_list[i];
+      u_ev->next = cb->wait_user_events;
+      cb->wait_user_events = u_ev;
+    } else if(event_wait_list[i]->enqueue_cb != NULL) {
+      user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
+      while(user_events != NULL) {
+        /* Insert the enqueue_callback to user event's  waits_tail */
+        node = user_events->event->waits_head;
+        while((node != cb) && node->next)
+          node = node->next;
+        if(node == cb) {  //wait on dup user event
+          user_events = user_events->next;
+          continue;
+        }
+        node->next = cb;
+
+        /* Insert the user event to enqueue_callback's wait_user_events */
+        TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+        u_ev->event = user_events->event;
+        u_ev->next = cb->wait_user_events;
+        cb->wait_user_events = u_ev;
+        user_events = user_events->next;
+      }
+    }
+  }
+  if(data->queue != NULL) {
+    assert(event->gpgpu_event);
+    cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
+    data->ptr = (void *)event->gpgpu_event;
+  }
+  cb->data = *data;
+  event->enqueue_cb = cb;
+
+exit:
+  return;
+error:
+  if(cb) {
+    while(cb->wait_user_events) {
+      u_ev = cb->wait_user_events;
+      cb->wait_user_events = cb->wait_user_events->next;
+      cl_free(u_ev);
+    }
+    cl_free(cb);
+  }
+  goto exit;
+}
+
+void cl_event_set_status(cl_event event, cl_int status)
+{
+  user_callback *user_cb;
+  user_event    *u_ev, *u_ev_next;
+  cl_int ret, i;
+  cl_event evt;
+
+  pthread_mutex_lock(&event->ctx->event_lock);
+  if(status >= event->status) {
+    pthread_mutex_unlock(&event->ctx->event_lock);
+    return;
+  }
+  if(event->status <= CL_COMPLETE) {
+    event->status = status;    //have done enqueue before or doing in another thread
+    pthread_mutex_unlock(&event->ctx->event_lock);
+    return;
+  }
+
+  if(status <= CL_COMPLETE) {
+    if(event->enqueue_cb) {
+      cl_enqueue_handle(&event->enqueue_cb->data);
+      event->status = status;  //Change the event status after enqueue and befor unlock
+
+      pthread_mutex_unlock(&event->ctx->event_lock);
+      for(i=0; i<event->enqueue_cb->num_events; i++)
+        cl_event_delete(event->enqueue_cb->wait_list[i]);
+      pthread_mutex_lock(&event->ctx->event_lock);
+
+      cl_free(event->enqueue_cb);
+      event->enqueue_cb = NULL;
+    }
+  }
+  if(event->status >= status)  //maybe changed in other threads
+    event->status = status;
+  pthread_mutex_unlock(&event->ctx->event_lock);
+
+  if(event->status <= CL_COMPLETE)
+    cl_event_delete(event);
+
+  /* Call user callback */
+  user_cb = event->user_cb;
+  while(user_cb) {
+    if(user_cb->status >= status) {
+      user_cb->pfn_notify(event, event->status, user_cb->user_data);
+      user_cb->executed = CL_TRUE;
+    }
+    user_cb = user_cb->next;
+  }
+
+  if(event->type != CL_COMMAND_USER)
+    return;
+
+  /* Check all defer enqueue */
+  enqueue_callback *cb, *enqueue_cb = event->waits_head;
+  while(enqueue_cb) {
+    /* Remove this user event in enqueue_cb */
+    while(enqueue_cb->wait_user_events &&
+          enqueue_cb->wait_user_events->event == event) {
+      u_ev = enqueue_cb->wait_user_events;
+      enqueue_cb->wait_user_events = enqueue_cb->wait_user_events->next;
+      cl_free(u_ev);
+    }
+
+    u_ev = enqueue_cb->wait_user_events;
+    while(u_ev) {
+      u_ev_next = u_ev->next;
+      if(u_ev_next && u_ev_next->event == event) {
+        u_ev->next = u_ev_next->next;
+        cl_free(u_ev_next);
+      } else
+        u_ev->next = u_ev_next;
+    }
+
+    /* Still wait on other user events */
+    if(enqueue_cb->wait_user_events != NULL) {
+      enqueue_cb = enqueue_cb->next;
+      continue;
+    }
+
+    /* All user events complete, now wait enqueue events */
+    ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list);
+    assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
+
+    cb = enqueue_cb;
+    enqueue_cb = enqueue_cb->next;
+
+    /* Call the pending operation */
+    evt = cb->event;
+    cl_event_set_status(cb->event, CL_COMPLETE);
+    if(cb->event->emplict == CL_FALSE) {
+      cl_event_delete(evt);
+    }
+  }
+  event->waits_head = NULL;
+}
+
+void cl_event_update_status(cl_event event)
+{
+  if(event->status <= CL_COMPLETE)
+    return;
+  if((event->gpgpu_event) &&
+     (cl_gpgpu_event_update_status(event->gpgpu_event, 0) == command_complete))
+    cl_event_set_status(event, CL_COMPLETE);
+}
diff --git a/src/cl_event.h b/src/cl_event.h
index 23378e8..c921cb2 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -20,9 +20,73 @@
 #ifndef __CL_EVENT_H__
 #define __CL_EVENT_H__
 
+#include <semaphore.h>
+
+#include "cl_enqueue.h"
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+
+#define CL_ENQUEUE_EXECUTE_IMM   0
+#define CL_ENQUEUE_EXECUTE_DEFER 1
+
+typedef struct _user_event {
+  cl_event            event;   /* The user event */
+  struct _user_event* next;    /* Next user event in list */
+} user_event;
+
+typedef struct _enqueue_callback {
+  cl_event           event;            /* The event relative this enqueue callback */
+  enqueue_data       data;             /* Hold all enqueue callback's infomation */
+  cl_uint            num_events;       /* num events in wait list */
+  const cl_event*    wait_list;        /* All event wait list this callback wait on */
+  user_event*        wait_user_events; /* The head of user event list the callback wait on */
+  struct _enqueue_callback*  next;     /* The  next enqueue callback in wait list */
+} enqueue_callback;
+
+typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int event_command_exec_status, void *user_data);
+
+typedef struct _user_callback {
+  cl_int            status;     /* The execution status */
+  cl_bool           executed;   /* Indicat the callback function been called or not */
+  EVENT_NOTIFY      pfn_notify; /* Callback function */
+  void*             user_data;  /* Callback user data */
+  struct _user_callback*    next;       /* Next event callback in list */
+} user_callback;
+
 struct _cl_event {
   DEFINE_ICD(dispatch)
+  uint64_t           magic;       /* To identify it as a sampler object */
+  volatile int       ref_n;       /* We reference count this object */
+  cl_context         ctx;         /* The context associated with event */
+  cl_event           prev, next;  /* We chain the memory buffers together */
+  cl_command_queue   queue;       /* The command queue associated with event */
+  cl_command_type    type;        /* The command type associated with event */
+  cl_int             status;      /* The execution status */
+  cl_gpgpu_event     gpgpu_event; /* The event object communicate with hardware */
+  user_callback*     user_cb;     /* The event callback functions */
+  enqueue_callback*  enqueue_cb;  /* This event's enqueue */
+  enqueue_callback*  waits_head;  /* The head of enqueues list wait on this event */
+  cl_bool            emplict;     /* Identify this event whether created by api emplict*/
 };
 
+/* Create a new event object */
+cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
+/* Unref the object and delete it if no more reference on it */
+void cl_event_delete(cl_event);
+/* Add one more reference to this object */
+void cl_event_add_ref(cl_event);
+/* Rigister a user callback function for specific commond execution status */
+cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
+/* Check events wait list for enqueue commonds */
+cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
+/* Wait the all events in wait list complete */
+cl_int cl_event_wait_events(cl_uint, const cl_event *);
+/* New a enqueue suspend task */
+void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
+/* Set the event status and call all callbacks */
+void cl_event_set_status(cl_event, cl_int);
+/* Check and update event status */
+void cl_event_update_status(cl_event);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/cl_internals.h b/src/cl_internals.h
index b2b25b2..693de1d 100644
--- a/src/cl_internals.h
+++ b/src/cl_internals.h
@@ -28,6 +28,7 @@
 #define CL_MAGIC_PROGRAM_HEADER   0x34560ab12789cdefLL
 #define CL_MAGIC_QUEUE_HEADER     0x83650a12b79ce4dfLL
 #define CL_MAGIC_SAMPLER_HEADER   0x686a0ecba79ce33fLL
+#define CL_MAGIC_EVENT_HEADER     0x8324a9c810ebf90fLL
 #define CL_MAGIC_MEM_HEADER       0x381a27b9ce6504dfLL
 #define CL_MAGIC_DEAD_HEADER      0xdeaddeaddeaddeadLL
 
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 59b7a2b..bfe418d 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -147,6 +147,18 @@ do {                                                        \
   }                                                         \
 } while (0)
 
+#define CHECK_EVENT(EVENT)                                    \
+  do {                                                        \
+    if (UNLIKELY(EVENT == NULL)) {                            \
+      err = CL_INVALID_EVENT;                            \
+      goto error;                                             \
+    }                                                         \
+    if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) {    \
+      err = CL_INVALID_EVENT;                                 \
+      goto error;                                             \
+    }                                                         \
+  } while (0)
+
 #define CHECK_SAMPLER(SAMPLER)                              \
 do {                                                        \
   if (UNLIKELY(SAMPLER == NULL)) {                          \
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index e553a55..1301b66 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -57,6 +57,12 @@ typedef struct surface_heap {
   char surface[256][sizeof(gen6_surface_state_t)];
 } surface_heap_t;
 
+typedef struct intel_event {
+  intel_batchbuffer_t *batch;
+  drm_intel_bo* buffer;
+  int status;
+} intel_event_t;
+
 #define MAX_IF_DESC    32
 
 /* We can bind only a limited number of buffers */
@@ -106,8 +112,8 @@ typedef struct intel_gpgpu intel_gpgpu_t;
 static void
 intel_gpgpu_sync(intel_gpgpu_t *gpgpu)
 {
-    if (gpgpu->batch->last_bo)
-	drm_intel_bo_wait_rendering(gpgpu->batch->last_bo);
+  if (gpgpu->batch->last_bo)
+    drm_intel_bo_wait_rendering(gpgpu->batch->last_bo);
 }
 
 static void
@@ -225,7 +231,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 }
 
 static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu) 
+intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
@@ -243,7 +249,7 @@ intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
 }
 
 static void
-intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu) 
+intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
@@ -256,7 +262,7 @@ intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
 static const uint32_t gpgpu_l3_config_reg1[] = {
   0x00080040, 0x02040040, 0x00800040, 0x01000038,
   0x02000030, 0x01000038, 0x00000038, 0x00000040,
-  0x0A140091, 0x09100091, 0x08900091, 0x08900091 
+  0x0A140091, 0x09100091, 0x08900091, 0x08900091
 };
 
 static const uint32_t gpgpu_l3_config_reg2[] = {
@@ -369,6 +375,7 @@ intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
 static void
 intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
 {
+  intel_batchbuffer_emit_mi_flush(gpgpu->batch);
   intel_batchbuffer_flush(gpgpu->batch);
   intel_gpgpu_check_binded_buf_address(gpgpu);
 }
@@ -404,7 +411,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   /* surface state */
   if(gpgpu->surface_heap_b.bo)
     dri_bo_unreference(gpgpu->surface_heap_b.bo);
-  bo = dri_bo_alloc(bufmgr, 
+  bo = dri_bo_alloc(bufmgr,
                     "SURFACE_HEAP",
                     sizeof(surface_heap_t),
                     32);
@@ -416,7 +423,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   /* Interface descriptor remap table */
   if(gpgpu->idrt_b.bo)
     dri_bo_unreference(gpgpu->idrt_b.bo);
-  bo = dri_bo_alloc(bufmgr, 
+  bo = dri_bo_alloc(bufmgr,
                     "IDRT",
                     MAX_IF_DESC * sizeof(struct gen6_interface_descriptor),
                     32);
@@ -431,7 +438,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   /* sampler state */
   if (gpgpu->sampler_state_b.bo)
     dri_bo_unreference(gpgpu->sampler_state_b.bo);
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr, 
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr,
                     "SAMPLER_STATE",
                     GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
                     32);
@@ -478,7 +485,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
   ss1->ss2.width  = ss0->ss2.width  = 127;   /* bits 6:0 of sz */
   ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */
   ss0->ss3.depth  = 1023; /* bits 30:21 of sz */
-  ss1->ss3.depth  = 510;  /* bits 30:21 of sz */
+  ss1->ss3.depth  = 1023;  /* bits 30:21 of sz */
   ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3;
   heap->binding_table[0] = offsetof(surface_heap_t, surface);
   heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
@@ -830,6 +837,83 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static intel_event_t*
+intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
+{
+  intel_event_t *event = NULL;
+  TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t));
+
+  event->status = command_queued;
+  event->batch = NULL;
+  event->buffer = gpgpu->batch->buffer;
+  if(event->buffer != NULL)
+    drm_intel_bo_reference(event->buffer);
+
+exit:
+  return event;
+error:
+  cl_free(event);
+  event = NULL;
+  goto exit;
+}
+
+static int
+intel_gpgpu_event_update_status(intel_event_t *event, int wait)
+{
+  if(event->status == command_complete)
+    return event->status;
+
+  if (event->buffer &&
+      event->batch == NULL &&        //have flushed
+      !drm_intel_bo_busy(event->buffer)) {
+    event->status = command_complete;
+    drm_intel_bo_unreference(event->buffer);
+    event->buffer = NULL;
+    return event->status;
+  }
+
+  if(wait == 0)
+    return event->status;
+
+  if (event->buffer) {
+    drm_intel_bo_wait_rendering(event->buffer);
+    event->status = command_complete;
+    drm_intel_bo_unreference(event->buffer);
+    event->buffer = NULL;
+  }
+  return event->status;
+}
+
+static void
+intel_gpgpu_event_pending(intel_gpgpu_t *gpgpu, intel_event_t *event)
+{
+  assert(event->buffer);           //This is gpu enqueue command
+  assert(event->batch == NULL);    //This command haven't pengding.
+  event->batch = intel_batchbuffer_new(gpgpu->drv);
+  assert(event->batch);
+  *event->batch = *gpgpu->batch;
+  if(event->batch->buffer)
+    drm_intel_bo_reference(event->batch->buffer);
+}
+
+static void
+intel_gpgpu_event_resume(intel_event_t *event)
+{
+  assert(event->batch);           //This command have pending.
+  intel_batchbuffer_flush(event->batch);
+  intel_batchbuffer_delete(event->batch);
+  event->batch = NULL;
+}
+
+static void
+intel_gpgpu_event_delete(intel_event_t *event)
+{
+  assert(event->batch == NULL);   //This command must have been flushed.
+  if(event->buffer)
+    drm_intel_bo_unreference(event->buffer);
+  cl_free(event);
+}
+
 LOCAL void
 intel_set_gpgpu_callbacks(void)
 {
@@ -851,5 +935,10 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
   cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
+  cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
+  cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status;
+  cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
+  cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
+  cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
 }
 
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index b205c67..97b7519 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -74,6 +74,7 @@ set (utests_sources
   compiler_uint16_copy.cpp
   compiler_uint3_unaligned_copy.cpp
   compiler_upsample_int.cpp
+  compiler_upsample_long.cpp
   compiler_unstructured_branch0.cpp
   compiler_unstructured_branch1.cpp
   compiler_unstructured_branch2.cpp
@@ -84,9 +85,12 @@ set (utests_sources
   compiler_switch.cpp
   compiler_math.cpp
   compiler_atomic_functions.cpp
+  compiler_async_copy.cpp
+  compiler_async_stride_copy.cpp
   compiler_insn_selection_min.cpp
   compiler_insn_selection_max.cpp
   compiler_insn_selection_masked_min_max.cpp
+	compiler_load_bool_imm.cpp
   compiler_global_memory_barrier.cpp
   compiler_local_memory_two_ptr.cpp
   compiler_local_memory_barrier.cpp
@@ -115,14 +119,22 @@ set (utests_sources
   builtin_global_id.cpp
   builtin_num_groups.cpp
   builtin_local_id.cpp
+  builtin_acos_asin.cpp
   runtime_createcontext.cpp
   runtime_null_kernel_arg.cpp
+	runtime_event.cpp
   compiler_double.cpp
   compiler_double_2.cpp
   compiler_double_3.cpp
   compiler_double_4.cpp
   compiler_long.cpp
   compiler_long_2.cpp
+  compiler_long_convert.cpp
+  compiler_long_shl.cpp
+  compiler_long_shr.cpp
+  compiler_long_asr.cpp
+  compiler_long_mult.cpp
+  compiler_long_cmp.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/builtin_acos_asin.cpp b/utests/builtin_acos_asin.cpp
new file mode 100644
index 0000000..0187226
--- /dev/null
+++ b/utests/builtin_acos_asin.cpp
@@ -0,0 +1,87 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+#define printf_c(...) \
+{\
+  printf("\033[1m\033[40;31m");\
+  printf( __VA_ARGS__ );\
+  printf("\033[0m");\
+}
+
+const float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
+const int count_input = sizeof(input_data) / sizeof(input_data[0]);
+const int max_function = 5;
+
+static void cpu_compiler_math(float *dst, const float *src)
+{
+  const float x = *src;
+
+  dst[0] = acos(x);
+  dst[1] = acosh(x);
+  dst[2] = asin(x);
+  dst[3] = asinh(x);
+  dst[4] = x;
+}
+
+static void builtin_acos_asin(void)
+{
+  // Setup kernel and buffers
+  int k, i, index_cur;
+  float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+
+  OCL_CREATE_KERNEL("builtin_acos_asin");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  globals[0] = count_input;
+  locals[0] = 1;
+
+  clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+
+   // Run the kernel
+  OCL_NDRANGE( 1 );
+
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+  for (k = 0; (uint)k < count_input; k++)
+  {
+    cpu_compiler_math( cpu_data + k * max_function, input_data + k);
+
+    for (i = 0; i < max_function; i++)
+    {
+      index_cur = k * max_function + i;
+#if udebug
+      if (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])){
+        printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else if (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])){
+        printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else if(fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-3f){
+        printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else
+        printf("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+#else
+     if (isinf(cpu_data[index_cur]))
+       OCL_ASSERT(isinf(gpu_data[index_cur]));
+     else if (isnan(cpu_data[index_cur]))
+       OCL_ASSERT(isnan(gpu_data[index_cur]));
+     else
+     {
+       OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+     }
+#endif
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_acos_asin)
diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp
new file mode 100644
index 0000000..9384f85
--- /dev/null
+++ b/utests/compiler_async_copy.cpp
@@ -0,0 +1,39 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_copy(void)
+{
+  const size_t n = 1024;
+  const size_t local_size = 32;
+  const int copiesPerWorkItem = 5;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_async_copy");
+  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(int) * 2, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(int)*2, NULL);
+  OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; ++i)
+      ((int*)buf_data[1])[i] = rand();
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = local_size;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+
+  // Check results
+  int *dst = (int*)buf_data[0];
+  int *src = (int*)buf_data[1];
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * 2; i++)
+    OCL_ASSERT(dst[i] == src[i] + 3);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy);
diff --git a/utests/compiler_async_stride_copy.cpp b/utests/compiler_async_stride_copy.cpp
new file mode 100644
index 0000000..132f917
--- /dev/null
+++ b/utests/compiler_async_stride_copy.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_stride_copy(void)
+{
+  const size_t n = 1024;
+  const size_t local_size = 128;
+  const int copiesPerWorkItem = 5;
+  const int stride =3;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_async_stride_copy");
+  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(char)*4, NULL);
+  OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+  OCL_SET_ARG(4, sizeof(int), &stride);
+
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * 4 * stride; ++i)
+      ((char*)buf_data[1])[i] = rand() && 0xff;
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = local_size;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+
+  // Check results
+  char *dst = (char*)buf_data[0];
+  char *src = (char*)buf_data[1];
+  for (uint32_t i = 0; i < n * copiesPerWorkItem; i += stride * 4) {
+    OCL_ASSERT(dst[i + 0] == src[i + 0] + 3);
+    OCL_ASSERT(dst[i + 1] == src[i + 1] + 3);
+    OCL_ASSERT(dst[i + 2] == src[i + 2] + 3);
+    OCL_ASSERT(dst[i + 3] == src[i + 3] + 3);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_stride_copy);
diff --git a/utests/compiler_load_bool_imm.cpp b/utests/compiler_load_bool_imm.cpp
new file mode 100644
index 0000000..d060daf
--- /dev/null
+++ b/utests/compiler_load_bool_imm.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+static void compiler_load_bool_imm(void)
+{
+  const size_t n = 1024;
+  const size_t local_size = 16;
+  const int copiesPerWorkItem = 5;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_load_bool_imm");
+  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, local_size*copiesPerWorkItem*sizeof(int), NULL); // 16 x int
+  OCL_SET_ARG(2, sizeof(int), &copiesPerWorkItem); // 16 x int
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = local_size;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  int *dst = (int*)buf_data[0];
+  for (uint32_t i = 0; i < n * copiesPerWorkItem; i++)
+    OCL_ASSERT(dst[i] == copiesPerWorkItem);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_load_bool_imm);
diff --git a/utests/compiler_long_asr.cpp b/utests/compiler_long_asr.cpp
new file mode 100644
index 0000000..0a70a23
--- /dev/null
+++ b/utests/compiler_long_asr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_asr(void)
+{
+  const size_t n = 64;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_asr");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    src[i] = (int64_t)1 << 63;
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(src));
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  int64_t *dest = ((int64_t *)buf_data[1]);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    if (i > 7)
+      OCL_ASSERT(dest[i] == src[i] >> i);
+    else
+      OCL_ASSERT(dest[i] == src[i] + 1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_asr);
diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
new file mode 100644
index 0000000..3775556
--- /dev/null
+++ b/utests/compiler_long_cmp.cpp
@@ -0,0 +1,117 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_cmp(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  src1[0] = (int64_t)1 << 63, src2[0] = 0x7FFFFFFFFFFFFFFFll;
+  src1[1] = (int64_t)1 << 63, src2[1] = ((int64_t)1 << 63) | 1;
+  src1[2] = -1ll, src2[2] = 0;
+  src1[3] = ((int64_t)123 << 32) | 0x7FFFFFFF, src2[3] = ((int64_t)123 << 32) | 0x80000000;
+  src1[4] = 0x7FFFFFFFFFFFFFFFll, src2[4] = (int64_t)1 << 63;
+  src1[5] = ((int64_t)1 << 63) | 1, src2[5] = (int64_t)1 << 63;
+  src1[6] = 0, src2[6] = -1ll;
+  src1[7] = ((int64_t)123 << 32) | 0x80000000, src2[7] = ((int64_t)123 << 32) | 0x7FFFFFFF;
+  for(size_t i=8; i<n; i++) {
+    src1[i] = i;
+    src2[i] = i;
+  }
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_l");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] < src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_le");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] <= src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_g");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] > src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_ge");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] >= src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_eq");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] == src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_neq");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] != src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_cmp);
diff --git a/utests/compiler_long_convert.cpp b/utests/compiler_long_convert.cpp
new file mode 100644
index 0000000..18e13ee
--- /dev/null
+++ b/utests/compiler_long_convert.cpp
@@ -0,0 +1,67 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_convert(void)
+{
+  const size_t n = 16;
+  char src1[n];
+  short src2[n];
+  int src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_convert");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[4], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[5], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[4]);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[5]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = -i;
+    src2[i] = -i;
+    src3[i] = -i;
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  memcpy(buf_data[2], src3, sizeof(src3));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(3);
+  OCL_MAP_BUFFER(4);
+  OCL_MAP_BUFFER(5);
+  int64_t *dst1 = ((int64_t *)buf_data[3]);
+  int64_t *dst2 = ((int64_t *)buf_data[4]);
+  int64_t *dst3 = ((int64_t *)buf_data[5]);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx %lx %lx\n", dst1[i], dst2[i], dst3[i]);
+    OCL_ASSERT(dst1[i] == -(int64_t)i);
+    OCL_ASSERT(dst2[i] == -(int64_t)i);
+    OCL_ASSERT(dst3[i] == -(int64_t)i);
+  }
+  OCL_UNMAP_BUFFER(3);
+  OCL_UNMAP_BUFFER(4);
+  OCL_UNMAP_BUFFER(5);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert);
diff --git a/utests/compiler_long_mult.cpp b/utests/compiler_long_mult.cpp
new file mode 100644
index 0000000..06070f7
--- /dev/null
+++ b/utests/compiler_long_mult.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_mult(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_mult");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = 0x77665544FFEEDDCCLL;
+    src2[i] = ((int64_t)rand() << 32) + rand();
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+    if (i < 3)
+      OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+    else
+      OCL_ASSERT(src1[i] * src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mult);
diff --git a/utests/compiler_long_shl.cpp b/utests/compiler_long_shl.cpp
new file mode 100644
index 0000000..c8e4624
--- /dev/null
+++ b/utests/compiler_long_shl.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shl(void)
+{
+  const size_t n = 64;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_shl");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    src[i] = 1;
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(src));
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  int64_t *dest = ((int64_t *)buf_data[1]);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    if (i > 7)
+      OCL_ASSERT(dest[i] == ((int64_t)1) << i);
+    else
+      OCL_ASSERT(dest[i] == src[i] + 1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shl);
diff --git a/utests/compiler_long_shr.cpp b/utests/compiler_long_shr.cpp
new file mode 100644
index 0000000..e9fea6a
--- /dev/null
+++ b/utests/compiler_long_shr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shr(void)
+{
+  const size_t n = 64;
+  uint64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_shr");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    src[i] = (uint64_t)1 << 63;
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(src));
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  uint64_t *dest = ((uint64_t *)buf_data[1]);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    if (i > 7)
+      OCL_ASSERT(dest[i] == src[i] >> i);
+    else
+      OCL_ASSERT(dest[i] == src[i] + 1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shr);
diff --git a/utests/compiler_smoothstep.cpp b/utests/compiler_smoothstep.cpp
index 760063b..363ea7e 100644
--- a/utests/compiler_smoothstep.cpp
+++ b/utests/compiler_smoothstep.cpp
@@ -34,7 +34,7 @@ void compiler_smoothstep(void)
   OCL_MAP_BUFFER(2);
   for (int i = 0; i < n; ++i) {
     float a = 0.1f * (rand() & 15) - 0.75f;
-    float b = a + 0.1f * (rand() & 15);
+    float b = a + 0.1f * (rand() & 15) + 0.1f;
     float c = 0.1f * (rand() & 15) - 0.75f;
     src1[i] = ((float*)buf_data[0])[i] = a;
     src2[i] = ((float*)buf_data[1])[i] = b;
diff --git a/utests/compiler_upsample_long.cpp b/utests/compiler_upsample_long.cpp
new file mode 100644
index 0000000..b125ff4
--- /dev/null
+++ b/utests/compiler_upsample_long.cpp
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include "utest_helper.hpp"
+
+void compiler_upsample_long(void)
+{
+  const int n = 32;
+  int src1[n];
+  unsigned int src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_upsample_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(unsigned int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((unsigned int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int64_t*)buf_data[2])[i] == (((int64_t)(src1[i]) << 32) | src2[i]));
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_upsample_long);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index 7deb7cb..f8a3dcb 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -59,5 +59,5 @@ test_all_vector(int32_t, int)
 test_all_vector(uint32_t, uint)
 test_all_vector(float, float)
 test_all_vector(double, double)
-//test_all_vector(int64_t, long)
-//test_all_vector(uint64_t, ulong)
+test_all_vector(int64_t, long)
+test_all_vector(uint64_t, ulong)
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
new file mode 100644
index 0000000..1ec8692
--- /dev/null
+++ b/utests/runtime_event.cpp
@@ -0,0 +1,61 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE  32*1024
+void runtime_event(void)
+{
+  const size_t n = BUFFERSIZE;
+  cl_int cpu_src[BUFFERSIZE];
+  cl_event ev[3];
+  cl_int status = 0;
+  cl_int value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_event");
+  OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+
+  for(cl_uint i=0; i<BUFFERSIZE; i++)
+    cpu_src[i] = 3;
+
+  OCL_CREATE_USER_EVENT(ev[0]);
+
+  clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(int), &value);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 32;
+  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status >= CL_SUBMITTED);
+  }
+
+  OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+  clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status == CL_COMPLETE);
+
+  OCL_FINISH();
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status <= CL_COMPLETE);
+  }
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clReleaseEvent(ev[i]);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_event);
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 0365040..e7f43fc 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -88,7 +88,13 @@ extern EGLSurface  eglSurface;
   } while (0)
 
 #define OCL_CREATE_BUFFER(BUFFER, FLAGS, SIZE, DATA) \
-    OCL_CALL2(clCreateBuffer, BUFFER, ctx, FLAGS, SIZE, DATA)
+        OCL_CALL2(clCreateBuffer, BUFFER, ctx, FLAGS, SIZE, DATA)
+
+#define OCL_CREATE_USER_EVENT(EVENT) \
+    OCL_CALL2(clCreateUserEvent, EVENT, ctx)
+
+#define OCL_SET_USER_EVENT_STATUS(EVENT, STATUS) \
+    OCL_CALL(clSetUserEventStatus, EVENT, STATUS)
 
 #define OCL_CREATE_IMAGE(IMAGE, FLAGS, FORMAT, DESC, DATA) \
     OCL_CALL2(clCreateImage, IMAGE, ctx, FLAGS, FORMAT, DESC, DATA)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git