[Pkg-opencl-devel] [beignet] 49/66: Imported Upstream version 0.2+git20130807+c4d1f40

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 07:27:07 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 56724b85bacbd1b966cfa5b9ed9392b444c49f72
Author: Simon Richter <sjr at debian.org>
Date:   Wed Aug 7 16:14:06 2013 +0200

    Imported Upstream version 0.2+git20130807+c4d1f40
---
 backend/src/backend/context.cpp                    |  18 ++
 backend/src/backend/context.hpp                    |   3 +
 backend/src/backend/gen/gen_mesa_disasm.c          |  44 ++-
 backend/src/backend/gen_context.cpp                | 299 +++++++++++++++++++-
 backend/src/backend/gen_context.hpp                |  21 +-
 backend/src/backend/gen_defs.hpp                   |  28 ++
 backend/src/backend/gen_encoder.cpp                | 307 ++++++++++++---------
 backend/src/backend/gen_encoder.hpp                |  17 +-
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   6 +-
 backend/src/backend/gen_insn_scheduling.cpp        |  39 ++-
 backend/src/backend/gen_insn_selection.cpp         | 201 +++++++++++---
 backend/src/backend/gen_insn_selection.hpp         |   6 +
 backend/src/backend/gen_insn_selection.hxx         |  13 +-
 backend/src/backend/gen_reg_allocation.cpp         |  66 ++++-
 backend/src/backend/gen_register.hpp               |  76 +++++
 backend/src/backend/program.cpp                    |   8 +
 backend/src/backend/program.h                      |   4 +
 backend/src/backend/program.hpp                    |   3 +
 backend/src/ir/instruction.cpp                     |   2 +
 backend/src/llvm/llvm_gen_backend.cpp              |   2 -
 backend/src/ocl_stdlib.tmpl.h                      | 241 +++++++++++-----
 kernels/builtin_shuffle2.cl                        |  13 +
 kernels/compiler_double_4.cl                       |   5 +
 kernels/compiler_long.cl                           |   7 +
 kernels/compiler_long_2.cl                         |  20 ++
 kernels/compiler_vector_load_store.cl              |   6 +-
 src/cl_command_queue_gen7.c                        |  10 +-
 src/cl_driver.h                                    |   4 +
 src/cl_driver_defs.c                               |   1 +
 src/intel/intel_gpgpu.c                            |  52 +++-
 utests/CMakeLists.txt                              |   4 +
 utests/builtin_shuffle2.cpp                        |  45 +++
 utests/compiler_double_4.cpp                       |  40 +++
 utests/compiler_long.cpp                           |  58 ++++
 utests/compiler_long_2.cpp                         |  51 ++++
 utests/compiler_vector_load_store.cpp              |  12 +-
 36 files changed, 1432 insertions(+), 300 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 48160de..5484869 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -268,6 +268,15 @@ namespace gbe
     }
   }
 
+  static int
+  alignScratchSize(int size){
+    int i = 0;
+
+    for(; i < size; i+=1024)
+      ;
+
+    return i;
+  }
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
@@ -284,6 +293,7 @@ namespace gbe
       this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
     else
       this->simdWidth = fn.getSimdWidth();
+    this->scratchOffset = 0;
   }
 
   Context::~Context(void) {
@@ -306,6 +316,8 @@ namespace gbe
       this->kernel = NULL;
     }
     if(this->kernel != NULL)
+      this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+    if(this->kernel != NULL)
       this->kernel->ctx = this;
     return this->kernel;
   }
@@ -337,6 +349,12 @@ namespace gbe
     return offset + GEN_REG_SIZE;
   }
 
+  uint32_t Context::allocateScratchMem(uint32_t size) {
+    uint32_t offset = scratchOffset;
+    scratchOffset += size;
+    return offset;
+  }
+
   void Context::buildStack(void) {
     const auto &stackUse = dag->getUse(ir::ocl::stackptr);
     if (stackUse.size() == 0)  // no stack is used if stackptr is unused
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index c205388..50c0e70 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -91,6 +91,8 @@ namespace gbe
     /* allocate a new entry for a specific image's information */
     /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
     uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
+    /*! allocate size scratch memory and return start address */
+    uint32_t allocateScratchMem(uint32_t size);
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
@@ -126,6 +128,7 @@ namespace gbe
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
+    uint32_t scratchOffset;               //!< scratch slot for next scratch memory request
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index f65cc30..bfb865a 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -83,6 +83,7 @@ static const struct {
   [GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_ADDC] = { .name = "addc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SUBB] = { .name = "subb", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
@@ -372,6 +373,28 @@ static const char *data_port_data_cache_category[] = {
   "scratch",
 };
 
+static const char *data_port_scratch_block_size[] = {
+  "1 register",
+  "2 registers",
+  "Reserve",
+  "4 registers",
+};
+
+static const char *data_port_scratch_invalidate[] = {
+  "no invalidate",
+  "invalidate cache line",
+};
+
+static const char *data_port_scratch_channel_mode[] = {
+  "Oword",
+  "Dword",
+};
+
+static const char *data_port_scratch_msg_type[] = {
+  "Scratch Read",
+  "Scratch Write",
+};
+
 static const char *data_port_data_cache_msg_type[] = {
   [0] = "OWord Block Read",
   [1] = "Unaligned OWord Block Read",
@@ -1154,12 +1177,21 @@ int gen_disasm (FILE *file, const void *opaque_insn)
                 inst->bits3.sampler_gen7.simd_mode);
         break;
       case GEN_SFID_DATAPORT_DATA_CACHE:
-        format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
-                inst->bits3.gen7_untyped_rw.bti,
-                inst->bits3.gen7_untyped_rw.rgba,
-                data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
-                data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
-                data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+        if(inst->bits3.gen7_untyped_rw.category == 0) {
+          format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                  inst->bits3.gen7_untyped_rw.bti,
+                  inst->bits3.gen7_untyped_rw.rgba,
+                  data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+                  data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+                  data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+        } else {
+          format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+                  inst->bits3.gen7_scratch_rw.offset,
+                  data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
+                  data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
+                  data_port_scratch_channel_mode[inst->bits3.gen7_scratch_rw.channel_mode],
+                  data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
+        }
         break;
       case GEN_SFID_MESSAGE_GATEWAY:
         format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index e33d8da..621e7be 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -118,7 +118,7 @@ namespace gbe
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->SHR(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(10));
+      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
       p->curr.execWidth = this->simdWidth;
       p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
       p->curr.execWidth = 1;
@@ -146,6 +146,7 @@ namespace gbe
       case SEL_OP_RNDU: p->RNDU(dst, src); break;
       case SEL_OP_RNDE: p->RNDE(dst, src); break;
       case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
+      case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -158,9 +159,85 @@ namespace gbe
       case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break;
       case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
       case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
+      case SEL_OP_SEL_INT64:
+        {
+          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+          int execWidth = p->curr.execWidth;
+          p->push();
+          p->curr.execWidth = 8;
+          for (int nib = 0; nib < execWidth / 4; nib ++) {
+            p->curr.chooseNib(nib);
+            p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+            p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+            xdst = GenRegister::suboffset(xdst, 4);
+            xsrc0 = GenRegister::suboffset(xsrc0, 4);
+            xsrc1 = GenRegister::suboffset(xsrc1, 4);
+          }
+          p->pop();
+        }
+        break;
       case SEL_OP_AND:  p->AND(dst, src0, src1); break;
       case SEL_OP_OR:   p->OR (dst, src0, src1);  break;
       case SEL_OP_XOR:  p->XOR(dst, src0, src1); break;
+      case SEL_OP_I64AND:
+        {
+          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+          int execWidth = p->curr.execWidth;
+          p->push();
+          p->curr.execWidth = 8;
+          for (int nib = 0; nib < execWidth / 4; nib ++) {
+            p->curr.chooseNib(nib);
+            p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+            p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+            xdst = GenRegister::suboffset(xdst, 4),
+            xsrc0 = GenRegister::suboffset(xsrc0, 4),
+            xsrc1 = GenRegister::suboffset(xsrc1, 4);
+          }
+          p->pop();
+        }
+        break;
+      case SEL_OP_I64OR:
+        {
+          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+          int execWidth = p->curr.execWidth;
+          p->push();
+          p->curr.execWidth = 8;
+          for (int nib = 0; nib < execWidth / 4; nib ++) {
+            p->curr.chooseNib(nib);
+            p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+            p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+            xdst = GenRegister::suboffset(xdst, 4),
+            xsrc0 = GenRegister::suboffset(xsrc0, 4),
+            xsrc1 = GenRegister::suboffset(xsrc1, 4);
+          }
+          p->pop();
+        }
+        break;
+      case SEL_OP_I64XOR:
+        {
+          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+          int execWidth = p->curr.execWidth;
+          p->push();
+          p->curr.execWidth = 8;
+          for (int nib = 0; nib < execWidth / 4; nib ++) {
+            p->curr.chooseNib(nib);
+            p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
+            p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
+            xdst = GenRegister::suboffset(xdst, 4),
+            xsrc0 = GenRegister::suboffset(xsrc0, 4),
+            xsrc1 = GenRegister::suboffset(xsrc1, 4);
+          }
+          p->pop();
+        }
+        break;
       case SEL_OP_SHR:  p->SHR(dst, src0, src1); break;
       case SEL_OP_SHL:  p->SHL(dst, src0, src1); break;
       case SEL_OP_RSR:  p->RSR(dst, src0, src1); break;
@@ -175,12 +252,142 @@ namespace gbe
     }
   }
 
+  void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    src = src.top_half();
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+    if (execWidth == 16) {
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+    }
+    p->pop();
+  }
+
+  void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    dest = dest.top_half();
+    p->push();
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->curr.nibControl = 1;
+    p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+    if (execWidth == 16) {
+      p->curr.quarterControl = 1;
+      p->curr.nibControl = 0;
+      p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+      p->curr.nibControl = 1;
+      p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+    }
+    p->pop();
+  }
+
+  void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    src = src.bottom_half();
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+    if (execWidth == 16) {
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+    }
+    p->pop();
+  }
+
+  void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    dest = dest.bottom_half();
+    p->push();
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->curr.nibControl = 1;
+    p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+    if (execWidth == 16) {
+      p->curr.quarterControl = 1;
+      p->curr.nibControl = 0;
+      p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+      p->curr.nibControl = 1;
+      p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+    }
+    p->pop();
+  }
+
+  void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
+    int execWidth = p->curr.execWidth;
+    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->ADDC(dest, src0, src1);
+    p->MOV(src1, acc0);
+    if (execWidth == 16) {
+      p->ADDC(GenRegister::suboffset(dest, 8),
+              GenRegister::suboffset(src0, 8),
+              GenRegister::suboffset(src1, 8));
+      p->MOV(GenRegister::suboffset(src1, 8), acc0);
+    }
+    p->pop();
+  }
+
+  void GenContext::subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1) {
+    int execWidth = p->curr.execWidth;
+    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->SUBB(dest, src0, src1);
+    p->MOV(src1, acc0);
+    if (execWidth == 16) {
+      p->SUBB(GenRegister::suboffset(dest, 8),
+              GenRegister::suboffset(src0, 8),
+              GenRegister::suboffset(src1, 8));
+      p->MOV(GenRegister::suboffset(src1, 8), acc0);
+    }
+    p->pop();
+  }
+
   void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     const GenRegister src2 = ra->genReg(insn.src(2));
     switch (insn.opcode) {
+      case SEL_OP_I64ADD:
+        {
+          GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
+                      y = GenRegister::suboffset(x, p->curr.execWidth);
+          loadBottomHalf(x, src0);
+          loadBottomHalf(y, src1);
+          addWithCarry(x, x, y);
+          storeBottomHalf(dst, x);
+          loadTopHalf(x, src0);
+          p->ADD(x, x, y);
+          loadTopHalf(y, src1);
+          p->ADD(x, x, y);
+          storeTopHalf(dst, x);
+        }
+        break;
+      case SEL_OP_I64SUB:
+        {
+          GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
+                      y = GenRegister::suboffset(x, p->curr.execWidth);
+          loadBottomHalf(x, src0);
+          loadBottomHalf(y, src1);
+          subWithBorrow(x, x, y);
+          storeBottomHalf(dst, x);
+          loadTopHalf(x, src0);
+          subWithBorrow(x, x, y);
+          loadTopHalf(y, src1);
+          subWithBorrow(x, x, y);
+          storeTopHalf(dst, x);
+        }
+        break;
       case SEL_OP_MUL_HI:
        {
         int w = p->curr.execWidth;
@@ -354,12 +561,52 @@ namespace gbe
     p->pop();
   }
 
-  void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(0));
+  void GenContext::emitSpillRegInstruction(const SelectionInstruction &insn) {
+    uint32_t simdWidth = p->curr.execWidth;
+    uint32_t scratchOffset = insn.extra.scratchOffset;
+    const uint32_t header = insn.extra.scratchMsgHeader;
+    p->push();
+
+    const GenRegister msg = GenRegister::ud8grf(header, 0);
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
+    GenRegister payload = src;
+    payload.nr = header + 1;
+    payload.subnr = 0;
+
+    p->MOV(payload, src);
+    uint32_t regType = insn.src(0).type;
+    uint32_t size = typeSize(regType);
+    assert(size <= 4);
+    uint32_t regNum = (stride(src.hstride)*size*simdWidth) > 32 ? 2 : 1;
+    this->scratchWrite(msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    p->pop();
+  }
+
+  void GenContext::emitUnSpillRegInstruction(const SelectionInstruction &insn) {
+    uint32_t scratchOffset = insn.extra.scratchOffset;
+    const GenRegister dst = insn.dst(0);
+    uint32_t regType = dst.type;
+    uint32_t simdWidth = p->curr.execWidth;
+    const uint32_t header = insn.extra.scratchMsgHeader;
+    uint32_t size = typeSize(regType);
+    assert(size <= 4);
+    uint32_t regNum = (stride(dst.hstride)*size*simdWidth) > 32 ? 2 : 1;
+    const GenRegister msg = GenRegister::ud8grf(header, 0);
+    this->scratchRead(GenRegister::retype(dst, GEN_TYPE_UD), msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+  }
+
+  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
+  //  then follow the real destination registers.
+  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
+  void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
     const uint32_t elemNum = insn.extra.elem;
-    p->READ_FLOAT64(dst, src, bti, elemNum);
+    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
+    const GenRegister tmp = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister tempAddr = ra->genReg(insn.src(1));
+    const uint32_t bti = insn.extra.function;
+    p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
   }
 
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
@@ -370,11 +617,16 @@ namespace gbe
     p->UNTYPED_READ(dst, src, bti, elemNum);
   }
 
-  void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
+  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
+  //  then follow the real destination registers.
+  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
+  void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
-    const uint32_t bti = insn.extra.function;
     const uint32_t elemNum = insn.extra.elem;
-    p->WRITE_FLOAT64(src, bti, elemNum);
+    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+    const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1));
+    const uint32_t bti = insn.extra.function;
+    p->WRITE64(src, data, bti, elemNum);
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
@@ -421,6 +673,37 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+    p->push();
+    uint32_t simdWidth = p->curr.execWidth;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+
+    p->curr.execWidth = 8;
+    p->MOV(header, GenRegister::ud8grf(0,0));
+    p->pop();
+
+    int size = typeSize(reg_type)*simdWidth;
+    p->push();
+    p->SCRATCH_WRITE(header, offset/32, size, reg_num, channel_mode);
+    p->pop();
+  }
+
+  void GenContext::scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+    p->push();
+    uint32_t simdWidth = p->curr.execWidth;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 8;
+    p->MOV(header, GenRegister::ud8grf(0,0));
+    p->pop();
+
+    int size = typeSize(reg_type)*simdWidth;
+    p->push();
+    p->SCRATCH_READ(dst, header, offset/32, size, reg_num, channel_mode);
+    p->pop();
+  }
+
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
     const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 5dfaef9..694ae98 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -27,6 +27,7 @@
 
 #include "backend/context.hpp"
 #include "backend/program.h"
+#include "backend/gen_register.hpp"
 #include "ir/function.hpp"
 #include "ir/liveness.hpp"
 #include "sys/map.hpp"
@@ -40,6 +41,7 @@ namespace gbe
   class Selection;            // Performs instruction selection
   class SelectionInstruction; // Pre-RA Gen instruction
   class SelectionReg;         // Pre-RA Gen register
+  class GenRegister;
 
   /*! Context is the helper structure to build the Gen ISA or simulation code
    *  from GenIR
@@ -73,6 +75,16 @@ namespace gbe
     INLINE const ir::Liveness::LiveOut &getLiveOut(const ir::BasicBlock *bb) const {
       return this->liveness->getLiveOut(bb);
     }
+
+    void loadTopHalf(GenRegister dest, GenRegister src);
+    void storeTopHalf(GenRegister dest, GenRegister src);
+
+    void loadBottomHalf(GenRegister dest, GenRegister src);
+    void storeBottomHalf(GenRegister dest, GenRegister src);
+
+    void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
+    void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
     void emitUnaryInstruction(const SelectionInstruction &insn);
@@ -87,8 +99,8 @@ namespace gbe
     void emitBarrierInstruction(const SelectionInstruction &insn);
     void emitFenceInstruction(const SelectionInstruction &insn);
     void emitMathInstruction(const SelectionInstruction &insn);
-    void emitReadFloat64Instruction(const SelectionInstruction &insn);
-    void emitWriteFloat64Instruction(const SelectionInstruction &insn);
+    void emitRead64Instruction(const SelectionInstruction &insn);
+    void emitWrite64Instruction(const SelectionInstruction &insn);
     void emitUntypedReadInstruction(const SelectionInstruction &insn);
     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
     void emitAtomicInstruction(const SelectionInstruction &insn);
@@ -96,8 +108,11 @@ namespace gbe
     void emitByteScatterInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
+    void emitSpillRegInstruction(const SelectionInstruction &insn);
+    void emitUnSpillRegInstruction(const SelectionInstruction &insn);
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
-
+    void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+    void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
     /*! Store the position of each label instruction in the Gen ISA stream */
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 5a9bb2d..e3959ff 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -157,6 +157,7 @@ enum opcode {
   GEN_OPCODE_FBH = 75,
   GEN_OPCODE_FBL = 76,
   GEN_OPCODE_ADDC = 78,
+  GEN_OPCODE_SUBB = 79,
   GEN_OPCODE_SAD2 = 80,
   GEN_OPCODE_SADA2 = 81,
   GEN_OPCODE_DP4 = 84,
@@ -242,6 +243,8 @@ enum GenMessageTarget {
 #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
 #define GEN_TYPE_DF  6
 #define GEN_TYPE_F   7
+#define GEN_TYPE_UL  8
+#define GEN_TYPE_L   9
 
 #define GEN_ARF_NULL                  0x00
 #define GEN_ARF_ADDRESS               0x10
@@ -316,6 +319,15 @@ enum GenMessageTarget {
 #define GEN_BYTE_SCATTER          12//1100: Byte Scattered Write
 #define GEN_UNTYPED_WRITE         13//1101: Untyped Surface Write
 
+/* Data port data cache scratch messages*/
+#define GEN_SCRATCH_READ                  0
+#define GEN_SCRATCH_WRITE                 1
+#define GEN_SCRATCH_CHANNEL_MODE_OWORD    0
+#define GEN_SCRATCH_CHANNEL_MODE_DWORD    1
+#define GEN_SCRATCH_BLOCK_SIZE_1          0
+#define GEN_SCRATCH_BLOCK_SIZE_2          1
+#define GEN_SCRATCH_BLOCK_SIZE_4          3
+
 /* Data port render cache Message Type*/
 #define GEN_MBLOCK_READ           4  //0100: Media Block Read
 #define GEN_TYPED_READ            5  //0101: Typed Surface Read
@@ -762,6 +774,22 @@ struct GenInstruction
       uint32_t end_of_thread:1;
     } gen7_byte_rw;
 
+    /*! Data port Scratch Read/ write */
+    struct {
+      uint32_t offset:12;
+      uint32_t block_size:2;
+      uint32_t ignored0:1;
+      uint32_t invalidate_after_read:1;
+      uint32_t channel_mode:1;
+      uint32_t msg_type:1;
+      uint32_t category:1;
+      uint32_t header_present:1;
+      uint32_t response_length:5;
+      uint32_t msg_length:4;
+      uint32_t pad2:2;
+      uint32_t end_of_thread:1;
+    } gen7_scratch_rw;
+
     /*! Data port OBlock read / write */
     struct {
       uint32_t bti:8;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index f84c6dd..4d6aa34 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -356,103 +356,69 @@ namespace gbe
     0
   };
 
-  static int dst_type(int exec_width) {
-    if (exec_width == 8)
-      return GEN_TYPE_UD;
-    if (exec_width == 16)
-      return GEN_TYPE_UW;
-    NOT_IMPLEMENTED;
-    return 0;
-  }
-
-  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
-    int w = curr.execWidth;
-    dst = GenRegister::h2(dst);
-    dst.type = GEN_TYPE_UD;
-    src.type = GEN_TYPE_UD;
-    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
-    GenRegister imm4 = GenRegister::immud(4);
-    GenInstruction *insn;
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::uw16grf(r.nr, 0));
-    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
-    push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(dst, r);
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
-    pop();
-    ADD(src, src, imm4);
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::uw16grf(r.nr, 0));
-    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+  void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
+    src = GenRegister::retype(src, GEN_TYPE_UD);
+    addr = GenRegister::retype(addr, GEN_TYPE_UD);
+    tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
+    uint32_t originSimdWidth = curr.execWidth;
+    uint32_t originPredicate = curr.predicate;
+    uint32_t originMask = curr.noMask;
     push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(GenRegister::suboffset(dst, 1), r);
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
+    for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
+          channels < originSimdWidth; channels += 8, currQuarter++) {
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = GEN_MASK_DISABLE;
+      curr.execWidth = 8;
+      /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
+         which is what we want here. */
+      MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
+      ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
+      MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
+      ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
+      // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
+      curr.execWidth = 16;
+      this->UNTYPED_READ(tmp, addr, bti, elemNum);
+      if (originSimdWidth == 16)
+        curr.quarterControl = currQuarter;
+      curr.predicate = originPredicate;
+      curr.noMask = originMask;
+      // Back to simd8 for correct predication flag.
+      curr.execWidth = 8;
+      MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
+    }
     pop();
   }
 
-  void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
-    int w = curr.execWidth;
-    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
-    r.type = GEN_TYPE_UD;
-    GenRegister hdr = GenRegister::h2(r);
-    GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
-    src.hstride = GEN_HORIZONTAL_STRIDE_2;
-    GenRegister data = GenRegister::offset(r, w / 8);
-    GenRegister imm4 = GenRegister::immud(4);
-    MOV(r, GenRegister::ud8grf(msg.nr, 0));
-    push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(data, src);
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
-    pop();
-    GenInstruction *insn;
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
-    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
-
-    ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
+  void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) {
+    GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
+    msg = GenRegister::retype(msg, GEN_TYPE_UD);
+    int originSimdWidth = curr.execWidth;
+    int originPredicate = curr.predicate;
+    int originMask = curr.noMask;
     push();
-    curr.quarterControl = 0;
-    curr.nibControl = 0;
-    MOV(data, GenRegister::suboffset(src, 1));
-    if (w == 8)
-      curr.nibControl = 1;
-    else
-      curr.quarterControl = 1;
-    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
+    for (uint32_t half = 0; half < 2; half++) {
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = GEN_MASK_DISABLE;
+      curr.execWidth = 8;
+      MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half));
+      if (originSimdWidth == 16) {
+        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half));
+        curr.execWidth = 16;
+      }
+      if (half == 1)
+        ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
+      curr.predicate = originPredicate;
+      curr.noMask = originMask;
+      this->UNTYPED_WRITE(msg, bti, elemNum);
+    }
+    /* Let's restore the original message(addr) register. */
+    /* XXX could be optimized if we don't allocate the address to the header
+       position of the message. */
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = GEN_MASK_DISABLE;
+    ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(-4));
     pop();
-    insn = next(GEN_OPCODE_SEND);
-    setHeader(insn);
-    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
-    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
-    setSrc1(insn, GenRegister::immud(0));
-    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
   }
 
   void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
@@ -470,7 +436,7 @@ namespace gbe
       NOT_IMPLEMENTED;
 
     this->setHeader(insn);
-    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
     setDPUntypedRW(this,
@@ -601,24 +567,64 @@ namespace gbe
      return &this->store.back();
   }
 
-  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
-     if (dst.isdf() && src.isdf()) {
+  INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+                            GenRegister src0, GenRegister src1 = GenRegister::null()) {
        int w = p->curr.execWidth;
        p->push();
-       p->curr.quarterControl = 0;
        p->curr.nibControl = 0;
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
-       p->setSrc0(insn, src);
+       p->setSrc0(insn, src0);
+       if (!GenRegister::isNull(src1))
+         p->setSrc1(insn, src1);
        if (w == 8)
          p->curr.nibControl = 1; // second 1/8 mask
-       else // w == 16
-         p->curr.quarterControl = 1; // second 1/4 mask
        insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, GenRegister::suboffset(dst, w / 2));
-       p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+       if (!GenRegister::isNull(src1))
+         p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+       p->pop();
+  }
+
+  // Double register accessing is a little special,
+  // Per Gen spec, then only supported mode is SIMD8 and, it only
+  // handles four doubles each time.
+  // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8
+  // to two SIMD1x4.
+  INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+                           GenRegister src0, GenRegister src1 = GenRegister::null()) {
+      if (p->curr.execWidth == 8)
+        _handleDouble(p, opcode, dst, src0, src1);
+      else if (p->curr.execWidth == 16) {
+        p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        _handleDouble(p, opcode, dst, src0, src1);
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
+        if (!GenRegister::isNull(src1))
+          src1 = GenRegister::offset(src1, 2);
+        _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
+        p->pop();
+      }
+  }
+
+  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
+     if (dst.isdf() && src.isdf()) {
+       handleDouble(p, opcode, dst, src);
+     } else if (dst.isint64() && src.isint64()) { // handle int64
+       int execWidth = p->curr.execWidth;
+       p->push();
+       p->curr.execWidth = 8;
+       for (int nib = 0; nib < execWidth / 4; nib ++) {
+         p->curr.chooseNib(nib);
+         p->MOV(dst.bottom_half(), src.bottom_half());
+         p->MOV(dst.top_half(), src.top_half());
+         dst = GenRegister::suboffset(dst, 4);
+         src = GenRegister::suboffset(src, 4);
+       }
        p->pop();
      } else if (needToSplitAlu1(p, dst, src) == false) {
        GenInstruction *insn = p->next(opcode);
@@ -653,25 +659,7 @@ namespace gbe
                    GenRegister src1)
   {
     if (dst.isdf() && src0.isdf() && src1.isdf()) {
-       int w = p->curr.execWidth;
-       p->push();
-       p->curr.quarterControl = 0;
-       p->curr.nibControl = 0;
-       GenInstruction *insn = p->next(opcode);
-       p->setHeader(insn);
-       p->setDst(insn, dst);
-       p->setSrc0(insn, src0);
-       p->setSrc1(insn, src1);
-       if (w == 8)
-         p->curr.nibControl = 1; // second 1/8 mask
-       else // w == 16
-         p->curr.quarterControl = 1; // second 1/4 mask
-       insn = p->next(opcode);
-       p->setHeader(insn);
-       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
-       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
-       p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
-       p->pop();
+       handleDouble(p, opcode, dst, src0, src1);
     } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
@@ -808,7 +796,16 @@ namespace gbe
     r.width = GEN_WIDTH_1;
     r.hstride = GEN_HORIZONTAL_STRIDE_0;
     push();
+    uint32_t width = curr.execWidth;
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.quarterControl = GEN_COMPRESSION_Q1;
     MOV(dest, r);
+    if (width == 16) {
+      curr.quarterControl = GEN_COMPRESSION_Q2;
+      MOV(GenRegister::offset(dest, 2), r);
+    }
     pop();
   }
 
@@ -836,17 +833,25 @@ namespace gbe
     MOV(dest, src0);
   }
 
+  void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
+    GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
+    int execWidth = curr.execWidth;
+    push();
+    curr.execWidth = 8;
+    for(int nib = 0; nib < execWidth/4; nib ++) {
+      curr.chooseNib(nib);
+      MOV(dest.top_half(), u1);
+      MOV(dest.bottom_half(), u0);
+      dest = GenRegister::suboffset(dest, 4);
+    }
+    pop();
+  }
+
   void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
     int w = curr.execWidth;
     if (src0.isdf()) {
-      push();
-      curr.execWidth = 16;
-      MOV(dest, src0);
-      if (w == 16) {
-        curr.quarterControl = 1;
-        MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
-      }
-      pop();
+      GBE_ASSERT(0); // MOV DF is called from convert instruction,
+                     // We should never convert a df to a df.
     } else {
       GenRegister r0 = GenRegister::h2(r);
       push();
@@ -903,6 +908,13 @@ namespace gbe
   ALU2(MACH)
   ALU3(MAD)
 
+  void GenEncoder::SUBB(GenRegister dest, GenRegister src0, GenRegister src1) {
+    push();
+    curr.accWrEnable = 1;
+    alu2(this, GEN_OPCODE_SUBB, dest, src0, src1);
+    pop();
+  }
+
   void GenEncoder::ADDC(GenRegister dest, GenRegister src0, GenRegister src1) {
     push();
     curr.accWrEnable = 1;
@@ -1150,6 +1162,49 @@ namespace gbe
      this->setSrc0(insn, msg);
      setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present);
   }
+  static void setScratchMessage(GenEncoder *p,
+                                   GenInstruction *insn,
+                                   uint32_t offset,
+                                   uint32_t block_size,
+                                   uint32_t channel_mode,
+                                   uint32_t msg_type,
+                                   uint32_t msg_length,
+                                   uint32_t response_length)
+  {
+     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+     setMessageDescriptor(p, insn, sfid, msg_length, response_length, true);
+     insn->bits3.gen7_scratch_rw.block_size = block_size;
+     insn->bits3.gen7_scratch_rw.msg_type = msg_type;
+     insn->bits3.gen7_scratch_rw.channel_mode = channel_mode;
+     insn->bits3.gen7_scratch_rw.offset = offset;
+     insn->bits3.gen7_scratch_rw.category = 1;
+  }
+
+  void GenEncoder::SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode)
+  {
+     assert(src_num == 1 || src_num ==2);
+     uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+     this->setSrc0(insn, msg);
+     this->setSrc1(insn, GenRegister::immud(0));
+     // here src_num means register that will be write out: in terms of 32byte register number
+     setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_WRITE, src_num+1, 0);
+  }
+
+  void GenEncoder::SCRATCH_READ(GenRegister dst, GenRegister src, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode)
+  {
+     assert(dst_num == 1 || dst_num ==2);
+     uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     this->setSrc0(insn, src);
+     this->setSrc1(insn, GenRegister::immud(0));
+      // here dst_num is the register that will be write-back: in terms of 32byte register
+     setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
+  }
 
   void GenEncoder::EOT(uint32_t msg) {
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index d3a7165..bbf240c 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -92,6 +92,7 @@ namespace gbe
     ALU1(MOV)
     ALU1(FBH)
     ALU1(FBL)
+    ALU2(SUBB)
     ALU2(UPSAMPLE_SHORT)
     ALU2(UPSAMPLE_INT)
     ALU1(RNDZ)
@@ -118,11 +119,13 @@ namespace gbe
     ALU2(LINE)
     ALU2(PLN)
     ALU3(MAD)
-    ALU2(MOV_DF);
+    //ALU2(MOV_DF);
 #undef ALU1
 #undef ALU2
 #undef ALU3
+    void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+    void LOAD_INT64_IMM(GenRegister dest, int64_t value);
     /*! Barrier message (to synchronize threads of a workgroup) */
     void BARRIER(GenRegister src);
     /*! Memory fence message (to order loads and stores between threads) */
@@ -141,10 +144,10 @@ namespace gbe
     void WAIT(void);
     /*! Atomic instructions */
     void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
-    /*! Read 64-bits float arrays */
-    void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
-    /*! Write 64-bits float arrays */
-    void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
+    /*! Read 64-bits float/int arrays */
+    void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
+    /*! Write 64-bits float/int arrays */
+    void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum);
     /*! Untyped read (upto 4 channels) */
     void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
@@ -153,6 +156,10 @@ namespace gbe
     void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
     void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+    /*! for scratch memory read */
+    void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
+    /*! for scratch memory write */
+    void SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode);
     /*! Send instruction for the sampler */
     void SAMPLE(GenRegister dest,
                 GenRegister msg,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index f3f4a25..da8f2a2 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -12,13 +12,15 @@ DECL_GEN7_SCHEDULE(Wait,            20,        2,        2)
 DECL_GEN7_SCHEDULE(Math,            20,        4,        2)
 DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
 DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
-DECL_GEN7_SCHEDULE(ReadFloat64,     80,        1,        1)
-DECL_GEN7_SCHEDULE(WriteFloat64,    80,        1,        1)
+DECL_GEN7_SCHEDULE(Read64,          80,        1,        1)
+DECL_GEN7_SCHEDULE(Write64,         80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
+DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
+DECL_GEN7_SCHEDULE(UnSpillReg,      80,        1,        1)
 DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index cb990be..0b720b7 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -283,19 +283,24 @@ namespace gbe
   uint32_t DependencyTracker::getIndex(GenRegister reg) const {
     // Non GRF physical register
     if (reg.physical) {
-      GBE_ASSERT (reg.file == GEN_ARCHITECTURE_REGISTER_FILE);
-      const uint32_t file = reg.nr & 0xf0;
-      const uint32_t nr = reg.nr & 0x0f;
-      if (file == GEN_ARF_FLAG) {
-        const uint32_t subnr = reg.subnr / sizeof(uint16_t);
-        GBE_ASSERT(nr < MAX_FLAG_REGISTER && (subnr == 0 || subnr == 1));
-        return grfNum + 2*nr + subnr;
-      } else if (file == GEN_ARF_ACCUMULATOR) {
-        GBE_ASSERT(nr < MAX_ACC_REGISTER);
-        return grfNum + MAX_FLAG_REGISTER + nr;
+      //GBE_ASSERT (reg.file == GEN_ARCHITECTURE_REGISTER_FILE);
+      if(reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+        const uint32_t file = reg.nr & 0xf0;
+        const uint32_t nr = reg.nr & 0x0f;
+        if (file == GEN_ARF_FLAG) {
+          const uint32_t subnr = reg.subnr / sizeof(uint16_t);
+          GBE_ASSERT(nr < MAX_FLAG_REGISTER && (subnr == 0 || subnr == 1));
+          return grfNum + 2*nr + subnr;
+        } else if (file == GEN_ARF_ACCUMULATOR) {
+          GBE_ASSERT(nr < MAX_ACC_REGISTER);
+          return grfNum + MAX_FLAG_REGISTER + nr;
+        } else {
+          NOT_SUPPORTED;
+          return 0;
+        }
       } else {
-        NOT_SUPPORTED;
-        return 0;
+          const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+          return simdWidth == 8 ? reg.nr : reg.nr / 2;
       }
     }
     // We directly manipulate physical GRFs here
@@ -344,6 +349,10 @@ namespace gbe
       this->nodes[index] = node;
     }
 
+    if(insn.opcode == SEL_OP_SPILL_REG) {
+      const uint32_t index = this->getIndex(0xff);
+      this->nodes[index] = node;
+    }
     // Consider barriers and wait write to memory
     if (insn.opcode == SEL_OP_BARRIER ||
         insn.opcode == SEL_OP_FENCE ||
@@ -424,6 +433,11 @@ namespace gbe
         const uint32_t index = tracker.getIndex(insn.extra.function);
         tracker.addDependency(node, index);
       }
+      //read-after-write of scratch memory
+      if (insn.opcode == SEL_OP_UNSPILL_REG) {
+        const uint32_t index = tracker.getIndex(0xff);
+        tracker.addDependency(node, index);
+      }
 
       // Consider barriers and wait are reading memory (local and global)
     if (insn.opcode == SEL_OP_BARRIER ||
@@ -453,6 +467,7 @@ namespace gbe
         tracker.addDependency(node, index);
       }
 
+
       // Consider barriers and wait are writing memory (local and global)
     if (insn.opcode == SEL_OP_BARRIER ||
         insn.opcode == SEL_OP_FENCE ||
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d4be8bf..1a3af68 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -128,6 +128,8 @@ namespace gbe
       case TYPE_U16: return GEN_TYPE_UW;
       case TYPE_S32: return GEN_TYPE_D;
       case TYPE_U32: return GEN_TYPE_UD;
+      case TYPE_S64: return GEN_TYPE_L;
+      case TYPE_U64: return GEN_TYPE_UL;
       case TYPE_FLOAT: return GEN_TYPE_F;
       case TYPE_DOUBLE: return GEN_TYPE_DF;
       default: NOT_SUPPORTED; return GEN_TYPE_F;
@@ -167,14 +169,14 @@ namespace gbe
 
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
-           this->opcode == SEL_OP_READ_FLOAT64 ||
+           this->opcode == SEL_OP_READ64 ||
            this->opcode == SEL_OP_ATOMIC       ||
            this->opcode == SEL_OP_BYTE_GATHER;
   }
 
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
-           this->opcode == SEL_OP_WRITE_FLOAT64 ||
+           this->opcode == SEL_OP_WRITE64 ||
            this->opcode == SEL_OP_ATOMIC        ||
            this->opcode == SEL_OP_BYTE_SCATTER;
   }
@@ -313,6 +315,8 @@ namespace gbe
     INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
     /*! Implement public class */
     INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
+    /*! spill a register (insert spill/unspill instructions) */
+    INLINE void spillReg(ir::Register reg, uint32_t registerPool);
     /*! Implement public class */
     INLINE uint32_t getRegNum(void) const { return file.regNum(); }
     /*! Implements public interface */
@@ -413,19 +417,26 @@ namespace gbe
     ALU1(MOV)
     ALU2(MOV_DF)
     ALU2(LOAD_DF_IMM)
+    ALU1(LOAD_INT64_IMM)
     ALU1(RNDZ)
     ALU1(RNDE)
     ALU2(SEL)
+    ALU2(SEL_INT64)
     ALU1(NOT)
     ALU2(AND)
     ALU2(OR)
     ALU2(XOR)
+    ALU2(I64AND)
+    ALU2(I64OR)
+    ALU2(I64XOR)
     ALU2(SHR)
     ALU2(SHL)
     ALU2(RSR)
     ALU2(RSL)
     ALU2(ASR)
     ALU2(ADD)
+    ALU3(I64ADD)
+    ALU3(I64SUB)
     ALU2(MUL)
     ALU1(FRC)
     ALU1(RNDD)
@@ -465,10 +476,10 @@ namespace gbe
     void WAIT(void);
     /*! Atomic instruction */
     void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
-    /*! Read 64 bits float array */
-    void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
-    /*! Write 64 bits float array */
-    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+    /*! Read 64 bits float/int array */
+    void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
+    /*! Write 64 bits float/int array */
+    void WRITE64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */
@@ -609,6 +620,57 @@ namespace gbe
     return vector;
   }
 
+  void Selection::Opaque::spillReg(ir::Register spilledReg, uint32_t registerPool) {
+    assert(registerPool != 0);
+    const uint32_t simdWidth = ctx.getSimdWidth();
+    const uint32_t dstStart = registerPool + 1;
+    const uint32_t srcStart = registerPool + 1;
+    uint32_t ptr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D)*simdWidth);
+
+    for (auto &block : blockList)
+      for (auto &insn : block.insnList) {
+        const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const GenRegister selReg = insn.src(srcID);
+          const ir::Register reg = selReg.reg();
+          if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+            GBE_ASSERT(srcID < 5);
+            SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
+            unspill->state  = GenInstructionState(simdWidth);
+            unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE, srcStart+srcID, 0,
+                                          selReg.type, selReg.vstride, selReg.width, selReg.hstride);
+            GenRegister src = insn.src(srcID);
+            // change nr/subnr, keep other register settings
+            src.nr = srcStart+srcID; src.subnr=0; src.physical=1;
+            insn.src(srcID) = src;
+            unspill->extra.scratchOffset = ptr;
+            unspill->extra.scratchMsgHeader = registerPool;
+            insn.prepend(*unspill);
+          }
+        }
+
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const GenRegister selReg = insn.dst(dstID);
+          const ir::Register reg = selReg.reg();
+          if(selReg.file == GEN_GENERAL_REGISTER_FILE && reg == spilledReg) {
+            GBE_ASSERT(dstID < 5);
+            SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
+            spill->state  = GenInstructionState(simdWidth);
+            spill->src(0) =GenRegister(GEN_GENERAL_REGISTER_FILE, dstStart + dstID, 0,
+                                              selReg.type, selReg.vstride, selReg.width, selReg.hstride);
+            GenRegister dst = insn.dst(dstID);
+            // change nr/subnr, keep other register settings
+            dst.physical =1; dst.nr = dstStart+dstID; dst.subnr = 0;
+            insn.dst(dstID)= dst;
+            spill->extra.scratchOffset = ptr;
+            spill->extra.scratchMsgHeader = registerPool;
+            insn.append(*spill);
+          }
+        }
+      }
+  }
+
   ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID) {
     SelectionBlock *block = insn->parent;
     const uint32_t simdWidth = ctx.getSimdWidth();
@@ -760,12 +822,16 @@ namespace gbe
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
 
-  void Selection::Opaque::READ_FLOAT64(Reg addr,
+  /* elemNum contains all the temporary register and the
+     real destination registers.*/
+  void Selection::Opaque::READ64(Reg addr,
+                                       Reg tempAddr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
+                                       uint32_t valueNum,
                                        uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 2);
     SelectionVector *srcVector = this->appendVector();
     SelectionVector *dstVector = this->appendVector();
 
@@ -773,11 +839,12 @@ namespace gbe
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->dst(elemID) = dst[elemID];
     insn->src(0) = addr;
+    insn->src(1) = tempAddr;
     insn->extra.function = bti;
-    insn->extra.elem = elemNum;
+    insn->extra.elem = valueNum;
 
-    // Sends require contiguous allocation
-    dstVector->regNum = elemNum;
+    // Only the temporary registers need contiguous allocation
+    dstVector->regNum = elemNum - valueNum;
     dstVector->isSrc = 0;
     dstVector->reg = &insn->dst(0);
 
@@ -807,19 +874,21 @@ namespace gbe
     dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
     dstVector->reg = &insn->dst(0);
-
     // Source cannot be scalar (yet)
     srcVector->regNum = 1;
     srcVector->isSrc = 1;
     srcVector->reg = &insn->src(0);
   }
 
-  void Selection::Opaque::WRITE_FLOAT64(Reg addr,
+  /* elemNum contains all the temporary register and the
+     real data registers.*/
+  void Selection::Opaque::WRITE64(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
+                                        uint32_t valueNum,
                                         uint32_t bti)
   {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, elemNum+1);
     SelectionVector *vector = this->appendVector();
 
     // Regular instruction to encode
@@ -827,10 +896,10 @@ namespace gbe
     for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
       insn->src(elemID+1) = src[elemID];
     insn->extra.function = bti;
-    insn->extra.elem = elemNum;
+    insn->extra.elem = valueNum;
 
-    // Sends require contiguous allocation for the sources
-    vector->regNum = elemNum+1;
+    // Only the addr + temporary registers need to be contiguous.
+    vector->regNum = (elemNum - valueNum) + 1;
     vector->reg = &insn->src(0);
     vector->isSrc = 1;
   }
@@ -1172,6 +1241,9 @@ namespace gbe
   ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID) {
     return this->opaque->replaceDst(insn, regID);
   }
+  void Selection::spillReg(ir::Register reg, uint32_t registerPool) {
+    this->opaque->spillReg(reg, registerPool);
+  }
 
   SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
     return this->opaque->create(opcode, dstNum, srcNum);
@@ -1185,7 +1257,7 @@ namespace gbe
     using namespace ir;
     const auto &childInsn = cast<LoadImmInstruction>(insn);
     const auto &imm = childInsn.getImmediate();
-    if(imm.type != TYPE_DOUBLE)
+    if(imm.type != TYPE_DOUBLE && imm.type != TYPE_S64 && imm.type != TYPE_U64)
       return true;
     return false;
   }
@@ -1408,17 +1480,44 @@ namespace gbe
 
       // Output the binary instruction
       switch (opcode) {
-        case OP_ADD: sel.ADD(dst, src0, src1); break;
+        case OP_ADD:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+            sel.I64ADD(dst, src0, src1, t);
+          } else
+            sel.ADD(dst, src0, src1);
+          break;
         case OP_ADDSAT:
           sel.push();
             sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
             sel.ADD(dst, src0, src1);
           sel.pop();
           break;
-        case OP_XOR: sel.XOR(dst, src0, src1); break;
-        case OP_OR:  sel.OR(dst, src0,  src1); break;
-        case OP_AND: sel.AND(dst, src0, src1); break;
-        case OP_SUB: sel.ADD(dst, src0, GenRegister::negate(src1)); break;
+        case OP_XOR:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+            sel.I64XOR(dst, src0, src1);
+          else
+            sel.XOR(dst, src0, src1);
+          break;
+        case OP_OR:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+            sel.I64OR(dst, src0, src1);
+          else
+            sel.OR(dst, src0, src1);
+          break;
+        case OP_AND:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+            sel.I64AND(dst, src0, src1);
+          else
+            sel.AND(dst, src0, src1);
+          break;
+        case OP_SUB:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+            sel.I64SUB(dst, src0, src1, t);
+          } else
+            sel.ADD(dst, src0, GenRegister::negate(src1));
+          break;
         case OP_SUBSAT:
           sel.push();
             sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
@@ -1782,6 +1881,8 @@ namespace gbe
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
         case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); break;
+        case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.s64)); break;
+        case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.data.u64)); break;
         default: NOT_SUPPORTED;
       }
       sel.pop();
@@ -1832,6 +1933,8 @@ namespace gbe
     using namespace ir;
     switch (type) {
       case TYPE_DOUBLE:
+      case TYPE_S64:
+      case TYPE_U64:
         return GEN_BYTE_SCATTER_QWORD;
       case TYPE_FLOAT:
       case TYPE_U32:
@@ -1864,20 +1967,25 @@ namespace gbe
       sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
     }
 
-    void emitReadFloat64(Selection::Opaque &sel,
+    void emitRead64(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister addr,
                          uint32_t bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      vector<GenRegister> dst(valueNum);
-      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
-      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      if (sel.ctx.getSimdWidth() == 16)
-        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+      uint32_t dstID;
+      /* XXX support scalar only right now. */
+      GBE_ASSERT(valueNum == 1);
+
+      // The first 16 DWORD register space is for temporary usage at encode stage.
+      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+      GenRegister dst[valueNum + tmpRegNum];
+      for (dstID = 0; dstID < tmpRegNum ; ++dstID)
+        dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
+      for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
+        dst[dstID] = sel.selReg(insn.getValue(valueID));
+      sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, valueNum + tmpRegNum, valueNum, bti);
     }
 
     void emitByteGather(Selection::Opaque &sel,
@@ -1932,7 +2040,7 @@ namespace gbe
       if (insn.getAddressSpace() == MEM_CONSTANT)
         this->emitIndirectMove(sel, insn, address);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
+        this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
@@ -1963,7 +2071,7 @@ namespace gbe
       sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
     }
 
-    void emitWriteFloat64(Selection::Opaque &sel,
+    void emitWrite64(Selection::Opaque &sel,
                           const ir::StoreInstruction &insn,
                           uint32_t bti) const
     {
@@ -1971,15 +2079,19 @@ namespace gbe
       const uint32_t valueNum = insn.getValueNum();
       const uint32_t addrID = ir::StoreInstruction::addressIndex;
       GenRegister addr;
-      vector<GenRegister> value(valueNum);
-
+      uint32_t srcID;
+      /* XXX support scalar only right now. */
+      GBE_ASSERT(valueNum == 1);
       addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
-      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
-      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      if (sel.ctx.getSimdWidth() == 16)
-        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
-      sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+      // The first 16 DWORD register space is for temporary usage at encode stage.
+      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+      GenRegister src[valueNum + tmpRegNum];
+      for (srcID = 0; srcID < tmpRegNum; ++srcID)
+        src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD));
+
+      for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID)
+        src[srcID] = sel.selReg(insn.getValue(valueID));
+      sel.WRITE64(addr, src, valueNum + tmpRegNum, valueNum, bti);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
@@ -2012,7 +2124,7 @@ namespace gbe
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
       if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitWriteFloat64(sel, insn, bti);
+        this->emitWrite64(sel, insn, bti);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedWrite(sel, insn, bti);
       else {
@@ -2191,7 +2303,10 @@ namespace gbe
         sel.curr.physicalFlag = 0;
         sel.curr.flagIndex = uint16_t(pred);
         sel.curr.noMask = 0;
-        sel.SEL(tmp, src0, src1);
+        if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
+          sel.SEL_INT64(tmp, src0, src1);
+        else
+          sel.SEL(tmp, src0, src1);
       sel.pop();
 
       // Update the destination register properly now
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 5ae6e42..79b73e2 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -107,6 +107,10 @@ namespace gbe
         /*! offset (0 to 7) */
         uint16_t offset:5;
       };
+      struct {
+        uint16_t scratchOffset;
+        uint16_t scratchMsgHeader;
+      };
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
@@ -197,6 +201,8 @@ namespace gbe
     ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID);
     /*! Replace a destination to the returned temporary register */
     ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
+    /*! spill a register (insert spill/unspill instructions) */
+    void spillReg(ir::Register reg, uint32_t registerPool);
     /*! Create a new selection instruction */
     SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
     /*! List of emitted blocks */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 33c3937..eeca9af 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -2,6 +2,7 @@ DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
 DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
 DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
+DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
 DECL_SELECTION_IR(NOT, UnaryInstruction)
 DECL_SELECTION_IR(LZD, UnaryInstruction)
 DECL_SELECTION_IR(RNDZ, UnaryInstruction)
@@ -10,15 +11,21 @@ DECL_SELECTION_IR(RNDD, UnaryInstruction)
 DECL_SELECTION_IR(RNDU, UnaryInstruction)
 DECL_SELECTION_IR(FRC, UnaryInstruction)
 DECL_SELECTION_IR(SEL, BinaryInstruction)
+DECL_SELECTION_IR(SEL_INT64, BinaryInstruction)
 DECL_SELECTION_IR(AND, BinaryInstruction)
 DECL_SELECTION_IR(OR, BinaryInstruction)
 DECL_SELECTION_IR(XOR, BinaryInstruction)
+DECL_SELECTION_IR(I64AND, BinaryInstruction)
+DECL_SELECTION_IR(I64OR, BinaryInstruction)
+DECL_SELECTION_IR(I64XOR, BinaryInstruction)
 DECL_SELECTION_IR(SHR, BinaryInstruction)
 DECL_SELECTION_IR(SHL, BinaryInstruction)
 DECL_SELECTION_IR(RSR, BinaryInstruction)
 DECL_SELECTION_IR(RSL, BinaryInstruction)
 DECL_SELECTION_IR(ASR, BinaryInstruction)
 DECL_SELECTION_IR(ADD, BinaryInstruction)
+DECL_SELECTION_IR(I64ADD, TernaryInstruction)
+DECL_SELECTION_IR(I64SUB, TernaryInstruction)
 DECL_SELECTION_IR(MUL, BinaryInstruction)
 DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
 DECL_SELECTION_IR(MACH, BinaryInstruction)
@@ -35,13 +42,15 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
 DECL_SELECTION_IR(FENCE, FenceInstruction)
 DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
 DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
-DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction)
-DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
+DECL_SELECTION_IR(READ64, Read64Instruction)
+DECL_SELECTION_IR(WRITE64, Write64Instruction)
 DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
+DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
 DECL_SELECTION_IR(MUL_HI, TernaryInstruction)
 DECL_SELECTION_IR(FBH, UnaryInstruction)
 DECL_SELECTION_IR(FBL, UnaryInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index e7c96ac..ccbc0da 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -31,6 +31,8 @@
 #include <algorithm>
 #include <climits>
 
+#define RESERVED_REG_NUM_FOR_SPILL 6
+
 namespace gbe
 {
   /////////////////////////////////////////////////////////////////////////////
@@ -94,6 +96,10 @@ namespace gbe
     vector<GenRegInterval*> starting;
     /*! Intervals sorting based on ending point positions */
     vector<GenRegInterval*> ending;
+    /*! registers that are spilled */
+    set<ir::Register> spilled;
+    /* reserved registers for register spill/reload */
+    uint32_t reservedReg;
     /*! Current vector to expire */
     uint32_t expiringID;
     /*! Use custom allocator */
@@ -259,6 +265,11 @@ namespace gbe
         continue;
       }
 
+      //ignore register that already spilled
+      if(spilled.contains(reg)) {
+        this->expiringID++;
+        continue;
+      }
       // Ignore booleans that were allocated with flags
       // if (ctx.getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg)) {
       if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL) {
@@ -473,33 +484,67 @@ namespace gbe
       auto it = vectorMap.find(reg);
       if (it != vectorMap.end()) {
         const SelectionVector *vector = it->second.first;
+        // all the reg in the SelectionVector are spilled
+        if(spilled.contains(vector->reg[0].reg()))
+          continue;
         const uint32_t simdWidth = ctx.getSimdWidth();
-        const uint32_t alignment = simdWidth * sizeof(uint32_t);
+
+        const ir::RegisterData regData = ctx.sel->getRegisterData(reg);
+        const ir::RegisterFamily family = regData.family;
+        const uint32_t typeSize = familyVectorSize[family];
+        const uint32_t alignment = simdWidth*typeSize;
+
         const uint32_t size = vector->regNum * alignment;
+
         uint32_t grfOffset;
         while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
           const bool success = this->expireGRF(interval);
-          if (success == false) return false;
+          if (success == false) {
+            // if no spill support, just return false, else simply spill the register
+            if(reservedReg == 0) return false;
+            break;
+          }
+        }
+        if(grfOffset == 0) {
+          // spill all the registers in the SelectionVector
+          // the tricky here is I need to use reservedReg+1 as scratch write payload.
+          // so, i need to write the first register to scratch memory first.
+          // the spillReg() will just append scratch write insn after the def. To spill
+          // the first register, need to call spillReg() last for the vector->reg[0]
+          GBE_ASSERT(vector->regNum < RESERVED_REG_NUM_FOR_SPILL);
+          for(int i = vector->regNum-1; i >= 0; i--) {
+            spilled.insert(vector->reg[i].reg());
+            selection.spillReg(vector->reg[i].reg(), reservedReg);
+          }
+          continue;
         }
         for (uint32_t regID = 0; regID < vector->regNum; ++regID, grfOffset += alignment) {
           const ir::Register reg = vector->reg[regID].reg();
-          GBE_ASSERT(RA.contains(reg) == false);
+          GBE_ASSERT(RA.contains(reg) == false
+                     && ctx.sel->getRegisterData(reg).family == family);
           RA.insert(std::make_pair(reg, grfOffset));
         }
       }
       // Case 2: This is a regular scalar register, allocate it alone
-      else if (this->createGenReg(interval) == false)
-        return false;
+      else if (this->createGenReg(interval) == false) {
+        if(reservedReg == 0) return false;
+        spilled.insert(reg);
+        selection.spillReg(reg, reservedReg);
+      }
     }
     return true;
   }
-
   INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
     using namespace ir;
     const Kernel *kernel = ctx.getKernel();
     const Function &fn = ctx.getFunction();
     GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
-
+    if (ctx.getSimdWidth() == 8) {
+      reservedReg = ctx.allocate(RESERVED_REG_NUM_FOR_SPILL * GEN_REG_SIZE, GEN_REG_SIZE);
+      reservedReg /= GEN_REG_SIZE;
+    } else {
+      reservedReg = 0;
+    }
     // Allocate all the vectors first since they need to be contiguous
     this->allocateVector(selection);
     // schedulePreRegAllocation(ctx, selection);
@@ -684,6 +729,10 @@ namespace gbe
         int subreg = offst % 8;
         std::cout << "%" << vReg << " g" << reg << "." << subreg << "D" << std::endl;
     }
+    std::set<ir::Register>::iterator is;
+    std::cout << "## spilled registers:" << std::endl;
+    for(is = spilled.begin(); is != spilled.end(); is++)
+      std::cout << (int)*is << std::endl;
     std::cout << std::endl;
   }
 
@@ -698,6 +747,9 @@ namespace gbe
 
   INLINE GenRegister GenRegAllocator::Opaque::genReg(const GenRegister &reg) {
     if (reg.file == GEN_GENERAL_REGISTER_FILE) {
+      if(reg.physical == 1) {
+        return reg;
+      }
       GBE_ASSERT(RA.contains(reg.reg()) != false);
       const uint32_t grfOffset = RA.find(reg.reg())->second;
       const GenRegister dst = setGenReg(reg, grfOffset);
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index fedb743..2cad4c0 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -70,6 +70,8 @@ namespace gbe
   INLINE int typeSize(uint32_t type) {
     switch(type) {
       case GEN_TYPE_DF:
+      case GEN_TYPE_UL:
+      case GEN_TYPE_L:
         return 8;
       case GEN_TYPE_UD:
       case GEN_TYPE_D:
@@ -134,6 +136,28 @@ namespace gbe
     uint32_t predicate:4;
     uint32_t inversePredicate:1;
     uint32_t saturate:1;
+    void chooseNib(int nib) {
+      switch (nib) {
+        case 0:
+          quarterControl = 0;
+          nibControl = 0;
+          break;
+        case 1:
+          quarterControl = 0;
+          nibControl = 1;
+          break;
+        case 2:
+          quarterControl = 1;
+          nibControl = 0;
+          break;
+        case 3:
+          quarterControl = 1;
+          nibControl = 1;
+          break;
+        default:
+          NOT_IMPLEMENTED;
+      }
+    }
   };
 
   /*! This is a book-keeping structure used to encode both virtual and physical
@@ -200,6 +224,7 @@ namespace gbe
       int32_t d;
       uint32_t ud;
       uint16_t reg;
+      int64_t i64;
     } value;
 
     uint32_t nr:8;         //!< Just for some physical registers (acc, null)
@@ -222,12 +247,32 @@ namespace gbe
       return r;
     }
 
+    INLINE bool isint64(void) const {
+      if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
     INLINE bool isimmdf(void) const {
       if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
         return true;
       return false;
     }
 
+    INLINE GenRegister top_half(void) const {
+      GenRegister r = bottom_half();
+      r.subnr += 4;
+      return r;
+    }
+
+    INLINE GenRegister bottom_half(void) const {
+      GBE_ASSERT(isint64());
+      GenRegister r = *this;
+      r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
+      r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      return r;
+    }
+
     INLINE bool isdf(void) const {
       if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
         return true;
@@ -412,6 +457,12 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE GenRegister immint64(int64_t i) {
+      GenRegister immediate = imm(GEN_TYPE_L);
+      immediate.value.i64 = i;
+      return immediate;
+    }
+
     static INLINE GenRegister immdf(double df) {
       GenRegister immediate = imm(GEN_TYPE_DF);
       immediate.value.df = df;
@@ -553,6 +604,11 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_1);
     }
 
+    static INLINE bool isNull(GenRegister reg) {
+      return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
+              && reg.nr == GEN_ARF_NULL);
+    }
+
     static INLINE GenRegister acc(void) {
       return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
                          GEN_ARF_ACCUMULATOR,
@@ -832,6 +888,26 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_2);
     }
 
+    static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
     static INLINE GenRegister mask(uint32_t subnr) {
       return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
     }
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 26c22f3..35d3a7c 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -374,6 +374,12 @@ namespace gbe {
     return kernel->getStackSize();
   }
 
+  static int32_t kernelGetScratchSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getScratchSize();
+  }
+
   static int32_t kernelUseSLM(gbe_kernel genKernel) {
     if (genKernel == NULL) return 0;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -443,6 +449,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
@@ -476,6 +483,7 @@ namespace gbe
       gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
       gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
       gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+      gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
       gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
       gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
       gbe_kernel_use_slm = gbe::kernelUseSLM;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index f36bfbf..d20e7af 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -198,6 +198,10 @@ extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
 typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
 extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
 
+/*! Get the scratch size (zero if no scratch is required) */
+typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel);
+extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
+
 /*! Get the curbe offset where to put the data. Returns -1 if not required */
 typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
 extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 2d67310..83aaab8 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -96,6 +96,8 @@ namespace gbe {
     INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; }
     /*! Return the size of the stack (zero if none) */
     INLINE uint32_t getStackSize(void) const { return this->stackSize; }
+    /*! Return the size of the scratch memory needed (zero if none) */
+    INLINE uint32_t getScratchSize(void) const { return this->scratchSize; }
     /*! Get the SIMD width for the kernel */
     INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
     /*! Says if SLM is needed for it */
@@ -135,6 +137,7 @@ namespace gbe {
     uint32_t curbeSize;        //!< Size of the data to push
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
     uint32_t stackSize;        //!< Stack size (may be 0 if unused)
+    uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
     ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 2a77454..45095db 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -664,6 +664,7 @@ namespace ir {
     static const Type allButBool[] = {TYPE_S8,  TYPE_U8,
                                       TYPE_S16, TYPE_U16,
                                       TYPE_S32, TYPE_U32,
+                                      TYPE_S64, TYPE_U64,
                                       TYPE_FLOAT, TYPE_DOUBLE};
     static const uint32_t allButBoolNum = ARRAY_ELEM_NUM(allButBool);
 
@@ -671,6 +672,7 @@ namespace ir {
     static const Type logicalType[] = {TYPE_S8,  TYPE_U8,
                                        TYPE_S16, TYPE_U16,
                                        TYPE_S32, TYPE_U32,
+                                       TYPE_S64, TYPE_U64,
                                        TYPE_BOOL};
     static const uint32_t logicalTypeNum = ARRAY_ELEM_NUM(logicalType);
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index c8c5484..18448cf 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2371,8 +2371,6 @@ namespace gbe
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
-      if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
-        OCL_SIMD_WIDTH = 8;
       const ir::Register values = this->getRegister(llvmValues);
       if (isLoad)
         ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index c972a3e..84f15ca 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -543,13 +543,13 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_p
 INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
   return __gen_ocl_pow(x, 0.3333333333f);
 }
-INLINE_OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
-  *cosval = native_cos(x);
+#define BODY \
+  *cosval = native_cos(x); \
   return native_sin(x);
-}
-INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
-INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
-INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
+INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
+#undef BODY
 
 INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
   return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
@@ -562,7 +562,16 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
   return (1 - y) / (1 + y);
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
-  return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
+  float sum = x, c = x, m = 1.0;
+  int n = 1;
+  do
+  {
+    c *= (2 * n - 1) * x * x;
+    m *= (2 * n);
+    sum += ( c / m / (2 * n + 1));
+    n++;
+  }while( n < 30);
+  return sum;
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
   return __gen_ocl_internal_asin(x) / M_PI_F;
@@ -726,23 +735,22 @@ DECL_MIN_MAX_CLAMP(unsigned short)
 DECL_MIN_MAX_CLAMP(unsigned char)
 #undef DECL_MIN_MAX_CLAMP
 
-INLINE_OVERLOADABLE float __gen_ocl_frexp(float x, int *exp) {
-  uint u = as_uint(x);
-  if ((u & 0x7FFFFFFFu) == 0) {
-    *exp = 0;
-    return x;
-  }
-  int e = (u >> 23) & 255;
-  if (e == 255)
-    return x;
-  *exp = e - 126;
-  u = (u & (0x807FFFFFu)) | 0x3F000000;
+#define BODY \
+  uint u = as_uint(x); \
+  if ((u & 0x7FFFFFFFu) == 0) { \
+    *exp = 0; \
+    return x; \
+  } \
+  int e = (u >> 23) & 255; \
+  if (e == 255) \
+    return x; \
+  *exp = e - 126; \
+  u = (u & (0x807FFFFFu)) | 0x3F000000; \
   return as_float(u);
-}
-
-INLINE_OVERLOADABLE float frexp(float x, global int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
-INLINE_OVERLOADABLE float frexp(float x, local int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
-INLINE_OVERLOADABLE float frexp(float x, private int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
+INLINE_OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
+#undef BODY
 
 INLINE_OVERLOADABLE float nextafter(float x, float y) {
   uint hx = as_uint(x), ix = hx & 0x7FFFFFFF;
@@ -760,24 +768,22 @@ INLINE_OVERLOADABLE float nextafter(float x, float y) {
   return as_float(hx);
 }
 
-INLINE_OVERLOADABLE float __gen_ocl_modf(float x, float *i) {
-  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF;
-  if (ix > 0x7F800000) {
-    *i = nan(0u);
-    return nan(0u);
-  }
-  if (ix == 0x7F800000) {
-    *i = x;
-    return as_float(hx & 0x80000000u);
-  }
-  *i = __gen_ocl_rndz(x);
+#define BODY \
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+  if (ix > 0x7F800000) { \
+    *i = nan(0u); \
+    return nan(0u); \
+  } \
+  if (ix == 0x7F800000) { \
+    *i = x; \
+    return as_float(hx & 0x80000000u); \
+  } \
+  *i = __gen_ocl_rndz(x); \
   return x - *i;
-}
-
-INLINE_OVERLOADABLE float modf(float x, global float *i) { return __gen_ocl_modf(x, (float *)i); }
-INLINE_OVERLOADABLE float modf(float x, local float *i) { return __gen_ocl_modf(x, (float *)i); }
-INLINE_OVERLOADABLE float modf(float x, private float *i) { return __gen_ocl_modf(x, (float *)i); }
-
+INLINE_OVERLOADABLE float modf(float x, global float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, local float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, private float *i) { BODY; }
+#undef BODY
 INLINE_OVERLOADABLE float degrees(float radians) { return (180 / M_PI_F) * radians; }
 INLINE_OVERLOADABLE float radians(float degrees) { return (M_PI_F / 180) * degrees; }
 
@@ -810,32 +816,30 @@ INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
 INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
   return __gen_ocl_internal_fmax(x, y) - y;
 }
-INLINE_OVERLOADABLE float __gen_ocl_fract(float x, float *p) {
-  *p = __gen_ocl_internal_floor(x);
+#define BODY \
+  *p = __gen_ocl_internal_floor(x); \
   return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
-}
-INLINE_OVERLOADABLE float fract(float x, global float *p) { return __gen_ocl_fract(x, (float *)p); }
-INLINE_OVERLOADABLE float fract(float x, local float *p) { return __gen_ocl_fract(x, (float *)p); }
-INLINE_OVERLOADABLE float fract(float x, private float *p) { return __gen_ocl_fract(x, (float *)p); }
-
-INLINE_OVERLOADABLE float __gen_ocl_remquo(float x, float y, int *quo) {
-  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF, hy = as_uint(y), iy = hy & 0x7FFFFFFF;
-  if (ix > 0x7F800000 || iy > 0x7F800000 || ix == 0x7F800000 || iy == 0)
-    return nan(0u);
-  float k = x / y;
-  int q =  __gen_ocl_rnde(k);
-  *quo = q >= 0 ? (q & 127) : (q | 0xFFFFFF80u);
-  float r = x - q * y;
-  uint hr = as_uint(r), ir = hr & 0x7FFFFFFF;
-  if (ir == 0)
-    hr = ir | (hx & 0x80000000u);
+INLINE_OVERLOADABLE float fract(float x, global float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, local float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, private float *p) { BODY; }
+#undef BODY
+
+#define BODY \
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF, hy = as_uint(y), iy = hy & 0x7FFFFFFF; \
+  if (ix > 0x7F800000 || iy > 0x7F800000 || ix == 0x7F800000 || iy == 0) \
+    return nan(0u); \
+  float k = x / y; \
+  int q =  __gen_ocl_rnde(k); \
+  *quo = q >= 0 ? (q & 127) : (q | 0xFFFFFF80u); \
+  float r = x - q * y; \
+  uint hr = as_uint(r), ir = hr & 0x7FFFFFFF; \
+  if (ir == 0) \
+    hr = ir | (hx & 0x80000000u); \
   return as_float(hr);
-}
-
-INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
-INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
-INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
-
+INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) { BODY; }
+INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
+INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
+#undef BODY
 INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
 INLINE_OVERLOADABLE float ldexp(float x, int n) {
   return __gen_ocl_pow(2, n) * x;
@@ -955,6 +959,7 @@ DECL_UNTYPED_RW_ALL(uint)
 DECL_UNTYPED_RW_ALL(long)
 DECL_UNTYPED_RW_ALL(ulong)
 DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
 
 #undef DECL_UNTYPED_RW_ALL
 #undef DECL_UNTYPED_RW_ALL_SPACE
@@ -1055,6 +1060,114 @@ DEF(float)
 #undef DEC8
 #undef DEC16
 
+#define DEC2(TYPE, ARGTYPE, TEMPTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, uint2 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC2X(TYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, uint2 mask) { \
+    TYPE##2 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    return z; \
+  }
+
+#define DEC4(TYPE, ARGTYPE, TEMPTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, uint4 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC4X(TYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, uint4 mask) { \
+    TYPE##4 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    return z; \
+  }
+
+#define DEC8(TYPE, ARGTYPE, TEMPTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, uint8 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC8X(TYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, uint8 mask) { \
+    TYPE##8 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+    return z; \
+  }
+
+#define DEC16(TYPE, ARGTYPE, TEMPTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, uint16 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC16X(TYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, uint16 mask) { \
+    TYPE##16 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+    z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
+    z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
+    z.sa = mask.sa < 16 ? ((TYPE *)&x)[mask.sa] : ((TYPE *)&y)[mask.sa & 15]; \
+    z.sb = mask.sb < 16 ? ((TYPE *)&x)[mask.sb] : ((TYPE *)&y)[mask.sb & 15]; \
+    z.sc = mask.sc < 16 ? ((TYPE *)&x)[mask.sc] : ((TYPE *)&y)[mask.sc & 15]; \
+    z.sd = mask.sd < 16 ? ((TYPE *)&x)[mask.sd] : ((TYPE *)&y)[mask.sd & 15]; \
+    z.se = mask.se < 16 ? ((TYPE *)&x)[mask.se] : ((TYPE *)&y)[mask.se & 15]; \
+    z.sf = mask.sf < 16 ? ((TYPE *)&x)[mask.sf] : ((TYPE *)&y)[mask.sf & 15]; \
+    return z; \
+  }
+
+#define DEF(TYPE) \
+  DEC2(TYPE, TYPE##2, TYPE##4) \
+  DEC2(TYPE, TYPE##4, TYPE##8) \
+  DEC2(TYPE, TYPE##8, TYPE##16) \
+  DEC2X(TYPE) \
+  DEC4(TYPE, TYPE##2, TYPE##4) \
+  DEC4(TYPE, TYPE##4, TYPE##8) \
+  DEC4(TYPE, TYPE##8, TYPE##16) \
+  DEC4X(TYPE) \
+  DEC8(TYPE, TYPE##2, TYPE##4) \
+  DEC8(TYPE, TYPE##4, TYPE##8) \
+  DEC8(TYPE, TYPE##8, TYPE##16) \
+  DEC8X(TYPE) \
+  DEC16(TYPE, TYPE##2, TYPE##4) \
+  DEC16(TYPE, TYPE##4, TYPE##8) \
+  DEC16(TYPE, TYPE##8, TYPE##16) \
+  DEC16X(TYPE)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+#undef DEF
+#undef DEC2
+#undef DEC2X
+#undef DEC4
+#undef DEC4X
+#undef DEC8
+#undef DEC8X
+#undef DEC16
+#undef DEC16X
 /////////////////////////////////////////////////////////////////////////////
 // Synchronization functions
 /////////////////////////////////////////////////////////////////////////////
diff --git a/kernels/builtin_shuffle2.cl b/kernels/builtin_shuffle2.cl
new file mode 100644
index 0000000..1a122d4
--- /dev/null
+++ b/kernels/builtin_shuffle2.cl
@@ -0,0 +1,13 @@
+kernel void builtin_shuffle2(global float *src1, global float *src2, global float *dst1, global float *dst2) {
+  int i = get_global_id(0);
+  float2 x = (float2)(src1[i], src2[i]);
+  float2 y = (float2)(1234, 5678);
+  uint4 mask = (uint4)(1, 0, 0, 0);
+  float4 v1 = shuffle2(x, y, mask);
+  float16 x2 = 0;
+  float16 y2 = (float16)(src1[i], src2[i], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  uint16 mask2 = (uint16)(17, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  float16 v2 = shuffle2(x2, y2, mask2);
+  dst1[i] = v1.s0 + v2.s0;
+  dst2[i] = v1.s1 + v2.s1;
+}
diff --git a/kernels/compiler_double_4.cl b/kernels/compiler_double_4.cl
new file mode 100644
index 0000000..e5e46f9
--- /dev/null
+++ b/kernels/compiler_double_4.cl
@@ -0,0 +1,5 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_4(global double *src1, global double *src2, global double *dst) {
+  int i = get_global_id(0);
+  dst[i] = src1[i] + src2[i];
+}
diff --git a/kernels/compiler_long.cl b/kernels/compiler_long.cl
new file mode 100644
index 0000000..3087292
--- /dev/null
+++ b/kernels/compiler_long.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  if(i < 5)
+    dst[i] = src1[i] + src2[i];
+  if(i > 5)
+    dst[i] = src1[i] - src2[i];
+}
diff --git a/kernels/compiler_long_2.cl b/kernels/compiler_long_2.cl
new file mode 100644
index 0000000..92be93a
--- /dev/null
+++ b/kernels/compiler_long_2.cl
@@ -0,0 +1,20 @@
+kernel void compiler_long_2(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  switch(i) {
+    case 0:
+      dst[i] = 0xFEDCBA9876543210UL;
+      break;
+    case 1:
+      dst[i] = src1[i] & src2[i];
+      break;
+    case 2:
+      dst[i] = src1[i] | src2[i];
+      break;
+    case 3:
+      dst[i] = src1[i] ^ src2[i];
+      break;
+    case 4:
+      dst[i] = src1[i] ? 0x1122334455667788L : 0x8877665544332211UL;
+      break;
+  }
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
index 30f0e1e..320194e 100644
--- a/kernels/compiler_vector_load_store.cl
+++ b/kernels/compiler_vector_load_store.cl
@@ -1,5 +1,5 @@
 /* test OpenCL 1.1 Vector Data Load/Store Functions (section 6.11.7) */
-
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #define OFFSET2(type)  (type ##2) {(type)1, (type)2}
 #define OFFSET3(type)  (type ##3) {(type)1, (type)2, (type)3}
 #define OFFSET4(type)  (type ##4) {(type)1, (type)2, (type)3, (type)4}
@@ -24,10 +24,10 @@ __kernel void test_##type ##n(__global type *pin, \
   TEST_TYPE(ushort,n)\
   TEST_TYPE(int,n)   \
   TEST_TYPE(uint,n)  \
-  TEST_TYPE(float,n)
+  TEST_TYPE(float,n) \
+  TEST_TYPE(double,n)
 
 #if 0
-  TEST_TYPE(double,n)
   TEST_TYPE(long,n)
   TEST_TYPE(ulong,n)
   TEST_TYPE(half,n)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 048595c..e58433f 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -180,10 +180,17 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   assert(offset >= 0);
   stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
   stack_sz *= device->max_compute_unit;
-  stack_sz *= device->max_thread_per_unit;
   cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
 }
 
+static void
+cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker)
+{
+  int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
+
+  cl_gpgpu_set_scratch(gpgpu, scratch_sz);
+}
+
 LOCAL cl_int
 cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                cl_kernel ker,
@@ -232,6 +239,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind all samplers */
   cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
 
+  cl_setup_scratch(gpgpu, ker);
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
   cl_gpgpu_states_setup(gpgpu, &kernel);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 212beb3..673985d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -135,6 +135,10 @@ extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
 typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
 extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
 
+/* Setup scratch */
+typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
+extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
+
 /* Configure internal state */
 typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
 extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 4952288..9aa926e 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -50,6 +50,7 @@ LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
 LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL;
 LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
+LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 2791fbe..e553a55 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -89,7 +89,9 @@ struct intel_gpgpu
   struct { drm_intel_bo *bo; } curbe_b;
   struct { drm_intel_bo *bo; } sampler_state_b;
   struct { drm_intel_bo *bo; } perf_b;
+  struct { drm_intel_bo *bo; } scratch_b;
 
+  uint32_t per_thread_scratch;
   struct {
     uint32_t num_cs_entries;
     uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
@@ -127,6 +129,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
     drm_intel_bo_unreference(gpgpu->perf_b.bo);
   if (gpgpu->stack_b.bo)
     drm_intel_bo_unreference(gpgpu->stack_b.bo);
+  if (gpgpu->scratch_b.bo)
+    drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+
   intel_batchbuffer_delete(gpgpu->batch);
   cl_free(gpgpu);
 }
@@ -199,18 +204,23 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
   BEGIN_BATCH(gpgpu->batch, 8);
   OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
 
-  gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
-    intel_batchbuffer_alloc_space(gpgpu->batch,0);
-
-  memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
-  vfe->vfe1.gpgpu_mode = 1;
-  vfe->vfe1.bypass_gateway_ctl = 1;
-  vfe->vfe1.reset_gateway_timer = 1;
-  vfe->vfe1.max_threads = gpgpu->max_threads - 1;
-  vfe->vfe1.urb_entries = 64;
-  vfe->vfe3.curbe_size = 480;
-  vfe->vfe4.scoreboard_mask = 0;
-  intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_vfe_state_inline_t));
+  if(gpgpu->per_thread_scratch > 0) {
+    OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              gpgpu->per_thread_scratch/1024 - 1);
+  }
+  else {
+    OUT_BATCH(gpgpu->batch, 0);
+  }
+  /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (64 << 8) | 0xc4);
+  OUT_BATCH(gpgpu->batch, 0);
+  /* curbe_size */
+  OUT_BATCH(gpgpu->batch, 480);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
   ADVANCE_BATCH(gpgpu->batch);
 }
 
@@ -537,6 +547,23 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, u
 }
 
 static void
+intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
+{
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  drm_intel_bo* old = gpgpu->scratch_b.bo;
+  uint32_t total = per_thread_size * gpgpu->max_threads;
+
+  gpgpu->per_thread_scratch = per_thread_size;
+
+  if(old && old->size < total) {
+    drm_intel_bo_unreference(old);
+    old = NULL;
+  }
+
+  if(!old)
+    gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
+}
+static void
 intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
@@ -823,5 +850,6 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
+  cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
 }
 
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 5cd20c3..b205c67 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -107,6 +107,7 @@ set (utests_sources
   builtin_nextafter.cpp
   builtin_remquo.cpp
   builtin_shuffle.cpp
+  builtin_shuffle2.cpp
   builtin_sign.cpp
   buildin_work_dim.cpp
   builtin_global_size.cpp
@@ -119,6 +120,9 @@ set (utests_sources
   compiler_double.cpp
   compiler_double_2.cpp
   compiler_double_3.cpp
+  compiler_double_4.cpp
+  compiler_long.cpp
+  compiler_long_2.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/builtin_shuffle2.cpp b/utests/builtin_shuffle2.cpp
new file mode 100644
index 0000000..7a9ebd1
--- /dev/null
+++ b/utests/builtin_shuffle2.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void builtin_shuffle2(void)
+{
+  const int n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_shuffle2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; i ++) {
+    ((float *)(buf_data[0]))[i] = (rand() & 15) * 0.1f;
+    ((float *)(buf_data[1]))[i] = (rand() & 15) * 0.1f;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; i ++) {
+    OCL_ASSERT(2 * ((float *)(buf_data[0]))[i] == ((float *)(buf_data[3]))[i]);
+    OCL_ASSERT(2 * ((float *)(buf_data[1]))[i] == ((float *)(buf_data[2]))[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_shuffle2);
diff --git a/utests/compiler_double_4.cpp b/utests/compiler_double_4.cpp
new file mode 100644
index 0000000..cb25bd4
--- /dev/null
+++ b/utests/compiler_double_4.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void compiler_double_4(void)
+{
+  const size_t n = 16;
+  double cpu_src1[n], cpu_src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double_4");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_src1[i] = ((double*)buf_data[0])[i] = rand() * 1e-2;
+    cpu_src2[i] = ((double*)buf_data[1])[i] = rand() * 1e-2;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    OCL_ASSERT(fabs(((double*)buf_data[2])[i] - cpu_src1[i] - cpu_src2[i]) < 1e-4);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_4);
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
new file mode 100644
index 0000000..fad2744
--- /dev/null
+++ b/utests/compiler_long.cpp
@@ -0,0 +1,58 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  src1[0] = -1L,                  src2[0] = -1L;
+  src1[1] = 0x8000000000000000UL, src2[1] = 0x8000000000000000UL;
+  src1[2] = 0x7FFFFFFFFFFFFFFFL,  src2[2] = 1L;
+  src1[3] = 0xFFFFFFFEL,          src2[3] = 1L;
+  src1[4] = 0x7FFFFFFFL,          src2[4] = 0x80000000L;
+  src1[5] = 0,                    src2[5] = 0;
+  src1[6] = 0,                    src2[6] = 1;
+  src1[7] = -2L,                  src2[7] = -1L;
+  src1[8] = 0,                    src2[8] = 0x8000000000000000UL;
+  for (int32_t i = 9; i < (int32_t) n; ++i) {
+    src1[i] = ((long)rand() << 32) + rand();
+    src2[i] = ((long)rand() << 32) + rand();
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+    if (i < 5)
+      OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+    if (i > 5)
+      OCL_ASSERT(src1[i] - src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long);
diff --git a/utests/compiler_long_2.cpp b/utests/compiler_long_2.cpp
new file mode 100644
index 0000000..e3c6640
--- /dev/null
+++ b/utests/compiler_long_2.cpp
@@ -0,0 +1,51 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_2(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = ((long)rand() << 32) + rand();
+    src2[i] = ((long)rand() << 32) + rand();
+  }
+  src1[4] = 1;
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  int64_t *dest = ((int64_t *)buf_data[2]);
+  //for (int32_t i = 0; i < (int32_t) n; ++i)
+  //  printf("%lx\n", dest[i]);
+  OCL_ASSERT(0xFEDCBA9876543210UL == (uint64_t)dest[0]);
+  OCL_ASSERT((src1[1] & src2[1]) == dest[1]);
+  OCL_ASSERT((src1[2] | src2[2]) == dest[2]);
+  OCL_ASSERT((src1[3] ^ src2[3]) == dest[3]);
+  OCL_ASSERT(0x1122334455667788L == dest[4]);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_2);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index 79f284f..7deb7cb 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -1,4 +1,5 @@
 #include "utest_helper.hpp"
+#include <string.h>
 template<typename T>
 static void compiler_vector_load_store(int elemNum, const char *kernelName)
 {
@@ -9,8 +10,8 @@ static void compiler_vector_load_store(int elemNum, const char *kernelName)
   buf_data[0] = (T*) malloc(sizeof(T) * n);
   for (uint32_t i = 0; i < n; ++i)
     ((T*)buf_data[0])[i] = i;
-  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(float), buf_data[0]);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
@@ -27,7 +28,10 @@ static void compiler_vector_load_store(int elemNum, const char *kernelName)
   for (uint32_t i = 0; i < n; ++i)
   {
     int shift = ((i % elemNum) + 1);
-    OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
+    if (strstr(kernelName, "double") == NULL)
+      OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
+    else
+      OCL_ASSERT((((T*)buf_data[1])[i] - ((T)((T*)buf_data[0])[i] + shift)) < 1e-5);
   }
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
@@ -54,6 +58,6 @@ test_all_vector(uint16_t, ushort)
 test_all_vector(int32_t, int)
 test_all_vector(uint32_t, uint)
 test_all_vector(float, float)
-//test_all_vector(double, double)
+test_all_vector(double, double)
 //test_all_vector(int64_t, long)
 //test_all_vector(uint64_t, ulong)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list