[Pkg-opencl-devel] [beignet] 25/66: Imported Upstream version 0.1+git20130521+a7ea35c

Fri Oct 31 07:27:04 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit e85d9784fbd36caa1b1d5031cf56c0862db101e4
Author: Simon Richter <sjr at debian.org>
Date:   Tue May 21 09:17:45 2013 +0200

    Imported Upstream version 0.1+git20130521+a7ea35c
---
 .gitignore                                         |   5 +-
 backend/src/.gitignore                             |   3 +
 backend/src/CMakeLists.txt                         |   5 +
 backend/src/backend/context.cpp                    |  24 +-
 backend/src/backend/context.hpp                    |   3 +
 backend/src/backend/gen/gen_mesa_disasm.c          |   3 +-
 backend/src/backend/gen_context.cpp                |  78 +-
 backend/src/backend/gen_context.hpp                |   3 +-
 backend/src/backend/gen_defs.hpp                   |   2 -
 backend/src/backend/gen_encoder.cpp                |  68 +-
 backend/src/backend/gen_encoder.hpp                |  10 +-
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   3 +-
 backend/src/backend/gen_insn_scheduling.cpp        |   2 +-
 backend/src/backend/gen_insn_selection.cpp         |  98 ++-
 backend/src/backend/gen_insn_selection.hxx         |   3 +-
 backend/src/backend/program.cpp                    |  51 +-
 backend/src/backend/program.h                      |  37 +-
 backend/src/backend/program.hpp                    |  26 +-
 backend/src/ir/function.cpp                        |   2 +
 backend/src/ir/function.hpp                        |  26 +
 backend/src/ir/image.cpp                           | 108 +++
 backend/src/ir/image.hpp                           |  71 ++
 backend/src/ir/instruction.cpp                     |  45 +-
 backend/src/ir/instruction.hpp                     |  45 +-
 backend/src/ir/instruction.hxx                     |   2 +-
 backend/src/ir/sampler.cpp                         |  78 ++
 backend/src/ir/sampler.hpp                         |  78 ++
 backend/src/ir/unit.hpp                            |  22 +-
 backend/src/llvm/llvm_gen_backend.cpp              | 551 ++++++--------
 backend/src/llvm/llvm_gen_backend.hpp              |  30 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   4 +
 backend/src/llvm/llvm_scalarize.cpp                | 836 +++++++++++++++++++++
 backend/src/llvm/llvm_to_gen.cpp                   |   1 +
 backend/src/ocl_common_defines.h                   |   9 +-
 backend/src/ocl_stdlib.h                           | 400 +++++++++-
 backend/src/sys/alloc.hpp                          |   1 +
 include/CL/cl.hpp                                  | 104 +--
 include/CMakeLists.txt                             |   2 +
 kernels/compiler_julia.cl                          |   2 -
 kernels/compiler_julia_no_break.cl                 |   2 -
 kernels/compiler_math.cl                           |  46 +-
 kernels/compiler_math_2op.cl                       |  19 +
 kernels/compiler_math_3op.cl                       |   9 +
 kernels/compiler_menger_sponge.cl                  |   2 -
 kernels/compiler_menger_sponge_no_shadow.cl        |   2 -
 kernels/compiler_nautilus.cl                       |   4 +-
 kernels/test_copy_image1.cl                        |  33 +
 kernels/test_fill_image0.cl                        |   2 +-
 kernels/test_get_image_size.cl                     |   9 +
 src/.gitignore                                     |   2 +
 src/CMakeLists.txt                                 |  10 +-
 src/cl_api.c                                       | 270 ++++++-
 src/cl_command_queue.c                             |  41 +-
 src/cl_command_queue.h                             |   3 +
 src/cl_command_queue_gen7.c                        |   4 +
 src/cl_device_id.c                                 |   1 +
 src/cl_device_id.h                                 |   6 +-
 src/cl_driver.h                                    |  29 +-
 src/cl_driver_defs.c                               |   3 +-
 src/cl_gt_device.h                                 |   1 +
 src/cl_kernel.c                                    |  52 +-
 src/cl_kernel.h                                    |   4 +
 src/cl_mem.c                                       |  25 +-
 src/cl_mem.h                                       |   9 +-
 src/cl_mem_gl.c                                    |   4 +-
 src/cl_sampler.c                                   |  47 ++
 src/cl_sampler.h                                   |   5 +
 src/cl_utils.h                                     |   9 +
 src/intel/intel_driver.c                           |  43 +-
 src/intel/intel_gpgpu.c                            |  73 +-
 utests/.gitignore                                  |  13 +
 utests/CMakeLists.txt                              |   2 +
 ...ovforphi_undef.cpp => compiler_copy_image1.cpp} |  50 +-
 utests/compiler_fill_image0.cpp                    |   7 +-
 ...fill_image0.cpp => compiler_get_image_size.cpp} |  24 +-
 utests/compiler_math.cpp                           |  88 ++-
 utests/compiler_math_2op.cpp                       |  80 ++
 utests/compiler_math_3op.cpp                       |  64 ++
 utests/compiler_movforphi_undef.cpp                |   4 +-
 utests/utest_helper.hpp                            |  11 +
 80 files changed, 3261 insertions(+), 692 deletions(-)

diff --git a/.gitignore b/.gitignore
index bae14b5..90fd161 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 *.o
-*.so*
+CMakeCache.txt
+CMakeFiles/
+Makefile
+cmake_install.cmake
diff --git a/backend/src/.gitignore b/backend/src/.gitignore
index afcb283..39239f3 100644
--- a/backend/src/.gitignore
+++ b/backend/src/.gitignore
@@ -1 +1,4 @@
+GBEConfig.h
+libgbe.so
+ocl_common_defines_str.cpp
 ocl_stdlib_str.cpp
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index ac7e1da..183517a 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -61,6 +61,10 @@ else (GBE_USE_BLOB)
     ir/unit.hpp
     ir/constant.cpp
     ir/constant.hpp
+    ir/sampler.cpp
+    ir/sampler.hpp
+    ir/image.cpp
+    ir/image.hpp
     ir/instruction.cpp
     ir/instruction.hpp
     ir/liveness.cpp
@@ -79,6 +83,7 @@ else (GBE_USE_BLOB)
     backend/program.h
     llvm/llvm_gen_backend.cpp
     llvm/llvm_passes.cpp
+    llvm/llvm_scalarize.cpp
     llvm/llvm_to_gen.cpp
     llvm/llvm_gen_backend.hpp
     llvm/llvm_gen_ocl_function.hxx
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index c636b48..474c36a 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -29,6 +29,8 @@
 #include "ir/profile.hpp"
 #include "ir/liveness.hpp"
 #include "ir/value.hpp"
+#include "ir/image.hpp"
+#include "ir/sampler.hpp"
 #include "sys/cvar.hpp"
 #include <algorithm>
 
@@ -120,7 +122,12 @@ namespace gbe
           continue;
         }
       } else {
-        aligned = ALIGN(list->offset+list->size-size-(alignment-1), alignment);   //alloc from block's tail
+        int16_t unaligned = list->offset + list->size - size - (alignment-1);
+        if(unaligned < 0) {
+          list = list->prev;
+          continue;
+        }
+        aligned = ALIGN(unaligned, alignment);   //alloc from block's tail
         spaceOnLeft = aligned - list->offset;
         spaceOnRight = list->size - size - spaceOnLeft;
 
@@ -196,7 +203,6 @@ namespace gbe
 
       // Track the allocation to retrieve the size later
       allocatedBlocks.insert(std::make_pair(aligned, size));
-
       // We have a valid offset now
       return aligned;
     }
@@ -352,6 +358,20 @@ namespace gbe
     kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
   }
 
+  uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
+  {
+    int32_t offset = fn.getImageSet()->getInfoOffset(key);
+    if (offset >= 0)
+      return offset;
+    newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
+    std::sort(kernel->patches.begin(), kernel->patches.end());
+
+    offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
+    GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
+    fn.getImageSet()->appendInfo(key, offset);
+    return offset;
+  }
+
   void Context::buildPatchList(void) {
     const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
     kernel->curbeSize = 0u;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 245ad01..c205388 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -88,6 +88,9 @@ namespace gbe
     void deallocate(int16_t offset);
     /* allocate curbe for constant ptr argument */
     int32_t allocConstBuf(uint32_t argID);
+    /* allocate a new entry for a specific image's information */
+    /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
+    uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index fa52f8c..420cd62 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -329,8 +329,7 @@ static const char *math_function[16] = {
   [GEN_MATH_FUNCTION_RSQ] = "rsq",
   [GEN_MATH_FUNCTION_SIN] = "sin",
   [GEN_MATH_FUNCTION_COS] = "cos",
-  [GEN_MATH_FUNCTION_SINCOS] = "sincos",
-  [GEN_MATH_FUNCTION_TAN] = "tan",
+  [GEN_MATH_FUNCTION_FDIV] = "fdiv",
   [GEN_MATH_FUNCTION_POW] = "pow",
   [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
   [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4a16cae..18f6c11 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -212,7 +212,7 @@ namespace gbe
     }
   }
 
-  void GenContext::emitCBMoveInstruction(const SelectionInstruction &insn) {
+  void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
     const GenRegister src = GenRegister::unpacked_uw(ra->genReg(insn.src(0)).nr, 0);
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister a0 = GenRegister::addr8(0);
@@ -287,47 +287,35 @@ namespace gbe
   void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
-    const GenRegister bti = ra->genReg(insn.src(4));
-    const GenRegister sampler = ra->genReg(insn.src(5));
-    const GenRegister ucoord = ra->genReg(insn.src(6));
-    const GenRegister vcoord = ra->genReg(insn.src(7));
-    const GenRegister wcoord = ra->genReg(insn.src(8));
-    const GenRegister temp = GenRegister::ud1grf(msgPayload.nr, msgPayload.subnr/sizeof(float) + 4);
-    const GenRegister a0_0 = GenRegister::ud1arf(GEN_ARF_ADDRESS, 0);
+    const unsigned char bti = insn.extra.function;
+    const unsigned char sampler = insn.extra.elem;
+    const GenRegister ucoord = ra->genReg(insn.src(4));
+    const GenRegister vcoord = ra->genReg(insn.src(5));
+    const GenRegister wcoord = ra->genReg(insn.src(6));
     uint32_t simdWidth = p->curr.execWidth;
     p->push();
     const uint32_t nr = msgPayload.nr;
     // prepare mesg desc and move to a0.0.
     // desc = bti | (sampler << 8) | (0 << 12) | (2 << 16) | (0 << 18) | (0 << 19) | (4 << 20) | (1 << 25) | (0 < 29) | (0 << 31)
-    p->curr.execWidth = 1;
-    p->MOV(a0_0, GenRegister::immud((GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE << 12) | (2 << 17)
-                                    | ((4 * (simdWidth/8)) << 20)
-                                    | ((2 * (simdWidth/8)) << 25)));
-    p->SHL(temp, GenRegister::ud1grf(sampler.nr, sampler.subnr/sizeof(float)), GenRegister::immud(8));
-    p->OR(a0_0, a0_0, temp);
-    p->OR(a0_0, a0_0, GenRegister::ud1grf(bti.nr, bti.subnr/sizeof(float)));
-    p->curr.execWidth = simdWidth;
     /* Prepare message payload. */
     p->MOV(GenRegister::f8grf(nr , 0), ucoord);
     p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
     if (insn.src(8).reg() != 0)
       p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), wcoord);
-    p->SAMPLE(dst, msgPayload, a0_0, -1, 0);
-
+    p->SAMPLE(dst, msgPayload, false, bti, sampler, simdWidth, -1, 0);
     p->pop();
   }
 
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
-    const GenRegister bti = ra->genReg(insn.src(0 + insn.extra.elem));
-    const GenRegister ucoord = ra->genReg(insn.src(1 + insn.extra.elem));
-    const GenRegister vcoord = ra->genReg(insn.src(2 + insn.extra.elem));
-    const GenRegister wcoord = ra->genReg(insn.src(3 + insn.extra.elem));
-    const GenRegister R = ra->genReg(insn.src(4 + insn.extra.elem));
-    const GenRegister G = ra->genReg(insn.src(5 + insn.extra.elem));
-    const GenRegister B = ra->genReg(insn.src(6 + insn.extra.elem));
-    const GenRegister A = ra->genReg(insn.src(7 + insn.extra.elem));
-    const GenRegister a0_0 = GenRegister::ud1arf(GEN_ARF_ADDRESS, 0);
+    const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
+    const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.elem));
+    const GenRegister wcoord = ra->genReg(insn.src(2 + insn.extra.elem));
+    const GenRegister R = ra->genReg(insn.src(3 + insn.extra.elem));
+    const GenRegister G = ra->genReg(insn.src(4 + insn.extra.elem));
+    const GenRegister B = ra->genReg(insn.src(5 + insn.extra.elem));
+    const GenRegister A = ra->genReg(insn.src(6 + insn.extra.elem));
+    const unsigned char bti = insn.extra.function;
 
     p->push();
     uint32_t simdWidth = p->curr.execWidth;
@@ -339,16 +327,18 @@ namespace gbe
 
     // prepare mesg desc and move to a0.0.
     // desc = bti | (msg_type << 14) | (header_present << 19))
-    p->MOV(a0_0, GenRegister::immud((GEN_TYPED_WRITE << 14) | (1 << 19) | (9 << 25)));
-    p->OR(a0_0, a0_0, GenRegister::ud1grf(bti.nr, bti.subnr/sizeof(float)));
     // prepare header, we need to enable all the 8 planes.
-    p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xff));
-    // Typed write only support SIMD8.
+    p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xffff));
     p->curr.execWidth = 8;
+    // Typed write only support SIMD8.
     // Prepare message payload U + V + R(ignored) + LOD(0) + RGBA.
-    // XXX currently only support U32 surface type with RGBA.
+    // Currently, we don't support non-zero lod, so we clear all lod to
+    // zero for both quarters thus save one instruction here.
+    // Thus we must put this instruction in noMask and no predication state.
     p->MOV(GenRegister::ud8grf(nr + 4, 0), GenRegister::immud(0)); //LOD
-
+    p->pop();
+    p->push();
+    p->curr.execWidth = 8;
     // TYPED WRITE send instruction only support SIMD8, if we are SIMD16, we
     // need to call it twice.
     uint32_t quarterNum = (simdWidth == 8) ? 1 : 2;
@@ -359,6 +349,8 @@ namespace gbe
                                         GenRegister::retype(GenRegister::QnPhysical(src, quarter), src.type))
 #define QUARTER_MOV1(dst_nr, src) p->MOV(GenRegister::retype(GenRegister::ud8grf(dst_nr, 0), src.type), \
                                         GenRegister::retype(GenRegister::QnPhysical(src,quarter), src.type))
+      if (quarter == 1)
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
       QUARTER_MOV0(nr + 1, ucoord);
       QUARTER_MOV0(nr + 2, vcoord);
       if (insn.src(3 + insn.extra.elem).reg() != 0)
@@ -368,12 +360,28 @@ namespace gbe
       QUARTER_MOV1(nr + 7, B);
       QUARTER_MOV1(nr + 8, A);
 #undef QUARTER_MOV
-      p->TYPED_WRITE(header, a0_0);
+      p->TYPED_WRITE(header, true, bti);
     }
-
     p->pop();
   }
 
+  void GenContext::emitGetImageInfoInstruction(const SelectionInstruction &insn) {
+    const unsigned char bti = insn.extra.function;
+    const unsigned char type = insn.extra.elem;
+    const uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(type);
+    ir::ImageInfoKey key;
+    key.index = bti;
+    key.type = type;
+
+    uint32_t offset = this->getImageInfoCurbeOffset(key, dstNum * 4) + GEN_REG_SIZE;
+    for(uint32_t i = 0; i < dstNum; i++) {
+      const uint32_t nr = offset / GEN_REG_SIZE;
+      const uint32_t subnr = (offset % GEN_REG_SIZE) / sizeof(uint32_t);
+      p->MOV(ra->genReg(insn.dst(i)), GenRegister::ud1grf(nr, subnr));
+      offset += 32;
+    }
+  }
+
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
   bool GenContext::emitCode(void) {
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 33258f8..7c28bdf 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -80,7 +80,7 @@ namespace gbe
     void emitTernaryInstruction(const SelectionInstruction &insn);
     void emitCompareInstruction(const SelectionInstruction &insn);
     void emitJumpInstruction(const SelectionInstruction &insn);
-    void emitCBMoveInstruction(const SelectionInstruction &insn);
+    void emitIndirectMoveInstruction(const SelectionInstruction &insn);
     void emitEotInstruction(const SelectionInstruction &insn);
     void emitNoOpInstruction(const SelectionInstruction &insn);
     void emitWaitInstruction(const SelectionInstruction &insn);
@@ -92,6 +92,7 @@ namespace gbe
     void emitByteScatterInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
+    void emitGetImageInfoInstruction(const SelectionInstruction &insn);
 
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 187224a..c7a1581 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -350,8 +350,6 @@ enum GenMessageTarget {
 #define GEN_MATH_FUNCTION_RSQ                              5
 #define GEN_MATH_FUNCTION_SIN                              6 /* was 7 */
 #define GEN_MATH_FUNCTION_COS                              7 /* was 8 */
-#define GEN_MATH_FUNCTION_SINCOS                           8 /* was 6 */
-#define GEN_MATH_FUNCTION_TAN                              9 /* gen4 */
 #define GEN_MATH_FUNCTION_FDIV                             9 /* gen6+ */
 #define GEN_MATH_FUNCTION_POW                              10
 #define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index ed7c256..d6c34fb 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -166,6 +166,39 @@ namespace gbe
   }
 #endif
 
+  static void setSamplerMessage(GenEncoder *p,
+                                GenInstruction *insn,
+                                unsigned char bti,
+                                unsigned char sampler,
+                                uint32_t msg_type,
+                                uint32_t response_length,
+                                uint32_t msg_length,
+                                bool header_present,
+                                uint32_t simd_mode,
+                                uint32_t return_format)
+  {
+     const GenMessageTarget sfid = GEN_SFID_SAMPLER;
+     setMessageDescriptor(p, insn, sfid, msg_length, response_length);
+     insn->bits3.sampler_gen7.bti = bti;
+     insn->bits3.sampler_gen7.sampler = sampler;
+     insn->bits3.sampler_gen7.msg_type = msg_type;
+     insn->bits3.sampler_gen7.simd_mode = simd_mode;
+  }
+
+
+  static void setTypedWriteMessage(GenEncoder *p,
+                                   GenInstruction *insn,
+                                   unsigned char bti,
+                                   unsigned char msg_type,
+                                   uint32_t msg_length,
+                                   bool header_present)
+  {
+     const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+     setMessageDescriptor(p, insn, sfid, msg_length, 0, header_present);
+     insn->bits3.gen7_typed_rw.bti = bti;
+     insn->bits3.gen7_typed_rw.msg_type = msg_type;
+  }
+
   //////////////////////////////////////////////////////////////////////////
   // Gen Emitter encoding class
   //////////////////////////////////////////////////////////////////////////
@@ -800,31 +833,42 @@ namespace gbe
   }
 
   void GenEncoder::SAMPLE(GenRegister dest,
-                          GenRegister src0,
-                          GenRegister src1,
+                          GenRegister msg,
+                          bool header_present,
+                          unsigned char bti,
+                          unsigned char sampler,
+                          uint32_t simdWidth,
                           uint32_t writemask,
                           uint32_t return_format)
   {
      if (writemask == 0) return;
-
+     uint32_t msg_type = (simdWidth == 16) ?
+                            GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+     uint32_t response_length = (4 * (simdWidth / 8));
+     uint32_t msg_length = (2 * (simdWidth / 8));
+     if (header_present)
+       msg_length++;
+     uint32_t simd_mode = (simdWidth == 16) ?
+                            GEN_SAMPLER_SIMD_MODE_SIMD16 : GEN_SAMPLER_SIMD_MODE_SIMD8;
      GenInstruction *insn = this->next(GEN_OPCODE_SEND);
-     insn->header.predicate_control = 0; /* XXX */
      this->setHeader(insn);
      this->setDst(insn, dest);
-     this->setSrc0(insn, src0);
-     this->setSrc1(insn, src1);
-     insn->header.destreg_or_condmod = GEN_SFID_SAMPLER;
+     this->setSrc0(insn, msg);
+     setSamplerMessage(this, insn, bti, sampler, msg_type,
+                       response_length, msg_length,
+                       header_present,
+                       simd_mode, return_format);
   }
 
-  void GenEncoder::TYPED_WRITE(GenRegister header, GenRegister desc)
+  void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
   {
      GenInstruction *insn = this->next(GEN_OPCODE_SEND);
-     insn->header.predicate_control = 0; /* XXX */
+     uint32_t msg_type = GEN_TYPED_WRITE;
+     uint32_t msg_length = header_present ? 9 : 8;
      this->setHeader(insn);
      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
-     this->setSrc0(insn, header);
-     this->setSrc1(insn, desc);
-     insn->header.destreg_or_condmod = GEN6_SFID_DATAPORT_RENDER_CACHE;
+     this->setSrc0(insn, msg);
+     setTypedWriteMessage(this, insn, bti, msg_type, msg_length, header_present);
   }
 
   void GenEncoder::EOT(uint32_t msg) {
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 7e26f0a..83d83d2 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -140,14 +140,18 @@ namespace gbe
     void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
     /*! Send instruction for the sampler */
     void SAMPLE(GenRegister dest,
-                GenRegister src0,
-                GenRegister src1,
+                GenRegister msg,
+                bool header_present,
+                unsigned char bti,
+                unsigned char sampler,
+                unsigned int simdWidth,
                 uint32_t writemask,
                 uint32_t return_format);
 
     /*! TypedWrite instruction for texture */
     void TYPED_WRITE(GenRegister header,
-                     GenRegister desc);
+                     bool header_present,
+                     unsigned char bti);
     /*! Extended math function (2 sources) */
     void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
     /*! Extended math function (1 source) */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index ce8769f..a2c0fba 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -5,7 +5,7 @@ DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
-DECL_GEN7_SCHEDULE(CBMove,          20,        2,        2)
+DECL_GEN7_SCHEDULE(IndirectMove,    20,        2,        2)
 DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
 DECL_GEN7_SCHEDULE(NoOp,            20,        2,        2)
 DECL_GEN7_SCHEDULE(Wait,            20,        2,        2)
@@ -17,4 +17,5 @@ DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
+DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
 
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index 01c525e..95eedfe 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -109,7 +109,7 @@ namespace gbe
       insn(insn), refNum(0), retiredCycle(0) {}
     bool dependsOn(ScheduleDAGNode *node) const {
       GBE_ASSERT(node != NULL);
-      for (auto child : children)
+      for (auto child : node->children)
         if (child.node == this)
           return true;
       return false;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 2bd9aca..08bc6af 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -440,7 +440,7 @@ namespace gbe
     /*! Select instruction with embedded comparison */
     void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
     /* Constant buffer move instruction */
-    void CB_MOVE(Reg dst, Reg src);
+    void INDIRECT_MOVE(Reg dst, Reg src);
     /*! EOT is used to finish GPGPU threads */
     void EOT(void);
     /*! No-op */
@@ -466,9 +466,11 @@ namespace gbe
     /*! Encode ternary instructions */
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
-    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum);
+    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler);
     /*! Encode typed write instructions */
-    void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum);
+    void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
+    /*! Get image information */
+    void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
     /*! Use custom allocators */
     GBE_CLASS(Opaque);
     friend class SelectionBlock;
@@ -700,8 +702,8 @@ namespace gbe
     insn->src(1) = src1;
     insn->extra.function = conditional;
   }
-  void Selection::Opaque::CB_MOVE(Reg dst, Reg src) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_CB_MOVE, 1, 1);
+  void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg src) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 1, 1);
     insn->dst(0) = dst;
     insn->src(0) = src;
   }
@@ -964,8 +966,11 @@ namespace gbe
       this->matchBasicBlock(insnNum);
     });
    }
- /* XXX always 4 return values? */
-  void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum) {
+
+  void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
+                                 GenRegister *src, uint32_t srcNum,
+                                 GenRegister *msgPayloads, uint32_t msgNum,
+                                 uint32_t bti, uint32_t sampler) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum);
     SelectionVector *dstVector = this->appendVector();
     SelectionVector *msgVector = this->appendVector();
@@ -987,6 +992,9 @@ namespace gbe
     msgVector->regNum = msgNum;
     msgVector->isSrc = 1;
     msgVector->reg = &insn->src(0);
+
+    insn->extra.function = bti;
+    insn->extra.elem = sampler;
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -999,7 +1007,8 @@ namespace gbe
   }
 
   void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum,
-                              GenRegister *msgs, uint32_t msgNum) {
+                                      GenRegister *msgs, uint32_t msgNum,
+                                      uint32_t bti) {
     uint32_t elemID = 0;
     uint32_t i;
     SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum);
@@ -1010,6 +1019,7 @@ namespace gbe
     for (i = 0; i < srcNum; ++i, ++elemID)
       insn->src(elemID) = src[i];
 
+    insn->extra.function = bti;
     insn->extra.elem = msgNum;
     // Sends require contiguous allocation
     msgVector->regNum = msgNum;
@@ -1017,6 +1027,17 @@ namespace gbe
     msgVector->reg = &insn->src(0);
   }
 
+  void Selection::Opaque::GET_IMAGE_INFO(uint32_t infoType, GenRegister *dst,
+                                    uint32_t dstNum, uint32_t bti) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_GET_IMAGE_INFO, dstNum, 0);
+
+    for(uint32_t i = 0; i < dstNum; ++i)
+      insn->dst(i) = dst[i];
+
+    insn->extra.function = bti;
+    insn->extra.elem = infoType;
+  }
+
   Selection::~Selection(void) { GBE_DELETE(this->opaque); }
 
   void Selection::select(void) {
@@ -1546,6 +1567,13 @@ namespace gbe
       const Immediate imm = insn.getImmediate();
       const GenRegister dst = sel.selReg(insn.getDst(0), type);
 
+      sel.push();
+      if (sel.isScalarOrBool(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+
       switch (type) {
         case TYPE_U32:
         case TYPE_S32:
@@ -1559,6 +1587,7 @@ namespace gbe
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
         default: NOT_SUPPORTED;
       }
+      sel.pop();
       return true;
     }
 
@@ -1661,7 +1690,7 @@ namespace gbe
         sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
     }
 
-    void emitCBMove(Selection::Opaque &sel,
+    void emitIndirectMove(Selection::Opaque &sel,
                          const ir::LoadInstruction &insn,
                          GenRegister address) const
     {
@@ -1670,7 +1699,7 @@ namespace gbe
 
       const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
       const GenRegister src = address;
-      sel.CB_MOVE(dst, src);
+      sel.INDIRECT_MOVE(dst, src);
     }
 
     INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn) const {
@@ -1683,7 +1712,7 @@ namespace gbe
                  insn.getAddressSpace() == MEM_LOCAL);
       GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
       if (insn.getAddressSpace() == MEM_CONSTANT)
-        this->emitCBMove(sel, insn, address);
+        this->emitIndirectMove(sel, insn, address);
       else if (insn.isAligned() == true)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
@@ -1957,7 +1986,7 @@ namespace gbe
     {
       using namespace ir;
       GenRegister msgPayloads[4];
-      GenRegister dst[insn.getDstNum()], src[insn.getSrcNum()];
+      GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
 
       for( int i = 0; i < 4; ++i)
         msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
@@ -1965,10 +1994,15 @@ namespace gbe
       for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
-      for (uint32_t valueID = 0; valueID < insn.getSrcNum(); ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      for (uint32_t valueID = 0; valueID < insn.getSrcNum() - 2; ++valueID)
+        src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
 
-      sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum(), msgPayloads, 4);
+      uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
+                       (insn.getSrc(SampleInstruction::SURFACE_BTI));
+      uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
+                           (insn.getSrc(SampleInstruction::SAMPLER_BTI));
+
+      sel.SAMPLE(dst, insn.getDstNum(), src, insn.getSrcNum() - 2, msgPayloads, 4, bti, sampler);
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1);
@@ -1990,22 +2024,40 @@ namespace gbe
       for(uint32_t i = 0; i < msgNum; i++)
         msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
 
-      // bti always uses TYPE_U32.
-      src[valueID] = sel.selReg(insn.getSrc(valueID), TYPE_U32);
-      valueID++;
       // u, v, w coords should use coord type.
       for (; valueID < 1 + coordNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getCoordType());
+        src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
 
-      for (; valueID < insn.getSrcNum(); ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
+        src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getSrcType());
 
-      sel.TYPED_WRITE(src, insn.getSrcNum(), msgs, msgNum);
+      uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
+                       (insn.getSrc(TypedWriteInstruction::SURFACE_BTI));
+      sel.TYPED_WRITE(src, insn.getSrcNum() - 1, msgs, msgNum, bti);
       return true;
     }
     DECL_CTOR(TypedWriteInstruction, 1, 1);
   };
 
+  /*! get image info instruction pattern. */
+  DECL_PATTERN(GetImageInfoInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn) const
+    {
+      using namespace ir;
+      const uint32_t infoType = insn.getInfoType();
+      GenRegister dst[4];
+      uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(infoType);
+      for (uint32_t valueID = 0; valueID < dstNum; ++valueID)
+        dst[valueID] = sel.selReg(insn.getDst(valueID), TYPE_U32);
+      uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
+                       (insn.getSrc(0));
+      sel.GET_IMAGE_INFO(infoType, dst, dstNum, bti);
+      return true;
+    }
+    DECL_CTOR(GetImageInfoInstruction, 1, 1);
+  };
+
   /*! Branch instruction pattern */
   DECL_PATTERN(BranchInstruction)
   {
@@ -2159,7 +2211,6 @@ namespace gbe
   SelectionLibrary::SelectionLibrary(void) {
     this->insert<UnaryInstructionPattern>();
     this->insert<BinaryInstructionPattern>();
-    this->insert<SampleInstructionPattern>();
     this->insert<TypedWriteInstructionPattern>();
     this->insert<SyncInstructionPattern>();
     this->insert<LoadImmInstructionPattern>();
@@ -2175,6 +2226,7 @@ namespace gbe
     this->insert<MulAddInstructionPattern>();
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
+    this->insert<GetImageInfoInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index f89ad4c..455bb92 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -24,7 +24,7 @@ DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
 DECL_SELECTION_IR(MAD, TernaryInstruction)
 DECL_SELECTION_IR(JMPI, JumpInstruction)
 DECL_SELECTION_IR(EOT, EotInstruction)
-DECL_SELECTION_IR(CB_MOVE, CBMoveInstruction)
+DECL_SELECTION_IR(INDIRECT_MOVE, IndirectMoveInstruction)
 DECL_SELECTION_IR(NOP, NoOpInstruction)
 DECL_SELECTION_IR(WAIT, WaitInstruction)
 DECL_SELECTION_IR(MATH, MathInstruction)
@@ -35,3 +35,4 @@ DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
+DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 38cc236..c46c681 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -49,10 +49,12 @@
 namespace gbe {
 
   Kernel::Kernel(const std::string &name) :
-    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), ctx(NULL)
+    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), ctx(NULL), samplerSet(NULL), imageSet(NULL)
   {}
   Kernel::~Kernel(void) {
     if(ctx) GBE_DELETE(ctx);
+    if(samplerSet) GBE_DELETE(samplerSet);
+    if(imageSet) GBE_DELETE(imageSet);
     GBE_SAFE_DELETE_ARRAY(args);
   }
   int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
@@ -90,6 +92,8 @@ namespace gbe {
     for (const auto &pair : set) {
       const std::string &name = pair.first;
       Kernel *kernel = this->compileKernel(unit, name);
+      kernel->setSamplerSet(pair.second->getSamplerSet());
+      kernel->setImageSet(pair.second->getImageSet());
       kernels.insert(std::make_pair(name, kernel));
     }
     return true;
@@ -250,6 +254,39 @@ namespace gbe {
     return kernel->setConstBufSize(argID, sz);
   }
 
+  static size_t kernelGetSamplerSize(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->getSamplerSize();
+  }
+
+  static void kernelGetSamplerData(gbe_kernel gbeKernel, uint32_t *samplers) {
+    if (gbeKernel == NULL) return;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    kernel->getSamplerData(samplers);
+  }
+
+  static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->getImageSize();
+  }
+
+  static void kernelGetImageData(gbe_kernel gbeKernel, ImageInfo *images) {
+    if (gbeKernel == NULL) return;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    kernel->getImageData(images);
+  }
+
+  static uint32_t gbeImageBaseIndex = 0;
+  static void setImageBaseIndex(uint32_t baseIdx) {
+     gbeImageBaseIndex = baseIdx;
+  }
+
+  static uint32_t getImageBaseIndex() {
+    return gbeImageBaseIndex;
+  }
+
   static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
     return 0u;
   }
@@ -277,6 +314,12 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL
 GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
+GBE_EXPORT_SYMBOL gbe_set_image_base_index_cb *gbe_set_image_base_index = NULL;
+GBE_EXPORT_SYMBOL gbe_get_image_base_index_cb *gbe_get_image_base_index = NULL;
 
 namespace gbe
 {
@@ -304,6 +347,12 @@ namespace gbe
       gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
       gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
       gbe_kernel_use_slm = gbe::kernelUseSLM;
+      gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
+      gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+      gbe_kernel_get_image_size = gbe::kernelGetImageSize;
+      gbe_kernel_get_image_data = gbe::kernelGetImageData;
+      gbe_get_image_base_index = gbe::getImageBaseIndex;
+      gbe_set_image_base_index = gbe::setImageBaseIndex;
       genSetupCallBacks();
     }
   };
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 575196a..f178f8b 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -71,9 +71,7 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
   GBE_CURBE_GLOBAL_CONSTANT_DATA,
-  GBE_CURBE_IMAGE_WIDTH,
-  GBE_CURBE_IMAGE_HEIGHT,
-  GBE_CURBE_IMAGE_DEPTH,
+  GBE_CURBE_IMAGE_INFO,
   GBE_CURBE_STACK_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
   GBE_CURBE_EXTRA_ARGUMENT,
@@ -87,6 +85,31 @@ enum gbe_extra_argument {
   GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
 };
 
+typedef struct ImageInfo {
+    int32_t arg_idx;
+    int32_t idx;
+    int32_t wSlot;
+    int32_t hSlot;
+    int32_t depthSlot;
+    int32_t dataTypeSlot;
+    int32_t channelOrderSlot;
+    int32_t dimOrderSlot;
+} ImageInfo;
+
+typedef void (gbe_set_image_base_index_cb)(uint32_t base_idx);
+extern gbe_set_image_base_index_cb *gbe_set_image_base_index;
+
+typedef uint32_t (gbe_get_image_base_index_cb)();
+extern gbe_get_image_base_index_cb *gbe_get_image_base_index;
+
+/*! Get the size of defined images */
+typedef size_t (gbe_kernel_get_image_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size;
+
+/*! Get the content of defined images */
+typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo *images);
+extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
+
 /*! Create a new program from the given source code (zero terminated string) */
 typedef gbe_program (gbe_program_new_from_source_cb)(const char *source,
                                                      size_t stringSize,
@@ -114,6 +137,14 @@ extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_
 typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
 extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
 
+/*! Get the size of defined samplers */
+typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
+
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
+extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
+
 /*! Destroy and deallocate the given program */
 typedef void (gbe_program_delete_cb)(gbe_program);
 extern gbe_program_delete_cb *gbe_program_delete;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index e754899..2d67310 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -29,6 +29,8 @@
 #include "backend/context.hpp"
 #include "ir/constant.hpp"
 #include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/sampler.hpp"
 #include "sys/hash_map.hpp"
 #include "sys/vector.hpp"
 #include <string>
@@ -53,9 +55,9 @@ namespace gbe {
     INLINE PatchInfo(gbe_curbe_type type, uint32_t subType = 0u, uint32_t offset = 0u) :
       type(uint32_t(type)), subType(subType), offset(offset) {}
     INLINE PatchInfo(void) {}
-    uint32_t type : 8;    //!< Type of the patch (see program.h for the list)
-    uint32_t subType : 8; //!< Optional sub-type of the patch (see program.h)
-    uint32_t offset : 16; //!< Optional offset to encode
+    uint64_t type : 16;    //!< Type of the patch (see program.h for the list)
+    uint64_t subType : 32; //!< Optional sub-type of the patch (see program.h)
+    uint64_t offset : 16; //!< Optional offset to encode
   };
 
   /*! We will sort PatchInfo to make binary search */
@@ -108,6 +110,22 @@ namespace gbe {
       }
       return -1;
     }
+    /*! Set sampler set. */
+    void setSamplerSet(ir::SamplerSet *from) {
+      samplerSet = from;
+    }
+    /*! Get defined sampler size */
+    size_t getSamplerSize(void) const { return samplerSet->getDataSize(); }
+    /*! Get defined sampler value array */
+    void getSamplerData(uint32_t *samplers) const { samplerSet->getData(samplers); }
+    /*! Set image set. */
+    void setImageSet(ir::ImageSet * from) {
+      imageSet = from;
+    }
+    /*! Get defined image size */
+    size_t getImageSize(void) const { return imageSet->getDataSize(); }
+    /*! Get defined image value array */
+    void getImageData(ImageInfo *images) const { imageSet->getData(images); }
   protected:
     friend class Context;      //!< Owns the kernels
     const std::string name;    //!< Kernel name
@@ -119,6 +137,8 @@ namespace gbe {
     uint32_t stackSize;        //!< Stack size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
+    ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
+    ir::ImageSet *imageSet;    //!< Copy from the corresponding function.
     GBE_CLASS(Kernel);         //!< Use custom allocators
   };
 
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 004afcd..88aae08 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -46,6 +46,8 @@ namespace ir {
     name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false)
   {
     initProfile(*this);
+    samplerSet = GBE_NEW(SamplerSet);
+    imageSet = GBE_NEW(ImageSet);
   }
 
   Function::~Function(void) {
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 43aa826..6e712cd 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -28,6 +28,8 @@
 #include "ir/register.hpp"
 #include "ir/instruction.hpp"
 #include "ir/profile.hpp"
+#include "ir/sampler.hpp"
+#include "ir/image.hpp"
 #include "sys/vector.hpp"
 #include "sys/set.hpp"
 #include "sys/map.hpp"
@@ -196,6 +198,18 @@ namespace ir {
       GBE_ASSERT(args[ID] != NULL);
       return *args[ID];
     }
+
+    /*! Get arg ID. */
+    INLINE int32_t getArgID(FunctionArgument *requestArg) {
+      for (uint32_t ID = 0; ID < args.size(); ID++)
+      {
+        if ( args[ID] == requestArg )
+          return ID;
+      }
+      GBE_ASSERTM(0, "Failed to get a valid argument ID.");
+      return -1;
+    }
+
     /*! Get the number of pushed registers */
     INLINE uint32_t pushedNum(void) const { return pushMap.size(); }
     /*! Get the pushed data location for the given register */
@@ -217,6 +231,12 @@ namespace ir {
       for (auto arg : args) if (arg->reg == reg) return arg;
       return NULL;
     }
+
+    INLINE FunctionArgument *getArg(const Register &reg) {
+      for (auto arg : args) if (arg->reg == reg) return arg;
+      return NULL;
+    }
+
     /*! Get output register */
     INLINE Register getOutput(uint32_t ID) const { return outputs[ID]; }
     /*! Get the argument location for the pushed register */
@@ -281,6 +301,10 @@ namespace ir {
     INLINE bool getUseSLM(void) const { return this->useSLM; }
     /*! Change the SLM config for the function */
     INLINE bool setUseSLM(bool useSLM) { return this->useSLM = useSLM; }
+    /*! Get sampler set in this function */
+    SamplerSet* getSamplerSet(void) const {return samplerSet; }
+    /*! Get image set in this function */
+    ImageSet* getImageSet(void) const {return imageSet; }
   private:
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
@@ -296,6 +320,8 @@ namespace ir {
     LocationMap locationMap;        //!< Pushed function arguments (loc->reg)
     uint32_t simdWidth;             //!< 8 or 16 if forced, 0 otherwise
     bool useSLM;                    //!< Is SLM required?
+    SamplerSet *samplerSet;          //!< samplers used in this function.
+    ImageSet* imageSet;              //!< Image set in this function's arguments..
     GBE_CLASS(Function);            //!< Use custom allocator
   };
 
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
new file mode 100644
index 0000000..9398e22
--- /dev/null
+++ b/backend/src/ir/image.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.cpp
+ *
+ */
+#include "image.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+#include "backend/program.h"
+
+namespace gbe {
+namespace ir {
+
+  static uint32_t getInfoOffset4Type(struct ImageInfo *imageInfo, int type)
+  {
+    switch (type) {
+      case GetImageInfoInstruction::WIDTH: return imageInfo->wSlot;
+      case GetImageInfoInstruction::HEIGHT: return imageInfo->hSlot;
+      default:
+        NOT_IMPLEMENTED;
+    }
+    return 0;
+  }
+
+  static uint32_t setInfoOffset4Type(struct ImageInfo *imageInfo, int type, uint32_t offset)
+  {
+    switch (type) {
+      case GetImageInfoInstruction::WIDTH: imageInfo->wSlot = offset; break;
+      case GetImageInfoInstruction::HEIGHT: imageInfo->hSlot = offset; break;
+      default:
+        NOT_IMPLEMENTED;
+    }
+    return 0;
+  }
+
+  void ImageSet::appendInfo(ImageInfoKey key, uint32_t offset)
+  {
+    auto it = indexMap.find(key.index);
+    assert(it != indexMap.end());
+    struct ImageInfo *imageInfo = it->second;
+    setInfoOffset4Type(imageInfo, key.type, offset);
+  }
+
+  void ImageSet::append(Register imageReg, Context *ctx)
+  {
+    ir::FunctionArgument *arg =  ctx->getFunction().getArg(imageReg);
+    GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
+    GBE_ASSERTM(regMap.find(imageReg) == regMap.end(), "Append the same image reg twice.");
+
+    int32_t id = ctx->getFunction().getArgID(arg);
+    struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
+    imageInfo->arg_idx = id;
+    imageInfo->idx = regMap.size() + gbe_get_image_base_index();
+    imageInfo->wSlot = -1;
+    imageInfo->hSlot = -1;
+    imageInfo->depthSlot = -1;
+    imageInfo->dataTypeSlot = -1;
+    imageInfo->channelOrderSlot = -1;
+    imageInfo->dimOrderSlot = -1;
+    regMap.insert(std::make_pair(imageReg, imageInfo));
+    indexMap.insert(std::make_pair(imageInfo->idx, imageInfo));
+  }
+
+  const int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
+  {
+    auto it = indexMap.find(key.index);
+    if (it == indexMap.end())
+      return -1;
+    struct ImageInfo *imageInfo = it->second;
+    return getInfoOffset4Type(imageInfo, key.type);
+  }
+
+  const uint32_t ImageSet::getIdx(const Register imageReg) const
+  {
+    auto it = regMap.find(imageReg);
+    GBE_ASSERT(it != regMap.end());
+    return it->second->idx;
+  }
+
+  void ImageSet::getData(struct ImageInfo *imageInfos) const {
+      for(auto &it : regMap)
+        imageInfos[it.second->idx - gbe_get_image_base_index()] = *it.second;
+  }
+
+  ImageSet::~ImageSet() {
+    for(auto &it : regMap)
+      GBE_DELETE(it.second);
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
new file mode 100644
index 0000000..04e78e6
--- /dev/null
+++ b/backend/src/ir/image.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.hpp
+ *
+ */
+#ifndef __GBE_IR_IMAGE_HPP__
+#define __GBE_IR_IMAGE_HPP__
+
+#include "ir/register.hpp"
+#include "ir/instruction.hpp" // for ImageInfoKey
+#include "sys/map.hpp"
+
+extern "C" {
+  struct ImageInfo;
+}
+
+namespace gbe {
+namespace ir {
+
+  class Context;
+  /*! An image set is a set of images which are defined in kernel args.
+   *  We use this set to gather the images here and allocate a unique index
+   *  for each individual image. And that individual image could be used
+   *  at backend to identify this image's location.
+   */
+  class ImageSet
+  {
+  public:
+    /*! Append an image argument. */
+    void append(Register imageReg, Context *ctx);
+    /*! Append an image info slot. */
+    void appendInfo(ImageInfoKey key, uint32_t offset);
+    /*! Get the image's index(actual location). */
+    const uint32_t getIdx(const Register imageReg) const;
+    size_t getDataSize(void) { return regMap.size(); }
+    size_t getDataSize(void) const { return regMap.size(); }
+
+    const int32_t getInfoOffset(ImageInfoKey key) const;
+    void getData(struct ImageInfo *imageInfos) const;
+    void operator = (const ImageSet& other) {
+      regMap.insert(other.regMap.begin(), other.regMap.end());
+    }
+    ImageSet(const ImageSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
+    ImageSet() {}
+    ~ImageSet();
+  private:
+    map<Register, struct ImageInfo *> regMap;
+    map<uint32_t, struct ImageInfo *> indexMap;
+    GBE_CLASS(ImageSet);
+  };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_IMAGE_HPP__ */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 8980abf..a57c204 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -470,6 +470,39 @@ namespace ir {
       Register dst[0];               //!< No dest register
     };
 
+    class ALIGNED_INSTRUCTION GetImageInfoInstruction :
+      public BasePolicy,
+      public NSrcPolicy<GetImageInfoInstruction, 1>,
+      public TupleDstPolicy<GetImageInfoInstruction>
+    {
+    public:
+      GetImageInfoInstruction( int type,
+                               Tuple dst,
+                               Register src)
+      {
+        this->opcode = OP_GET_IMAGE_INFO;
+        this->infoType = type;
+        this->dst = dst;
+        this->src[0] = src;
+      }
+
+      INLINE uint32_t getInfoType(void) const { return infoType; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << "." << this->getInfoType()
+            << " surface id %" << this->getSrc(fn, 0)
+            << " %" << this->getDst(fn, 0);
+      }
+
+      uint8_t infoType;                 //!< Type of the requested information.
+      Register src[1];                  //!< Surface to get info
+      Tuple dst;                        //!< dest register to put the information.
+      static const uint32_t dstNum = 4; //! The maximum dst number. Not the actual number
+                                        // of destination tuple. We use the infoType to determin
+                                        // the actual num.
+    };
+
     class ALIGNED_INSTRUCTION LoadImmInstruction :
       public BasePolicy,
       public NSrcPolicy<LoadImmInstruction, 0>,
@@ -758,6 +791,8 @@ namespace ir {
     { return true; }
     INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
+    INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
 
     // Ensure that types and register family match
     INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
@@ -915,7 +950,6 @@ namespace ir {
       case MEM_CONSTANT: return out << "constant";
       case MEM_PRIVATE: return out << "private";
       case IMAGE: return out << "image";
-      case SAMPLER: return out << "sampler";
       case MEM_INVALID: return out << "invalid";
     };
     return out;
@@ -991,6 +1025,10 @@ START_INTROSPECTION(TypedWriteInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(TypedWriteInstruction)
 
+START_INTROSPECTION(GetImageInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetImageInfoInstruction)
+
 START_INTROSPECTION(LoadImmInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LoadImmInstruction)
@@ -1176,6 +1214,7 @@ DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
+DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
 
 #undef DECL_MEM_FN
 
@@ -1317,6 +1356,10 @@ DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
     return internal::TypedWriteInstruction(src, srcType, coordType).convert();
   }
 
+  Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src) {
+    return internal::GetImageInfoInstruction(infoType, dst, src).convert();
+  }
+
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
     switch (insn.getOpcode()) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index ec5d250..c948d2c 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -50,7 +50,6 @@ namespace ir {
     MEM_CONSTANT,   //!< Immutable global memory
     MEM_PRIVATE,    //!< Per thread private memory
     IMAGE,          //!< For texture image.
-    SAMPLER,        //!< For sampler.
     MEM_INVALID
   };
 
@@ -298,6 +297,9 @@ namespace ir {
   /*! Store data in an texture */
   class TypedWriteInstruction : public Instruction {
   public:
+    enum {
+     SURFACE_BTI = 0
+    };
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
     Type getSrcType(void) const;
@@ -307,12 +309,51 @@ namespace ir {
   /*! Load texels from a texture */
   class SampleInstruction : public Instruction {
   public:
+    enum {
+     SURFACE_BTI = 0,
+     SAMPLER_BTI = 1
+    };
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
     Type getSrcType(void) const;
     Type getDstType(void) const;
   };
 
+  typedef union {
+    struct {
+     uint16_t index; /*! the allocated image index */
+     uint16_t type;  /*! the information type */
+    };
+    uint32_t data;
+  } ImageInfoKey;
+  /*! Get image information */
+  class GetImageInfoInstruction : public Instruction {
+  public:
+    enum {
+     SURFACE_BTI = 0
+    };
+    enum {
+     WIDTH = 0,
+     HEIGHT = 1,
+    };
+
+    static INLINE uint32_t getDstNum4Type(int infoType) {
+      switch (infoType) {
+        case GetImageInfoInstruction::WIDTH:
+        case GetImageInfoInstruction::HEIGHT:
+          return 1;
+        break;
+        default:
+          GBE_ASSERT(0);
+     }
+     return 0;
+   }
+
+    uint32_t getInfoType() const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Branch instruction is the unified way to branch (with or without
    *  predicate)
    */
@@ -522,6 +563,8 @@ namespace ir {
   Instruction TYPED_WRITE(Tuple src, Type srcType, Type coordType);
   /*! sample textures */
   Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType);
+  /*! get image information , such as width/height/depth/... */
+  Instruction GET_IMAGE_INFO(int infoType, Tuple dst, Register src);
   /*! label labelIndex */
   Instruction LABEL(LabelIndex labelIndex);
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index b16c22f..5cf37d2 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -70,4 +70,4 @@ DECL_INSN(TYPED_WRITE, TypedWriteInstruction)
 DECL_INSN(SAMPLE, SampleInstruction)
 DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
-
+DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
new file mode 100644
index 0000000..62bdc16
--- /dev/null
+++ b/backend/src/ir/sampler.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.cpp
+ *
+ */
+#include "sampler.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+
+namespace gbe {
+namespace ir {
+
+  const uint32_t SamplerSet::getIdx(const Register reg) const
+  {
+    auto it = regMap.find(reg);
+    GBE_ASSERT(it != regMap.end());
+    return it->second.slot;
+  }
+
+  void SamplerSet::appendReg(const Register reg, uint32_t key, Context *ctx) {
+    struct SamplerRegSlot samplerSlot;
+    samplerSlot.reg = reg;
+    samplerSlot.slot = samplerMap.size();
+    samplerMap.insert(std::make_pair(key, samplerSlot));
+    regMap.insert(std::make_pair(samplerSlot.reg, samplerSlot));
+  }
+
+  Register SamplerSet::append(uint32_t samplerValue, Context *ctx)
+  {
+    auto it = samplerMap.find(samplerValue);
+    if (it != samplerMap.end())
+        return it->second.reg;
+    // This register is just used as a key.
+    Register reg = ctx->reg(FAMILY_DWORD);
+    appendReg(reg, samplerValue, ctx);
+    return reg;
+  }
+
+#define SAMPLER_ID(id) ((id << __CLK_SAMPLER_ARG_BASE) | __CLK_SAMPLER_ARG_KEY_BIT)
+  void SamplerSet::append(Register samplerReg, Context *ctx)
+  {
+    ir::FunctionArgument *arg =  ctx->getFunction().getArg(samplerReg);
+    GBE_ASSERT(arg != NULL);
+
+    // XXX As LLVM 3.2/3.1 doesn't have a new data type for the sampler_t, we have to fix up the argument
+    // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
+    // work around.
+    arg->type = ir::FunctionArgument::SAMPLER;
+    int32_t id = ctx->getFunction().getArgID(arg);
+    GBE_ASSERT(id < (1 << __CLK_SAMPLER_ARG_BITS));
+
+    auto it = samplerMap.find(SAMPLER_ID(id));
+    if (it != samplerMap.end()) {
+      GBE_ASSERT(it->second.reg == samplerReg);
+      return;
+    }
+    appendReg(samplerReg, SAMPLER_ID(id), ctx);
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
new file mode 100644
index 0000000..f968299
--- /dev/null
+++ b/backend/src/ir/sampler.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_SAMPLER_HPP__
+#define __GBE_IR_SAMPLER_HPP__
+
+#include "ir/register.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+  /*! A sampler set is a set of global samplers which are defined as constant global
+   * sampler or defined in the outermost kernel scope variables. According to the spec
+   * all the variable should have a initialized integer value and can't be modified.
+   */
+  class Context;
+
+  struct SamplerRegSlot {
+    Register reg;
+    uint32_t slot;
+  };
+
+  class SamplerSet
+  {
+  public:
+    /*! Append the specified sampler and return the allocated offset.
+     *  If the speficied sampler is exist, only return the previous offset and
+     *  don't append it again. Return -1, if failed.*/
+    Register append(uint32_t clkSamplerValue, Context *ctx);
+    /*! Append a sampler defined in kernel args. */
+    void append(Register samplerArg, Context *ctx);
+    /*! Get the sampler idx (actual location) */
+    const uint32_t getIdx(const Register reg) const;
+    size_t getDataSize(void) { return samplerMap.size(); }
+    size_t getDataSize(void) const { return samplerMap.size(); }
+    void getData(uint32_t *samplers) const {
+      for(auto &it : samplerMap)
+        samplers[it.second.slot] = it.first;
+    }
+
+    void operator = (const SamplerSet& other) {
+      regMap.insert(other.regMap.begin(), other.regMap.end());
+      samplerMap.insert(other.samplerMap.begin(), other.samplerMap.end());
+    }
+
+    SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
+    SamplerSet() {}
+  private:
+    void appendReg(const Register reg, uint32_t key, Context *ctx);
+    map<uint32_t, SamplerRegSlot> samplerMap;
+    map<Register, SamplerRegSlot> regMap;
+    GBE_CLASS(SamplerSet);
+  };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_SAMPLER_HPP__ */
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index ae78638..3b293f5 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -24,9 +24,12 @@
 #ifndef __GBE_IR_UNIT_HPP__
 #define __GBE_IR_UNIT_HPP__
 
+#include "llvm/Value.h"
+
 #include "ir/constant.hpp"
 #include "ir/register.hpp"
 #include "sys/hash_map.hpp"
+#include "sys/map.hpp"
 
 namespace gbe {
 namespace ir {
@@ -41,6 +44,7 @@ namespace ir {
   {
   public:
     typedef hash_map<std::string, Function*> FunctionSet;
+    typedef std::pair<llvm::Value*, uint32_t> ValueIndex;
     /*! Create an empty unit */
     Unit(PointerSize pointerSize = POINTER_32_BITS);
     /*! Release everything (*including* the function pointers) */
@@ -71,11 +75,27 @@ namespace ir {
     ConstantSet& getConstantSet(void) { return constantSet; }
     /*! Return the constant set */
     const ConstantSet& getConstantSet(void) const { return constantSet; }
+
+    /*! Some values will not be allocated. For example a vector extract and
+     * a vector insertion when scalarize the vector load/store
+     */
+    void newValueProxy(llvm::Value *real,
+                       llvm::Value *fake,
+                       uint32_t realIndex = 0u,
+                       uint32_t fakeIndex = 0u) {
+      const ValueIndex key(fake, fakeIndex);
+      const ValueIndex value(real, realIndex);
+      GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
+      valueMap[key] = value;
+    }
+    /*! Return the value map */
+    const map<ValueIndex, ValueIndex>& getValueMap(void) const { return valueMap; }
   private:
     friend class ContextInterface; //!< Can free modify the unit
     hash_map<std::string, Function*> functions; //!< All the defined functions
     ConstantSet constantSet; //!< All the constants defined in the unit
     PointerSize pointerSize; //!< Size shared by all pointers
+    map<ValueIndex, ValueIndex> valueMap; //!< fake to real value map for vector load/store
     GBE_CLASS(Unit);
   };
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index ad465e2..deda687 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -60,7 +60,7 @@
  * dependencies on endianness or ABIs. Fortunately, the ptx (and nvptx for LLVM
  * 3.2) profile is pretty well adapted to our needs since NV and Gen GPU are
  * kind of similar, or at least they are similar enough to share the same front
- * end. 
+ * end.
  *
  * Problems
  * ========
@@ -126,10 +126,8 @@
 #include "ir/context.hpp"
 #include "ir/unit.hpp"
 #include "ir/liveness.hpp"
-#include "sys/map.hpp"
 #include "sys/set.hpp"
 #include "sys/cvar.hpp"
-#include <algorithm>
 
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
@@ -207,7 +205,7 @@ namespace gbe
   /*! Type to register family translation */
   static ir::RegisterFamily getFamily(const ir::Context &ctx, const Type *type)
   {
-    GBE_ASSERT(isScalarType(type) == true); 
+    GBE_ASSERT(isScalarType(type) == true);
     if (type == Type::getInt1Ty(type->getContext()))
       return ir::FAMILY_BOOL;
     if (type == Type::getInt8Ty(type->getContext()))
@@ -256,7 +254,6 @@ namespace gbe
       case 2: return ir::MEM_CONSTANT;
       case 3: return ir::MEM_LOCAL;
       case 4: return ir::IMAGE;
-      case 5: return ir::SAMPLER;
     }
     GBE_ASSERT(false);
     return ir::MEM_GLOBAL;
@@ -270,6 +267,8 @@ namespace gbe
   class RegisterTranslator
   {
   public:
+    /*! Indices will be zero for scalar values */
+    typedef std::pair<Value*, uint32_t> ValueIndex;
     RegisterTranslator(ir::Context &ctx) : ctx(ctx) {}
 
     /*! Empty the maps */
@@ -290,6 +289,11 @@ namespace gbe
       GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
       valueMap[key] = value;
     }
+    /*! After scalarize pass, there are some valueMap in unit,
+     *  use this function to copy from unit valueMap */
+    void initValueMap(const map<ValueIndex, ValueIndex>& vMap) {
+      valueMap.insert(vMap.begin(), vMap.end());
+    }
     /*! Mostly used for the preallocated registers (lids, gids) */
     void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
       const ValueIndex key(value, index);
@@ -326,10 +330,9 @@ namespace gbe
       };
       return ir::Register();
     }
-    /*! Get the register from the given value at given index possibly iterating
-     *  in the value map to get the final real register
-     */
-    ir::Register getScalar(Value *value, uint32_t index = 0u) {
+
+    /*! iterating in the value map to get the final real register */
+    void getRealValue(Value* &value, uint32_t& index) {
       auto end = valueMap.end();
       for (;;) {
         auto it = valueMap.find(std::make_pair(value, index));
@@ -340,6 +343,14 @@ namespace gbe
           index = it->second.second;
         }
       }
+    }
+
+    /*! Get the register from the given value at given index possibly iterating
+     *  in the value map to get the final real register
+     */
+    ir::Register getScalar(Value *value, uint32_t index = 0u) {
+      getRealValue(value, index);
+
       const auto key = std::make_pair(value, index);
       GBE_ASSERT(scalarMap.find(key) != scalarMap.end());
       return scalarMap[key];
@@ -352,16 +363,8 @@ namespace gbe
     }
     /*! Says if the value exists. Otherwise, it is undefined */
     bool valueExists(Value *value, uint32_t index) {
-      auto end = valueMap.end();
-      for (;;) {
-        auto it = valueMap.find(std::make_pair(value, index));
-        if (it == end)
-          break;
-        else {
-          value = it->second.first;
-          index = it->second.second;
-        }
-      }
+      getRealValue(value, index);
+
       const auto key = std::make_pair(value, index);
       return scalarMap.find(key) != scalarMap.end();
     }
@@ -376,8 +379,6 @@ namespace gbe
       this->insertRegister(reg, key, index);
       return reg;
     }
-    /*! Indices will be zero for scalar values */
-    typedef std::pair<Value*, uint32_t> ValueIndex;
     /*! Map value to ir::Register */
     map<ValueIndex, ir::Register> scalarMap;
     /*! Map values to values when this is only a translation (eq bitcast) */
@@ -385,28 +386,6 @@ namespace gbe
     /*! Actually allocates the registers */
     ir::Context &ctx;
   };
-  /*! All intrinsic Gen functions */
-  enum OCLInstrinsic {
-#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
-#include "llvm_gen_ocl_function.hxx"
-#undef DECL_LLVM_GEN_FUNCTION
-  };
-
-  /*! Build the hash map for OCL functions on Gen */
-  struct OCLIntrinsicMap {
-    /*! Build the intrinsic hash map */
-    OCLIntrinsicMap(void) {
-#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
-  map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
-#include "llvm_gen_ocl_function.hxx"
-#undef DECL_LLVM_GEN_FUNCTION
-    }
-    /*! Sort intrinsics with their names */
-    hash_map<std::string, OCLInstrinsic> map;
-  };
-
-  /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
-  static const OCLIntrinsicMap instrinsicMap;
 
   /*! Translate LLVM IR code to Gen IR code */
   class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
@@ -424,7 +403,7 @@ namespace gbe
      */
     set<const Value*> conditionSet;
     /*! We visit each function twice. Once to allocate the registers and once to
-     *  emit the Gen IR instructions 
+     *  emit the Gen IR instructions
      */
     enum Pass {
       PASS_EMIT_REGISTERS = 0,
@@ -664,7 +643,7 @@ namespace gbe
     if (dyn_cast<ConstantAggregateZero>(CPV)) {
       return doIt(uint32_t(0)); // XXX Handle type
     } else {
-      if (dyn_cast<ConstantVector>(CPV)) 
+      if (dyn_cast<ConstantVector>(CPV))
         CPV = extractConstantElem(CPV, index);
       GBE_ASSERTM(dyn_cast<ConstantExpr>(CPV) == NULL, "Unsupported constant expression");
 
@@ -757,6 +736,9 @@ namespace gbe
   }
 
   ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
+    //the real value may be constant, so get real value before constant check
+    regTranslator.getRealValue(value, elemID);
+
     if (dyn_cast<ConstantExpr>(value)) {
       ConstantExpr *ce = dyn_cast<ConstantExpr>(value);
       if(ce->isCast()) {
@@ -831,32 +813,29 @@ namespace gbe
       PHINode *PN = cast<PHINode>(I);
       Value *IV = PN->getIncomingValueForBlock(curr);
       if (!isa<UndefValue>(IV)) {
-        uint32_t elemNum;
         Type *llvmType = PN->getType();
         GBE_ASSERTM(llvmType != Type::getInt1Ty(llvmType->getContext()),
           "TODO Boolean values cannot escape their definition basic block");
-        const ir::Type type = getVectorInfo(ctx, llvmType, PN, elemNum);
+        const ir::Type type = getType(ctx, llvmType);
 
         // Emit the MOV required by the PHI function. We do it simple and do not
         // try to optimize them. A next data flow analysis pass on the Gen IR
         // will remove them
-        for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-          Value *PHICopy = this->getPHICopy(PN);
-          const ir::Register dst = this->getRegister(PHICopy, elemID);
-          Constant *CP = dyn_cast<Constant>(IV);
-          if (CP) {
-            GBE_ASSERT(isa<GlobalValue>(CP) == false);
-            ConstantVector *CPV = dyn_cast<ConstantVector>(CP);
-            if (CPV && dyn_cast<ConstantVector>(CPV) &&
-                isa<UndefValue>(extractConstantElem(CPV, elemID)))
-              continue;
-            const ir::ImmediateIndex immIndex = this->newImmediate(CP, elemID);
-            const ir::Immediate imm = ctx.getImmediate(immIndex);
-            ctx.LOADI(imm.type, dst, immIndex);
-          } else if (regTranslator.valueExists(IV,elemID) || dyn_cast<Constant>(IV)) {
-            const ir::Register src = this->getRegister(IV, elemID);
-            ctx.MOV(type, dst, src);
-          }
+        Value *PHICopy = this->getPHICopy(PN);
+        const ir::Register dst = this->getRegister(PHICopy);
+        Constant *CP = dyn_cast<Constant>(IV);
+        if (CP) {
+          GBE_ASSERT(isa<GlobalValue>(CP) == false);
+          ConstantVector *CPV = dyn_cast<ConstantVector>(CP);
+          if (CPV && dyn_cast<ConstantVector>(CPV) &&
+              isa<UndefValue>(extractConstantElem(CPV, 0)))
+            continue;
+          const ir::ImmediateIndex immIndex = this->newImmediate(CP);
+          const ir::Immediate imm = ctx.getImmediate(immIndex);
+          ctx.LOADI(imm.type, dst, immIndex);
+        } else if (regTranslator.valueExists(IV,0) || dyn_cast<Constant>(IV)) {
+          const ir::Register src = this->getRegister(IV);
+          ctx.MOV(type, dst, src);
         }
       }
     }
@@ -868,6 +847,7 @@ namespace gbe
                 "Returned value for kernel functions is forbidden");
     // Loop over the arguments and output registers for them
     if (!F.arg_empty()) {
+      uint32_t argID = 0;
       Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
 
       // Insert a new register for each function argument
@@ -876,10 +856,33 @@ namespace gbe
       uint32_t argID = 1; // Start at one actually
       for (; I != E; ++I, ++argID) {
 #else
-      for (; I != E; ++I) {
+      for (; I != E; ++I, ++argID) {
 #endif /* LLVM_VERSION_MINOR <= 1 */
         const std::string &argName = I->getName().str();
         Type *type = I->getType();
+
+        //add support for vector argument
+        if(type->isVectorTy()) {
+          VectorType *vectorType = cast<VectorType>(type);
+
+          this->newRegister(I);
+          ir::Register reg = getRegister(I, 0);
+
+          Type *elemType = vectorType->getElementType();
+          const uint32_t elemSize = getTypeByteSize(unit, elemType);
+          const uint32_t elemNum = vectorType->getNumElements();
+          //vector's elemType always scalar type
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize);
+
+          ir::Function& fn = ctx.getFunction();
+          for(uint32_t i=1; i < elemNum; i++) {
+            ir::PushLocation argLocation(fn, argID, elemSize*i);
+            reg = getRegister(I, i);
+            ctx.appendPushedConstant(reg, argLocation);  //add to push map for reg alloc
+          }
+          continue;
+        }
+
         GBE_ASSERTM(isScalarType(type) == true,
                     "vector type in the function argument is not supported yet");
         const ir::Register reg = regTranslator.newScalar(I);
@@ -915,9 +918,7 @@ namespace gbe
               break;
               case ir::IMAGE:
                 ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize);
-              break;
-              case ir::SAMPLER:
-                ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, ptrSize);
+                ctx.getFunction().getImageSet()->append(reg, &ctx);
               break;
               default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
             }
@@ -1143,6 +1144,7 @@ namespace gbe
 
     ctx.startFunction(F.getName());
     this->regTranslator.clear();
+    this->regTranslator.initValueMap(unit.getValueMap());
     this->labelMap.clear();
     this->emitFunctionPrototype(F);
 
@@ -1232,36 +1234,33 @@ namespace gbe
 #endif /* GBE_DEBUG */
 
     // Get the element type for a vector
-    uint32_t elemNum;
-    const ir::Type type = getVectorInfo(ctx, I.getType(), &I, elemNum);
+    const ir::Type type = getType(ctx, I.getType());
 
     // Emit the instructions in a row
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-      const ir::Register dst = this->getRegister(&I, elemID);
-      const ir::Register src0 = this->getRegister(I.getOperand(0), elemID);
-      const ir::Register src1 = this->getRegister(I.getOperand(1), elemID);
-
-      switch (I.getOpcode()) {
-        case Instruction::Add:
-        case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
-        case Instruction::Sub:
-        case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
-        case Instruction::Mul:
-        case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
-        case Instruction::URem:
-        case Instruction::SRem:
-        case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
-        case Instruction::UDiv:
-        case Instruction::SDiv:
-        case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
-        case Instruction::And:  ctx.AND(type, dst, src0, src1); break;
-        case Instruction::Or:   ctx.OR(type, dst, src0, src1); break;
-        case Instruction::Xor:  ctx.XOR(type, dst, src0, src1); break;
-        case Instruction::Shl:  ctx.SHL(type, dst, src0, src1); break;
-        case Instruction::LShr: ctx.SHR(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
-        case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
-        default: NOT_SUPPORTED;
-      }
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src0 = this->getRegister(I.getOperand(0));
+    const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+    switch (I.getOpcode()) {
+      case Instruction::Add:
+      case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
+      case Instruction::Sub:
+      case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
+      case Instruction::Mul:
+      case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
+      case Instruction::And:  ctx.AND(type, dst, src0, src1); break;
+      case Instruction::Or:   ctx.OR(type, dst, src0, src1); break;
+      case Instruction::Xor:  ctx.XOR(type, dst, src0, src1); break;
+      case Instruction::Shl:  ctx.SHL(type, dst, src0, src1); break;
+      case Instruction::LShr: ctx.SHR(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+      case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
+      default: NOT_SUPPORTED;
     }
   }
 
@@ -1289,49 +1288,46 @@ namespace gbe
     GBE_ASSERT(I.getOperand(0)->getType() != Type::getInt1Ty(I.getContext()));
 
     // Get the element type and the number of elements
-    uint32_t elemNum;
     Type *operandType = I.getOperand(0)->getType();
-    const ir::Type type = getVectorInfo(ctx, operandType, &I, elemNum);
+    const ir::Type type = getType(ctx, operandType);
     const ir::Type signedType = makeTypeSigned(type);
     const ir::Type unsignedType = makeTypeUnsigned(type);
 
     // Emit the instructions in a row
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-      const ir::Register dst = this->getRegister(&I, elemID);
-      const ir::Register src0 = this->getRegister(I.getOperand(0), elemID);
-      const ir::Register src1 = this->getRegister(I.getOperand(1), elemID);
-
-      // We must invert the condition to simplify the branch code
-      if (conditionSet.find(&I) != conditionSet.end()) {
-        switch (I.getPredicate()) {
-          case ICmpInst::ICMP_EQ:  ctx.NE(type, dst, src0, src1); break;
-          case ICmpInst::ICMP_NE:  ctx.EQ(type, dst, src0, src1); break;
-          case ICmpInst::ICMP_ULE: ctx.GT((unsignedType), dst, src0, src1); break;
-          case ICmpInst::ICMP_SLE: ctx.GT(signedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_UGE: ctx.LT(unsignedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_SGE: ctx.LT(signedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_ULT: ctx.GE(unsignedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_SLT: ctx.GE(signedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_UGT: ctx.LE(unsignedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_SGT: ctx.LE(signedType, dst, src0, src1); break;
-          default: NOT_SUPPORTED;
-        }
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src0 = this->getRegister(I.getOperand(0));
+    const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+    // We must invert the condition to simplify the branch code
+    if (conditionSet.find(&I) != conditionSet.end()) {
+      switch (I.getPredicate()) {
+        case ICmpInst::ICMP_EQ:  ctx.NE(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_NE:  ctx.EQ(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULE: ctx.GT((unsignedType), dst, src0, src1); break;
+        case ICmpInst::ICMP_SLE: ctx.GT(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGE: ctx.LT(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGE: ctx.LT(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULT: ctx.GE(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SLT: ctx.GE(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGT: ctx.LE(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGT: ctx.LE(signedType, dst, src0, src1); break;
+        default: NOT_SUPPORTED;
       }
-      // Nothing special to do
-      else {
-        switch (I.getPredicate()) {
-          case ICmpInst::ICMP_EQ:  ctx.EQ(type, dst, src0, src1); break;
-          case ICmpInst::ICMP_NE:  ctx.NE(type, dst, src0, src1); break;
-          case ICmpInst::ICMP_ULE: ctx.LE((unsignedType), dst, src0, src1); break;
-          case ICmpInst::ICMP_SLE: ctx.LE(signedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_UGE: ctx.GE(unsignedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_SGE: ctx.GE(signedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_ULT: ctx.LT(unsignedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_SLT: ctx.LT(signedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_UGT: ctx.GT(unsignedType, dst, src0, src1); break;
-          case ICmpInst::ICMP_SGT: ctx.GT(signedType, dst, src0, src1); break;
-          default: NOT_SUPPORTED;
-        }
+    }
+    // Nothing special to do
+    else {
+      switch (I.getPredicate()) {
+        case ICmpInst::ICMP_EQ:  ctx.EQ(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_NE:  ctx.NE(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULE: ctx.LE((unsignedType), dst, src0, src1); break;
+        case ICmpInst::ICMP_SLE: ctx.LE(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGE: ctx.GE(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGE: ctx.GE(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULT: ctx.LT(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SLT: ctx.LT(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGT: ctx.GT(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGT: ctx.GT(signedType, dst, src0, src1); break;
+        default: NOT_SUPPORTED;
       }
     }
   }
@@ -1343,31 +1339,28 @@ namespace gbe
   void GenWriter::emitFCmpInst(FCmpInst &I) {
 
     // Get the element type and the number of elements
-    uint32_t elemNum;
     Type *operandType = I.getOperand(0)->getType();
-    const ir::Type type = getVectorInfo(ctx, operandType, &I, elemNum);
+    const ir::Type type = getType(ctx, operandType);
 
     // Emit the instructions in a row
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-      const ir::Register dst = this->getRegister(&I, elemID);
-      const ir::Register src0 = this->getRegister(I.getOperand(0), elemID);
-      const ir::Register src1 = this->getRegister(I.getOperand(1), elemID);
-
-      switch (I.getPredicate()) {
-        case ICmpInst::FCMP_OEQ:
-        case ICmpInst::FCMP_UEQ: ctx.EQ(type, dst, src0, src1); break;
-        case ICmpInst::FCMP_ONE:
-        case ICmpInst::FCMP_UNE: ctx.NE(type, dst, src0, src1); break;
-        case ICmpInst::FCMP_OLE:
-        case ICmpInst::FCMP_ULE: ctx.LE(type, dst, src0, src1); break;
-        case ICmpInst::FCMP_OGE:
-        case ICmpInst::FCMP_UGE: ctx.GE(type, dst, src0, src1); break;
-        case ICmpInst::FCMP_OLT:
-        case ICmpInst::FCMP_ULT: ctx.LT(type, dst, src0, src1); break;
-        case ICmpInst::FCMP_OGT:
-        case ICmpInst::FCMP_UGT: ctx.GT(type, dst, src0, src1); break;
-        default: NOT_SUPPORTED;
-      }
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src0 = this->getRegister(I.getOperand(0));
+    const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+    switch (I.getPredicate()) {
+      case ICmpInst::FCMP_OEQ:
+      case ICmpInst::FCMP_UEQ: ctx.EQ(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_ONE:
+      case ICmpInst::FCMP_UNE: ctx.NE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OLE:
+      case ICmpInst::FCMP_ULE: ctx.LE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OGE:
+      case ICmpInst::FCMP_UGE: ctx.GE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OLT:
+      case ICmpInst::FCMP_ULT: ctx.LT(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OGT:
+      case ICmpInst::FCMP_UGT: ctx.GT(type, dst, src0, src1); break;
+      default: NOT_SUPPORTED;
     }
   }
 
@@ -1397,10 +1390,7 @@ namespace gbe
       // Bitcast just forward registers
       case Instruction::BitCast:
       {
-        uint32_t elemNum;
-        getVectorInfo(ctx, I.getType(), &I, elemNum);
-        for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-          regTranslator.newValueProxy(srcValue, dstValue, elemID, elemID);
+        regTranslator.newValueProxy(srcValue, dstValue);
       }
       break;
       // Various conversion operations -> just allocate registers for them
@@ -1448,15 +1438,14 @@ namespace gbe
       case Instruction::Trunc:
       {
         // Get the element type for a vector
-        uint32_t elemNum;
         Type *llvmDstType = I.getType();
         Type *llvmSrcType = I.getOperand(0)->getType();
-        const ir::Type dstType = getVectorInfo(ctx, llvmDstType, &I, elemNum);
+        const ir::Type dstType = getType(ctx, llvmDstType);
         ir::Type srcType;
         if (I.getOpcode() == Instruction::ZExt) {
-          srcType = getVectorInfo(ctx, llvmSrcType, &I, elemNum, true);
+          srcType = getUnsignedType(ctx, llvmSrcType);
         } else {
-          srcType = getVectorInfo(ctx, llvmSrcType, &I, elemNum);
+          srcType = getType(ctx, llvmSrcType);
         }
 
         // We use a select (0,1) not a convert when the destination is a boolean
@@ -1468,19 +1457,15 @@ namespace gbe
           const ir::Register oneReg = ctx.reg(family);
           ctx.LOADI(dstType, zeroReg, zero);
           ctx.LOADI(dstType, oneReg, one);
-          for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-            const ir::Register dst = this->getRegister(&I, elemID);
-            const ir::Register src = this->getRegister(I.getOperand(0), elemID);
-            ctx.SEL(dstType, dst, src, oneReg, zeroReg);
-          }
+          const ir::Register dst = this->getRegister(&I);
+          const ir::Register src = this->getRegister(I.getOperand(0));
+          ctx.SEL(dstType, dst, src, oneReg, zeroReg);
         }
         // Use a convert for the other cases
         else {
-          for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-            const ir::Register dst = this->getRegister(&I, elemID);
-            const ir::Register src = this->getRegister(I.getOperand(0), elemID);
-            ctx.CVT(dstType, srcType, dst, src);
-          }
+          const ir::Register dst = this->getRegister(&I);
+          const ir::Register src = this->getRegister(I.getOperand(0));
+          ctx.CVT(dstType, srcType, dst, src);
         }
       }
       break;
@@ -1497,141 +1482,15 @@ namespace gbe
     ir::Context &ctx;
   };
 
-  void GenWriter::regAllocateInsertElement(InsertElementInst &I) {
-    Value *modified = I.getOperand(0);
-    Value *toInsert = I.getOperand(1);
-    Value *index = I.getOperand(2);
-
-    // Get the index for the insertion
-    Constant *CPV = dyn_cast<Constant>(index);
-    GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values");
-    auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-    GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32,
-                "Invalid index type for InsertElement");
-
-    // Crash on overrun
-    VectorType *vectorType = cast<VectorType>(modified->getType());
-    const uint32_t elemNum = vectorType->getNumElements();
-    const uint32_t modifiedID = x.data.u32;
-    GBE_ASSERTM(modifiedID < elemNum, "Out-of-bound index for InsertElement");
-
-    // The source vector is not constant
-    if (!isa<Constant>(modified) || isa<UndefValue>(modified)) {
-       // Non modified values are just proxies
-       for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-         if (elemID != modifiedID)
-           regTranslator.newValueProxy(modified, &I, elemID, elemID);
-     }
-     // The source vector is constant
-     else {
-       // Non modified values will use LOADI
-       for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-         if (elemID != modifiedID) {
-           const ir::Type type = getType(ctx, toInsert->getType());
-           const ir::Register reg = ctx.reg(getFamily(type));
-           regTranslator.insertRegister(reg, &I, elemID);
-         }
-     }
-
-     // If the element to insert is an immediate we will generate a LOADI.
-     // Otherwise, the value is just a proxy of the inserted value
-     if (dyn_cast<Constant>(toInsert) != NULL) {
-       const ir::Type type = getType(ctx, toInsert->getType());
-       const ir::Register reg = ctx.reg(getFamily(type));
-       regTranslator.insertRegister(reg, &I, modifiedID);
-     } else
-       regTranslator.newValueProxy(toInsert, &I, 0, modifiedID);
-  }
-
-  void GenWriter::emitInsertElement(InsertElementInst &I) {
-    // Note that we check everything in regAllocateInsertElement
-    Value *modified = I.getOperand(0);
-    Value *toInsert = I.getOperand(1);
-    Value *index = I.getOperand(2);
-
-    // Get the index of the value to insert
-    Constant *indexCPV = dyn_cast<Constant>(index);
-    auto x = processConstant<ir::Immediate>(indexCPV, InsertExtractFunctor(ctx));
-    const uint32_t modifiedID = x.data.u32;
-
-    // The source vector is constant. We need to insert LOADI for the unmodified
-    // values
-    if (isa<Constant>(modified) && !isa<UndefValue>(modified)) {
-      VectorType *vectorType = cast<VectorType>(modified->getType());
-      const uint32_t elemNum = vectorType->getNumElements();
-      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
-        if (elemID != modifiedID) {
-          Constant *sourceCPV = dyn_cast<Constant>(modified);
-          if (isa<UndefValue>(extractConstantElem(sourceCPV, elemID)) == false) {
-            const ir::ImmediateIndex immIndex = this->newImmediate(sourceCPV, elemID);
-            const ir::Immediate imm = ctx.getImmediate(immIndex);
-            const ir::Register reg = regTranslator.getScalar(&I, elemID);
-            ctx.LOADI(imm.type, reg, immIndex);
-          }
-        }
-    }
-
-    // If the inserted value is not a constant, we just use a proxy
-    if (dyn_cast<Constant>(toInsert) == NULL)
-      return;
+  /*! Because there are still fake insert/extract instruction for
+   *  load/store, so keep empty function here */
+  void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
+  void GenWriter::emitInsertElement(InsertElementInst &I) {}
 
-    // We need a LOADI if we insert an immediate
-    Constant *toInsertCPV = dyn_cast<Constant>(toInsert);
-    const ir::ImmediateIndex immIndex = this->newImmediate(toInsertCPV);
-    const ir::Immediate imm = ctx.getImmediate(immIndex);
-    const ir::Register reg = regTranslator.getScalar(&I, modifiedID);
-    ctx.LOADI(imm.type, reg, immIndex);
-  }
-
-  void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {
-    Value *extracted = I.getOperand(0);
-    Value *index = I.getOperand(1);
-    GBE_ASSERTM(isa<Constant>(extracted) == false,
-                "TODO support constant vector for extract");
-    Constant *CPV = dyn_cast<Constant>(index);
-    GBE_ASSERTM(CPV != NULL, "only constant indices when inserting values");
-    auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-    GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32,
-                "Invalid index type for InsertElement");
-
-    // Crash on overrun
-    const uint32_t extractedID = x.data.u32;
-#if GBE_DEBUG
-    VectorType *vectorType = cast<VectorType>(extracted->getType());
-    const uint32_t elemNum = vectorType->getNumElements();
-    GBE_ASSERTM(extractedID < elemNum, "Out-of-bound index for InsertElement");
-#endif /* GBE_DEBUG */
-
-    // Easy when the vector is not immediate
-    regTranslator.newValueProxy(extracted, &I, extractedID, 0);
-  }
-
-  void GenWriter::emitExtractElement(ExtractElementInst &I) {
-    // TODO -> insert LOADI when the extracted vector is constant
-  }
-
-  void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {
-    Value *first = I.getOperand(0);
-    Value *second = I.getOperand(1);
-    GBE_ASSERTM(!isa<Constant>(first) || isa<UndefValue>(first),
-                "TODO support constant vector for shuffle");
-    GBE_ASSERTM(!isa<Constant>(second) || isa<UndefValue>(second),
-                "TODO support constant vector for shuffle");
-    VectorType *dstType = cast<VectorType>(I.getType());
-    VectorType *srcType = cast<VectorType>(first->getType());
-    const uint32_t dstElemNum = dstType->getNumElements();
-    const uint32_t srcElemNum = srcType->getNumElements();
-    for (uint32_t elemID = 0; elemID < dstElemNum; ++elemID) {
-      uint32_t srcID = I.getMaskValue(elemID);
-      Value *src = first;
-      if (srcID >= srcElemNum) {
-        srcID -= srcElemNum;
-        src = second;
-      }
-      regTranslator.newValueProxy(src, &I, srcID, elemID);
-    }
-  }
+  void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {}
+  void GenWriter::emitExtractElement(ExtractElementInst &I) {}
 
+  void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
   void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
 
   void GenWriter::regAllocateSelectInst(SelectInst &I) {
@@ -1640,22 +1499,14 @@ namespace gbe
 
   void GenWriter::emitSelectInst(SelectInst &I) {
     // Get the element type for a vector
-    uint32_t elemNum;
-    const ir::Type type = getVectorInfo(ctx, I.getType(), &I, elemNum);
-
-    // Condition can be either a vector or a scalar
-    Type *condType = I.getOperand(0)->getType();
-    const bool isVectorCond = isa<VectorType>(condType);
+    const ir::Type type = getType(ctx, I.getType());
 
     // Emit the instructions in a row
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-      const ir::Register dst = this->getRegister(&I, elemID);
-      const uint32_t condID = isVectorCond ? elemID : 0;
-      const ir::Register cond = this->getRegister(I.getOperand(0), condID);
-      const ir::Register src0 = this->getRegister(I.getOperand(1), elemID);
-      const ir::Register src1 = this->getRegister(I.getOperand(2), elemID);
-      ctx.SEL(type, dst, cond, src0, src1);
-    }
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register cond = this->getRegister(I.getOperand(0));
+    const ir::Register src0 = this->getRegister(I.getOperand(1));
+    const ir::Register src1 = this->getRegister(I.getOperand(2));
+    ctx.SEL(type, dst, cond, src0, src1);
   }
 
   void GenWriter::regAllocatePHINode(PHINode &I) {
@@ -1668,15 +1519,11 @@ namespace gbe
 
   void GenWriter::emitPHINode(PHINode &I) {
     Value *copy = this->getPHICopy(&I);
-    uint32_t elemNum;
-    const ir::Type type = getVectorInfo(ctx, I.getType(), &I, elemNum);
-
-    // Emit the MOVs to avoid the lost copy issue
-    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
-      const ir::Register dst = this->getRegister(&I, elemID);
-      const ir::Register src = this->getRegister(copy, elemID);
-      ctx.MOV(type, dst, src);
-    }
+    const ir::Type type = getType(ctx, I.getType());
+
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src = this->getRegister(copy);
+    ctx.MOV(type, dst, src);
   }
 
   void GenWriter::regAllocateBranchInst(BranchInst &I) {}
@@ -1813,6 +1660,8 @@ namespace gbe
       case GEN_OCL_RNDE:
       case GEN_OCL_RNDU:
       case GEN_OCL_RNDD:
+      case GEN_OCL_GET_IMAGE_WIDTH:
+      case GEN_OCL_GET_IMAGE_HEIGHT:
         // No structure can be returned
         this->newRegister(&I);
         break;
@@ -1848,7 +1697,7 @@ namespace gbe
       case GEN_OCL_READ_IMAGE14:
       case GEN_OCL_READ_IMAGE15:
       {
-      // dst is a 4 elements vector. We allocate all 4 registers here.
+        // dst is a 4 elements vector. We allocate all 4 registers here.
         uint32_t elemNum;
         (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
         GBE_ASSERT(elemNum == 4);
@@ -1976,6 +1825,29 @@ namespace gbe
           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
           case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
           case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+          case GEN_OCL_GET_IMAGE_WIDTH:
+          case GEN_OCL_GET_IMAGE_HEIGHT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
+            uint32_t elemNum;
+            (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
+            vector<ir::Register> dstTupleData;
+            ir::Register lastReg;
+            for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+              const ir::Register reg = this->getRegister(&I, elemID);
+              dstTupleData.push_back(reg);
+              lastReg = reg;
+            }
+            // A walk around for the gen IR limitation.
+            for (uint32_t elemID = elemNum; elemID < 4; ++ elemID) {
+              dstTupleData.push_back(lastReg);
+            }
+            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], 4);
+            int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
+
+            ctx.GET_IMAGE_INFO(infoType, dstTuple, surface_id);
+            break;
+          }
           case GEN_OCL_READ_IMAGE0:
           case GEN_OCL_READ_IMAGE1:
           case GEN_OCL_READ_IMAGE2:
@@ -1990,7 +1862,22 @@ namespace gbe
           case GEN_OCL_READ_IMAGE15:
           {
             GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
-            GBE_ASSERT(AI != AE); const ir::Register sampler = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE);
+            Constant *CPV = dyn_cast<Constant>(*AI);
+            ir::Register sampler;
+            if (CPV != NULL)
+            {
+              // This is not a kernel argument sampler, we need to append it to sampler set,
+              // and allocate a sampler slot for it.
+              auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
+              GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
+              sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+            } else {
+              sampler = this->getRegister(*AI);
+              ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
+            }
+            ++AI;
+
             GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
             ir::Register wcoord;
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index c270924..2ad879e 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -28,6 +28,9 @@
 
 #include "llvm/Pass.h"
 #include "sys/platform.hpp"
+#include "sys/map.hpp"
+#include "sys/hash_map.hpp"
+#include <algorithm>
 
 // LLVM Type
 namespace llvm { class Type; }
@@ -37,6 +40,29 @@ namespace gbe
   // Final target of the Gen backend
   namespace ir { class Unit; }
 
+  /*! All intrinsic Gen functions */
+  enum OCLInstrinsic {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+  };
+
+  /*! Build the hash map for OCL functions on Gen */
+  struct OCLIntrinsicMap {
+    /*! Build the intrinsic hash map */
+    OCLIntrinsicMap(void) {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
+  map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+    }
+    /*! Sort intrinsics with their names */
+    hash_map<std::string, OCLInstrinsic> map;
+  };
+
+  /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
+  static const OCLIntrinsicMap instrinsicMap;
+
   /*! Pad the offset */
   uint32_t getPadding(uint32_t offset, uint32_t align);
 
@@ -55,6 +81,8 @@ namespace gbe
   /*! Remove the GEP instructions */
   llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
 
+  llvm::FunctionPass* createScalarizePass(ir::Unit &unit);
+
 } /* namespace gbe */
 
 #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 055afa6..2fb33c0 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -70,6 +70,10 @@ DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE13, _Z23__gen_ocl_write_imageuijfffDv4_j)
 DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE14, _Z22__gen_ocl_write_imagefjiiiDv4_f)
 DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE15, _Z22__gen_ocl_write_imagefjfffDv4_f)
 
+// To get image info function
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_WIDTH, __gen_ocl_get_image_width)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_HEIGHT, __gen_ocl_get_image_height)
+
 // saturation related functions.
 DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
 DECL_LLVM_GEN_FUNCTION(SADD_SAT_SHORT, _Z12ocl_sadd_satss)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
new file mode 100644
index 0000000..f71401f
--- /dev/null
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -0,0 +1,836 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_scalarize.cpp
+ * \author Yang Rong <rong.r.yang at intel.com>
+ *
+ * This file is derived from:
+ *  https://code.google.com/p/lunarglass/source/browse/trunk/Core/Passes/Transforms/Scalarize.cpp?r=605
+ */
+
+//===- Scalarize.cpp - Scalarize LunarGLASS IR ----------------------------===//
+//
+// LunarGLASS: An Open Modular Shader Compiler Architecture
+// Copyright (C) 2010-2011 LunarG, Inc.
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; version 2 of the
+// License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+//===----------------------------------------------------------------------===//
+//
+// Author: Michael Ilseman, LunarG
+//
+//===----------------------------------------------------------------------===//
+//
+// Scalarize the IR.
+//   * Loads of uniforms become multiple loadComponent calls
+//
+//   * Reads/writes become read/writeComponent calls
+//
+//   * Component-wise operations become multiple ops over each component
+//
+//   * Texture call become recomponsed texture calls
+//
+//   * Vector ops disappear, with their users referring to the scalarized
+//   * components
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "ir/unit.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+
+  struct VectorValues {
+    VectorValues() : vals()
+    { }
+
+    void setComponent(int c, llvm::Value* val)
+    {
+      assert(c >= 0 && c < 16 && "Out of bounds component");
+      vals[c] = val;
+    }
+    llvm::Value* getComponent(int c)
+    {
+      assert(c >= 0 && c < 16 && "Out of bounds component");
+      assert(vals[c] && "Requesting non-existing component");
+      return vals[c];
+    }
+
+    // {Value* x, Value* y, Value* z, Value* w}
+    llvm::Value* vals[16];
+  };
+
+  class Scalarize : public FunctionPass {
+
+  public:
+    // Standard pass stuff
+    static char ID;
+
+    Scalarize(ir::Unit& unit) : FunctionPass(ID), unit(unit)
+    {
+      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+      initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function&);
+    void print(raw_ostream&, const Module* = 0) const;
+    virtual void getAnalysisUsage(AnalysisUsage&) const;
+
+  protected:
+    // An instruction is valid post-scalarization iff it is fully scalar or it
+    // is a gla_loadn
+    bool isValid(const Instruction*);
+
+    // Take an instruction that produces a vector, and scalarize it
+    bool scalarize(Instruction*);
+    bool scalarizePerComponent(Instruction*);
+    bool scalarizeFuncCall(CallInst *);
+    bool scalarizeLoad(LoadInst*);
+    bool scalarizeStore(StoreInst*);
+    //bool scalarizeIntrinsic(IntrinsicInst*);
+    bool scalarizeExtract(ExtractElementInst*);
+    bool scalarizeInsert(InsertElementInst*);
+    bool scalarizeShuffleVector(ShuffleVectorInst*);
+    bool scalarizePHI(PHINode*);
+    void scalarizeArgs(Function& F);
+    // ...
+
+    // Helpers to make the actual multiple scalar calls, one per
+    // component. Updates the given VectorValues's components with the new
+    // Values.
+    void makeScalarizedCalls(Function*, ArrayRef<Value*>, int numComponents, VectorValues&);
+
+    void makePerComponentScalarizedCalls(Instruction*, ArrayRef<Value*>);
+
+    // Makes a scalar form of the given instruction: replaces the operands
+    // and chooses a correct return type
+    Instruction* createScalarInstruction(Instruction* inst, ArrayRef<Value*>);
+
+    // Gather the specified components in the given values. Returns the
+    // component if the given value is a vector, or the scalar itself.
+    void gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs);
+
+    // Get the assigned component for that value. If the value is a scalar,
+    // returns the scalar. If it's a constant, returns that component. If
+    // it's an instruction, returns the vectorValues of that instruction for
+    // that component
+    Value* getComponent(int component, Value*);
+
+    // Used for assertion purposes. Whether we can get the component out with
+    // a getComponent call
+    bool canGetComponent(Value*);
+
+    // Used for assertion purposes. Whether for every operand we can get
+    // components with a getComponent call
+    bool canGetComponentArgs(User*);
+
+    // Delete the instruction in the deadList
+    void dce();
+
+
+    int GetConstantInt(const Value* value);
+    bool IsPerComponentOp(const Instruction* inst);
+    bool IsPerComponentOp(const Value* value);
+
+    //these function used to add extract and insert instructions when load/store etc.
+    void extractFromeVector(Value* insn);
+    Value* InsertToVector(Value* insn, Value* vecValue);
+
+    Type* GetBasicType(Value* value) {
+      return GetBasicType(value->getType());
+    }
+
+    Type* GetBasicType(Type* type) {
+      switch(type->getTypeID()) {
+      case Type::VectorTyID:
+      case Type::ArrayTyID:
+        return GetBasicType(type->getContainedType(0));
+      default:
+        break;
+      }
+      return type;
+    }
+
+    int GetComponentCount(const Type* type)  {
+      if (type->getTypeID() == Type::VectorTyID)
+        return llvm::dyn_cast<VectorType>(type)->getNumElements();
+      else
+        return 1;
+    }
+
+    int GetComponentCount(const Value* value) {
+      return GetComponentCount(value->getType());
+    }
+
+    DenseMap<Value*, VectorValues> vectorVals;
+    Module* module;
+    IRBuilder<>* builder;
+
+    Type* intTy;
+    Type* floatTy;
+    ir::Unit &unit;
+
+    std::vector<Instruction*> deadList;
+
+    // List of vector phis that were not completely scalarized because some
+    // of their operands hadn't before been visited (i.e. loop variant
+    // variables)
+    SmallVector<PHINode*, 16> incompletePhis;
+  };
+
+  Value* Scalarize::getComponent(int component, Value* v)
+  {
+    assert(canGetComponent(v) && "getComponent called on unhandled vector");
+
+    if (v->getType()->isVectorTy()) {
+      if (ConstantDataVector* c = dyn_cast<ConstantDataVector>(v)) {
+        return c->getElementAsConstant(component);
+      } else if (ConstantVector* c = dyn_cast<ConstantVector>(v)) {
+        return c->getOperand(component);
+      } else if (isa<ConstantAggregateZero>(v)) {
+        return Constant::getNullValue(GetBasicType(v));
+      } else if (isa<UndefValue>(v)) {
+        return UndefValue::get(GetBasicType(v));
+      } else {
+        return vectorVals[v].getComponent(component);
+      }
+    } else {
+      return v;
+    }
+  }
+
+  bool IsPerComponentOp(const llvm::Value* value)
+  {
+    const llvm::Instruction* inst = llvm::dyn_cast<const llvm::Instruction>(value);
+    return inst && IsPerComponentOp(inst);
+  }
+
+  bool Scalarize::IsPerComponentOp(const Instruction* inst)
+  {
+    //if (const IntrinsicInst* intr = dyn_cast<const IntrinsicInst>(inst))
+    //    return IsPerComponentOp(intr);
+
+    if (inst->isTerminator())
+        return false;
+
+    switch (inst->getOpcode()) {
+
+    // Cast ops are only per-component if they cast back to the same vector
+    // width
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return GetComponentCount(inst->getOperand(0)) == GetComponentCount(inst);
+
+    // Vector ops
+    case Instruction::InsertElement:
+    case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+
+    // Ways of accessing/loading/storing vectors
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+
+    // Memory ops
+    case Instruction::Alloca:
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::GetElementPtr:
+    // Phis are a little special. We consider them not to be per-component
+    // because the mechanism of choice is a single value (what path we took to
+    // get here), and doesn't choose per-component (as select would). The caller
+    // should know to handle phis specially
+    case Instruction::PHI:
+    // Call insts, conservatively are no per-component
+    case Instruction::Call:
+    // Misc
+    case Instruction::LandingPad:  //--- 3.0
+    case Instruction::VAArg:
+      return false;
+    } // end of switch (inst->getOpcode())
+
+    return true;
+  }
+  int Scalarize::GetConstantInt(const Value* value)
+  {
+    const ConstantInt *constantInt = dyn_cast<ConstantInt>(value);
+
+    // this might still be a constant expression, rather than a numeric constant,
+    // e.g., expression with undef's in it, so it was not folded
+    if (! constantInt)
+      NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("non-simple constant");
+
+    return constantInt->getValue().getSExtValue();
+  }
+  bool Scalarize::canGetComponent(Value* v)
+  {
+    if (v->getType()->isVectorTy()) {
+      if (isa<ConstantDataVector>(v) || isa<ConstantVector>(v) || isa<ConstantAggregateZero>(v) || isa<UndefValue>(v)) {
+        return true;
+      } else {
+        assert((isa<Instruction>(v) || isa<Argument>(v)) && "Non-constant non-instuction?");
+        return vectorVals.count(v);
+      }
+    } else {
+      return true;
+    }
+  }
+
+  bool Scalarize::canGetComponentArgs(User* u)
+  {
+    if (PHINode* phi = dyn_cast<PHINode>(u)) {
+      for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i)
+        if (!canGetComponent(phi->getIncomingValue(i)))
+          return false;
+    } else {
+      for (User::op_iterator i = u->op_begin(), e = u->op_end(); i != e; ++i)
+        if (!canGetComponent(*i))
+          return false;
+    }
+    return true;
+  }
+
+  void Scalarize::gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs)
+  {
+    componentArgs.clear();
+    for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end(); i != e; ++i)
+      componentArgs.push_back(getComponent(component, *i));
+  }
+
+  Instruction* Scalarize::createScalarInstruction(Instruction* inst, ArrayRef<Value*> args)
+  {
+    // TODO: Refine the below into one large switch
+
+    unsigned op = inst->getOpcode();
+    if (inst->isCast()) {
+      assert(args.size() == 1 && "incorrect number of arguments for cast op");
+      return CastInst::Create((Instruction::CastOps)op, args[0], GetBasicType(inst));
+    }
+
+    if (inst->isBinaryOp()) {
+      assert(args.size() == 2 && "incorrect number of arguments for binary op");
+      return BinaryOperator::Create((Instruction::BinaryOps)op, args[0], args[1]);
+    }
+
+    if (PHINode* phi = dyn_cast<PHINode>(inst)) {
+      PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues());
+      assert(args.size() % 2 == 0 && "Odd number of arguments for a PHI");
+
+      // Loop over pairs of operands: [Value*, BasicBlock*]
+      for (unsigned int i = 0; i < args.size(); i++) {
+        BasicBlock* bb = phi->getIncomingBlock(i); //dyn_cast<BasicBlock>(args[i+1]);
+        //assert(bb && "Non-basic block incoming block?");
+        res->addIncoming(args[i], bb);
+      }
+
+      return res;
+    }
+
+    if (CmpInst* cmpInst = dyn_cast<CmpInst>(inst)) {
+      assert(args.size() == 2 && "incorrect number of arguments for comparison");
+      return CmpInst::Create(cmpInst->getOpcode(), cmpInst->getPredicate(), args[0], args[1]);
+    }
+
+    if (isa<SelectInst>(inst)) {
+      assert(args.size() == 3 && "incorrect number of arguments for select");
+      return SelectInst::Create(args[0], args[1], args[2]);
+    }
+
+    if (IntrinsicInst* intr = dyn_cast<IntrinsicInst>(inst)) {
+      if (! IsPerComponentOp(inst))
+        NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarize instruction on a non-per-component intrinsic");
+
+      // TODO: Assumption is that all per-component intrinsics have all their
+      // arguments be overloadable. Need to find some way to assert on this
+      // assumption. This is due to how getDeclaration operates; it only takes
+      // a list of types that fit overloadable slots.
+      SmallVector<Type*, 8> tys(1, GetBasicType(inst->getType()));
+      // Call instructions have the decl as a last argument, so skip it
+      for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end() - 1; i != e; ++i) {
+        tys.push_back(GetBasicType((*i)->getType()));
+      }
+
+      Function* f = Intrinsic::getDeclaration(module, intr->getIntrinsicID(), tys);
+      return CallInst::Create(f, args);
+    }
+
+    NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unsupported instruction: ", inst->getOpcode(),
+                     //             inst->getOpcodeName());
+    return 0;
+
+  }
+
+
+  void Scalarize::makeScalarizedCalls(Function* f, ArrayRef<Value*> args, int count, VectorValues& vVals)
+  {
+    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    for (int i = 0; i < count; ++i) {
+      Value* res;
+      SmallVector<Value*, 8> callArgs(args.begin(), args.end());
+      callArgs.push_back(ConstantInt::get(intTy, i));
+
+      res = builder->CreateCall(f, callArgs);
+      vVals.setComponent(i, res);
+    }
+  }
+
+  void Scalarize::makePerComponentScalarizedCalls(Instruction* inst, ArrayRef<Value*> args)
+  {
+    int count = GetComponentCount(inst);
+    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    assert((inst->getNumOperands() == args.size() || isa<PHINode>(inst))
+           && "not enough arguments passed for instruction");
+
+    VectorValues& vVals = vectorVals[inst];
+
+    for (int i = 0; i < count; ++i) {
+      // Set this component of each arg
+      SmallVector<Value*, 8> callArgs(args.size(), 0);
+      gatherComponents(i, args, callArgs);
+
+      Instruction* res = createScalarInstruction(inst, callArgs);
+
+      vVals.setComponent(i, res);
+      builder->Insert(res);
+    }
+  }
+
+  bool Scalarize::isValid(const Instruction* inst)
+  {
+    // The result
+    if (inst->getType()->isVectorTy())
+        return false;
+
+    // The arguments
+    for (Instruction::const_op_iterator i = inst->op_begin(), e = inst->op_end(); i != e; ++i) {
+      const Value* v = (*i);
+      assert(v);
+      if (v->getType()->isVectorTy())
+        return false;
+    }
+
+    return true;
+  }
+
+  bool Scalarize::scalarize(Instruction* inst)
+  {
+    if (isValid(inst))
+        return false;
+
+    assert(! vectorVals.count(inst) && "We've already scalarized this somehow?");
+    assert((canGetComponentArgs(inst) || isa<PHINode>(inst)) &&
+           "Scalarizing an op whose arguments haven't been scalarized ");
+    builder->SetInsertPoint(inst);
+
+    if (IsPerComponentOp(inst))
+      return scalarizePerComponent(inst);
+
+    if (LoadInst* ld = dyn_cast<LoadInst>(inst))
+      return scalarizeLoad(ld);
+
+    if (CallInst* call = dyn_cast<CallInst>(inst))
+      return scalarizeFuncCall(call);
+
+    if (ExtractElementInst* extr = dyn_cast<ExtractElementInst>(inst))
+      return scalarizeExtract(extr);
+
+    if (InsertElementInst* ins = dyn_cast<InsertElementInst>(inst))
+      return scalarizeInsert(ins);
+
+    if (ShuffleVectorInst* sv = dyn_cast<ShuffleVectorInst>(inst))
+      return scalarizeShuffleVector(sv);
+
+    if (PHINode* phi = dyn_cast<PHINode>(inst))
+      return scalarizePHI(phi);
+
+    if (isa<ExtractValueInst>(inst) || isa<InsertValueInst>(inst))
+      // TODO: need to come up with a struct/array model for scalarization
+      NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarizing struct/array ops");
+
+    if (StoreInst* st = dyn_cast<StoreInst>(inst))
+      return scalarizeStore(st);
+
+    NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unhandled instruction ", inst->getOpcode(), inst->getOpcodeName());
+    return false;
+  }
+
+  bool Scalarize::scalarizeShuffleVector(ShuffleVectorInst* sv)
+  {
+    //     %res = shuffleVector <n x ty> %foo, <n x ty> bar, <n x i32> <...>
+    // ==> nothing (just make a new VectorValues with the new components)
+    VectorValues& vVals = vectorVals[sv];
+
+    int size = GetComponentCount(sv);
+    int srcSize = GetComponentCount(sv->getOperand(0)->getType());
+
+    for (int i = 0; i < size; ++i) {
+      int select = sv->getMaskValue(i);
+
+      if (select < 0) {
+        vVals.setComponent(i, UndefValue::get(GetBasicType(sv->getOperand(0))));
+        continue;
+      }
+
+      // Otherwise look up the corresponding component from the correct
+      // source.
+      Value* selectee;
+      if (select < srcSize) {
+        selectee = sv->getOperand(0);
+      } else {
+        // Choose from the second operand
+        select -= srcSize;
+        selectee = sv->getOperand(1);
+      }
+
+      vVals.setComponent(i, getComponent(select, selectee));
+    }
+
+    return true;
+  }
+
+  bool Scalarize::scalarizePerComponent(Instruction* inst)
+  {
+    //     dst  = op <n x ty> %foo, <n x ty> %bar
+    // ==> dstx = op ty %foox, ty %barx
+    //     dsty = op ty %fooy, ty %bary
+    //     ...
+
+    SmallVector<Value*, 16> args(inst->op_begin(), inst->op_end());
+
+    makePerComponentScalarizedCalls(inst, args);
+
+    return true;
+  }
+
+  bool Scalarize::scalarizePHI(PHINode* phi)
+  {
+    //     dst = phi <n x ty> [ %foo, %bb1 ], [ %bar, %bb2], ...
+    // ==> dstx = phi ty [ %foox, %bb1 ], [ %barx, %bb2], ...
+    //     dsty = phi ty [ %fooy, %bb1 ], [ %bary, %bb2], ...
+
+    // If the scalar values are all known up-front, then just make the full
+    // phinode now. If they are not yet known (phinode for a loop variant
+    // variable), then deferr the arguments until later
+
+    if (canGetComponentArgs(phi)) {
+      SmallVector<Value*, 8> args(phi->op_begin(), phi->op_end());
+      makePerComponentScalarizedCalls(phi, args);
+    } else {
+      makePerComponentScalarizedCalls(phi, ArrayRef<Value*>());
+      incompletePhis.push_back(phi);
+    }
+
+    return true;
+  }
+
+  void Scalarize::extractFromeVector(Value* insn) {
+    VectorValues& vVals = vectorVals[insn];
+
+    for (int i = 0; i < GetComponentCount(insn); ++i) {
+      Value *cv = ConstantInt::get(intTy, i);
+      Value *EI = builder->CreateExtractElement(insn, cv);
+      vVals.setComponent(i, EI);
+      //unit.fakeInsnMap[EI] = insn;
+      unit.newValueProxy(insn, EI, i, 0);
+    }
+  }
+
+  Value* Scalarize::InsertToVector(Value * insn, Value* vecValue) {
+    //VectorValues& vVals = vectorVals[writeValue];
+    //unit.vecValuesMap[call] = vectorVals[writeValue];
+
+    //add fake insert instructions to avoid removed
+    Value *II = NULL;
+    for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+      Value *vec = II ? II : UndefValue::get(vecValue->getType());
+      Value *cv = ConstantInt::get(intTy, i);
+      II = builder->CreateInsertElement(vec, getComponent(i, vecValue), cv);
+      //unit.vecValuesMap[insn].setComponent(i, getComponent(i, writeValue));
+      //unit.newValueProxy(getComponent(i, vecValue), vecValue, 0, i);
+      //unit.fakeInsnMap[II] = insn;
+    }
+
+    for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+      unit.newValueProxy(getComponent(i, vecValue), II, 0, i);
+    }
+    return II;
+  }
+
+  bool Scalarize::scalarizeFuncCall(CallInst* call) {
+    if (Function *F = call->getCalledFunction()) {
+      if (F->getIntrinsicID() != 0) {   //Intrinsic functions
+        NOT_IMPLEMENTED;
+      } else {
+        Value *Callee = call->getCalledValue();
+        const std::string fnName = Callee->getName();
+        auto it = instrinsicMap.map.find(fnName);
+        GBE_ASSERT(it != instrinsicMap.map.end());
+
+        // Get the function arguments
+        CallSite CS(call);
+        CallSite::arg_iterator CI = CS.arg_begin() + 3;
+
+        switch (it->second) {
+          default: break;
+          case GEN_OCL_READ_IMAGE0:
+          case GEN_OCL_READ_IMAGE1:
+          case GEN_OCL_READ_IMAGE2:
+          case GEN_OCL_READ_IMAGE3:
+          case GEN_OCL_READ_IMAGE4:
+          case GEN_OCL_READ_IMAGE5:
+          case GEN_OCL_READ_IMAGE10:
+          case GEN_OCL_READ_IMAGE11:
+          case GEN_OCL_READ_IMAGE12:
+          case GEN_OCL_READ_IMAGE13:
+          case GEN_OCL_READ_IMAGE14:
+          case GEN_OCL_READ_IMAGE15:
+          case GEN_OCL_GET_IMAGE_WIDTH:
+          case GEN_OCL_GET_IMAGE_HEIGHT:
+          {
+            extractFromeVector(call);
+            break;
+          }
+          case GEN_OCL_WRITE_IMAGE10:
+          case GEN_OCL_WRITE_IMAGE11:
+          case GEN_OCL_WRITE_IMAGE12:
+          case GEN_OCL_WRITE_IMAGE13:
+          case GEN_OCL_WRITE_IMAGE14:
+          case GEN_OCL_WRITE_IMAGE15:
+            CI++;
+          case GEN_OCL_WRITE_IMAGE0:
+          case GEN_OCL_WRITE_IMAGE1:
+          case GEN_OCL_WRITE_IMAGE2:
+          case GEN_OCL_WRITE_IMAGE3:
+          case GEN_OCL_WRITE_IMAGE4:
+          case GEN_OCL_WRITE_IMAGE5:
+          {
+            *CI = InsertToVector(call, *CI);
+            break;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  bool Scalarize::scalarizeLoad(LoadInst* ld)
+  {
+    extractFromeVector(ld);
+    return false;
+  }
+
+  bool Scalarize::scalarizeStore(StoreInst* st) {
+    st->setOperand(0, InsertToVector(st, st->getValueOperand()));
+    return false;
+  }
+
+  bool Scalarize::scalarizeExtract(ExtractElementInst* extr)
+  {
+    //     %res = extractelement <n X ty> %foo, %i
+    // ==> nothing (just use %foo's %ith component instead of %res)
+
+    if (! isa<Constant>(extr->getOperand(1))) {
+        // TODO: Variably referenced components. Probably handle/emulate through
+        // a series of selects.
+        NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+    }
+    //if (isa<Argument>(extr->getOperand(0)))
+    //  return false;
+    int component = GetConstantInt(extr->getOperand(1));
+    Value* v = getComponent(component, extr->getOperand(0));
+    if(extr == v)
+      return false;
+    extr->replaceAllUsesWith(v);
+
+    return true;
+  }
+
+  bool Scalarize::scalarizeInsert(InsertElementInst* ins)
+  {
+    //     %res = insertValue <n x ty> %foo, %i
+    // ==> nothing (just make a new VectorValues with the new component)
+
+    if (! isa<Constant>(ins->getOperand(2))) {
+      // TODO: Variably referenced components. Probably handle/emulate through
+      // a series of selects.
+      NOT_IMPLEMENTED;   //gla::UnsupportedFunctionality("Variably referenced vector components");
+    }
+
+    int component = GetConstantInt(ins->getOperand(2));
+
+    VectorValues& vVals = vectorVals[ins];
+    for (int i = 0; i < GetComponentCount(ins); ++i) {
+      vVals.setComponent(i, i == component ? ins->getOperand(1)
+                                           : getComponent(i, ins->getOperand(0)));
+    }
+
+    return true;
+  }
+
+  void Scalarize::scalarizeArgs(Function& F)  {
+    if (F.arg_empty())
+      return;
+    ReversePostOrderTraversal<Function*> rpot(&F);
+    BasicBlock::iterator instI = (*rpot.begin())->begin();
+    builder->SetInsertPoint(instI);
+
+    Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+
+#if LLVM_VERSION_MINOR <= 1
+    const AttrListPtr &PAL = F.getAttributes();
+    uint32_t argID = 1; // Start at one actually
+    for (; I != E; ++I, ++argID) {
+#else
+    for (; I != E; ++I) {
+#endif /* LLVM_VERSION_MINOR <= 1 */
+      Type *type = I->getType();
+
+      if(type->isVectorTy())
+        extractFromeVector(I);
+    }
+    return;
+  }
+
+  bool Scalarize::runOnFunction(Function& F)
+  {
+    switch (F.getCallingConv()) {
+    case CallingConv::PTX_Device:
+      return false;
+    case CallingConv::PTX_Kernel:
+      break;
+    default: GBE_ASSERTM(false, "Unsupported calling convention");
+    }
+
+    bool changed = false;
+    module = F.getParent();
+    intTy = IntegerType::get(module->getContext(), 32);
+    floatTy = Type::getFloatTy(module->getContext());
+    builder = new IRBuilder<>(module->getContext());
+
+    scalarizeArgs(F);
+
+    typedef ReversePostOrderTraversal<Function*> RPOTType;
+    RPOTType rpot(&F);
+    for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
+      for (BasicBlock::iterator instI = (*bbI)->begin(), instE = (*bbI)->end(); instI != instE; ++instI) {
+        bool scalarized = scalarize(instI);
+        if (scalarized) {
+          changed = true;
+          // TODO: uncomment when done
+          deadList.push_back(instI);
+        }
+      }
+    }
+
+    // Fill in the incomplete phis
+    for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
+       phiI != phiE; ++phiI) {
+      assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
+
+      // Fill in each component of this phi
+      VectorValues& vVals = vectorVals[*phiI];
+      for (int c = 0; c < GetComponentCount(*phiI); ++c) {
+        PHINode* compPhi = dyn_cast<PHINode>(vVals.getComponent(c));
+        assert(compPhi && "Vector phi got scalarized to non-phis?");
+
+        // Loop over pairs of operands: [Value*, BasicBlock*]
+        for (unsigned int i = 0; i < (*phiI)->getNumOperands(); i++) {
+          BasicBlock* bb = (*phiI)->getIncomingBlock(i);
+          assert(bb && "Non-basic block incoming block?");
+          compPhi->addIncoming(getComponent(c, (*phiI)->getOperand(i)), bb);
+        }
+      }
+    }
+
+    dce();
+
+    delete builder;
+    builder = 0;
+
+    return changed;
+  }
+
+  void Scalarize::dce()
+  {
+    //two passes delete for some phinode
+    for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+      (*i)->dropAllReferences();
+      if((*i)->use_empty())
+        (*i)->eraseFromParent();
+    }
+    for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+      if((*i)->getParent())
+        (*i)->eraseFromParent();
+    }
+    deadList.clear();
+  }
+
+  void Scalarize::getAnalysisUsage(AnalysisUsage& AU) const
+  {
+  }
+
+  void Scalarize::print(raw_ostream&, const Module*) const
+  {
+      return;
+  }
+  FunctionPass* createScalarizePass(ir::Unit &unit)
+  {
+      return new Scalarize(unit);
+  }
+  char Scalarize::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index ea3d9eb..559cde0 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -69,6 +69,7 @@ namespace gbe
     // Print the code before further optimizations
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
       passes.add(createPrintModulePass(&*o));
+    passes.add(createScalarizePass(unit));        // Expand all vector ops
     passes.add(createScalarReplAggregatesPass()); // Break up allocas
     passes.add(createRemoveGEPPass(unit));
     passes.add(createConstantPropagationPass());
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
index d4b1b6c..1ea150b 100644
--- a/backend/src/ocl_common_defines.h
+++ b/backend/src/ocl_common_defines.h
@@ -111,9 +111,12 @@ typedef enum clk_sampler_type {
     __CLK_SAMPLER_MASK             = __CLK_MIP_MASK | __CLK_FILTER_MASK |
                                      __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
 
-    __CLK_ANISOTROPIC_RATIO_BITS   = 5,
-    __CLK_ANISOTROPIC_RATIO_MASK   = (int) 0x80000000 >>
-                                      (__CLK_ANISOTROPIC_RATIO_BITS-1)
+    __CLK_SAMPLER_ARG_BASE         = __CLK_MIP_BASE + __CLK_SAMPLER_BITS,
+    __CLK_SAMPLER_ARG_BITS         = 8,
+    __CLK_SAMPLER_ARG_MASK         = ((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE,
+    __CLK_SAMPLER_ARG_KEY_BIT      = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
+    __CLK_SAMPLER_ARG_KEY_BITS     = 1,
+
 } clk_sampler_type;
 
 // Memory synchronization
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 4c0d39c..92f9ba9 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -46,7 +46,6 @@ typedef unsigned int uintptr_t;
 #define __constant __attribute__((address_space(2)))
 #define __local __attribute__((address_space(3)))
 #define __texture __attribute__((address_space(4)))
-#define __sampler __attribute__((address_space(5)))
 #define global __global
 //#define local __local
 #define constant __constant
@@ -77,7 +76,7 @@ struct _image2d_t;
 typedef __texture struct _image2d_t* image2d_t;
 struct _image3d_t;
 typedef __texture struct _image3d_t* image3d_t;
-typedef __sampler uint* sampler_t;
+typedef uint sampler_t;
 typedef size_t event_t;
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL conversions & type casting
@@ -394,8 +393,15 @@ PURE CONST float __gen_ocl_rndz(float x);
 PURE CONST float __gen_ocl_rnde(float x);
 PURE CONST float __gen_ocl_rndu(float x);
 PURE CONST float __gen_ocl_rndd(float x);
+INLINE OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
 INLINE OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+  return __gen_ocl_cos(x * M_PI_F);
+}
 INLINE OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+  return __gen_ocl_sin(x * M_PI_F);
+}
 INLINE OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
 INLINE OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
 INLINE OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
@@ -405,14 +411,150 @@ INLINE OVERLOADABLE float native_log(float x) {
 INLINE OVERLOADABLE float native_log10(float x) {
   return native_log2(x) * 0.3010299956f;
 }
+INLINE OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
+INLINE OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE OVERLOADABLE int2 ilogb(float2 x) {
+  return (int2)(ilogb(x.s0), ilogb(x.s1));
+}
+INLINE OVERLOADABLE int4 ilogb(float4 x) {
+  return (int4)(ilogb(x.s01), ilogb(x.s23));
+}
+INLINE OVERLOADABLE int8 ilogb(float8 x) {
+  return (int8)(ilogb(x.s0123), ilogb(x.s4567));
+}
+INLINE OVERLOADABLE int16 ilogb(float16 x) {
+  return (int16)(ilogb(x.s01234567), ilogb(x.s89abcdef));
+}
+INLINE OVERLOADABLE float nan(uint code) {
+  return NAN;
+}
+INLINE OVERLOADABLE float2 nan(uint2 code) {
+  return (float2)(nan(code.s0), nan(code.s1));
+}
+INLINE OVERLOADABLE float4 nan(uint4 code) {
+  return (float4)(nan(code.s01), nan(code.s23));
+}
+INLINE OVERLOADABLE float8 nan(uint8 code) {
+  return (float8)(nan(code.s0123), nan(code.s4567));
+}
+INLINE OVERLOADABLE float16 nan(uint16 code) {
+  return (float16)(nan(code.s01234567), nan(code.s89abcdef));
+}
 INLINE OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
 INLINE OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
 INLINE OVERLOADABLE float native_tan(float x) {
   return native_sin(x) / native_cos(x);
 }
-#define E 2.71828182845904523536f
-INLINE OVERLOADABLE float native_exp(float x) { return native_powr(E, x); }
-#undef E
+INLINE OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+  return native_tan(x * M_PI_F);
+}
+INLINE OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
+INLINE OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
+INLINE OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+INLINE OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
+INLINE OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+  return __gen_ocl_pow(x, 0.3333333333f);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
+  *cosval = native_cos(x);
+  return native_sin(x);
+}
+INLINE OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
+  return (float2)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval));
+}
+INLINE OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
+  return (float4)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval));
+}
+INLINE OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
+  return (float8)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s4, 4 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s5, 5 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
+                  __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval));
+}
+INLINE OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
+  return (float16)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s4, 4 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s5, 5 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s8, 8 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.s9, 9 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.sa, 10 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.sb, 11 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.sc, 12 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.sd, 13 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.se, 14 + (float *)cosval),
+                   __gen_ocl_internal_sincos(x.sf, 15 + (float *)cosval));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
+  return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+  return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+  float y = native_exp(-2 * x);
+  return (1 - y) / (1 + y);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+  return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+  return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+  return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+  return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+  float a = 0, c = 1;
+  if (x <= -1) {
+    a = - M_PI_2_F;
+    x = 1 / x;
+    c = -1;
+  }
+  if (x >= 1) {
+    a = M_PI_2_F;
+    x = 1 / x;
+    c = -1;
+  }
+  return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+  return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
+  return native_log(x + native_sqrt(x * x + 1));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+  return 0.5f * native_sqrt((1 + x) / (1 - x));
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+  return x * y < 0 ? -x : x;
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+  return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+  return 1 - __gen_ocl_internal_erf(x);
+}
 
 // XXX work-around PTX profile
 #define sqrt native_sqrt
@@ -428,11 +570,36 @@ INLINE OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log1
 INLINE OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
 INLINE OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
 INLINE OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
-
+INLINE OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
+INLINE OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+  return 2 * __gen_ocl_internal_round(x / 2);
+}
 // TODO use llvm intrinsics definitions
 #define cos native_cos
+#define cospi __gen_ocl_internal_cospi
+#define cosh __gen_ocl_internal_cosh
+#define acos __gen_ocl_internal_acos
+#define acospi __gen_ocl_internal_acospi
+#define acosh __gen_ocl_internal_acosh
 #define sin native_sin
+#define sinpi __gen_ocl_internal_sinpi
+#define sinh __gen_ocl_internal_sinh
+#define sincos __gen_ocl_internal_sincos
+#define asin __gen_ocl_internal_asin
+#define asinpi __gen_ocl_internal_asinpi
+#define asinh __gen_ocl_internal_asinh
+#define tan native_tan
+#define tanpi __gen_ocl_internal_tanpi
+#define tanh __gen_ocl_internal_tanh
+#define atan __gen_ocl_internal_atan
+#define atanpi __gen_ocl_internal_atanpi
+#define atanh __gen_ocl_internal_atanh
 #define pow powr
+#define cbrt __gen_ocl_internal_cbrt
+#define rint __gen_ocl_internal_rint
+#define copysign __gen_ocl_internal_copysign
+#define erf __gen_ocl_internal_erf
+#define erfc __gen_ocl_internal_erfc
 
 INLINE OVERLOADABLE float mad(float a, float b, float c) {
   return a*b+c;
@@ -484,25 +651,93 @@ DECL_SELECT4(float4, float, uint4, 0x80000000)
 /////////////////////////////////////////////////////////////////////////////
 // Common Functions (see 6.11.4 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-#define DECL_MIN_MAX(TYPE) \
+#define DECL_MIN_MAX_CLAMP(TYPE) \
 INLINE OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
   return a > b ? a : b; \
 } \
 INLINE OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
   return a < b ? a : b; \
+} \
+INLINE OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+  return max(min(v, u), l); \
 }
-DECL_MIN_MAX(float)
-DECL_MIN_MAX(int)
-DECL_MIN_MAX(short)
-DECL_MIN_MAX(char)
-DECL_MIN_MAX(uint)
-DECL_MIN_MAX(unsigned short)
-DECL_MIN_MAX(unsigned char)
-#undef DECL_MIN_MAX
+DECL_MIN_MAX_CLAMP(float)
+DECL_MIN_MAX_CLAMP(int)
+DECL_MIN_MAX_CLAMP(short)
+DECL_MIN_MAX_CLAMP(char)
+DECL_MIN_MAX_CLAMP(uint)
+DECL_MIN_MAX_CLAMP(unsigned short)
+DECL_MIN_MAX_CLAMP(unsigned char)
+#undef DECL_MIN_MAX_CLAMP
 
 INLINE OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
 INLINE OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+INLINE OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a > b ? x : b > a ? y : max(x, y);
+}
+INLINE OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a < b ? x : b < a ? y : min(x, y);
+}
 INLINE OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+INLINE OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+  return __gen_ocl_internal_fmax(x, y) - y;
+}
+INLINE OVERLOADABLE float fract(float x, float *p) {
+  *p = __gen_ocl_internal_floor(x);
+  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+}
+INLINE OVERLOADABLE float2 fract(float2 x, float2 *p) {
+  return (float2)(fract(x.s0, (float *)p),
+                  fract(x.s1, 1 + (float *)p));
+}
+INLINE OVERLOADABLE float4 fract(float4 x, float4 *p) {
+  return (float4)(fract(x.s0, (float *)p),
+                  fract(x.s1, 1 + (float *)p),
+                  fract(x.s2, 2 + (float *)p),
+                  fract(x.s3, 3 + (float *)p));
+}
+INLINE OVERLOADABLE float8 fract(float8 x, float8 *p) {
+  return (float8)(fract(x.s0, (float *)p),
+                  fract(x.s1, 1 + (float *)p),
+                  fract(x.s2, 2 + (float *)p),
+                  fract(x.s3, 3 + (float *)p),
+                  fract(x.s4, 4 + (float *)p),
+                  fract(x.s5, 5 + (float *)p),
+                  fract(x.s6, 6 + (float *)p),
+                  fract(x.s7, 7 + (float *)p));
+}
+INLINE OVERLOADABLE float16 fract(float16 x, float16 *p) {
+  return (float16)(fract(x.s0, (float *)p),
+                   fract(x.s1, 1 + (float *)p),
+                   fract(x.s2, 2 + (float *)p),
+                   fract(x.s3, 3 + (float *)p),
+                   fract(x.s4, 4 + (float *)p),
+                   fract(x.s5, 5 + (float *)p),
+                   fract(x.s6, 6 + (float *)p),
+                   fract(x.s7, 7 + (float *)p),
+                   fract(x.s8, 8 + (float *)p),
+                   fract(x.s9, 9 + (float *)p),
+                   fract(x.sa, 10 + (float *)p),
+                   fract(x.sb, 11 + (float *)p),
+                   fract(x.sc, 12 + (float *)p),
+                   fract(x.sd, 13 + (float *)p),
+                   fract(x.se, 14 + (float *)p),
+                   fract(x.sf, 15 + (float *)p));
+}
+INLINE OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+INLINE OVERLOADABLE float ldexp(float x, int n) {
+  return __gen_ocl_pow(2, n) * x;
+}
+INLINE OVERLOADABLE float pown(float x, int n) {
+  if (x == 0 && n == 0)
+    return 1;
+  return powr(x, n);
+}
+INLINE OVERLOADABLE float rootn(float x, int n) {
+  return powr(x, 1.f / n);
+}
 
 /////////////////////////////////////////////////////////////////////////////
 // Geometric functions (see 6.11.5 of OCL 1.1 spec)
@@ -641,12 +876,33 @@ DECL_UNTYPED_RW_ALL(float)
     return dst;\
   }
 DECL_VECTOR_1OP(native_cos, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_cospi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_cosh, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_acos, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_acospi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_acosh, float);
 DECL_VECTOR_1OP(native_sin, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_sinpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_sinh, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_asin, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_asinpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_asinh, float);
 DECL_VECTOR_1OP(native_tan, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_tanpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_tanh, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_atan, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_atanpi, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_atanh, float);
 DECL_VECTOR_1OP(native_sqrt, float);
 DECL_VECTOR_1OP(native_rsqrt, float);
 DECL_VECTOR_1OP(native_log2, float);
+DECL_VECTOR_1OP(log1p, float);
+DECL_VECTOR_1OP(logb, float);
 DECL_VECTOR_1OP(native_recip, float);
+DECL_VECTOR_1OP(native_exp2, float);
+DECL_VECTOR_1OP(native_exp10, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_expm1, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_cbrt, float);
 DECL_VECTOR_1OP(__gen_ocl_internal_fabs, float);
 DECL_VECTOR_1OP(__gen_ocl_internal_trunc, float);
 DECL_VECTOR_1OP(__gen_ocl_internal_round, float);
@@ -655,6 +911,9 @@ DECL_VECTOR_1OP(__gen_ocl_internal_ceil, float);
 DECL_VECTOR_1OP(__gen_ocl_internal_log, float);
 DECL_VECTOR_1OP(__gen_ocl_internal_log2, float);
 DECL_VECTOR_1OP(__gen_ocl_internal_log10, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_rint, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_erf, float);
+DECL_VECTOR_1OP(__gen_ocl_internal_erfc, float);
 #undef DECL_VECTOR_1OP
 /////////////////////////////////////////////////////////////////////////////
 // Arithmetic functions
@@ -682,12 +941,46 @@ DECL_VECTOR_1OP(__gen_ocl_internal_log10, float);
     dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
     return dst;\
   }
+DECL_VECTOR_2OP(hypot, float);
 DECL_VECTOR_2OP(min, float);
 DECL_VECTOR_2OP(max, float);
 DECL_VECTOR_2OP(__gen_ocl_internal_fmin, float);
 DECL_VECTOR_2OP(__gen_ocl_internal_fmax, float);
+DECL_VECTOR_2OP(__gen_ocl_internal_fdim, float);
 DECL_VECTOR_2OP(fmod, float);
+DECL_VECTOR_2OP(remainder, float);
 DECL_VECTOR_2OP(powr, float);
+DECL_VECTOR_2OP(native_divide, float);
+DECL_VECTOR_2OP(copysign, float);
+DECL_VECTOR_2OP(__gen_ocl_internal_maxmag, float);
+DECL_VECTOR_2OP(__gen_ocl_internal_minmag, float);
+#undef DECL_VECTOR_2OP
+
+#define DECL_VECTOR_2OP(NAME, TYPE, TYPE2) \
+  INLINE OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
+    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v1.y, v1.y)); \
+  }\
+  INLINE OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
+    return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
+  }\
+  INLINE OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
+    return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
+  }\
+  INLINE OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
+    TYPE##8 dst;\
+    dst.s0123 = NAME(v0.s0123, v1.s0123);\
+    dst.s4567 = NAME(v0.s4567, v1.s4567);\
+    return dst;\
+  }\
+  INLINE OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
+    TYPE##16 dst;\
+    dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
+    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
+    return dst;\
+  }
+DECL_VECTOR_2OP(ldexp, float, int);
+DECL_VECTOR_2OP(pown, float, int);
+DECL_VECTOR_2OP(rootn, float, int);
 #undef DECL_VECTOR_2OP
 
 #define DECL_VECTOR_3OP(NAME, TYPE) \
@@ -733,8 +1026,15 @@ INLINE OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,
 #define log2 __gen_ocl_internal_log2
 #define log10 __gen_ocl_internal_log10
 #define exp __gen_ocl_internal_exp
+#define exp2 native_exp2
+#define exp10 native_exp10
+#define expm1 __gen_ocl_internal_expm1
 #define fmin __gen_ocl_internal_fmin
 #define fmax __gen_ocl_internal_fmax
+#define fma mad
+#define fdim __gen_ocl_internal_fdim
+#define maxmag __gen_ocl_internal_maxmag
+#define minmag __gen_ocl_internal_minmag
 
 /////////////////////////////////////////////////////////////////////////////
 // Synchronization functions
@@ -796,6 +1096,9 @@ OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w,
 OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
 OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
 OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
+int __gen_ocl_get_image_width(uint surface_id);
+int __gen_ocl_get_image_height(uint surface_id);
+//OVERLOADABLE int __gen_ocl_get_image_depth(image3d_t image);
 
 #define GET_IMAGE(cl_image, surface_id) \
     uint surface_id = (uint)cl_image
@@ -804,7 +1107,14 @@ OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, floa
   INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, sampler_t sampler, coord_type coord) \
   {\
     GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ##suffix(surface_id, (uint)sampler, coord.s0, coord.s1);\
+    return __gen_ocl_read_image ##suffix(surface_id, sampler, coord.s0, coord.s1);\
+  }
+
+#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, coord_type coord) \
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    return __gen_ocl_read_image ##suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1);\
   }
 
 #define DECL_WRITE_IMAGE(type, suffix, coord_type) \
@@ -817,6 +1127,7 @@ OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, floa
 #define DECL_IMAGE(type, suffix)        \
   DECL_READ_IMAGE(type, suffix, int2)   \
   DECL_READ_IMAGE(type, suffix, float2) \
+  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int2) \
   DECL_WRITE_IMAGE(type, suffix, int2)   \
   DECL_WRITE_IMAGE(type, suffix, float2)
 
@@ -826,8 +1137,56 @@ DECL_IMAGE(float4, f)
 
 #undef DECL_IMAGE
 #undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
 #undef DECL_WRITE_IMAGE
 
+#define DECL_IMAGE_INFO(image_type)    \
+  INLINE_OVERLOADABLE  int get_image_width(image_type image) \
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_width(surface_id);\
+  } \
+  INLINE_OVERLOADABLE  int get_image_height(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_height(surface_id); \
+  }
+#if 0
+  INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
+  { NOT_IMPLEMENTED; }\
+  INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
+  { NOT_IMPLEMENTED; }
+#endif
+
+
+DECL_IMAGE_INFO(image2d_t)
+DECL_IMAGE_INFO(image3d_t)
+#if 0
+/* The following functions are not implemented yet. */
+DECL_IMAGE_INFO(image1d_t)
+DECL_IMAGE_INFO(image1d_buffer_t)
+DECL_IMAGE_INFO(image1d_array_t)
+DECL_IMAGE_INFO(image2d_array_t)
+
+INLINE_OVERLOADABLE  int get_image_depth(image3d_t image)
+  { return __gen_ocl_get_image_depth(image); }
+
+INLINE_OVERLOADABLE  int2 get_image_dim(image2d_t image)
+  { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE  int2 get_image_dim(image2d_array_t image)
+  { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE  int4 get_image_dim(image2d_array_t image)
+  { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE  size_t get_image_array_size(image2d_array_t image)
+  { return __gen_ocl_get_image_array_size(image); }
+
+INLINE_OVERLOADABLE  size_t get_image_array_size(image1d_array_t image)
+  { return __gen_ocl_get_image_array_size(image); }
+#endif
+
 #define DECL_READ_IMAGE(type, suffix, coord_type) \
   INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, sampler_t sampler, coord_type coord) \
   {\
@@ -835,6 +1194,13 @@ DECL_IMAGE(float4, f)
     return __gen_ocl_read_image ## suffix(surface_id, (uint)sampler, coord.s0, coord.s1, coord.s2);\
   }
 
+#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, coord_type coord) \
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    return __gen_ocl_read_image ## suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1, coord.s2);\
+  }
+
 #define DECL_WRITE_IMAGE(type, suffix, coord_type) \
   INLINE_OVERLOADABLE void write_image ## suffix(image3d_t cl_image, coord_type coord, type color)\
   {\
@@ -845,6 +1211,7 @@ DECL_IMAGE(float4, f)
 #define DECL_IMAGE(type, suffix)        \
   DECL_READ_IMAGE(type, suffix, int4)   \
   DECL_READ_IMAGE(type, suffix, float4) \
+  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int4) \
   DECL_WRITE_IMAGE(type, suffix, int4)   \
   DECL_WRITE_IMAGE(type, suffix, float4)
 
@@ -854,6 +1221,7 @@ DECL_IMAGE(float4, f)
 
 #undef DECL_IMAGE
 #undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
 #undef DECL_WRITE_IMAGE
 
 #undef GET_IMAGE
diff --git a/backend/src/sys/alloc.hpp b/backend/src/sys/alloc.hpp
index 52a37a2..8fcb3a7 100644
--- a/backend/src/sys/alloc.hpp
+++ b/backend/src/sys/alloc.hpp
@@ -27,6 +27,7 @@
 #include "sys/platform.hpp"
 #include "sys/assert.hpp"
 #include <algorithm>
+#include <limits>
 
 namespace gbe
 {
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
index 806e736..06448e2 100644
--- a/include/CL/cl.hpp
+++ b/include/CL/cl.hpp
@@ -29,11 +29,10 @@
  *   
  *   Additions and fixes from:
  *       Brian Cole, March 3rd 2010 and April 2012 
- *       Lee Howes, October 2011, March 2012
  *       Matt Gruenke, April 2012.
  *   
- *   \version 1.2.1
- *   \date September 2012
+ *   \version 1.2.4
+ *   \date January 2013
  *
  *   Optional extension support
  *
@@ -168,12 +167,6 @@
 #include <CL/cl_ext.h>
 #endif
 
-// TODO: Remove once declaration is moved elsewhere
-//#if !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-//#define CL_USE_DEPRECATED_OPENCL_1_1_APIS 
-//#endif // #if !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-
-
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenGL/OpenGL.h>
 #include <OpenCL/opencl.h>
@@ -393,7 +386,7 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
 #if defined(CL_VERSION_1_2)
 #define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
-#define __SET_PRINTF_CALLBACK_ERR           __ERR_STR(clSetPrintfCallback)
+
 #endif // #if defined(CL_VERSION_1_2)
 #define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
 
@@ -433,7 +426,7 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
 
 /**
- * CL 1.1 version that uses device fission.
+ * CL 1.2 version that uses device fission.
  */
 #if defined(CL_VERSION_1_2)
 #define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
@@ -458,6 +451,13 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #endif // __CL_USER_OVERRIDE_ERROR_STRINGS
 //! \endcond
 
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
 
 #if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
 typedef std::string STRING_CLASS;
@@ -623,10 +623,8 @@ public:
     //! \brief Destructor - frees memory used to hold the current value.
     ~string()
     {
-        if (str_ != NULL) {
-            delete[] str_;
-            str_ = NULL;
-        }
+        delete[] str_;
+        str_ = NULL;
     }
     
     //! \brief Queries the length of the string, excluding any added '\0's.
@@ -2327,9 +2325,16 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
         object_ = ::clCreateContext(
-            properties, (cl_uint) devices.size(),
-            (cl_device_id*) &devices.front(),
+            properties, (cl_uint) numDevices,
+            deviceIDs,
             notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
@@ -2350,9 +2355,12 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
+
+        cl_device_id deviceID = device();
+
         object_ = ::clCreateContext(
             properties, 1,
-            (cl_device_id*) &device,
+            &deviceID,
             notifyFptr, data, &error);
 
         detail::errHandler(error, __CREATE_CONTEXT_ERR);
@@ -2554,25 +2562,6 @@ public:
         formats->assign(&value[0], &value[numEntries]);
         return CL_SUCCESS;
     }
-
-
-#if defined(CL_VERSION_1_2)
-    cl_int setPrintfCallback(
-        void (CL_CALLBACK * pfn_notify)(
-            cl_context /* program */, 
-            cl_uint /*printf_data_len */, 
-            char * /* printf_data_ptr */, 
-            void * /* user_data */),
-        void * user_data )
-    {
-        return detail::errHandler(
-            ::clSetPrintfCallback(
-                object_,
-                pfn_notify,
-                user_data), 
-            __SET_PRINTF_CALLBACK_ERR);
-    }
-#endif // #if defined(CL_VERSION_1_2)
 };
 
 inline Device Device::getDefault(cl_int * err)
@@ -3016,7 +3005,7 @@ public:
     Buffer(
         IteratorType startIterator,
         IteratorType endIterator,
-        bool readOnly = false,
+        bool readOnly,
         bool useHostPtr = false,
         cl_int* err = NULL)
     {
@@ -4121,14 +4110,14 @@ public:
     ImageGL& operator = (const ImageGL& rhs)
     {
         if (this != &rhs) {
-            ImageGL::operator=(rhs);
+            Image::operator=(rhs);
         }
         return *this;
     }
 
     ImageGL& operator = (const cl_mem& rhs)
     {
-        ImageGL::operator=(rhs);
+        Image::operator=(rhs);
         return *this;
     }
 };
@@ -4443,7 +4432,7 @@ public:
     {
         typename detail::param_traits<
             detail::cl_kernel_arg_info, name>::param_type param;
-        cl_int result = getArgInfo(name, argIndex, &param);
+        cl_int result = getArgInfo(argIndex, name, &param);
         if (err != NULL) {
             *err = result;
         }
@@ -4650,9 +4639,15 @@ public:
             lengths[i] = binaries[(int)i].second;
         }
 
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
         object_ = ::clCreateProgramWithBinary(
             context(), (cl_uint) devices.size(),
-            (cl_device_id*)&devices.front(),
+            deviceIDs,
             lengths, images, binaryStatus != NULL
                ? (cl_int*) &binaryStatus->front()
                : NULL, &error);
@@ -4676,11 +4671,18 @@ public:
         cl_int* err = NULL)
     {
         cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
         
         object_ = ::clCreateProgramWithBuiltInKernels(
             context(), 
             (cl_uint) devices.size(),
-            (cl_device_id*)&devices.front(),
+            deviceIDs,
             kernelNames.c_str(), 
             &error);
 
@@ -4717,12 +4719,18 @@ public:
         void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
         void* data = NULL) const
     {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
         return detail::errHandler(
             ::clBuildProgram(
                 object_,
                 (cl_uint)
                 devices.size(),
-                (cl_device_id*)&devices.front(),
+                deviceIDs,
                 options,
                 notifyFptr,
                 data),
@@ -5668,7 +5676,7 @@ public:
                 (events != NULL) ? (cl_uint) events->size() : 0,
                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
                 (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
 
         if (event != NULL && err == CL_SUCCESS)
             *event = tmp;
@@ -5687,18 +5695,18 @@ public:
      * all events either in the event_wait_list or all previously enqueued commands, queued 
      * before this command to command_queue, have completed.
      */
-    cl_int clEnqueueBarrierWithWaitList(
+    cl_int enqueueBarrierWithWaitList(
         const VECTOR_CLASS<Event> *events = 0,
         Event *event = 0)
     {
         cl_event tmp;
         cl_int err = detail::errHandler(
-            ::clEnqueueMarkerWithWaitList(
+            ::clEnqueueBarrierWithWaitList(
                 object_,
                 (events != NULL) ? (cl_uint) events->size() : 0,
                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
                 (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
 
         if (event != NULL && err == CL_SUCCESS)
             *event = tmp;
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 1f0437f..4d0bed7 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -1,3 +1,5 @@
 FILE(GLOB HEADER_FILES "CL/*.h")
+FILE(GLOB HPP_FILES "CL/*.hpp")
 
 install (FILES ${HEADER_FILES} DESTINATION include/CL)
+install (FILES ${HPP_FILES} DESTINATION include/CL)
diff --git a/kernels/compiler_julia.cl b/kernels/compiler_julia.cl
index 996c0b7..98c5799 100644
--- a/kernels/compiler_julia.cl
+++ b/kernels/compiler_julia.cl
@@ -14,8 +14,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
   return I - 2.0f * dot(N, I) * N;
 }
 
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
 inline uint pack_fp4(float4 u4) {
   uint u;
   u = (((uint) u4.x)) |
diff --git a/kernels/compiler_julia_no_break.cl b/kernels/compiler_julia_no_break.cl
index c0bd3b1..1a9be64 100644
--- a/kernels/compiler_julia_no_break.cl
+++ b/kernels/compiler_julia_no_break.cl
@@ -14,8 +14,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
   return I - 2.0f * dot(N, I) * N;
 }
 
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
 inline uint pack_fp4(float4 u4) {
   uint u;
   u = (((uint) u4.x)) |
diff --git a/kernels/compiler_math.cl b/kernels/compiler_math.cl
index 0659840..695fc2c 100644
--- a/kernels/compiler_math.cl
+++ b/kernels/compiler_math.cl
@@ -1,14 +1,40 @@
 __kernel void compiler_math(__global float *dst, __global float *src) {
-  const float x = src[get_global_id(0)];
-  switch (get_global_id(0)) {
-    case 0: dst[get_global_id(0)] = native_cos(x); break;
-    case 1: dst[get_global_id(0)] = native_sin(x); break;
-    case 2: dst[get_global_id(0)] = native_log2(x); break;
-    case 3: dst[get_global_id(0)] = native_sqrt(x); break;
-    case 4: dst[get_global_id(0)] = native_rsqrt(x); break;
-    case 5: dst[get_global_id(0)] = native_recip(x); break;
-    case 6: dst[get_global_id(0)] = native_tan(x); break;
-    default: dst[get_global_id(0)] = 1.f; break;
+  int i = get_global_id(0);
+  const float x = src[i];
+  switch (i) {
+    case 0: dst[i] = cos(x); break;
+    case 1: dst[i] = sin(x); break;
+    case 2: dst[i] = log2(x); break;
+    case 3: dst[i] = sqrt(x); break;
+    case 4: dst[i] = rsqrt(x); break;
+    case 5: dst[i] = native_recip(x); break;
+    case 6: dst[i] = tan(x); break;
+    case 7: dst[i] = cbrt(x); break;
+    case 8: dst[i] = ceil(x); break;
+    case 9: dst[i] = cospi(x); break;
+    case 10: dst[i] = exp2(x); break;
+    case 11: dst[i] = exp10(x); break;
+    case 12: dst[i] = expm1(x); break;
+    case 13: dst[i] = log1p(x); break;
+    case 14: dst[i] = logb(x); break;
+    case 15: dst[i] = sinpi(x); break;
+    case 16: dst[i] = tanpi(x); break;
+    case 17: dst[i] = rint(x); break;
+    case 18: dst[i] = sinh(x); break;
+    case 19: dst[i] = cosh(x); break;
+    case 20: dst[i] = tanh(x); break;
+    case 21: dst[i] = asinh(x); break;
+    case 22: dst[i] = acosh(x); break;
+    case 23: dst[i] = atanh(x); break;
+    case 24: dst[i] = asin(x); break;
+    case 25: dst[i] = acos(x); break;
+    case 26: dst[i] = atan(x); break;
+    case 27: dst[i] = asinpi(x); break;
+    case 28: dst[i] = acospi(x); break;
+    case 29: dst[i] = atanpi(x); break;
+    case 30: dst[i] = erf(x); break;
+    case 31: dst[i] = nan((uint)x); break;
+    default: dst[i] = 1.f; break;
   };
 }
 
diff --git a/kernels/compiler_math_2op.cl b/kernels/compiler_math_2op.cl
new file mode 100644
index 0000000..6e970b8
--- /dev/null
+++ b/kernels/compiler_math_2op.cl
@@ -0,0 +1,19 @@
+kernel void compiler_math_2op(global float *dst, global float *src1, global float *src2) {
+  int i = get_global_id(0);
+  const float x = src1[i], y = src2[i];
+  float z;
+  switch (i) {
+    case 0: dst[i] = native_divide(x, y); break;
+    case 1: dst[i] = fdim(x, y); break;
+    case 2: dst[i] = fract(x, &z); break;
+    case 3: dst[i] = hypot(x, y); break;
+    case 4: dst[i] = ldexp(x, y); break;
+    case 5: dst[i] = pown(x, (int)y); break;
+    case 6: dst[i] = remainder(x, y); break;
+    case 7: dst[i] = rootn(x, (int)(y+1)); break;
+    case 8: dst[i] = copysign(x, y); break;
+    case 9: dst[i] = maxmag(x, y); break;
+    case 10: dst[i] = minmag(x, y); break;
+    default: dst[i] = 1.f; break;
+  };
+}
diff --git a/kernels/compiler_math_3op.cl b/kernels/compiler_math_3op.cl
new file mode 100644
index 0000000..95b0398
--- /dev/null
+++ b/kernels/compiler_math_3op.cl
@@ -0,0 +1,9 @@
+kernel void compiler_math_3op(global float *dst, global float *src1, global float *src2, global float *src3) {
+  int i = get_global_id(0);
+  const float x = src1[i], y = src2[i], z = src3[i];
+  switch (i) {
+    case 0: dst[i] = mad(x, y, z); break;
+    case 1: dst[i] = fma(x, y, z); break;
+    default: dst[i] = 1.f; break;
+  };
+}
diff --git a/kernels/compiler_menger_sponge.cl b/kernels/compiler_menger_sponge.cl
index b59c5e3..58af12a 100644
--- a/kernels/compiler_menger_sponge.cl
+++ b/kernels/compiler_menger_sponge.cl
@@ -25,8 +25,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
   return I - 2.0f * dot(N, I) * N;
 }
 
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
 inline uint pack_fp4(float4 u4) {
   uint u;
   u = (((uint) u4.x)) |
diff --git a/kernels/compiler_menger_sponge_no_shadow.cl b/kernels/compiler_menger_sponge_no_shadow.cl
index 4f1093f..95469c5 100644
--- a/kernels/compiler_menger_sponge_no_shadow.cl
+++ b/kernels/compiler_menger_sponge_no_shadow.cl
@@ -25,8 +25,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
   return I - 2.0f * dot(N, I) * N;
 }
 
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
 inline uint pack_fp4(float4 u4) {
   uint u;
   u = (((uint) u4.x)) |
diff --git a/kernels/compiler_nautilus.cl b/kernels/compiler_nautilus.cl
index b53771c..aa7251a 100644
--- a/kernels/compiler_nautilus.cl
+++ b/kernels/compiler_nautilus.cl
@@ -14,8 +14,6 @@ inline vec3 reflect(vec3 I, vec3 N) {
   return I - 2.0f * dot(N, I) * N;
 }
 
-inline float clamp(x,m,M) { return max(min(x,M),m); }
-
 inline uint pack_fp4(float4 u4) {
   uint u;
   u = (((uint) u4.x)) |
@@ -59,7 +57,7 @@ __kernel void compiler_nautilus(__global uint *dst, float resx, float resy, int
   for(int q=0;q<100;q++)
   {
      float l = e(o+0.5f*(vec3)(cos(1.1f*(float)(q)),cos(1.6f*(float)(q)),cos(1.4f*(float)(q))))-m;
-     a+=clamp(4.0f*l,0.0f,1.0f);
+     a+=floor(clamp(4.0f*l,0.0f,1.0f));
   }
   v*=a/100.0f;
   vec4 gl_FragColor=(vec4)(v,1.0f);
diff --git a/kernels/test_copy_image1.cl b/kernels/test_copy_image1.cl
new file mode 100644
index 0000000..28e7a7d
--- /dev/null
+++ b/kernels/test_copy_image1.cl
@@ -0,0 +1,33 @@
+#define S(A,B,C) CLK_NORMALIZED_COORDS_##A | CLK_ADDRESS_##B | CLK_FILTER_##C
+
+#define COPY_IMAGE(_dst, _sampler, scoord, dcoord) \
+  color = read_imagei(src, _sampler, scoord);\
+  write_imagei(_dst, dcoord, color)
+
+__kernel void
+test_copy_image1(__read_only image2d_t src,
+                 __write_only image2d_t dst0,
+                 sampler_t sampler0,
+                 __write_only image2d_t dst1,
+                 __write_only image2d_t dst2,
+                 __write_only image2d_t dst3,
+                 __write_only image2d_t dst4,
+                 float w_inv, float h_inv)
+{
+  const sampler_t sampler1 = S(FALSE, REPEAT, NEAREST);
+  const sampler_t sampler2 = S(FALSE, CLAMP, NEAREST);
+  const sampler_t sampler3 = S(FALSE, MIRRORED_REPEAT, NEAREST);
+  const sampler_t sampler4 = S(TRUE, REPEAT, NEAREST);
+  int2 coord;
+  float2 fcoord;
+  int4 color;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  fcoord.x = coord.x * w_inv;
+  fcoord.y = coord.y * h_inv;
+  COPY_IMAGE(dst0, sampler0, coord, coord);
+  COPY_IMAGE(dst1, sampler1, coord, coord);
+  COPY_IMAGE(dst2, sampler2, coord, coord);
+  COPY_IMAGE(dst3, sampler3, coord, coord);
+  COPY_IMAGE(dst4, sampler4, fcoord, coord);
+}
diff --git a/kernels/test_fill_image0.cl b/kernels/test_fill_image0.cl
index ad1339f..9428092 100644
--- a/kernels/test_fill_image0.cl
+++ b/kernels/test_fill_image0.cl
@@ -2,8 +2,8 @@ __kernel void
 test_fill_image0(__write_only image2d_t dst)
 {
   int2 coord;
-  int4 color4 = {0x12, 0x34, 0x56, 0x78};
   coord.x = (int)get_global_id(0);
   coord.y = (int)get_global_id(1);
+  int4 color4 = {coord.y & 0xFF, (coord.y & 0xFF00) >> 8, coord.x & 0xFF, (coord.x & 0xFF00) >> 8};
   write_imagei(dst, coord, color4);
 }
diff --git a/kernels/test_get_image_size.cl b/kernels/test_get_image_size.cl
new file mode 100644
index 0000000..aeb7d66
--- /dev/null
+++ b/kernels/test_get_image_size.cl
@@ -0,0 +1,9 @@
+__kernel void
+test_get_image_size(__write_only image2d_t src, __global int *info)
+{
+  int id = (int)get_global_id(0);
+  int w, h;
+  w = get_image_width(src);
+  h = get_image_height(src);
+  info[id] = (w << 16 | h);
+}
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..fc1479e
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,2 @@
+OCLConfig.h
+libcl.so
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2f590c6..cea78c0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,14 +28,16 @@ set(OPENCL_SRC
     x11/dricommon.c 
     x11/va_dri2.c)
 
-if (EGL_FOUND)
+if (EGL_FOUND AND GBM_FOUND)
 set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/gbm_dri2_x11_platform.c)
 SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
 SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
-else(EGL_FOUND)
+SET(OPTIONAL_GBM_LIBRARY "${GBM_LIBRARY}")
+else(EGL_FOUND AND GBM_FOUND)
 SET(OPTIONAL_EGL_LIBRARY "")
-endif (EGL_FOUND)
+SET(OPTIONAL_GBM_LIBRARY "")
+endif (EGL_FOUND AND GBM_FOUND)
 
 if (OCLIcd_FOUND)
 set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
@@ -57,5 +59,5 @@ target_link_libraries(
                       ${DRM_LIBRARY}
                       ${OPENGL_LIBRARIES}
                       ${OPTIONAL_EGL_LIBRARY}
-                      ${GBM_LIBRARY})
+                      ${OPTIONAL_GBM_LIBRARY})
 install (TARGETS cl LIBRARY DESTINATION lib)
diff --git a/src/cl_api.c b/src/cl_api.c
index cfbb44f..a4e534a 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -87,6 +87,31 @@ clGetDeviceInfo(cl_device_id   device,
                             param_value_size_ret);
 }
 
+cl_int
+clCreateSubDevices(cl_device_id                         in_device,
+                   const cl_device_partition_property * properties,
+                   cl_uint                              num_devices,
+                   cl_device_id *                       out_devices,
+                   cl_uint *                            num_devices_ret)
+{
+  NOT_IMPLEMENTED;
+  return 0;
+}
+
+cl_int
+clRetainDevice(cl_device_id device)
+{
+  // XXX stub for C++ Bindings
+  return CL_SUCCESS;
+}
+
+cl_int
+clReleaseDevice(cl_device_id device)
+{
+  // XXX stub for C++ Bindings
+  return CL_SUCCESS;
+}
+
 cl_context
 clCreateContext(const cl_context_properties *  properties,
                 cl_uint                        num_devices,
@@ -292,7 +317,7 @@ clCreateImage(cl_context context,
                          image_format,
                          image_desc,
                          host_ptr,
-                         errcode_ret);
+                         &err);
 error:
   if (errcode_ret)
     *errcode_ret = err;
@@ -945,8 +970,86 @@ clEnqueueReadImage(cl_command_queue      command_queue,
                    const cl_event *      event_wait_list,
                    cl_event *            event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(image);
+  if (command_queue->ctx != image->ctx) {
+     err = CL_INVALID_CONTEXT;
+     goto error;
+  }
+
+  if (blocking_read != CL_TRUE)
+     NOT_IMPLEMENTED;
+
+  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (!row_pitch)
+    row_pitch = image->bpp*region[0];
+  else if (row_pitch < image->bpp*region[0]) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (image->slice_pitch) {
+    if (!slice_pitch)
+      slice_pitch = row_pitch*region[1];
+    else if (slice_pitch < row_pitch*region[1]) {
+      err = CL_INVALID_VALUE;
+      goto error;
+    }
+  }
+  else if (slice_pitch) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (!ptr) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+     err = CL_INVALID_OPERATION;
+     goto error;
+  }
+
+  if (!(src_ptr = cl_mem_map_auto(image))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  src_ptr = (char*)src_ptr + offset;
+
+  if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
+  {
+    memcpy(ptr, src_ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+  }
+  else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src = src_ptr;
+      char* dst = ptr;
+      for (y = 0; y < region[1]; y++) {
+	memcpy(dst, src, image->bpp*region[0]);
+	src += image->row_pitch;
+	dst += row_pitch;
+      }
+      src_ptr = (char*)src_ptr + image->slice_pitch;
+      ptr = (char*)ptr + slice_pitch;
+    }
+  }
+
+  err = cl_mem_unmap_auto(image);
+
+error:
+  return err;
 }
 
 cl_int
@@ -955,15 +1058,93 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
                     cl_bool              blocking_write,
                     const size_t *       origin,
                     const size_t *       region,
-                    size_t               input_row_pitch,
-                    size_t               input_slice_pitch,
+                    size_t               row_pitch,
+                    size_t               slice_pitch,
                     const void *         ptr,
                     cl_uint              num_events_in_wait_list,
                     const cl_event *     event_wait_list,
                     cl_event *           event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  void* dst_ptr;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(image);
+  if (command_queue->ctx != image->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (blocking_write != CL_TRUE)
+    NOT_IMPLEMENTED;
+
+  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!row_pitch)
+    row_pitch = image->bpp*region[0];
+  else if (row_pitch < image->bpp*region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (image->slice_pitch) {
+    if (!slice_pitch)
+      slice_pitch = row_pitch*region[1];
+    else if (slice_pitch < row_pitch*region[1]) {
+      err = CL_INVALID_VALUE;
+      goto error;
+    }
+  }
+  else if (slice_pitch) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!ptr) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  if (!(dst_ptr = cl_mem_map_auto(image))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  dst_ptr = (char*)dst_ptr + offset;
+
+  if (!origin[0] && region[0] == image->w && row_pitch == image->row_pitch &&
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && slice_pitch == image->slice_pitch)))
+  {
+    memcpy(dst_ptr, ptr, region[2] == 1 ? row_pitch*region[1] : slice_pitch*region[2]);
+  }
+  else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src = ptr;
+      char* dst = dst_ptr;
+      for (y = 0; y < region[1]; y++) {
+	memcpy(dst, src, image->bpp*region[0]);
+	src += row_pitch;
+	dst += image->row_pitch;
+      }
+      ptr = (char*)ptr + slice_pitch;
+      dst_ptr = (char*)dst_ptr + image->slice_pitch;
+    }
+  }
+
+  err = cl_mem_unmap_auto(image);
+
+error:
+  return err;
 }
 
 cl_int
@@ -1023,13 +1204,30 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
                    cl_event *        event,
                    cl_int *          errcode_ret)
 {
-  void *p;
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
   if (blocking_map != CL_TRUE)
      NOT_IMPLEMENTED;
   if (offset != 0)
      NOT_IMPLEMENTED;
-  p = clMapBufferIntel(buffer, errcode_ret);
-  return p;
+
+  if (!(ptr = cl_mem_map_auto(buffer))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return ptr;
 }
 
 void *
@@ -1046,8 +1244,54 @@ clEnqueueMapImage(cl_command_queue   command_queue,
                   cl_event *         event,
                   cl_int *           errcode_ret)
 {
-  NOT_IMPLEMENTED;
-  return NULL;
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(image);
+  if (command_queue->ctx != image->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (blocking_map != CL_TRUE)
+    NOT_IMPLEMENTED;
+
+  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  *image_row_pitch = image->row_pitch;
+  if (image_slice_pitch)
+    *image_slice_pitch = image->slice_pitch;
+
+  if ((map_flags & CL_MAP_READ &&
+       image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+      (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+       image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+  {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  if (!(ptr = cl_mem_map_auto(image))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  ptr = (char*)ptr + offset;
+
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return ptr;
 }
 
 cl_int
@@ -1058,7 +1302,7 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
                         const cl_event *  event_wait_list,
                         cl_event *        event)
 {
-  return clUnmapBufferIntel(memobj);
+  return cl_mem_unmap_auto(memobj);
 }
 
 cl_int
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 7d604c3..6ce9016 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -98,6 +98,31 @@ cl_command_queue_add_ref(cl_command_queue queue)
   atomic_inc(&queue->ref_n);
 }
 
+static void
+set_image_info(char *curbe, struct ImageInfo * image_info, cl_mem image)
+{
+  if (image_info->wSlot >= 0)
+    *(uint32_t*)(curbe + image_info->wSlot) = image->w;
+  if (image_info->hSlot >= 0)
+    *(uint32_t*)(curbe + image_info->hSlot) = image->h;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
+{
+  uint32_t i;
+  for (i = 0; i < k->image_sz; i++) {
+    int id = k->images[i].arg_idx;
+    assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
+    set_image_info(k->curbe, &k->images[i], k->args[id].mem);
+    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, k->args[id].mem->bo,
+                        k->args[id].mem->intel_fmt, k->args[id].mem->type,
+                        k->args[id].mem->w, k->args[id].mem->h,
+                        k->args[id].mem->row_pitch, k->args[id].mem->tiling);
+  }
+  return CL_SUCCESS;
+}
+
 LOCAL cl_int
 cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
 {
@@ -107,22 +132,10 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   for (i = 0; i < k->arg_n; ++i) {
     uint32_t offset; // location of the address in the curbe
     arg_type = gbe_kernel_get_arg_type(k->opaque, i);
-    if (arg_type != GBE_ARG_GLOBAL_PTR &&
-        arg_type != GBE_ARG_IMAGE &&
-        arg_type != GBE_ARG_SAMPLER)
+    if (arg_type != GBE_ARG_GLOBAL_PTR)
       continue;
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
-    if (arg_type == GBE_ARG_IMAGE) {
-      uint32_t *curbe_index = (uint32_t*)(k->curbe + offset);
-      cl_gpgpu_bind_image(queue->gpgpu, curbe_index, k->args[i].mem->bo,
-                          k->args[i].mem->intel_fmt, k->args[i].mem->type,
-                          k->args[i].mem->w, k->args[i].mem->h,
-                          k->args[i].mem->pitch, k->args[i].mem->tiling);
-    } else if (arg_type == GBE_ARG_SAMPLER) {
-      uint32_t *curbe_index = (uint32_t*)(k->curbe + offset);
-      cl_gpgpu_insert_sampler(queue->gpgpu, curbe_index, k->args[i].sampler);
-    } else
-      cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
+    cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
   }
 
   return CL_SUCCESS;
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index dcfc8c4..f0c00f4 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -70,6 +70,9 @@ extern cl_int cl_command_queue_finish(cl_command_queue);
 /* Bind all the surfaces in the GPGPU state */
 extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
 
+/* Bind all the image surfaces in the GPGPU state */
+extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
+
 /*update constant buffer to final curbe */
 extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
 #endif /* __CL_COMMAND_QUEUE_H__ */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 108684f..770af4a 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -224,6 +224,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Bind user buffers */
   cl_command_queue_bind_surface(queue, ker);
+  /* Bind user images */
+  cl_command_queue_bind_image(queue, ker);
+  /* Bind all samplers */
+  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
 
   /* Bind a stack if needed */
   cl_bind_stack(gpgpu, ker);
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 6300b41..136f3b1 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -230,6 +230,7 @@ cl_get_device_info(cl_device_id     device,
     DECL_STRING_FIELD(PROFILE, profile)
     DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
     DECL_STRING_FIELD(EXTENSIONS, extensions);
+    DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
 
     case CL_DRIVER_VERSION:
       if (param_value_size_ret) {
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index d199ecb..1beff92 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -29,7 +29,7 @@ struct _cl_device_id {
   cl_uint  max_thread_per_unit;
   cl_uint  max_work_item_dimensions;
   size_t   max_work_item_sizes[3];
-  cl_uint  max_work_group_size;
+  size_t   max_work_group_size;
   cl_uint  preferred_vector_width_char;
   cl_uint  preferred_vector_width_short;
   cl_uint  preferred_vector_width_int;
@@ -56,7 +56,7 @@ struct _cl_device_id {
   size_t   image3d_max_height;
   size_t   image3d_max_depth;
   cl_uint  max_samplers;
-  cl_uint  max_parameter_size;
+  size_t   max_parameter_size;
   cl_uint  mem_base_addr_align;
   cl_uint  min_data_type_align_size;
   cl_device_fp_config single_fp_config;
@@ -84,6 +84,7 @@ struct _cl_device_id {
   const char *opencl_c_version;
   const char *extensions;
   const char *driver_version;
+  const char *built_in_kernels;
   size_t name_sz;
   size_t vendor_sz;
   size_t version_sz;
@@ -91,6 +92,7 @@ struct _cl_device_id {
   size_t opencl_c_version_sz;
   size_t extensions_sz;
   size_t driver_version_sz;
+  size_t built_in_kernels_sz;
   /* Kernel specific info that we're assigning statically */
   size_t wg_sz;
   size_t compile_wg_sz[3];
diff --git a/src/cl_driver.h b/src/cl_driver.h
index f1e1454..e8ebad1 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -110,20 +110,21 @@ extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
 typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t cchint);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
-/* Insert a sampler */
-typedef void (cl_gpgpu_insert_sampler_cb)(cl_gpgpu, uint32_t *curbe_index, cl_sampler sampler);
-extern cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler;
+/* bind samplers defined in both kernel and kernel args. */
+typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
+extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
 /* Set a 2d texture */
 typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
-                                        uint32_t *curbe_index,
-                                        cl_buffer obj_bo,
-                                        uint32_t format,
-                                        uint32_t type,
-                                        int32_t w,
-                                        int32_t h,
-                                        int pitch,
-                                        cl_gpgpu_tiling tiling);
+                                      uint32_t id,
+                                      cl_buffer obj_bo,
+                                      uint32_t format,
+                                      uint32_t type,
+                                      int32_t w,
+                                      int32_t h,
+                                      int pitch,
+                                      cl_gpgpu_tiling tiling);
+
 extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
 
 /* Setup a stack */
@@ -183,9 +184,13 @@ extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
  * Buffer
  **************************************************************************/
 /* Allocate a buffer */
-typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, unsigned long, unsigned long);
+typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, size_t, size_t);
 extern cl_buffer_alloc_cb *cl_buffer_alloc;
 
+/* Set a buffer's tiling mode */
+typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
+extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
+
 #include "cl_context.h"
 typedef struct _cl_context *cl_context;
 
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 5acc6a5..2c77a22 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -29,6 +29,7 @@ LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
 
 /* Buffer */
 LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
 LOCAL cl_buffer_alloc_from_eglimage_cb *cl_buffer_alloc_from_eglimage = NULL;
 LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
 LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
@@ -59,5 +60,5 @@ LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
 LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
 LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
 LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
-LOCAL cl_gpgpu_insert_sampler_cb *cl_gpgpu_insert_sampler = NULL;
+LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
 
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f26cd8a..a535452 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -72,6 +72,7 @@ DECL_INFO_STRING(version, OCL_VERSION_STRING)
 DECL_INFO_STRING(profile, "FULL_PROFILE")
 DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
 DECL_INFO_STRING(extensions, "")
+DECL_INFO_STRING(built_in_kernels, "")
 DECL_INFO_STRING(driver_version, LIBCL_VERSION_STRING)
 #undef DECL_INFO_STRING
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index ec0e2e8..d8671c6 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -56,6 +56,8 @@ cl_kernel_delete(cl_kernel k)
         cl_mem_delete(k->args[i].mem);
     cl_free(k->args);
   }
+  if (k->image_sz)
+    cl_free(k->images);
   k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(k);
 }
@@ -110,6 +112,7 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   if (arg_type == GBE_ARG_VALUE) {
     if (UNLIKELY(value == NULL))
       return CL_INVALID_KERNEL_ARGS;
+
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
     assert(offset + sz <= k->curbe_sz);
     memcpy(k->curbe + offset, value, sz);
@@ -129,20 +132,18 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     return CL_SUCCESS;
   }
 
-  /* For a sampler*/
+  /* Is it a sampler*/
   if (arg_type == GBE_ARG_SAMPLER) {
-     cl_sampler sampler;
-     if (UNLIKELY(value == NULL))
+    cl_sampler sampler;
+    memcpy(&sampler, value, sz);
+    if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
       return CL_INVALID_KERNEL_ARGS;
-     sampler = *(cl_sampler*)value;
-
-     if (UNLIKELY(sampler->magic != CL_MAGIC_SAMPLER_HEADER))
-       return CL_INVALID_ARG_VALUE;
-     k->args[index].local_sz = 0;
-     k->args[index].is_set = 1;
-     k->args[index].mem = NULL;
-     k->args[index].sampler = sampler;
-     return CL_SUCCESS;
+    k->args[index].local_sz = 0;
+    k->args[index].is_set = 1;
+    k->args[index].mem = NULL;
+    k->args[index].sampler = sampler;
+    cl_set_sampler_arg_slot(k, index, sampler);
+    return CL_SUCCESS;
   }
 
   /* Otherwise, we just need to check that this is a buffer */
@@ -203,6 +204,24 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
 
   /* Create the curbe */
   k->curbe_sz = gbe_kernel_get_curbe_size(k->opaque);
+
+  /* Get sampler data & size */
+  k->sampler_sz = gbe_kernel_get_sampler_size(k->opaque);
+  assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
+  if (k->sampler_sz > 0)
+    gbe_kernel_get_sampler_data(k->opaque, k->samplers);
+  /* Get image data & size */
+  k->image_sz = gbe_kernel_get_image_size(k->opaque);
+  assert(k->sampler_sz <= GEN_MAX_SURFACES);
+  if (k->image_sz > 0) {
+    TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
+    gbe_kernel_get_image_data(k->opaque, k->images);
+  } else
+    k->images = NULL;
+  return;
+error:
+  cl_buffer_unreference(k->bo);
+  k->bo = NULL;
 }
 
 LOCAL cl_kernel
@@ -221,6 +240,15 @@ cl_kernel_dup(cl_kernel from)
   to->program = from->program;
   to->arg_n = from->arg_n;
   to->curbe_sz = from->curbe_sz;
+  to->sampler_sz = from->sampler_sz;
+  to->image_sz = from->image_sz;
+  if (to->sampler_sz)
+    memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
+  if (to->image_sz) {
+    TRY_ALLOC_NO_ERR(to->images, cl_calloc(to->image_sz, sizeof(to->images[0])));
+    memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
+  } else
+    to->images = NULL;
   TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
   if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
 
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index dd98fb3..e191058 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -52,6 +52,10 @@ struct _cl_kernel {
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
   char *curbe;                /* One curbe per kernel */
   size_t curbe_sz;            /* Size of it */
+  uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
+  size_t sampler_sz;          /* sampler size defined in kernel & kernel args. */
+  struct ImageInfo *images;   /* images defined in kernel args */
+  size_t image_sz;            /* image count in kernel args */
   cl_argument *args;          /* To track argument setting */
   uint32_t arg_n:31;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 6950590..354fe34 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -410,14 +410,18 @@ _cl_mem_new_image(cl_context ctx,
 
   mem->w = w;
   mem->h = h;
+  mem->depth = depth;
   mem->fmt = *fmt;
   mem->intel_fmt = intel_fmt;
   mem->bpp = bpp;
   mem->is_image = 1;
-  mem->pitch = aligned_pitch;
+  mem->row_pitch = aligned_pitch;
+  mem->slice_pitch = image_type == CL_MEM_OBJECT_IMAGE1D || image_type == CL_MEM_OBJECT_IMAGE2D ? 0 : aligned_pitch*aligned_h;
   mem->tiling = tiling;
   mem->type = image_type;
 
+  cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+
 exit:
   if (errcode_ret)
     *errcode_ret = err;
@@ -518,6 +522,25 @@ cl_mem_unmap_gtt(cl_mem mem)
   return CL_SUCCESS;
 }
 
+LOCAL void*
+cl_mem_map_auto(cl_mem mem)
+{
+  if (mem->is_image && mem->tiling != CL_NO_TILE)
+    return cl_mem_map_gtt(mem);
+  else
+    return cl_mem_map(mem);
+}
+
+LOCAL cl_int
+cl_mem_unmap_auto(cl_mem mem)
+{
+  if (mem->is_image && mem->tiling != CL_NO_TILE)
+    cl_buffer_unmap_gtt(mem->bo);
+  else
+    cl_buffer_unmap(mem->bo);
+  return CL_SUCCESS;
+}
+
 LOCAL cl_int
 cl_mem_pin(cl_mem mem)
 {
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 99f3835..3ca6381 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -43,7 +43,8 @@ struct _cl_mem {
   uint32_t is_image;        /* Indicate if this is an image or not */
   cl_image_format fmt;      /* only for images */
   cl_mem_object_type type;  /* only for images 1D/2D...*/
-  size_t w,h,depth,pitch;   /* only for images (depth is only for 3d images) */
+  size_t w,h,depth;         /* only for images (depth is only for 3D images) */
+  size_t row_pitch,slice_pitch;
   uint32_t intel_fmt;       /* format to provide in the surface state */
   uint32_t bpp;             /* number of bytes per pixel */
   cl_image_tiling_t tiling; /* only IVB+ supports TILE_[X,Y] (image only) */
@@ -82,6 +83,12 @@ extern void *cl_mem_map_gtt(cl_mem);
 /* Unmap a memory object in GTT mode */
 extern cl_int cl_mem_unmap_gtt(cl_mem);
 
+/* Directly map a memory object - tiled images are mapped in GTT mode */
+extern void *cl_mem_map_auto(cl_mem);
+
+/* Unmap a memory object - tiled images are unmapped in GTT mode */
+extern cl_int cl_mem_unmap_auto(cl_mem);
+
 /* Pin/unpin the buffer in memory (you must be root) */
 extern cl_int cl_mem_pin(cl_mem);
 extern cl_int cl_mem_unpin(cl_mem);
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index 3dfac00..930107f 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -198,11 +198,13 @@ LOCAL cl_mem cl_mem_new_gl_texture(cl_context ctx,
   mem->type = get_mem_type_from_target(texture_target);
   mem->w = w;
   mem->h = h;
+  mem->depth = 1;
   mem->fmt = cl_format;
   mem->intel_fmt = intel_fmt;
   mem->bpp = bpp;
   mem->is_image = 1;
-  mem->pitch = pitch;
+  mem->row_pitch = pitch;
+  mem->slice_pitch = 0;
   mem->tiling = tiling;
   mem->ref_n = 1;
   mem->magic = CL_MAGIC_MEM_HEADER;
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index d3e61da..7e0b7b0 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -22,9 +22,54 @@
 #include "cl_utils.h"
 #include "cl_alloc.h"
 #include "cl_khr_icd.h"
+#include "cl_kernel.h"
 
 #include <assert.h>
 
+uint32_t cl_to_clk(cl_bool normalized_coords,
+                   cl_addressing_mode address,
+                   cl_filter_mode filter)
+{
+  int clk_address;
+  int clk_filter;
+  switch (address) {
+  case CL_ADDRESS_NONE: clk_address = CLK_ADDRESS_NONE; break;
+  case CL_ADDRESS_CLAMP: clk_address = CLK_ADDRESS_CLAMP; break;
+  case CL_ADDRESS_CLAMP_TO_EDGE: clk_address = CLK_ADDRESS_CLAMP_TO_EDGE; break;
+  case CL_ADDRESS_REPEAT: clk_address = CLK_ADDRESS_REPEAT; break;
+  case CL_ADDRESS_MIRRORED_REPEAT: clk_address = CLK_ADDRESS_MIRRORED_REPEAT; break;
+  default:
+    assert(0);
+  }
+  switch(filter) {
+  case CL_FILTER_NEAREST: clk_filter = CLK_FILTER_NEAREST; break;
+  case CL_FILTER_LINEAR: clk_filter = CLK_FILTER_LINEAR; break;
+  default:
+    assert(0);
+  }
+  return (clk_address << __CLK_ADDRESS_BASE)
+         | (normalized_coords << __CLK_NORMALIZED_BASE)
+         | (clk_filter << __CLK_FILTER_BASE);
+}
+
+#define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
+#define SAMPLER_ARG_ID(v) ((v & __CLK_SAMPLER_ARG_MASK) >> __CLK_SAMPLER_ARG_BASE)
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
+{
+  int slot_id;
+  for(slot_id = 0; slot_id < k->sampler_sz; slot_id++)
+  {
+    if (IS_SAMPLER_ARG(k->samplers[slot_id])) {
+     if (SAMPLER_ARG_ID(k->samplers[slot_id]) == index) {
+       k->samplers[slot_id] = (k->samplers[slot_id] & (~__CLK_SAMPLER_MASK))
+                              | sampler->clkSamplerValue;
+       return slot_id;
+     }
+    }
+  }
+  assert(0);
+}
+
 LOCAL cl_sampler
 cl_sampler_new(cl_context ctx,
                cl_bool normalized_coords,
@@ -54,6 +99,8 @@ cl_sampler_new(cl_context ctx,
   sampler->ctx = ctx;
   cl_context_add_ref(ctx);
 
+  sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
+
 exit:
   if (errcode_ret)
     *errcode_ret = err;
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
index da9a488..4785928 100644
--- a/src/cl_sampler.h
+++ b/src/cl_sampler.h
@@ -21,6 +21,7 @@
 #define __CL_SAMPLER_H__
 
 #include "CL/cl.h"
+#include "../backend/src/ocl_common_defines.h"
 #include <stdint.h>
 
 /* How to access images */
@@ -33,6 +34,7 @@ struct _cl_sampler {
   cl_bool normalized_coords; /* Are coordinates normalized? */
   cl_addressing_mode address;/* CLAMP / REPEAT and so on... */
   cl_filter_mode filter;     /* LINEAR / NEAREST mostly */
+  uint32_t clkSamplerValue;
 };
 
 /* Create a new sampler object */
@@ -48,5 +50,8 @@ extern void cl_sampler_delete(cl_sampler);
 /* Add one more reference to this object */
 extern void cl_sampler_add_ref(cl_sampler);
 
+/* set a sampler kernel argument */
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler);
+
 #endif /* __CL_SAMPLER_H__ */
 
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 4493858..dfb1369 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -130,6 +130,15 @@ do {                                                        \
   }                                                         \
 } while (0)
 
+#define CHECK_IMAGE(IMAGE)                                  \
+CHECK_MEM(image);                                           \
+do {                                                        \
+  if (UNLIKELY(!IMAGE->is_image)) {                         \
+    err = CL_INVALID_MEM_OBJECT;                            \
+    goto error;                                             \
+  }                                                         \
+} while (0)
+
 #define CHECK_SAMPLER(SAMPLER)                              \
 do {                                                        \
   if (UNLIKELY(SAMPLER == NULL)) {                          \
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 842163b..ebc4961 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -45,12 +45,12 @@
  *    Zou Nan hai <nanhai.zou at intel.com>
  *
  */
-#define GL_GLEXT_PROTOTYPES
 #include "intel_driver.h"
 #include "intel_gpgpu.h"
 #include "intel_batchbuffer.h"
 #include "intel_bufmgr.h"
 #include "x11/dricommon.h"
+#include "cl_mem.h"
 
 #include <assert.h>
 #include <unistd.h>
@@ -369,14 +369,17 @@ cl_intel_driver_delete(intel_driver_t *driver)
   intel_driver_terminate(driver);
   intel_driver_delete(driver);
 }
-
+#include "program.h"
 static intel_driver_t*
 cl_intel_driver_new(cl_context_prop props)
 {
   intel_driver_t *driver = NULL;
   TRY_ALLOC_NO_ERR (driver, intel_driver_new());
   intel_driver_open(driver, props);
-
+  /* We use the first 2 slots(0,1) for all the bufs.
+   * Notify the gbe this base index, thus gbe can avoid conflicts
+   * when it allocates slots for images*/
+  gbe_set_image_base_index(2);
 exit:
   return driver;
 error:
@@ -405,7 +408,6 @@ static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
 #include "GL/gl.h"
 #include "EGL/egl.h"
 #include "EGL/eglext.h"
-#include "cl_mem.h"
 static int get_cl_tiling(uint32_t drm_tiling)
 {
   switch(drm_tiling) {
@@ -465,6 +467,38 @@ cl_buffer intel_alloc_buffer_from_eglimage(cl_context ctx,
 }
 #endif
 
+static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
+{
+  switch (tiling) {
+    case CL_NO_TILE:
+      *intel_tiling = I915_TILING_NONE;
+      break;
+    case CL_TILE_X:
+      *intel_tiling = I915_TILING_X;
+      break;
+    case CL_TILE_Y:
+      *intel_tiling = I915_TILING_Y;
+      break;
+    default:
+      assert(0);
+      return -1;
+  }
+  return 0;
+}
+
+static int intel_buffer_set_tiling(cl_buffer bo,
+                                   cl_image_tiling_t tiling, size_t stride)
+{
+  uint32_t intel_tiling, required_tiling;
+  int ret;
+  if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
+    return -1;
+  required_tiling = intel_tiling;
+  ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
+  assert(intel_tiling == required_tiling);
+  return ret;
+}
+
 LOCAL void
 intel_setup_callbacks(void)
 {
@@ -474,6 +508,7 @@ intel_setup_callbacks(void)
   cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
   cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
   cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+  cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
 #ifdef HAS_EGL
   cl_buffer_alloc_from_eglimage = (cl_buffer_alloc_from_eglimage_cb *) intel_alloc_buffer_from_eglimage;
 #endif
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index d90368c..b0f556d 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -451,35 +451,6 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
   heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface);
 }
 
-static inline unsigned long
-__fls(unsigned long x)
-{
-        asm("bsf %1,%0"
-            : "=r" (x)
-            : "rm" (x));
-        return x;
-}
-
-static int
-intel_gpgpu_get_free_img_index(intel_gpgpu_t *gpgpu)
-{
-  int slot;
-  assert(~gpgpu->img_bitmap != 0);
-  slot = __fls(~gpgpu->img_bitmap);
-  gpgpu->img_bitmap |= (1 << slot);
-  return slot + gpgpu->img_index_base;
-}
-
-static int
-intel_gpgpu_get_free_sampler_index(intel_gpgpu_t *gpgpu)
-{
-  int slot;
-  assert(~gpgpu->sampler_bitmap != 0);
-  slot = __fls(~gpgpu->sampler_bitmap);
-  gpgpu->sampler_bitmap |= (1 << slot);
-  return slot;
-}
-
 static int
 intel_get_surface_type(cl_mem_object_type type)
 {
@@ -500,7 +471,7 @@ intel_get_surface_type(cl_mem_object_type type)
 
 static void
 intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
-                              uint32_t *curbe_index,
+                              uint32_t index,
                               dri_bo* obj_bo,
                               uint32_t format,
                               cl_mem_object_type type,
@@ -509,7 +480,6 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
                               int32_t pitch,
                               int32_t tiling)
 {
-  int32_t index = intel_gpgpu_get_free_img_index(gpgpu);
   surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
   gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
 
@@ -531,7 +501,6 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
   }
   ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
   intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo);
-  *curbe_index = index;
   gpgpu->binded_img[index - gpgpu->img_index_base] = obj_bo;
 }
 
@@ -554,7 +523,7 @@ intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint
 
 static void
 intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
-                       uint32_t *index,
+                       uint32_t index,
                        cl_buffer *obj_bo,
                        uint32_t format,
                        cl_mem_object_type type,
@@ -564,7 +533,7 @@ intel_gpgpu_bind_image(intel_gpgpu_t *gpgpu,
                        cl_gpgpu_tiling tiling)
 {
   intel_gpgpu_bind_image_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, type, w, h, pitch, tiling);
-  assert(*index < GEN_MAX_SURFACES);
+  assert(index < GEN_MAX_SURFACES);
 }
 
 static void
@@ -662,10 +631,10 @@ intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
 int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
 {
    switch( cl_address_mode ) {
-   case CL_ADDRESS_NONE:
-   case CL_ADDRESS_REPEAT:
+   case CLK_ADDRESS_NONE:
+   case CLK_ADDRESS_REPEAT:
       return GEN_TEXCOORDMODE_WRAP;
-   case CL_ADDRESS_CLAMP:
+   case CLK_ADDRESS_CLAMP:
       /* GL_CLAMP is the weird mode where coordinates are clamped to
        * [0.0, 1.0], so linear filtering of coordinates outside of
        * [0.0, 1.0] give you half edge texel value and half border
@@ -679,9 +648,9 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
          return GEN_TEXCOORDMODE_CLAMP;
       else
          return GEN_TEXCOORDMODE_CLAMP_BORDER;
-   case CL_ADDRESS_CLAMP_TO_EDGE:
+   case CLK_ADDRESS_CLAMP_TO_EDGE:
       return GEN_TEXCOORDMODE_CLAMP;
-   case CL_ADDRESS_MIRRORED_REPEAT:
+   case CLK_ADDRESS_MIRRORED_REPEAT:
       return GEN_TEXCOORDMODE_MIRROR;
    default:
       return GEN_TEXCOORDMODE_WRAP;
@@ -689,35 +658,33 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
 }
 
 static void
-intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t *curbe_index, cl_sampler cl_sampler)
+intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
 {
-  int index;
   int using_nearest = 0;
   uint32_t wrap_mode;
   gen7_sampler_state_t *sampler;
 
-  index = intel_gpgpu_get_free_sampler_index(gpgpu);
   sampler = (gen7_sampler_state_t *)gpgpu->sampler_state_b.bo->virtual + index;
-  if (!cl_sampler->normalized_coords)
+  if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
     sampler->ss3.non_normalized_coord = 1;
   else
     sampler->ss3.non_normalized_coord = 0;
 
-  switch (cl_sampler->filter) {
-  case CL_FILTER_NEAREST:
+  switch (clk_sampler & __CLK_FILTER_MASK) {
+  case CLK_FILTER_NEAREST:
     sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
     sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
     sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
     using_nearest = 1;
     break;
-  case CL_FILTER_LINEAR:
+  case CLK_FILTER_LINEAR:
     sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
     sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
     sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
     break;
   }
 
-  wrap_mode = translate_wrap_mode(cl_sampler->address, using_nearest);
+  wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
   sampler->ss3.r_wrap_mode = wrap_mode;
   sampler->ss3.s_wrap_mode = wrap_mode;
   sampler->ss3.t_wrap_mode = wrap_mode;
@@ -738,7 +705,15 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t *curbe_index, cl_sampl
      sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
                                    GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
                                    GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
-  *curbe_index = index;
+}
+
+static void
+intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
+{
+  int index;
+  assert(sampler_sz <= GEN_MAX_SAMPLERS);
+  for(index = 0; index < sampler_sz; index++)
+    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index] & __CLK_SAMPLER_MASK);
 }
 
 static void
@@ -815,6 +790,6 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
-  cl_gpgpu_insert_sampler = (cl_gpgpu_insert_sampler_cb *) intel_gpgpu_insert_sampler;
+  cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
 }
 
diff --git a/utests/.gitignore b/utests/.gitignore
new file mode 100644
index 0000000..9a374dc
--- /dev/null
+++ b/utests/.gitignore
@@ -0,0 +1,13 @@
+compiler_box_blur.bmp
+compiler_box_blur_float.bmp
+compiler_clod.bmp
+compiler_julia.bmp
+compiler_julia_no_break.bmp
+compiler_mandelbrot.bmp
+compiler_mandelbrot_alternate.bmp
+compiler_menger_sponge_no_shadow.bmp
+compiler_nautilus.bmp
+compiler_ribbon.bmp
+flat_address_space
+libutests.so
+utest_run
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 20d5456..2ba01c4 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -74,6 +74,8 @@ set (utests_sources
   compiler_local_memory_barrier_wg64.cpp
   compiler_movforphi_undef.cpp
   compiler_volatile.cpp
+  compiler_copy_image1.cpp
+  compiler_get_image_size.cpp
   runtime_createcontext.cpp
   utest_assert.cpp
   utest.cpp
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_copy_image1.cpp
similarity index 52%
copy from utests/compiler_movforphi_undef.cpp
copy to utests/compiler_copy_image1.cpp
index 30e53bb..a9ef3f4 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_copy_image1.cpp
@@ -1,15 +1,15 @@
 #include "utest_helper.hpp"
 
-static void compiler_movforphi_undef(void)
+static void compiler_copy_image1(void)
 {
-  const size_t w = 16;
-  const size_t h = 16;
-  cl_sampler sampler;
+  const size_t w = 512;
+  const size_t h = 512;
   cl_image_format format;
   cl_image_desc desc;
+  cl_sampler sampler;
 
   // Setup kernel and images
-  OCL_CREATE_KERNEL("test_movforphi_undef");
+  OCL_CREATE_KERNEL("test_copy_image1");
   buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
   for (uint32_t j = 0; j < h; ++j)
     for (uint32_t i = 0; i < w; i++)
@@ -22,10 +22,14 @@ static void compiler_movforphi_undef(void)
   desc.image_height = h;
   desc.image_row_pitch = w * sizeof(uint32_t);
   OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
 
   desc.image_row_pitch = 0;
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
-  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  OCL_CREATE_IMAGE(buf[2], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[3], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[4], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[5], 0, &format, &desc, NULL);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
@@ -33,6 +37,15 @@ static void compiler_movforphi_undef(void)
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
   OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[3]);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+  OCL_SET_ARG(6, sizeof(cl_mem), &buf[5]);
+  float w_inv = 1.0/w;
+  float h_inv = 1.0/h;
+  OCL_SET_ARG(7, sizeof(float), &w_inv);
+  OCL_SET_ARG(8, sizeof(float), &h_inv);
+
   globals[0] = w;
   globals[1] = h;
   locals[0] = 16;
@@ -42,16 +55,23 @@ static void compiler_movforphi_undef(void)
   // Check result
   OCL_MAP_BUFFER(0);
   OCL_MAP_BUFFER(1);
-  // Just compare the initial 2 data is enough for this case, as the initial 2 data must in the first
-  // tile box and we can just get the correct coords.
-  for (uint32_t j = 0; j < 1; ++j)
-    for (uint32_t i = 0; i < 3; i++)
-    {
-      if (i < w - 1)
-       OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
-    }
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  OCL_MAP_BUFFER(4);
+  OCL_MAP_BUFFER(5);
+
+  for(uint32_t k = 0; k < 5; k++)
+  {
+    for (uint32_t j = 0; j < h; ++j)
+      for (uint32_t i = 0; i < w; i++)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1 + k])[j * w + i]);
+  }
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+  OCL_UNMAP_BUFFER(4);
+  OCL_UNMAP_BUFFER(5);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_movforphi_undef);
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_fill_image0.cpp
index cf76be3..7c8f40e 100644
--- a/utests/compiler_fill_image0.cpp
+++ b/utests/compiler_fill_image0.cpp
@@ -14,7 +14,6 @@ static void compiler_fill_image0(void)
   desc.image_height = h;
   desc.image_row_pitch = 0;
 
-
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_fill_image0");
 
@@ -29,11 +28,11 @@ static void compiler_fill_image0(void)
   OCL_NDRANGE(2);
 
   // Check result
-  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER_GTT(0);
   for (uint32_t j = 0; j < h; ++j)
     for (uint32_t i = 0; i < w; i++)
-      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 0x78563412);
-  OCL_UNMAP_BUFFER(0);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == (i << 16 | j));
+  OCL_UNMAP_BUFFER_GTT(0);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_image0);
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_get_image_size.cpp
similarity index 55%
copy from utests/compiler_fill_image0.cpp
copy to utests/compiler_get_image_size.cpp
index cf76be3..49c08ad 100644
--- a/utests/compiler_fill_image0.cpp
+++ b/utests/compiler_get_image_size.cpp
@@ -1,8 +1,8 @@
 #include "utest_helper.hpp"
 
-static void compiler_fill_image0(void)
+static void compiler_get_image_size(void)
 {
-  const size_t w = 512;
+  const size_t w = 256;
   const size_t h = 512;
   cl_image_format format;
   cl_image_desc desc;
@@ -14,26 +14,24 @@ static void compiler_fill_image0(void)
   desc.image_height = h;
   desc.image_row_pitch = 0;
 
-
   // Setup kernel and images
-  OCL_CREATE_KERNEL("test_fill_image0");
+  OCL_CREATE_KERNEL("test_get_image_size");
 
   OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, 32 * sizeof(int), NULL);
 
   // Run the kernel
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  globals[0] = w;
-  globals[1] = h;
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 32;
   locals[0] = 16;
-  locals[1] = 16;
-  OCL_NDRANGE(2);
+  OCL_NDRANGE(1);
 
   // Check result
-  OCL_MAP_BUFFER(0);
-  for (uint32_t j = 0; j < h; ++j)
-    for (uint32_t i = 0; i < w; i++)
-      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 0x78563412);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 32; i++)
+    OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((w << 16) | (h)));
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_fill_image0);
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_size);
diff --git a/utests/compiler_math.cpp b/utests/compiler_math.cpp
index 7303dd5..e0c4487 100644
--- a/utests/compiler_math.cpp
+++ b/utests/compiler_math.cpp
@@ -2,18 +2,44 @@
 #include <cmath>
 #include <algorithm>
 
-static void cpu_compiler_math(float *dst, float *src, int get_global_id0)
+static void cpu_compiler_math(float *dst, float *src, int i)
 {
-  const float x = src[get_global_id0];
-  switch (get_global_id0) {
-    case 0: dst[get_global_id0] = cosf(x); break;
-    case 1: dst[get_global_id0] = sinf(x); break;
-    case 2: dst[get_global_id0] = log2f(x); break;
-    case 3: dst[get_global_id0] = sqrtf(x); break;
-    case 4: dst[get_global_id0] = 1.f/ sqrtf(x); break;
-    case 5: dst[get_global_id0] = 1.f / x; break;
-    case 6: dst[get_global_id0] = tanf(x); break;
-    default: dst[get_global_id0] = 1.f; break;
+  const float x = src[i];
+  const float PI = 3.141592653589793f;
+  switch (i) {
+    case 0: dst[i] = cosf(x); break;
+    case 1: dst[i] = sinf(x); break;
+    case 2: dst[i] = log2f(x); break;
+    case 3: dst[i] = sqrtf(x); break;
+    case 4: dst[i] = 1.f/ sqrtf(x); break;
+    case 5: dst[i] = 1.f / x; break;
+    case 6: dst[i] = tanf(x); break;
+    case 7: dst[i] = powf(x, 0.3333333333333333333f); break;
+    case 8: dst[i] = ceilf(x); break;
+    case 9: dst[i] = cosf(PI * x); break;
+    case 10: dst[i] = powf(2, x); break;
+    case 11: dst[i] = powf(10, x); break;
+    case 12: dst[i] = expf(x) - 1; break;
+    case 13: dst[i] = logf(x + 1); break;
+    case 14: dst[i] = floorf(log2f(x)); break;
+    case 15: dst[i] = sinf(PI * x); break;
+    case 16: dst[i] = tanf(PI * x); break;
+    case 17: dst[i] = 2 * roundf(x / 2); break;
+    case 18: dst[i] = sinhf(x); break;
+    case 19: dst[i] = coshf(x); break;
+    case 20: dst[i] = tanhf(x); break;
+    case 21: dst[i] = asinhf(x); break;
+    case 22: dst[i] = acoshf(x); break;
+    case 23: dst[i] = atanhf(x); break;
+    case 24: dst[i] = asinf(x); break;
+    case 25: dst[i] = acosf(x); break;
+    case 26: dst[i] = atanf(x); break;
+    case 27: dst[i] = asinf(x) / PI; break;
+    case 28: dst[i] = acosf(x) / PI; break;
+    case 29: dst[i] = atanf(x) / PI; break;
+    case 30: dst[i] = erff(x); break;
+    case 31: dst[i] = nanf(""); break;
+    default: dst[i] = 1.f; break;
   };
 }
 
@@ -31,23 +57,31 @@ static void compiler_math(void)
   globals[0] = 16;
   locals[0] = 16;
 
-  OCL_MAP_BUFFER(1);
-  for (uint32_t i = 0; i < 32; ++i)
-    cpu_src[i] = ((float*)buf_data[1])[i] = float(i);
-  OCL_UNMAP_BUFFER(1);
-  OCL_NDRANGE(1);
-
-  OCL_MAP_BUFFER(0);
-  OCL_MAP_BUFFER(1);
-  for (int i = 0; i < 16; ++i)
-    cpu_compiler_math(cpu_dst, cpu_src, i);
-  for (int i = 0; i < 16; ++i) {
-    const float cpu = cpu_dst[i];
-    const float gpu = ((float*)buf_data[0])[i];
-    OCL_ASSERT(fabs(gpu-cpu)/std::max(fabs(cpu), fabs(gpu)) < 1e-4f);
+  int j;
+  for(j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(1);
+    for (uint32_t i = 0; i < 32; ++i)
+      cpu_src[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+    OCL_UNMAP_BUFFER(1);
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (int i = 0; i < 16; ++i)
+      cpu_compiler_math(cpu_dst, cpu_src, i);
+    for (int i = 0; i < 16; ++i) {
+      const float cpu = cpu_dst[i];
+      const float gpu = ((float*)buf_data[0])[i];
+      if (isinf(cpu))
+        OCL_ASSERT(isinf(gpu));
+      else if (isnan(cpu))
+        OCL_ASSERT(isnan(gpu));
+      else
+        OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
   }
-  OCL_UNMAP_BUFFER(0);
-  OCL_UNMAP_BUFFER(1);
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_math)
diff --git a/utests/compiler_math_2op.cpp b/utests/compiler_math_2op.cpp
new file mode 100644
index 0000000..454967d
--- /dev/null
+++ b/utests/compiler_math_2op.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static float rnde(float v) {
+  if(v - floorf(v) > 0.5f)
+    return floorf(v) + 1;
+  if(v - floorf(v) < 0.5f)
+    return floorf(v);
+  if((int)(floorf(v)) & 1)
+    return floorf(v) + 1;
+  return floorf(v);
+}
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, int i)
+{
+  const float x = src1[i], y = src2[i];
+  switch (i) {
+    case 0: dst[i] = x / y; break;
+    case 1: dst[i] = x > y ? x - y : 0; break;
+    case 2: dst[i] = fminf(x - floorf(x), 0x1.FFFFFep-1F); break;
+    case 3: dst[i] = sqrtf(x*x + y*y); break;
+    case 4: dst[i] = x * powf(2, (int)y); break;
+    case 5: dst[i] = powf(x, (int)y); break;
+    case 6: dst[i] = x - rnde(x/y)*y; break;
+    case 7: dst[i] = powf(x, 1.f/(int)(y+1)); break;
+    case 8: dst[i] = x * y < 0 ? -x : x; break;
+    case 9: dst[i] = fabsf(x) > fabsf(y) ? x : fabsf(y) > fabsf(x) ? y : fmaxf(x, y); break;
+    case 10: dst[i] = fabsf(x) < fabsf(y) ? x : fabsf(y) < fabsf(x) ? y : fminf(x, y); break;
+    default: dst[i] = 1.f; break;
+  };
+}
+
+static void compiler_math_2op(void)
+{
+  const size_t n = 32;
+  float cpu_dst[32], cpu_src1[32], cpu_src2[32];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_math_2op");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  int j;
+  for(j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (uint32_t i = 0; i < 32; ++i) {
+      cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+      cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+    OCL_NDRANGE(1);
+
+    for (int i = 0; i < 16; ++i)
+      cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, i);
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < 16; ++i) {
+      const float cpu = cpu_dst[i];
+      const float gpu = ((float*)buf_data[0])[i];
+      if (isinf(cpu))
+        OCL_ASSERT(isinf(gpu));
+      else if (isnan(cpu))
+        OCL_ASSERT(isnan(gpu));
+      else {
+        OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+      }
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_2op)
diff --git a/utests/compiler_math_3op.cpp b/utests/compiler_math_3op.cpp
new file mode 100644
index 0000000..a382b0a
--- /dev/null
+++ b/utests/compiler_math_3op.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, float *src3, int i)
+{
+  const float x = src1[i], y = src2[i], z = src3[i];
+  switch (i) {
+    case 0: dst[i] = x * y + z; break;
+    case 1: dst[i] = x * y + z; break;
+    default: dst[i] = 1.f; break;
+  };
+}
+
+static void compiler_math_3op(void)
+{
+  const size_t n = 32;
+  float cpu_dst[32], cpu_src1[32], cpu_src2[32], cpu_src3[32];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_math_3op");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    OCL_MAP_BUFFER(3);
+    for (uint32_t i = 0; i < 32; ++i) {
+      cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+      cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+      cpu_src3[i] = ((float*)buf_data[3])[i] = .1f * (rand() & 15);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+    OCL_UNMAP_BUFFER(3);
+    OCL_NDRANGE(1);
+
+    for (int i = 0; i < 16; ++i)
+      cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, cpu_src3, i);
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < 16; ++i) {
+      const float cpu = cpu_dst[i];
+      const float gpu = ((float*)buf_data[0])[i];
+      if (isinf(cpu))
+        OCL_ASSERT(isinf(gpu));
+      else if (isnan(cpu))
+        OCL_ASSERT(isnan(gpu));
+      else
+        OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_3op)
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
index 30e53bb..17bde8c 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_movforphi_undef.cpp
@@ -47,8 +47,8 @@ static void compiler_movforphi_undef(void)
   for (uint32_t j = 0; j < 1; ++j)
     for (uint32_t i = 0; i < 3; i++)
     {
-      if (i < w - 1)
-       OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
+      if (i == 0)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
     }
   OCL_UNMAP_BUFFER(0);
   OCL_UNMAP_BUFFER(1);
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 5258416..d882fc7 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -111,6 +111,17 @@ extern EGLSurface  eglSurface;
     } \
   } while (0)
 
+#define OCL_MAP_BUFFER_GTT(ID) \
+    OCL_CALL2(clMapBufferGTTIntel, buf_data[ID], buf[ID])
+
+#define OCL_UNMAP_BUFFER_GTT(ID) \
+  do { \
+    if (buf[ID] != NULL) { \
+      OCL_CALL (clUnmapBufferGTTIntel, buf[ID]); \
+      buf_data[ID] = NULL; \
+    } \
+  } while (0)
+
 #define OCL_NDRANGE(DIM_N) \
     OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, DIM_N, NULL, globals, locals, 0, NULL, NULL)
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git