[Pkg-opencl-devel] [beignet] 16/66: Imported Upstream version 0.1+git20130422+003fac5

Fri Oct 31 07:27:03 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit bbfdc9a35c42bb2cf517139ca440930b3c195971
Author: Simon Richter <sjr at debian.org>
Date:   Mon Apr 22 15:10:54 2013 +0200

    Imported Upstream version 0.1+git20130422+003fac5
---
 backend/src/backend/context.cpp                    | 211 +++++++++++++--------
 backend/src/backend/context.hpp                    |   2 +
 backend/src/backend/gen_context.cpp                |  28 ++-
 backend/src/backend/gen_context.hpp                |   1 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   1 +
 backend/src/backend/gen_insn_selection.cpp         |  34 +++-
 backend/src/backend/gen_insn_selection.hxx         |   1 +
 backend/src/backend/gen_program.cpp                |   5 +-
 backend/src/backend/gen_register.hpp               |  10 +
 backend/src/backend/program.cpp                    |  13 +-
 backend/src/backend/program.h                      |   7 +-
 backend/src/backend/program.hpp                    |  15 +-
 include/CL/cl_intel.h                              |  54 ++++--
 kernels/compiler_function_constant.cl              |   6 +
 kernels/compiler_function_constant0.cl             |   6 +
 src/cl_api.c                                       |  73 +++++--
 src/cl_command_queue.c                             |  22 ++-
 src/cl_command_queue.h                             |   2 +
 src/cl_command_queue_gen7.c                        |   7 +-
 src/cl_kernel.c                                    |  16 +-
 src/cl_mem.c                                       |  16 ++
 src/cl_mem.h                                       |   7 +
 utests/CMakeLists.txt                              |   3 +
 utests/compiler_copy_image.cpp                     |   3 +-
 utests/compiler_fill_image.cpp                     |   2 +-
 utests/compiler_fill_image0.cpp                    |   2 +-
 utests/compiler_function_constant.cpp              |  34 ++++
 utests/compiler_function_constant0.cpp             |  42 ++++
 utests/compiler_function_constant1.cpp             |  47 +++++
 utests/compiler_movforphi_undef.cpp                |  15 +-
 utests/runtime_flat_address_space.cpp              |   8 +-
 utests/utest_helper.cpp                            |  14 +-
 utests/utest_helper.hpp                            |   4 +-
 33 files changed, 560 insertions(+), 151 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 180e8bb..91d8d8c 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -53,7 +53,7 @@ namespace gbe
      *  the hardware. Note that we always use the left most block when
      *  allocating, so it makes sense for constant pushing
      */
-    int16_t allocate(int16_t size, int16_t alignment);
+    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
 
     /*! Free the given register file piece */
     void deallocate(int16_t offset);
@@ -75,8 +75,9 @@ namespace gbe
      *  If the colascing was done, the left block is deleted
      */
     void coalesce(Block *left, Block *right);
-    /*! Head of the free list */
+    /*! Head and tail of the free list */
     Block *head;
+    Block *tail;
     /*! Handle free list element allocation */
     DECL_POOL(Block, blockPool);
     /*! Track allocated memory blocks <offset, size> */
@@ -89,10 +90,10 @@ namespace gbe
     // r0 is always set by the HW and used at the end by EOT
     const int16_t offset = GEN_REG_SIZE;
     const int16_t size = RegisterFileSize  - offset;
-    head = this->newBlock(offset, size);
+    tail = head = this->newBlock(offset, size);
   }
 
-  RegisterFilePartitioner::~RegisterFilePartitioner(void) { 
+  RegisterFilePartitioner::~RegisterFilePartitioner(void) {
     while (this->head) {
       Block *next = this->head->next;
       this->deleteBlock(this->head);
@@ -100,80 +101,104 @@ namespace gbe
     }
   }
 
-  int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment)
+  int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment, bool bFwd)
   {
     // Make it simple and just use the first block we find
-    Block *list = head;
+    Block *list = bFwd ? head : tail;
     while (list) {
-      const int16_t aligned = ALIGN(list->offset, alignment);
-      const int16_t spaceOnLeft = aligned - list->offset;
-      const int16_t spaceOnRight = list->size - size - spaceOnLeft;
+      int16_t aligned;
+      int16_t spaceOnLeft;
+      int16_t spaceOnRight;
+      if(bFwd) {
+        aligned = ALIGN(list->offset, alignment);
+        spaceOnLeft = aligned - list->offset;
+        spaceOnRight = list->size - size - spaceOnLeft;
 
       // Not enough space in this block
-      if (spaceOnRight < 0) {
-        list = list->next;
-        continue;
+        if (spaceOnRight < 0) {
+          list = list->next;
+          continue;
+        }
+      } else {
+        aligned = ALIGN(list->offset+list->size-size-(alignment-1), alignment);   //alloc from block's tail
+        spaceOnLeft = aligned - list->offset;
+        spaceOnRight = list->size - size - spaceOnLeft;
+
+        // Not enough space in this block
+        if (spaceOnLeft < 0) {
+          list = list->prev;
+          continue;
+        }
       }
+
       // Cool we can use this block
-      else {
-        Block *left = list->prev;
-        Block *right = list->next;
-
-        // If we left a hole on the left, create a new block
-        if (spaceOnLeft) {
-          Block *newBlock = this->newBlock(list->offset, spaceOnLeft);
-          if (left) {
-            left->next = newBlock;
-            newBlock->prev = left;
-          }
-          if (right) {
-            newBlock->next = right;
-            right->prev = newBlock;
-          }
-          left = newBlock;
+      Block *left = list->prev;
+      Block *right = list->next;
+
+      // If we left a hole on the left, create a new block
+      if (spaceOnLeft) {
+        Block *newBlock = this->newBlock(list->offset, spaceOnLeft);
+        if (left) {
+          left->next = newBlock;
+          newBlock->prev = left;
         }
-
-        // If we left a hole on the right, create a new block as well
-        if (spaceOnRight) {
-          Block *newBlock = this->newBlock(aligned + size, spaceOnRight);
-          if (left) {
-            left->next = newBlock;
-            newBlock->prev = left;
-          }
-          if (right) {
-            right->prev = newBlock;
-            newBlock->next = right;
-          }
-          right = newBlock;
+        if (right) {
+          newBlock->next = right;
+          right->prev = newBlock;
         }
+        left = newBlock;
+      }
 
-        // Chain both successors and predecessors when the entire block was
-        // allocated
-        if (spaceOnLeft == 0 && spaceOnRight == 0) {
-          if (left) left->next = right;
-          if (right) right->prev = left;
+      // If we left a hole on the right, create a new block as well
+      if (spaceOnRight) {
+        Block *newBlock = this->newBlock(aligned + size, spaceOnRight);
+        if (left) {
+          left->next = newBlock;
+          newBlock->prev = left;
         }
-
-        // Update the head of the free blocks
-        if (list == head) {
-          if (left)
-            head = left;
-          else if (right)
-            head = right;
-          else
-            head = NULL;
+        if (right) {
+          right->prev = newBlock;
+          newBlock->next = right;
         }
+        right = newBlock;
+      }
 
-        // Free the block and check the consistency
-        this->deleteBlock(list);
-        if (head && head->next) GBE_ASSERT(head->next->prev == head);
+      // Chain both successors and predecessors when the entire block was
+      // allocated
+      if (spaceOnLeft == 0 && spaceOnRight == 0) {
+        if (left) left->next = right;
+        if (right) right->prev = left;
+      }
 
-        // Track the allocation to retrieve the size later
-        allocatedBlocks.insert(std::make_pair(aligned, size));
+      // Update the head of the free blocks
+      if (list == head) {
+        if (left)
+          head = left;
+        else if (right)
+          head = right;
+        else
+          head = NULL;
+      }
 
-        // We have a valid offset now
-        return aligned;
+      // Update the tail of the free blocks
+      if (list == tail) {
+        if (right)
+          tail = right;
+        else if (left)
+          tail = left;
+        else
+          tail = NULL;
       }
+      // Free the block and check the consistency
+      this->deleteBlock(list);
+      if (head && head->next) GBE_ASSERT(head->next->prev == head);
+      if (tail && tail->prev) GBE_ASSERT(tail->prev->next == tail);
+
+      // Track the allocation to retrieve the size later
+      allocatedBlocks.insert(std::make_pair(aligned, size));
+
+      // We have a valid offset now
+      return aligned;
     }
     return 0;
   }
@@ -186,34 +211,35 @@ namespace gbe
     const int16_t size = it->second;
 
     // Find the two blocks where to insert the new block
-    Block *list = head, *prev = NULL;
+    Block *list = tail, *next = NULL;
     while (list != NULL) {
-      if (list->offset > offset)
+      if (list->offset < offset)
         break;
-      prev = list;
-      list = list->next;
+      next = list;
+      list = list->prev;
     }
 
     // Create the block and insert it
     Block *newBlock = this->newBlock(offset, size);
-    if (prev) {
-      GBE_ASSERT(prev->offset + prev->size <= offset);
-      prev->next = newBlock;
-      newBlock->prev = prev;
+    if (list) {
+      GBE_ASSERT(list->offset + list->size <= offset);
+      list->next = newBlock;
+      newBlock->prev = list;
     } else
-      this->head = newBlock;  // prev is NULL means newBlock should be the head.
+      this->head = newBlock;  // list is NULL means newBlock should be the head.
 
-    if (list) {
-      GBE_ASSERT(offset + size <= list->offset);
-      list->prev = newBlock;
-      newBlock->next = list;
-    }
+    if (next) {
+      GBE_ASSERT(offset + size <= next->offset);
+      next->prev = newBlock;
+      newBlock->next = next;
+    } else
+      this->tail = newBlock;  // next is NULL means newBlock should be the tail.
 
-    if (prev != NULL || list != NULL)
+    if (list != NULL || next != NULL)
     {
       // Coalesce the blocks if possible
-      this->coalesce(prev, newBlock);
-      this->coalesce(newBlock, list);
+      this->coalesce(list, newBlock);
+      this->coalesce(newBlock, next);
     }
 
     // Do not track this allocation anymore
@@ -273,6 +299,8 @@ namespace gbe
       GBE_DELETE(this->kernel);
       this->kernel = NULL;
     }
+    if(this->kernel != NULL)
+      this->kernel->ctx = this;
     return this->kernel;
   }
 
@@ -282,6 +310,27 @@ namespace gbe
 
   void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); }
 
+  int32_t Context::allocConstBuf(uint32_t argID) {
+     GBE_ASSERT(kernel->args[argID].type == GBE_ARG_CONSTANT_PTR);
+
+    //free previous
+    int32_t offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
+    if(offset >= 0)
+        deallocate(offset+GEN_REG_SIZE);
+
+    if(kernel->args[argID].bufSize > 0) {
+      //use 32 alignment here as GEN_REG_SIZE, need dynamic by type?
+      newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_CONSTANT_BUFFER+argID, kernel->args[argID].bufSize, 32);
+    }
+
+    std::sort(kernel->patches.begin(), kernel->patches.end());
+    offset = kernel->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, argID+GBE_CONSTANT_BUFFER);
+    GBE_ASSERT(offset>=0);
+
+    kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
+    return offset + GEN_REG_SIZE;
+  }
+
   void Context::buildStack(void) {
     const auto &stackUse = dag->getUse(ir::ocl::stackptr);
     if (stackUse.size() == 0)  // no stack is used if stackptr is unused
@@ -297,7 +346,7 @@ namespace gbe
                               uint32_t alignment)
   {
     alignment = alignment == 0 ? size : alignment;
-    const uint32_t offset = partitioner->allocate(size, alignment);
+    const uint32_t offset = partitioner->allocate(size, alignment, 1);
     GBE_ASSERT(offset >= GEN_REG_SIZE);
     kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
     kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 55a63a7..245ad01 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -86,6 +86,8 @@ namespace gbe
     int16_t allocate(int16_t size, int16_t alignment);
     /*! Deallocate previously allocated memory */
     void deallocate(int16_t offset);
+    /* allocate curbe for constant ptr argument */
+    int32_t allocConstBuf(uint32_t argID);
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index b3d385b..1f867b8 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -144,7 +144,7 @@ namespace gbe
     }
   }
 
-  void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) { 
+  void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
@@ -212,6 +212,32 @@ namespace gbe
     }
   }
 
+  void GenContext::emitCBMoveInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = GenRegister::unpacked_uw(ra->genReg(insn.src(0)).nr, 0);
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister a0 = GenRegister::addr8(0);
+    uint32_t simdWidth = p->curr.execWidth;
+
+    p->push();
+      p->curr.execWidth = 8;
+      p->curr.quarterControl = GEN_COMPRESSION_Q1;
+      p->MOV(a0, src);
+      p->MOV(dst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+    p->pop();
+
+    if (simdWidth == 16) {
+      p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
+
+        const GenRegister nextDst = GenRegister::Qn(dst, 1);
+        const GenRegister nextSrc = GenRegister::Qn(src, 1);
+        p->MOV(a0, nextSrc);
+        p->MOV(nextDst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+      p->pop();
+    }
+  }
+
   void GenContext::emitJumpInstruction(const SelectionInstruction &insn) {
     const ir::LabelIndex label(insn.index);
     const GenRegister src = ra->genReg(insn.src(0));
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6af174f..33258f8 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -80,6 +80,7 @@ namespace gbe
     void emitTernaryInstruction(const SelectionInstruction &insn);
     void emitCompareInstruction(const SelectionInstruction &insn);
     void emitJumpInstruction(const SelectionInstruction &insn);
+    void emitCBMoveInstruction(const SelectionInstruction &insn);
     void emitEotInstruction(const SelectionInstruction &insn);
     void emitNoOpInstruction(const SelectionInstruction &insn);
     void emitWaitInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 969ec82..ce8769f 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -5,6 +5,7 @@ DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
+DECL_GEN7_SCHEDULE(CBMove,          20,        2,        2)
 DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
 DECL_GEN7_SCHEDULE(NoOp,            20,        2,        2)
 DECL_GEN7_SCHEDULE(Wait,            20,        2,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index e0e8920..ee8a0ab 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -25,7 +25,7 @@
 /* This is the instruction selection code. First of all, this is a bunch of c++
  * crap. Sorry if this is not that readable. Anyway, the goal here is to take
  * GenIR code (i.e. the very regular, very RISC IR) and to produce GenISA with
- * virtual registers (i.e. regular GenIR registers). 
+ * virtual registers (i.e. regular GenIR registers).
  *
  * Overall idea:
  * =============
@@ -72,7 +72,7 @@
  * *same* flag register for the predicates (used for masking) and the
  * conditional modifier (used as a destination for CMP). This leads to extra
  * complications with compare instructions and select instructions. Basically,
- * we need to insert extra MOVs. 
+ * we need to insert extra MOVs.
  *
  * Also, there is some extra kludge to handle the predicates for JMPI.
  *
@@ -439,6 +439,8 @@ namespace gbe
     void CMP(uint32_t conditional, Reg src0, Reg src1);
     /*! Select instruction with embedded comparison */
     void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
+    /* Constant buffer move instruction */
+    void CB_MOVE(Reg dst, Reg src);
     /*! EOT is used to finish GPGPU threads */
     void EOT(void);
     /*! No-op */
@@ -481,7 +483,7 @@ namespace gbe
   static void markAllChildren(SelectionDAG &dag) {
     // Do not merge anything, so all sources become roots
     for (uint32_t childID = 0; childID < dag.childNum; ++childID)
-      if (dag.child[childID]) 
+      if (dag.child[childID])
         dag.child[childID]->isRoot = 1;
   }
 
@@ -698,6 +700,11 @@ namespace gbe
     insn->src(1) = src1;
     insn->extra.function = conditional;
   }
+  void Selection::Opaque::CB_MOVE(Reg dst, Reg src) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CB_MOVE, 1, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+  }
 
   void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
@@ -1057,7 +1064,7 @@ namespace gbe
   // Implementation of all patterns
   ///////////////////////////////////////////////////////////////////////////
 
-  GenRegister getRegisterFromImmediate(ir::Immediate imm) 
+  GenRegister getRegisterFromImmediate(ir::Immediate imm)
   {
     using namespace ir;
     switch (imm.type) {
@@ -1654,15 +1661,30 @@ namespace gbe
         sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
     }
 
+    void emitCBMove(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister address) const
+    {
+      using namespace ir;
+      GBE_ASSERT(insn.getValueNum() == 1);   //todo: handle vec later
+
+      const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
+      const GenRegister src = address;
+      sel.CB_MOVE(dst, src);
+    }
+
     INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn) const {
       using namespace ir;
       const GenRegister address = sel.selReg(insn.getAddress());
       const AddressSpace space = insn.getAddressSpace();
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
+                 insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
                  insn.getAddressSpace() == MEM_LOCAL);
       GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false);
-      if (insn.isAligned() == true)
+      if (insn.getAddressSpace() == MEM_CONSTANT)
+        this->emitCBMove(sel, insn, address);
+      else if (insn.isAligned() == true)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
         const GenRegister value = sel.selReg(insn.getValue(0));
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 691100b..f89ad4c 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -24,6 +24,7 @@ DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
 DECL_SELECTION_IR(MAD, TernaryInstruction)
 DECL_SELECTION_IR(JMPI, JumpInstruction)
 DECL_SELECTION_IR(EOT, EotInstruction)
+DECL_SELECTION_IR(CB_MOVE, CBMoveInstruction)
 DECL_SELECTION_IR(NOP, NoOpInstruction)
 DECL_SELECTION_IR(WAIT, WaitInstruction)
 DECL_SELECTION_IR(MATH, MathInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 8a7efdb..3d7bedd 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -76,9 +76,10 @@ namespace gbe {
       unit.getFunction(name)->setSimdWidth(simdWidth);
       Context *ctx = GBE_NEW(GenContext, unit, name, limitRegisterPressure);
       kernel = ctx->compileKernel();
-      GBE_DELETE(ctx);
-      if (kernel != NULL)
+      if (kernel != NULL) {
         break;
+      }
+      GBE_DELETE(ctx);
     }
 
     // XXX spill must be implemented
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 92122a6..d772b0d 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -725,6 +725,16 @@ namespace gbe
       return ub16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
 
+    static INLINE GenRegister unpacked_uw(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UW,
+                         GEN_VERTICAL_STRIDE_16,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
     static INLINE GenRegister mask(uint32_t subnr) {
       return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
     }
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index d33c533..0882e5a 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -49,9 +49,10 @@
 namespace gbe {
 
   Kernel::Kernel(const std::string &name) :
-    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false)
+    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false), ctx(NULL)
   {}
   Kernel::~Kernel(void) {
+    if(ctx) GBE_DELETE(ctx);
     GBE_SAFE_DELETE_ARRAY(args);
   }
   int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
@@ -229,6 +230,12 @@ namespace gbe {
     return kernel->getUseSLM() ? 1 : 0;
   }
 
+  static int32_t kernelSetConstBufSize(gbe_kernel genKernel, uint32_t argID, size_t sz) {
+    if (genKernel == NULL) return -1;
+    gbe::Kernel *kernel = (gbe::Kernel*) genKernel;
+    return kernel->setConstBufSize(argID, sz);
+  }
+
   static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
     return 0u;
   }
@@ -251,6 +258,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
 
@@ -275,6 +283,7 @@ namespace gbe
       gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
       gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
       gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+      gbe_kernel_set_const_buffer_size = gbe::kernelSetConstBufSize;
       gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
       gbe_kernel_use_slm = gbe::kernelUseSLM;
       genSetupCallBacks();
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index b90c1df..4273a77 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -81,7 +81,8 @@ enum gbe_curbe_type {
 
 /*! Extra arguments use the negative range of sub-values */
 enum gbe_extra_argument {
-  GBE_STACK_BUFFER = 0 /* Give stack location in curbe */
+  GBE_STACK_BUFFER = 0,   /* Give stack location in curbe */
+  GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
 };
 
 /*! Create a new program from the given source code (zero terminated string) */
@@ -159,6 +160,10 @@ extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
 typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
 extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
 
+/*! Set the constant pointer arg size and return the cb offset in curbe */
+typedef int32_t (gbe_kernel_set_const_buffer_size_cb)(gbe_kernel, uint32_t argID, size_t sz);
+extern gbe_kernel_set_const_buffer_size_cb *gbe_kernel_set_const_buffer_size;
+
 /*! Indicates if a work group size is required. Return the required width or 0
  *  if none
  */
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index e0f7dba..e7584d9 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -26,6 +26,7 @@
 #define __GBE_PROGRAM_HPP__
 
 #include "backend/program.h"
+#include "backend/context.hpp"
 #include "sys/hash_map.hpp"
 #include "sys/vector.hpp"
 #include <string>
@@ -42,6 +43,7 @@ namespace gbe {
   struct KernelArgument {
     gbe_arg_type type; //!< Pointer, structure, image, regular value?
     uint32_t size;     //!< Size of the argument
+    uint32_t bufSize;  //!< Contant buffer size
   };
 
   /*! Stores the offset where to patch where to patch */
@@ -94,6 +96,16 @@ namespace gbe {
     INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
     /*! Says if SLM is needed for it */
     INLINE bool getUseSLM(void) const { return this->useSLM; }
+    /*! set constant buffer size and return the cb curbe offset */
+    int32_t setConstBufSize(uint32_t argID, size_t sz) {
+      if(argID >= argNum) return -1;
+      if(args[argID].type != GBE_ARG_CONSTANT_PTR) return -1;
+      if(args[argID].bufSize != sz) {
+        args[argID].bufSize = sz;
+        return ctx->allocConstBuf(argID);
+      }
+      return -1;
+    }
   protected:
     friend class Context;      //!< Owns the kernels
     const std::string name;    //!< Kernel name
@@ -104,6 +116,7 @@ namespace gbe {
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
     uint32_t stackSize;        //!< Stack size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
+    Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
     GBE_CLASS(Kernel);         //!< Use custom allocators
   };
 
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 680f948..135e340 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -30,33 +30,65 @@ extern "C" {
 
 /* Track allocations and report current number of unfreed allocations */
 extern CL_API_ENTRY cl_int CL_API_CALL
-clIntelReportUnfreed(void);
+clReportUnfreedIntel(void);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReportUnfreedIntel_fn)(void);
 
 /* 1 to 1 mapping of drm_intel_bo_map */
 extern CL_API_ENTRY void* CL_API_CALL
-clIntelMapBuffer(cl_mem, cl_int*);
+clMapBufferIntel(cl_mem, cl_int*);
+
+typedef CL_API_ENTRY void* (CL_API_CALL *clMapBufferIntel_fn)(cl_mem, cl_int*);
 
 /* 1 to 1 mapping of drm_intel_bo_unmap */
 extern CL_API_ENTRY cl_int CL_API_CALL
-clIntelUnmapBuffer(cl_mem);
+clUnmapBufferIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnmapBufferIntel_fn)(cl_mem);
+
+/* 1 to 1 mapping of drm_intel_gem_bo_map_gtt */
+extern CL_API_ENTRY void* CL_API_CALL
+clMapBufferGTTIntel(cl_mem, cl_int*);
+
+typedef CL_API_ENTRY void* (CL_API_CALL *clMapBufferGTTIntel_fn)(cl_mem, cl_int*);
+
+/* 1 to 1 mapping of drm_intel_gem_bo_unmap_gtt */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnmapBufferGTTIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnmapBufferGTTIntel_fn)(cl_mem);
 
 /* Pin /Unpin the buffer in GPU memory (must be root) */
 extern CL_API_ENTRY cl_int CL_API_CALL
-clIntelPinBuffer(cl_mem);
+clPinBufferIntel(cl_mem);
 extern CL_API_ENTRY cl_int CL_API_CALL
-clIntelUnpinBuffer(cl_mem);
+clUnpinBufferIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clPinBufferIntel_fn)(cl_mem);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnpinBufferIntel_fn)(cl_mem);
 
 /* Get the generation of the Gen device (used to load the proper binary) */
 extern CL_API_ENTRY cl_int CL_API_CALL
-clIntelGetGenVersion(cl_device_id device, cl_int *ver);
+clGetGenVersionIntel(cl_device_id device, cl_int *ver);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGenVersionIntel_fn)(
+                             cl_device_id device,
+                             cl_int *ver);
 
 /* Create a program from a LLVM source file */
 extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithLLVM(cl_context              /* context */,
-                        cl_uint                 /* num_devices */,
-                        const cl_device_id *    /* device_list */,
-                        const char *            /* file */,
-                        cl_int *                /* errcode_ret */);
+clCreateProgramWithLLVMIntel(cl_context              /* context */,
+                             cl_uint                 /* num_devices */,
+                             const cl_device_id *    /* device_list */,
+                             const char *            /* file */,
+                             cl_int *                /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithLLVMIntel_fn)(
+                                 cl_context              /* context */,
+                                 cl_uint                 /* num_devices */,
+                                 const cl_device_id *    /* device_list */,
+                                 const char *            /* file */,
+                                 cl_int *                /* errcode_ret */);
 
 #ifdef __cplusplus
 }
diff --git a/kernels/compiler_function_constant.cl b/kernels/compiler_function_constant.cl
new file mode 100644
index 0000000..ca7e874
--- /dev/null
+++ b/kernels/compiler_function_constant.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_function_constant(__constant short *c, __global int *dst, int value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = value + c[id%69];
+}
diff --git a/kernels/compiler_function_constant0.cl b/kernels/compiler_function_constant0.cl
new file mode 100644
index 0000000..f6efcef
--- /dev/null
+++ b/kernels/compiler_function_constant0.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_function_constant0(__constant short *c0, __constant char *c1, __global int *dst, int value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = value + c0[id%69] + c1[15];
+}
diff --git a/src/cl_api.c b/src/cl_api.c
index 2d84ace..03cc0e6 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -793,7 +793,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
 	cl_int err = CL_SUCCESS;
 	assert(ptr != NULL);
 	void* temp_ptr = NULL;
-	temp_ptr = clIntelMapBuffer(buffer, &err);
+	temp_ptr = clMapBufferIntel(buffer, &err);
 	assert(err == CL_SUCCESS);
 	memcpy(ptr, temp_ptr, cb);
 	return err;
@@ -833,11 +833,11 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
   if (blocking_write != CL_TRUE)
     NOT_IMPLEMENTED;
   cl_int err;
-  void *p = clIntelMapBuffer(buffer, &err);
+  void *p = clMapBufferIntel(buffer, &err);
   if (err != CL_SUCCESS)
     return err;
   memcpy(p + offset, ptr, cb);
-  err = clIntelUnmapBuffer(buffer);
+  err = clUnmapBufferIntel(buffer);
   return err;
 }
 
@@ -991,7 +991,7 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
      NOT_IMPLEMENTED;
   if (offset != 0)
      NOT_IMPLEMENTED;
-  p = clIntelMapBuffer(buffer, errcode_ret);
+  p = clMapBufferIntel(buffer, errcode_ret);
   return p;
 }
 
@@ -1021,7 +1021,7 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
                         const cl_event *  event_wait_list,
                         cl_event *        event)
 {
-  return clIntelUnmapBuffer(memobj);
+  return clUnmapBufferIntel(memobj);
 }
 
 cl_int
@@ -1170,6 +1170,10 @@ clEnqueueBarrier(cl_command_queue  command_queue)
   return 0;
 }
 
+#define EXTFUNC(x)                      \
+  if (strcmp(#x, func_name) == 0)       \
+    return (void *)x;
+
 void*
 clGetExtensionFunctionAddress(const char *func_name)
 {
@@ -1177,20 +1181,30 @@ clGetExtensionFunctionAddress(const char *func_name)
     return NULL;
 #ifdef HAS_OCLIcd
   /* cl_khr_icd */
-  if (strcmp("clIcdGetPlatformIDsKHR", func_name) == 0)
-    return (void *)clIcdGetPlatformIDsKHR;
+  EXTFUNC(clIcdGetPlatformIDsKHR)
 #endif
+  EXTFUNC(clCreateProgramWithLLVMIntel)
+  EXTFUNC(clGetGenVersionIntel)
+  EXTFUNC(clMapBufferIntel)
+  EXTFUNC(clUnmapBufferIntel)
+  EXTFUNC(clMapBufferGTTIntel)
+  EXTFUNC(clUnmapBufferGTTIntel)
+  EXTFUNC(clPinBufferIntel)
+  EXTFUNC(clUnpinBufferIntel)
+  EXTFUNC(clReportUnfreedIntel)
   return NULL;
 }
 
+#undef EXTFUNC
+
 cl_int
-clIntelReportUnfreed(void)
+clReportUnfreedIntel(void)
 {
   return cl_report_unfreed();
 }
 
 void*
-clIntelMapBuffer(cl_mem mem, cl_int *errcode_ret)
+clMapBufferIntel(cl_mem mem, cl_int *errcode_ret)
 {
   void *ptr = NULL;
   cl_int err = CL_SUCCESS;
@@ -1203,7 +1217,7 @@ error:
 }
 
 cl_int
-clIntelUnmapBuffer(cl_mem mem)
+clUnmapBufferIntel(cl_mem mem)
 {
   cl_int err = CL_SUCCESS;
   CHECK_MEM (mem);
@@ -1212,8 +1226,31 @@ error:
   return err;
 }
 
+void*
+clMapBufferGTTIntel(cl_mem mem, cl_int *errcode_ret)
+{
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  ptr = cl_mem_map_gtt(mem);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return ptr;
+}
+
+cl_int
+clUnmapBufferGTTIntel(cl_mem mem)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  err = cl_mem_unmap_gtt(mem);
+error:
+  return err;
+}
+
 cl_int
-clIntelPinBuffer(cl_mem mem)
+clPinBufferIntel(cl_mem mem)
 {
   cl_int err = CL_SUCCESS;
   CHECK_MEM (mem);
@@ -1223,7 +1260,7 @@ error:
 }
 
 cl_int
-clIntelUnpinBuffer(cl_mem mem)
+clUnpinBufferIntel(cl_mem mem)
 {
   cl_int err = CL_SUCCESS;
   CHECK_MEM (mem);
@@ -1233,17 +1270,17 @@ error:
 }
 
 cl_int
-clIntelGetGenVersion(cl_device_id device, cl_int *ver)
+clGetGenVersionIntel(cl_device_id device, cl_int *ver)
 {
   return cl_device_get_version(device, ver);
 }
 
 cl_program
-clCreateProgramWithLLVM(cl_context              context,
-                        cl_uint                 num_devices,
-                        const cl_device_id *    devices,
-                        const char *            filename,
-                        cl_int *                errcode_ret)
+clCreateProgramWithLLVMIntel(cl_context              context,
+                             cl_uint                 num_devices,
+                             const cl_device_id *    devices,
+                             const char *            filename,
+                             cl_int *                errcode_ret)
 {
   return cl_program_create_from_llvm(context,
                                      num_devices,
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index a22884f..7d604c3 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -108,7 +108,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
     uint32_t offset; // location of the address in the curbe
     arg_type = gbe_kernel_get_arg_type(k->opaque, i);
     if (arg_type != GBE_ARG_GLOBAL_PTR &&
-        arg_type != GBE_ARG_CONSTANT_PTR &&
         arg_type != GBE_ARG_IMAGE &&
         arg_type != GBE_ARG_SAMPLER)
       continue;
@@ -129,6 +128,25 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
   return CL_SUCCESS;
 }
 
+LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k,
+                                                       char * dst)
+{
+  int i;
+  for(i = 0; i < k->arg_n; i++) {
+    enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i);
+
+    if(arg_type == GBE_ARG_CONSTANT_PTR) {
+      uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER);
+      cl_mem mem = k->args[i].mem;
+      cl_buffer_map(mem->bo, 1);
+      void * addr = cl_buffer_get_virtual(mem->bo);
+      memcpy(dst + offset, addr, mem->size);
+      cl_buffer_unmap(mem->bo);
+    }
+  }
+  return CL_SUCCESS;
+}
+
 #if USE_FULSIM
 extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
 extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr, FILE*);
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 6387ae1..dcfc8c4 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -70,5 +70,7 @@ extern cl_int cl_command_queue_finish(cl_command_queue);
 /* Bind all the surfaces in the GPGPU state */
 extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
 
+/*update constant buffer to final curbe */
+extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst);
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 3a590bc..9402549 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -186,7 +186,8 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
   cl_gpgpu_kernel kernel;
   const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
-  size_t i, batch_sz = 0u, local_sz = 0u, cst_sz = ker->curbe_sz;
+  size_t i, batch_sz = 0u, local_sz = 0u;
+  size_t cst_sz = ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque);
   size_t thread_n = 0u;
   cl_int err = CL_SUCCESS;
 
@@ -224,8 +225,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   if (ker->curbe) {
     assert(cst_sz > 0);
     TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
-      for (i = 0; i < thread_n; ++i)
+    for (i = 0; i < thread_n; ++i) {
         memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
+        cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * i);
+    }
     TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
     cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz);
   }
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index bbd4438..ec0e2e8 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -154,6 +154,17 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !mem->is_image)
      || (arg_type != GBE_ARG_IMAGE && mem->is_image)))
       return CL_INVALID_ARG_VALUE;
+
+  if(arg_type == GBE_ARG_CONSTANT_PTR) {
+    int32_t cbOffset;
+    cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size);
+    //constant ptr's curbe offset changed, update it
+    if(cbOffset >= 0) {
+      offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+      *((uint32_t *)(k->curbe + offset)) = cbOffset;  //cb offset in curbe
+    }
+  }
+
   cl_mem_add_ref(mem);
   if (k->args[index].mem)
     cl_mem_delete(k->args[index].mem);
@@ -177,6 +188,9 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   cl_context ctx = k->program->ctx;
   cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
 
+  if(k->bo != NULL)
+    cl_buffer_unreference(k->bo);
+
   /* Allocate the gen code here */
   const uint32_t code_sz = gbe_kernel_get_code_size(opaque);
   const char *code = gbe_kernel_get_code(opaque);
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 4bbaee4..690e5cf 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -141,6 +141,7 @@ cl_mem_allocate(cl_context ctx,
     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
     goto error;
   }
+  mem->size = sz;
 
   /* Append the buffer in the context buffer list */
   pthread_mutex_lock(&ctx->buffer_lock);
@@ -480,6 +481,21 @@ cl_mem_unmap(cl_mem mem)
   return CL_SUCCESS;
 }
 
+LOCAL void*
+cl_mem_map_gtt(cl_mem mem)
+{
+  cl_buffer_map_gtt(mem->bo);
+  assert(cl_buffer_get_virtual(mem->bo));
+  return cl_buffer_get_virtual(mem->bo);
+}
+
+LOCAL cl_int
+cl_mem_unmap_gtt(cl_mem mem)
+{
+  cl_buffer_unmap_gtt(mem->bo);
+  return CL_SUCCESS;
+}
+
 LOCAL cl_int
 cl_mem_pin(cl_mem mem)
 {
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 836deb4..99f3835 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -36,6 +36,7 @@ struct _cl_mem {
   uint64_t magic;           /* To identify it as a memory object */
   volatile int ref_n;       /* This object is reference counted */
   cl_buffer bo;             /* Data in GPU memory */
+  size_t size;              /* original request size, not alignment size, used in constant buffer */
   cl_mem prev, next;        /* We chain the memory buffers together */
   cl_context ctx;           /* Context it belongs to */
   cl_mem_flags flags;       /* Flags specified at the creation time */
@@ -75,6 +76,12 @@ extern void *cl_mem_map(cl_mem);
 /* Unmap a memory object */
 extern cl_int cl_mem_unmap(cl_mem);
 
+/* Directly map a memory object in GTT mode */
+extern void *cl_mem_map_gtt(cl_mem);
+
+/* Unmap a memory object in GTT mode */
+extern cl_int cl_mem_unmap_gtt(cl_mem);
+
 /* Pin/unpin the buffer in memory (you must be root) */
 extern cl_int cl_mem_pin(cl_mem);
 extern cl_int cl_mem_unpin(cl_mem);
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index bed0159..b2e3c97 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -29,6 +29,9 @@ set (utests_sources
   compiler_function_argument0.cpp
   compiler_function_argument1.cpp
   compiler_function_argument.cpp
+  compiler_function_constant0.cpp
+  compiler_function_constant1.cpp
+  compiler_function_constant.cpp
   compiler_if_else.cpp
   compiler_integer_division.cpp
   compiler_integer_remainder.cpp
diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp
index 685a189..04c9544 100644
--- a/utests/compiler_copy_image.cpp
+++ b/utests/compiler_copy_image.cpp
@@ -21,8 +21,9 @@ static void compiler_copy_image(void)
   desc.image_width = w;
   desc.image_height = h;
   desc.image_row_pitch = w * sizeof(uint32_t);
-
   OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+  desc.image_row_pitch = 0;
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
   OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
   free(buf_data[0]);
diff --git a/utests/compiler_fill_image.cpp b/utests/compiler_fill_image.cpp
index c9744cc..c9242b2 100644
--- a/utests/compiler_fill_image.cpp
+++ b/utests/compiler_fill_image.cpp
@@ -13,7 +13,7 @@ static void compiler_fill_image(void)
   desc.image_type = CL_MEM_OBJECT_IMAGE2D;
   desc.image_width = w;
   desc.image_height = h;
-  desc.image_row_pitch = w * sizeof(uint32_t);
+  desc.image_row_pitch = 0;
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_fill_image");
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_fill_image0.cpp
index 2fef90c..cf76be3 100644
--- a/utests/compiler_fill_image0.cpp
+++ b/utests/compiler_fill_image0.cpp
@@ -12,7 +12,7 @@ static void compiler_fill_image0(void)
   desc.image_type = CL_MEM_OBJECT_IMAGE2D;
   desc.image_width = w;
   desc.image_height = h;
-  desc.image_row_pitch = w * sizeof(uint32_t);
+  desc.image_row_pitch = 0;
 
 
   // Setup kernel and images
diff --git a/utests/compiler_function_constant.cpp b/utests/compiler_function_constant.cpp
new file mode 100644
index 0000000..20f0ece
--- /dev/null
+++ b/utests/compiler_function_constant.cpp
@@ -0,0 +1,34 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant(void)
+{
+  const size_t n = 2048;
+  const uint32_t value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_constant");
+  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(uint32_t), &value);
+
+  OCL_MAP_BUFFER(0);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(1);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t *)buf_data[1])[i] == (value + i%69));
+
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant);
diff --git a/utests/compiler_function_constant0.cpp b/utests/compiler_function_constant0.cpp
new file mode 100644
index 0000000..de564f3
--- /dev/null
+++ b/utests/compiler_function_constant0.cpp
@@ -0,0 +1,42 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant0(void)
+{
+  const size_t n = 2048;
+  const uint32_t value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_constant0");
+  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, 256 * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(uint32_t), &value);
+
+  OCL_MAP_BUFFER(0);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_MAP_BUFFER(1);
+  for(uint32_t i = 0; i < 256; ++i)
+    ((char *)buf_data[1])[i] = 10;
+  ((char *)buf_data[1])[15] = 15;
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t *)buf_data[2])[i] == (value + 15 + i%69));
+
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant0);
diff --git a/utests/compiler_function_constant1.cpp b/utests/compiler_function_constant1.cpp
new file mode 100644
index 0000000..b92e6ca
--- /dev/null
+++ b/utests/compiler_function_constant1.cpp
@@ -0,0 +1,47 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant1(void)
+{
+  const size_t n = 2048;
+  const uint32_t value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_constant");
+  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(uint32_t), &value);
+
+  OCL_MAP_BUFFER(0);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  OCL_CREATE_BUFFER(buf[2], 0, 101 * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[2]);
+  OCL_MAP_BUFFER(2);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[2])[i] = 2*i;
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t *)buf_data[1])[i] == (value + (i%69)*2));
+
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant1);
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
index 19e395f..30e53bb 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_movforphi_undef.cpp
@@ -8,13 +8,6 @@ static void compiler_movforphi_undef(void)
   cl_image_format format;
   cl_image_desc desc;
 
-  format.image_channel_order = CL_RGBA;
-  format.image_channel_data_type = CL_UNSIGNED_INT8;
-  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
-  desc.image_width = w;
-  desc.image_height = h;
-  desc.image_row_pitch = w * sizeof(uint32_t);
-
   // Setup kernel and images
   OCL_CREATE_KERNEL("test_movforphi_undef");
   buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
@@ -22,7 +15,15 @@ static void compiler_movforphi_undef(void)
     for (uint32_t i = 0; i < w; i++)
       ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
 
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
   OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+  desc.image_row_pitch = 0;
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
   OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
   free(buf_data[0]);
diff --git a/utests/runtime_flat_address_space.cpp b/utests/runtime_flat_address_space.cpp
index 0357cbd..08167c4 100644
--- a/utests/runtime_flat_address_space.cpp
+++ b/utests/runtime_flat_address_space.cpp
@@ -53,7 +53,7 @@ main(int argc, char *argv[])
                                   NULL);
 
     // Be sure that everything run fine
-    dst_buffer = (int *) clIntelMapBuffer(dst[j], &status);
+    dst_buffer = (int *) clMapBufferIntel(dst[j], &status);
     if (status != CL_SUCCESS)
       goto error;
     for (uint32_t i = 0; i < n; ++i)
@@ -61,13 +61,13 @@ main(int argc, char *argv[])
         fprintf(stderr, "run-time flat address space failed\n");
         exit(-1);
       }
-    OCL_CALL (clIntelUnmapBuffer, dst[j]);
+    OCL_CALL (clUnmapBufferIntel, dst[j]);
   }
 
   for (uint32_t j = 0; j < 24; ++j) OCL_CALL (clReleaseMemObject, dst[j]);
   cl_test_destroy();
-  printf("%i memory leaks\n", clIntelReportUnfreed());
-  assert(clIntelReportUnfreed() == 0);
+  printf("%i memory leaks\n", clReportUnfreedIntel());
+  assert(clReportUnfreedIntel() == 0);
 
 error:
   return status;
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index b536598..3e73db3 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -216,7 +216,7 @@ do_kiss_path(const char *file, cl_device_id device)
   if (device == NULL)
     sub_path = "";
   else {
-    if (clIntelGetGenVersion(device, &ver) != CL_SUCCESS)
+    if (clGetGenVersionIntel(device, &ver) != CL_SUCCESS)
       clpanic("Unable to get Gen version", -1);
     sub_path = "";
   }
@@ -240,7 +240,7 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format)
   /* Load the program and build it */
   ker_path = do_kiss_path(file_name, device);
   if (format == LLVM)
-    program = clCreateProgramWithLLVM(ctx, 1, &device, ker_path, &status);
+    program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
   else if (format == SOURCE) {
     cl_file_map_t *fm = cl_file_map_new();
     FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
@@ -428,8 +428,8 @@ cl_test_destroy(void)
 {
   cl_kernel_destroy();
   cl_ocl_destroy();
-  printf("%i memory leaks\n", clIntelReportUnfreed());
-  assert(clIntelReportUnfreed() == 0);
+  printf("%i memory leaks\n", clReportUnfreedIntel());
+  assert(clReportUnfreedIntel() == 0);
 }
 
 void
@@ -438,7 +438,7 @@ cl_buffer_destroy(void)
   int i;
   for (i = 0; i < MAX_BUFFER_N; ++i) {
     if (buf_data[i] != NULL) {
-      clIntelUnmapBuffer(buf[i]);
+      clUnmapBufferIntel(buf[i]);
       buf_data[i] = NULL;
     }
     if (buf[i] != NULL) {
@@ -456,7 +456,7 @@ cl_report_perf_counters(cl_mem perf)
   uint32_t i;
   if (perf == NULL)
     return;
-  start = (uint32_t*) clIntelMapBuffer(perf, &status);
+  start = (uint32_t*) clMapBufferIntel(perf, &status);
   assert(status == CL_SUCCESS && start != NULL);
   end = start + 128;
 
@@ -481,7 +481,7 @@ cl_report_perf_counters(cl_mem perf)
   }
   printf("\n\n");
 
-  clIntelUnmapBuffer(perf);
+  clUnmapBufferIntel(perf);
 }
 
 struct bmphdr {
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 02249e1..5258416 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -101,12 +101,12 @@ extern EGLSurface  eglSurface;
     OCL_CALL2(clCreateSampler, SAMPLER, ctx, 0, ADDRESS_MODE, FILTER_MODE)
 
 #define OCL_MAP_BUFFER(ID) \
-    OCL_CALL2(clIntelMapBuffer, buf_data[ID], buf[ID])
+    OCL_CALL2(clMapBufferIntel, buf_data[ID], buf[ID])
 
 #define OCL_UNMAP_BUFFER(ID) \
   do { \
     if (buf[ID] != NULL) { \
-      OCL_CALL (clIntelUnmapBuffer, buf[ID]); \
+      OCL_CALL (clUnmapBufferIntel, buf[ID]); \
       buf_data[ID] = NULL; \
     } \
   } while (0)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git