[Pkg-opencl-devel] [beignet] 34/66: Imported Upstream version 0.1+git20130621+30586bf

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 07:27:05 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit b49cd1f2a6a9cab7685bc2d270b876478274ddb9
Author: Simon Richter <sjr at debian.org>
Date:   Fri Jun 21 12:08:49 2013 +0200

    Imported Upstream version 0.1+git20130621+30586bf
---
 backend/src/backend/gen_context.cpp                |   17 +
 backend/src/backend/gen_context.hpp                |    2 +
 backend/src/backend/gen_defs.hpp                   |   10 +-
 backend/src/backend/gen_encoder.cpp                |  203 +-
 backend/src/backend/gen_encoder.hpp                |    6 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |    2 +
 backend/src/backend/gen_insn_selection.cpp         |  148 +-
 backend/src/backend/gen_insn_selection.hxx         |    4 +
 backend/src/backend/gen_reg_allocation.cpp         |    1 -
 backend/src/backend/gen_register.hpp               |  102 +-
 backend/src/gen_as.sh                              |   83 +
 backend/src/gen_convert.sh                         |   52 +
 backend/src/genconfig.sh                           |   10 +
 backend/src/ir/profile.cpp                         |    1 +
 backend/src/llvm/llvm_gen_backend.cpp              |    3 +
 backend/src/llvm/llvm_scalarize.cpp                |    6 +-
 backend/src/ocl_stdlib.h                           | 4002 +++++++++++++++++++-
 backend/src/update.sh                              |    3 +
 backend/src/update_as.sh                           |   11 +
 backend/src/update_convert.sh                      |   11 +
 kernels/compiler_double.cl                         |    9 +
 kernels/compiler_double_2.cl                       |    9 +
 kernels/compiler_double_3.cl                       |    6 +
 kernels/get_program_info.cl                        |   10 +
 utests/CMakeLists.txt                              |    4 +
 utests/compiler_double.cpp                         |   46 +
 utests/compiler_double_2.cpp                       |   47 +
 utests/compiler_double_3.cpp                       |   46 +
 utests/compiler_sub_bytes.cpp                      |    4 +-
 utests/compiler_sub_shorts.cpp                     |    4 +-
 utests/get_program_info.cpp                        |  247 ++
 31 files changed, 5050 insertions(+), 59 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 70c5bcf..53ba73c 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -153,6 +153,8 @@ namespace gbe
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     switch (insn.opcode) {
+      case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break;
+      case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
       case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
       case SEL_OP_AND:  p->AND(dst, src0, src1); break;
       case SEL_OP_OR:   p->OR (dst, src0, src1);  break;
@@ -269,6 +271,14 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.extra.function;
+    const uint32_t elemNum = insn.extra.elem;
+    p->READ_FLOAT64(dst, src, bti, elemNum);
+  }
+
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
@@ -277,6 +287,13 @@ namespace gbe
     p->UNTYPED_READ(dst, src, bti, elemNum);
   }
 
+  void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.extra.function;
+    const uint32_t elemNum = insn.extra.elem;
+    p->WRITE_FLOAT64(src, bti, elemNum);
+  }
+
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     const uint32_t bti = insn.extra.function;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 1566cbb..804384d 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -87,6 +87,8 @@ namespace gbe
     void emitBarrierInstruction(const SelectionInstruction &insn);
     void emitFenceInstruction(const SelectionInstruction &insn);
     void emitMathInstruction(const SelectionInstruction &insn);
+    void emitReadFloat64Instruction(const SelectionInstruction &insn);
+    void emitWriteFloat64Instruction(const SelectionInstruction &insn);
     void emitUntypedReadInstruction(const SelectionInstruction &insn);
     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index f4e4938..9d8db5b 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -215,6 +215,7 @@ enum GenMessageTarget {
 #define GEN_TYPE_VF  5 /* packed float vector, immediates only? */
 #define GEN_TYPE_HF  6
 #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF  6
 #define GEN_TYPE_F   7
 
 #define GEN_ARF_NULL                  0x00
@@ -303,6 +304,7 @@ enum GenMessageTarget {
 #define GEN_BYTE_SCATTER_BYTE   0
 #define GEN_BYTE_SCATTER_WORD   1
 #define GEN_BYTE_SCATTER_DWORD  2
+#define GEN_BYTE_SCATTER_QWORD  3
 
 #define GEN_SAMPLER_RETURN_FORMAT_FLOAT32     0
 #define GEN_SAMPLER_RETURN_FORMAT_UINT32      2
@@ -418,7 +420,7 @@ struct GenInstruction
       uint32_t src0_reg_type:3;
       uint32_t src1_reg_file:2;
       uint32_t src1_reg_type:3;
-      uint32_t pad:1;
+      uint32_t nib_ctrl:1;
       uint32_t dest_subreg_nr:5;
       uint32_t dest_reg_nr:8;
       uint32_t dest_horiz_stride:2;
@@ -432,7 +434,7 @@ struct GenInstruction
       uint32_t src0_reg_type:3;
       uint32_t src1_reg_file:2;        /* 0x00000c00 */
       uint32_t src1_reg_type:3;        /* 0x00007000 */
-      uint32_t pad:1;
+      uint32_t nib_ctrl:1;
       int dest_indirect_offset:10;        /* offset against the deref'd address reg */
       uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
       uint32_t dest_horiz_stride:2;
@@ -446,7 +448,7 @@ struct GenInstruction
       uint32_t src0_reg_type:3;
       uint32_t src1_reg_file:2;
       uint32_t src1_reg_type:3;
-      uint32_t pad:1;
+      uint32_t nib_ctrl:1;
       uint32_t dest_writemask:4;
       uint32_t dest_subreg_nr:1;
       uint32_t dest_reg_nr:8;
@@ -459,7 +461,7 @@ struct GenInstruction
       uint32_t dest_reg_type:3;
       uint32_t src0_reg_file:2;
       uint32_t src0_reg_type:3;
-      uint32_t pad0:6;
+      uint32_t nib_ctrl:1;
       uint32_t dest_writemask:4;
       int dest_indirect_offset:6;
       uint32_t dest_subreg_nr:3;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 859a1b9..3d8afe8 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -235,6 +235,7 @@ namespace gbe
       NOT_IMPLEMENTED;
     insn->header.acc_wr_control = this->curr.accWrEnable;
     insn->header.quarter_control = this->curr.quarterControl;
+    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
     insn->header.mask_control = this->curr.noMask;
     insn->bits2.ia1.flag_reg_nr = this->curr.flag;
     insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
@@ -355,6 +356,105 @@ namespace gbe
     0
   };
 
+  static int dst_type(int exec_width) {
+    if (exec_width == 8)
+      return GEN_TYPE_UD;
+    if (exec_width == 16)
+      return GEN_TYPE_UW;
+    NOT_IMPLEMENTED;
+    return 0;
+  }
+
+  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    int w = curr.execWidth;
+    dst = GenRegister::h2(dst);
+    dst.type = GEN_TYPE_UD;
+    src.type = GEN_TYPE_UD;
+    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
+    GenRegister imm4 = GenRegister::immud(4);
+    GenInstruction *insn;
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::uw16grf(r.nr, 0));
+    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(dst, r);
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
+    pop();
+    ADD(src, src, imm4);
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::uw16grf(r.nr, 0));
+    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(GenRegister::suboffset(dst, 1), r);
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
+    pop();
+  }
+
+  void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+    int w = curr.execWidth;
+    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
+    r.type = GEN_TYPE_UD;
+    GenRegister hdr = GenRegister::h2(r);
+    GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
+    src.hstride = GEN_HORIZONTAL_STRIDE_2;
+    GenRegister data = GenRegister::offset(r, w / 8);
+    GenRegister imm4 = GenRegister::immud(4);
+    MOV(r, GenRegister::ud8grf(msg.nr, 0));
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(data, src);
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
+    pop();
+    GenInstruction *insn;
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
+    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
+
+    ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(data, GenRegister::suboffset(src, 1));
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
+    pop();
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
+    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
+  }
+
   void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
@@ -467,7 +567,25 @@ namespace gbe
   }
 
   INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
-     if (needToSplitAlu1(p, dst, src) == false) {
+     if (dst.isdf() && src.isdf()) {
+       int w = p->curr.execWidth;
+       p->push();
+       p->curr.quarterControl = 0;
+       p->curr.nibControl = 0;
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src);
+       if (w == 8)
+         p->curr.nibControl = 1; // second 1/8 mask
+       else // w == 16
+         p->curr.quarterControl = 1; // second 1/4 mask
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+       p->pop();
+     } else if (needToSplitAlu1(p, dst, src) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -499,7 +617,27 @@ namespace gbe
                    GenRegister src0,
                    GenRegister src1)
   {
-    if (needToSplitAlu2(p, dst, src0, src1) == false) {
+    if (dst.isdf() && src0.isdf() && src1.isdf()) {
+       int w = p->curr.execWidth;
+       p->push();
+       p->curr.quarterControl = 0;
+       p->curr.nibControl = 0;
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src0);
+       p->setSrc1(insn, src1);
+       if (w == 8)
+         p->curr.nibControl = 1; // second 1/8 mask
+       else // w == 16
+         p->curr.quarterControl = 1; // second 1/4 mask
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+       p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+       p->pop();
+    } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -620,6 +758,67 @@ namespace gbe
     alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
   }
 
+  void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+    union { double d; unsigned u[2]; } u;
+    u.d = value;
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+    push();
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.execWidth = 1;
+    MOV(r, GenRegister::immud(u.u[1]));
+    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
+    pop();
+    r.type = GEN_TYPE_DF;
+    r.vstride = GEN_VERTICAL_STRIDE_0;
+    r.width = GEN_WIDTH_1;
+    r.hstride = GEN_HORIZONTAL_STRIDE_0;
+    push();
+    MOV(dest, r);
+    pop();
+  }
+
+  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+    int w = curr.execWidth;
+    if (src0.isdf()) {
+      push();
+      curr.execWidth = 16;
+      MOV(dest, src0);
+      if (w == 16) {
+        curr.quarterControl = 1;
+        MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
+      }
+      pop();
+    } else {
+      GenRegister r0 = GenRegister::h2(r);
+      push();
+      curr.execWidth = 8;
+      curr.predicate = GEN_PREDICATE_NONE;
+      MOV(r0, src0);
+      MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4));
+      curr.predicate = GEN_PREDICATE_NORMAL;
+      curr.quarterControl = 0;
+      curr.nibControl = 0;
+      MOV(dest, r);
+      curr.nibControl = 1;
+      MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8));
+      pop();
+      if (w == 16) {
+        push();
+        curr.execWidth = 8;
+        curr.predicate = GEN_PREDICATE_NONE;
+        MOV(r0, GenRegister::suboffset(src0, 8));
+        MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12));
+        curr.predicate = GEN_PREDICATE_NORMAL;
+        curr.quarterControl = 1;
+        curr.nibControl = 0;
+        MOV(GenRegister::suboffset(dest, 8), r);
+        curr.nibControl = 1;
+        MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8));
+        pop();
+      }
+    }
+  }
+
   ALU1(MOV)
   ALU1(RNDZ)
   ALU1(RNDE)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index c98774f..1a5dcf9 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -113,9 +113,11 @@ namespace gbe
     ALU2(LINE)
     ALU2(PLN)
     ALU3(MAD)
+    ALU2(MOV_DF);
 #undef ALU1
 #undef ALU2
 #undef ALU3
+    void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     /*! Barrier message (to synchronize threads of a workgroup) */
     void BARRIER(GenRegister src);
     /*! Memory fence message (to order loads and stores between threads) */
@@ -132,6 +134,10 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Read 64-bits float arrays */
+    void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    /*! Write 64-bits float arrays */
+    void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped read (upto 4 channels) */
     void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 098d9ec..a3b4621 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -12,6 +12,8 @@ DECL_GEN7_SCHEDULE(Wait,            20,        2,        2)
 DECL_GEN7_SCHEDULE(Math,            20,        4,        2)
 DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
 DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
+DECL_GEN7_SCHEDULE(ReadFloat64,     80,        1,        1)
+DECL_GEN7_SCHEDULE(WriteFloat64,    80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 4e7cebd..5901419 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -129,6 +129,7 @@ namespace gbe
       case TYPE_S32: return GEN_TYPE_D;
       case TYPE_U32: return GEN_TYPE_UD;
       case TYPE_FLOAT: return GEN_TYPE_F;
+      case TYPE_DOUBLE: return GEN_TYPE_DF;
       default: NOT_SUPPORTED; return GEN_TYPE_F;
     }
   }
@@ -166,11 +167,13 @@ namespace gbe
 
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
+           this->opcode == SEL_OP_READ_FLOAT64 ||
            this->opcode == SEL_OP_BYTE_GATHER;
   }
 
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
+           this->opcode == SEL_OP_WRITE_FLOAT64 ||
            this->opcode == SEL_OP_BYTE_SCATTER;
   }
 
@@ -406,6 +409,8 @@ namespace gbe
 #define ALU3(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
     ALU1(MOV)
+    ALU2(MOV_DF)
+    ALU2(LOAD_DF_IMM)
     ALU1(RNDZ)
     ALU1(RNDE)
     ALU2(SEL)
@@ -449,6 +454,10 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Read 64 bits float array */
+    void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    /*! Write 64 bits float array */
+    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */
@@ -610,20 +619,23 @@ namespace gbe
 
   ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) {
     SelectionBlock *block = insn->parent;
-    const uint32_t simdWidth = ctx.getSimdWidth();
+    uint32_t simdWidth = ctx.getSimdWidth();
     ir::Register tmp;
+    ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
+    int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
+    GenRegister gr;
 
     // This will append the temporary register in the instruction block
     this->block = block;
-    tmp = this->reg(ir::FAMILY_DWORD);
+    tmp = this->reg(f);
 
     // Generate the MOV instruction and replace the register in the instruction
     SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
-    mov->dst(0) = GenRegister::retype(insn->dst(regID), GEN_TYPE_F);
+    mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
     mov->state = GenInstructionState(simdWidth);
-    insn->dst(regID) = mov->src(0) = GenRegister::fxgrf(simdWidth, tmp);
+    gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp);
+    insn->dst(regID) = mov->src(0) = gr;
     insn->append(*mov);
-
     return tmp;
   }
 
@@ -657,6 +669,7 @@ namespace gbe
       case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
       case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
       case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
       default: NOT_SUPPORTED;
     }
     GBE_ASSERT(false);
@@ -719,6 +732,33 @@ namespace gbe
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
 
+  void Selection::Opaque::READ_FLOAT64(Reg addr,
+                                       const GenRegister *dst,
+                                       uint32_t elemNum,
+                                       uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+
+    // Regular instruction to encode
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    insn->src(0) = addr;
+    insn->extra.function = bti;
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation
+    dstVector->regNum = elemNum;
+    dstVector->isSrc = 0;
+    dstVector->reg = &insn->dst(0);
+
+    // Source cannot be scalar (yet)
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
   void Selection::Opaque::UNTYPED_READ(Reg addr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
@@ -746,6 +786,27 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
+  void Selection::Opaque::WRITE_FLOAT64(Reg addr,
+                                        const GenRegister *src,
+                                        uint32_t elemNum,
+                                        uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
+    SelectionVector *vector = this->appendVector();
+
+    // Regular instruction to encode
+    insn->src(0) = addr;
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->src(elemID+1) = src[elemID];
+    insn->extra.function = bti;
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation for the sources
+    vector->regNum = elemNum+1;
+    vector->reg = &insn->src(0);
+    vector->isSrc = 1;
+  }
+
   void Selection::Opaque::UNTYPED_WRITE(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
@@ -1092,6 +1153,15 @@ namespace gbe
   // Implementation of all patterns
   ///////////////////////////////////////////////////////////////////////////
 
+  bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+    using namespace ir;
+    const auto &childInsn = cast<LoadImmInstruction>(insn);
+    const auto &imm = childInsn.getImmediate();
+    if(imm.type != TYPE_DOUBLE)
+      return true;
+    return false;
+  }
+
   GenRegister getRegisterFromImmediate(ir::Immediate imm)
   {
     using namespace ir;
@@ -1103,6 +1173,7 @@ namespace gbe
       case TYPE_S16: return  GenRegister::immw(imm.data.s16);
       case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
       case TYPE_S8:  return GenRegister::immw(imm.data.s8);
+      case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
@@ -1146,7 +1217,13 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0));
       switch (opcode) {
         case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
-        case ir::OP_MOV: sel.MOV(dst, src); break;
+        case ir::OP_MOV:
+          if (dst.isdf()) {
+            ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+            sel.MOV_DF(dst, src, sel.selReg(r));
+          } else
+            sel.MOV(dst, src);
+          break;
         case ir::OP_RNDD: sel.RNDD(dst, src); break;
         case ir::OP_RNDE: sel.RNDE(dst, src); break;
         case ir::OP_RNDU: sel.RNDU(dst, src); break;
@@ -1225,14 +1302,14 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
         if (dag0) dag0->isRoot = 1;
       }
       // Left source cannot be immediate but it is OK if we can commute
-      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
+      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
         src0 = sel.selReg(insn.getSrc(1), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1268,7 +1345,7 @@ namespace gbe
         case OP_SHR: sel.SHR(dst, src0, src1); break;
         case OP_ASR: sel.ASR(dst, src0, src1); break;
         case OP_MUL:
-          if (type == TYPE_FLOAT)
+          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
             sel.MUL(dst, src0, src1);
           else if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
@@ -1599,6 +1676,7 @@ namespace gbe
         case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
+        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); break;
         default: NOT_SUPPORTED;
       }
       sel.pop();
@@ -1650,6 +1728,8 @@ namespace gbe
   INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
     using namespace ir;
     switch (type) {
+      case TYPE_DOUBLE:
+        return GEN_BYTE_SCATTER_QWORD;
       case TYPE_FLOAT:
       case TYPE_U32:
       case TYPE_S32:
@@ -1681,6 +1761,22 @@ namespace gbe
       sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
     }
 
+    void emitReadFloat64(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      vector<GenRegister> dst(valueNum);
+      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
+      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      if (sel.ctx.getSimdWidth() == 16)
+        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+    }
+
     void emitByteGather(Selection::Opaque &sel,
                         const ir::LoadInstruction &insn,
                         const uint32_t elemSize,
@@ -1732,6 +1828,8 @@ namespace gbe
       const uint32_t elemSize = getByteScatterGatherSize(type);
       if (insn.getAddressSpace() == MEM_CONSTANT)
         this->emitIndirectMove(sel, insn, address);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
@@ -1762,6 +1860,25 @@ namespace gbe
       sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
     }
 
+    void emitWriteFloat64(Selection::Opaque &sel,
+                          const ir::StoreInstruction &insn,
+                          uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t addrID = ir::StoreInstruction::addressIndex;
+      GenRegister addr;
+      vector<GenRegister> value(valueNum);
+
+      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
+      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      if (sel.ctx.getSimdWidth() == 16)
+        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+    }
+
     void emitByteScatter(Selection::Opaque &sel,
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize,
@@ -1791,7 +1908,9 @@ namespace gbe
       const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitWriteFloat64(sel, insn, bti);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedWrite(sel, insn, bti);
       else {
         const GenRegister address = sel.selReg(insn.getAddress());
@@ -1839,7 +1958,7 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1873,7 +1992,7 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
 
       // We need two instructions to make the conversion
-      if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
+      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
           const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
@@ -1886,6 +2005,9 @@ namespace gbe
         }
         sel.MOV(unpacked, src);
         sel.MOV(dst, unpacked);
+      } else if (dst.isdf()) {
+        ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+        sel.MOV_DF(dst, src, sel.selReg(r));
       } else
         sel.MOV(dst, src);
       return true;
@@ -1919,7 +2041,7 @@ namespace gbe
       SelectionDAG *dag2 = dag.child[2];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag2->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag2->insn);
         src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 789c81c..4b5525b 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,7 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
 DECL_SELECTION_IR(NOT, UnaryInstruction)
 DECL_SELECTION_IR(LZD, UnaryInstruction)
 DECL_SELECTION_IR(RNDZ, UnaryInstruction)
@@ -32,6 +34,8 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
 DECL_SELECTION_IR(FENCE, FenceInstruction)
 DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
 DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction)
+DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
 DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 9765b02..e7c96ac 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -458,7 +458,6 @@ namespace gbe
   }
 
   bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
-
     // Perform the linear scan allocator
     const uint32_t regNum = ctx.sel->getRegNum();
     for (uint32_t startID = 0; startID < regNum; ++startID) {
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index d772b0d..fedb743 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -69,11 +69,12 @@ namespace gbe
   /*! Type size in bytes for each Gen type */
   INLINE int typeSize(uint32_t type) {
     switch(type) {
+      case GEN_TYPE_DF:
+        return 8;
       case GEN_TYPE_UD:
       case GEN_TYPE_D:
       case GEN_TYPE_F:
         return 4;
-      case GEN_TYPE_HF:
       case GEN_TYPE_UW:
       case GEN_TYPE_W:
         return 2;
@@ -110,6 +111,7 @@ namespace gbe
     INLINE GenInstructionState(uint32_t simdWidth = 8) {
       this->execWidth = simdWidth;
       this->quarterControl = GEN_COMPRESSION_Q1;
+      this->nibControl = 0;
       this->accWrEnable = 0;
       this->noMask = 0;
       this->flag = 0;
@@ -126,6 +128,7 @@ namespace gbe
     uint32_t flagIndex:16;   //!< Only if virtual flag (index of the register)
     uint32_t execWidth:5;
     uint32_t quarterControl:1;
+    uint32_t nibControl:1;
     uint32_t accWrEnable:1;
     uint32_t noMask:1;
     uint32_t predicate:4;
@@ -192,6 +195,7 @@ namespace gbe
 
     /*! For immediates or virtual register */
     union {
+      double df;
       float f;
       int32_t d;
       uint32_t ud;
@@ -211,6 +215,31 @@ namespace gbe
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
 
+    static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+      GenRegister r = reg;
+      r.nr += nr;
+      r.subnr += subnr;
+      return r;
+    }
+
+    INLINE bool isimmdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+        return true;
+      return false;
+    }
+
+    INLINE bool isdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
+    static INLINE GenRegister h2(GenRegister reg) {
+      GenRegister r = reg;
+      r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      return r;
+    }
+
     static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
       GBE_ASSERT(reg.physical == 0);
       if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
@@ -293,6 +322,18 @@ namespace gbe
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+      return retype(vec16(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
       return retype(vec16(file, reg), GEN_TYPE_UD);
     }
@@ -371,6 +412,12 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE GenRegister immdf(double df) {
+      GenRegister immediate = imm(GEN_TYPE_DF);
+      immediate.value.df = df;
+      return immediate;
+    }
+
     static INLINE GenRegister immf(float f) {
       GenRegister immediate = imm(GEN_TYPE_F);
       immediate.value.f = f;
@@ -448,6 +495,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, reg);
     }
 
+    static INLINE GenRegister df1grf(ir::Register reg) {
+      return df1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df8grf(ir::Register reg) {
+      return df8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df16grf(ir::Register reg) {
+      return df16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
     static INLINE GenRegister ud16grf(ir::Register reg) {
       return ud16(GEN_GENERAL_REGISTER_FILE, reg);
     }
@@ -608,11 +667,37 @@ namespace gbe
                     GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE int hstride_size(GenRegister reg) {
+      switch (reg.hstride) {
+        case GEN_HORIZONTAL_STRIDE_0: return 0;
+        case GEN_HORIZONTAL_STRIDE_1: return 1;
+        case GEN_HORIZONTAL_STRIDE_2: return 2;
+        case GEN_HORIZONTAL_STRIDE_4: return 4;
+        default: NOT_IMPLEMENTED; return 0;
+      }
+    }
+
     static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
-      reg.subnr += delta * typeSize(reg.type);
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += delta * typeSize(reg.type);
+        reg.nr += reg.subnr / 32;
+        reg.subnr %= 32;
+      }
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
     }
@@ -685,6 +770,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
 
+    static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+      return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+      return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+      return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
     static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
       return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
@@ -790,6 +887,7 @@ namespace gbe
         return SIMD1(values...); \
       } \
     }
+    DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
     DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
     DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
     DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
diff --git a/backend/src/gen_as.sh b/backend/src/gen_as.sh
new file mode 100755
index 0000000..626e6ec
--- /dev/null
+++ b/backend/src/gen_as.sh
@@ -0,0 +1,83 @@
+#! /bin/sh -e
+
+. ./genconfig.sh
+
+# Generate list of union sizes
+for type in $TYPES; do
+        size=`IFS=:; set -- dummy $type; echo $3`
+        for vector_length in $VECTOR_LENGTHS; do
+                union_sizes="$union_sizes `expr $vector_length \* $size`"
+        done
+done
+union_sizes="`echo $union_sizes | tr ' ' '\n' | sort -n | uniq`"
+
+# For each union size
+for union_size in $union_sizes; do
+
+        # Define an union that contains all vector types that have the same size as the union
+        unionname="union _type_cast_${union_size}_b"
+        echo "$unionname {"
+        for type in $TYPES; do
+                basetype=`IFS=:; set -- dummy $type; echo $2`
+                basesize=`IFS=:; set -- dummy $type; echo $3`
+                for vector_length in $VECTOR_LENGTHS; do
+                        vector_size_in_union="`expr $vector_length \* $basesize`"
+                        if test $union_size -ne $vector_size_in_union; then
+                                continue
+                        fi
+                        if test $vector_length -eq 1; then
+                                vectortype=$basetype
+                        else
+                                vectortype=$basetype$vector_length
+                        fi
+                        echo "  $vectortype _$vectortype;"
+                done
+
+        done
+        echo "};"
+        echo
+
+        # For each tuple of vector types that has the same size as the current union size,
+        # define an as_* function that converts types without changing binary representation.
+        for ftype in $TYPES; do
+                fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+                fbasesize=`IFS=:; set -- dummy $ftype; echo $3`
+                for fvector_length in $VECTOR_LENGTHS; do
+                        fvector_size_in_union="`expr $fvector_length \* $fbasesize`"
+                        if test $union_size -ne $fvector_size_in_union; then
+                                continue
+                        fi
+                        if test $fvector_length -eq 1; then
+                                fvectortype=$fbasetype
+                        else
+                                fvectortype=$fbasetype$fvector_length
+                        fi
+                        for ttype in $TYPES; do
+                                tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+                                tbasesize=`IFS=:; set -- dummy $ttype; echo $3`
+                                if test $fbasetype = $tbasetype; then
+                                        continue
+                                fi
+                                for tvector_length in $VECTOR_LENGTHS; do
+                                        tvector_size_in_union="`expr $tvector_length \* $tbasesize`"
+                                        if test $union_size -ne $tvector_size_in_union; then
+                                                continue
+                                        fi
+                                        if test $tvector_length -eq 1; then
+                                                tvectortype=$tbasetype
+                                        else
+                                                tvectortype=$tbasetype$tvector_length
+                                        fi
+                                        echo "INLINE OVERLOADABLE $tvectortype as_$tvectortype($fvectortype v) {"
+                                        echo "  $unionname u;"
+                                        echo "  u._$fvectortype = v;"
+                                        echo "  return u._$tvectortype;"
+                                        echo "}"
+                                        echo
+                                done
+                        done
+                done
+
+        done
+
+done
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
new file mode 100755
index 0000000..056b529
--- /dev/null
+++ b/backend/src/gen_convert.sh
@@ -0,0 +1,52 @@
+#! /bin/sh -e
+
+. ./genconfig.sh
+
+# For all vector lengths and types, generate conversion functions
+for vector_length in $VECTOR_LENGTHS; do
+        if test $vector_length -eq 1; then
+                continue;
+        fi
+        for ftype in $TYPES; do
+                fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+                for ttype in $TYPES; do
+                        tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+                        if test $fbasetype = $tbasetype; then
+                                continue
+                        fi
+                        fvectortype=$fbasetype$vector_length
+                        tvectortype=$tbasetype$vector_length
+                        construct="($tbasetype)(v.s0)"
+                        if test $vector_length -gt 1; then
+                                construct="$construct, ($tbasetype)(v.s1)"
+                        fi
+                        if test $vector_length -gt 2; then
+                                construct="$construct, ($tbasetype)(v.s2)"
+                        fi
+                        if test $vector_length -gt 3; then
+                                construct="$construct, ($tbasetype)(v.s3)"
+                        fi
+                        if test $vector_length -gt 4; then
+                                construct="$construct, ($tbasetype)(v.s4)"
+                                construct="$construct, ($tbasetype)(v.s5)"
+                                construct="$construct, ($tbasetype)(v.s6)"
+                                construct="$construct, ($tbasetype)(v.s7)"
+                        fi
+                        if test $vector_length -gt 8; then
+                                construct="$construct, ($tbasetype)(v.s8)"
+                                construct="$construct, ($tbasetype)(v.s9)"
+                                construct="$construct, ($tbasetype)(v.sA)"
+                                construct="$construct, ($tbasetype)(v.sB)"
+                                construct="$construct, ($tbasetype)(v.sC)"
+                                construct="$construct, ($tbasetype)(v.sD)"
+                                construct="$construct, ($tbasetype)(v.sE)"
+                                construct="$construct, ($tbasetype)(v.sF)"
+                        fi
+
+                        echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) {"
+                        echo "  return ($tvectortype)($construct);"
+                        echo "}"
+                        echo
+                done
+        done
+done
diff --git a/backend/src/genconfig.sh b/backend/src/genconfig.sh
new file mode 100644
index 0000000..f55b670
--- /dev/null
+++ b/backend/src/genconfig.sh
@@ -0,0 +1,10 @@
+#! /bin/false
+# This is to be sourced by the generation scripts
+
+# Supported base types and their lengths
+TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
+
+# Supported vector lengths
+VECTOR_LENGTHS="1 2 3 4 8 16"
+
+## No user serviceable parts below here
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 99cd06c..675018a 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -41,6 +41,7 @@ namespace ir {
         "block_ip",
         "barrier_id", "thread_number",
         "const_curbe_offset",
+        "work_dimension",
     };
 
 #if GBE_DEBUG
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3a59da3..5b7754c 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2164,6 +2164,7 @@ namespace gbe
   }
   void GenWriter::regAllocateStoreInst(StoreInst &I) {}
 
+  extern int OCL_SIMD_WIDTH;
   template <bool isLoad, typename T>
   INLINE void GenWriter::emitLoadOrStore(T &I)
   {
@@ -2178,6 +2179,8 @@ namespace gbe
     // Scalar is easy. We neednot build register tuples
     if (isScalarType(llvmType) == true) {
       const ir::Type type = getType(ctx, llvmType);
+      if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
+        OCL_SIMD_WIDTH = 8;
       const ir::Register values = this->getRegister(llvmValues);
       if (isLoad)
         ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 3c0d6a4..bab2236 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -825,11 +825,13 @@ namespace gbe {
     //two passes delete for some phinode
     for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
       (*i)->dropAllReferences();
-      if((*i)->use_empty())
+      if((*i)->use_empty()) {
         (*i)->eraseFromParent();
+        (*i) = NULL;
+      }
     }
     for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
-      if((*i)->getParent())
+      if((*i) && (*i)->getParent())
         (*i)->eraseFromParent();
     }
     deadList.clear();
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index b95b7ed..81a0193 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -53,6 +53,8 @@ typedef unsigned int uintptr_t;
 #define private __private
 #endif
 
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL built-in vector data types
 /////////////////////////////////////////////////////////////////////////////
@@ -70,6 +72,7 @@ DEF(uint);
 DEF(long);
 DEF(ulong);
 DEF(float);
+DEF(double);
 #undef DEF
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL other built-in data types
@@ -91,38 +94,3973 @@ typedef size_t __event_t;
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL conversions & type casting
 /////////////////////////////////////////////////////////////////////////////
-union type_cast_4_b {
-  float f;
-  uchar4 u4;
+
+// ##BEGIN_AS##
+union _type_cast_1_b {
+  char _char;
+  uchar _uchar;
 };
-uchar4 INLINE_OVERLOADABLE as_uchar4(float f) {
-    union type_cast_4_b u;
-    u.f = f;
-    return u.u4;
-}
-#define DEF(type, n, type2) type##n INLINE_OVERLOADABLE convert_##type##n(type2##n d) { \
-    return (type##n)((type)(d.s0), (type)(d.s1), (type)(d.s2), (type)(d.s3)); \
- }
-#define DEF2(type) DEF(type, 4, char); \
-                   DEF(type, 4, uchar); \
-                   DEF(type, 4, short); \
-                   DEF(type, 4, ushort); \
-                   DEF(type, 4, int); \
-                   DEF(type, 4, uint); \
-                   DEF(type, 4, long); \
-                   DEF(type, 4, ulong); \
-                   DEF(type, 4, float);
-DEF2(char);
-DEF2(uchar);
-DEF2(short);
-DEF2(ushort);
-DEF2(int);
-DEF2(uint);
-DEF2(long);
-DEF2(ulong);
-DEF2(float);
-#undef DEF2
-#undef DEF
+
+INLINE OVERLOADABLE uchar as_uchar(char v) {
+  union _type_cast_1_b u;
+  u._char = v;
+  return u._uchar;
+}
+
+INLINE OVERLOADABLE char as_char(uchar v) {
+  union _type_cast_1_b u;
+  u._uchar = v;
+  return u._char;
+}
+
+union _type_cast_2_b {
+  short _short;
+  ushort _ushort;
+  char2 _char2;
+  uchar2 _uchar2;
+};
+
+INLINE OVERLOADABLE ushort as_ushort(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE char2 as_char2(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._char2;
+}
+
+union _type_cast_3_b {
+  char3 _char3;
+  uchar3 _uchar3;
+};
+
+INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
+  union _type_cast_3_b u;
+  u._char3 = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
+  union _type_cast_3_b u;
+  u._uchar3 = v;
+  return u._char3;
+}
+
+union _type_cast_4_b {
+  int _int;
+  uint _uint;
+  short2 _short2;
+  ushort2 _ushort2;
+  char4 _char4;
+  uchar4 _uchar4;
+  float _float;
+};
+
+INLINE OVERLOADABLE uint as_uint(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE float as_float(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uchar4;
+}
+
+union _type_cast_6_b {
+  short3 _short3;
+  ushort3 _ushort3;
+};
+
+INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
+  union _type_cast_6_b u;
+  u._short3 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
+  union _type_cast_6_b u;
+  u._ushort3 = v;
+  return u._short3;
+}
+
+union _type_cast_8_b {
+  long _long;
+  ulong _ulong;
+  int2 _int2;
+  uint2 _uint2;
+  short4 _short4;
+  ushort4 _ushort4;
+  char8 _char8;
+  uchar8 _uchar8;
+  double _double;
+  float2 _float2;
+};
+
+INLINE OVERLOADABLE ulong as_ulong(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE double as_double(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE float2 as_float2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._double;
+}
+
+union _type_cast_12_b {
+  int3 _int3;
+  uint3 _uint3;
+  float3 _float3;
+};
+
+INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
+  union _type_cast_12_b u;
+  u._int3 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE float3 as_float3(int3 v) {
+  union _type_cast_12_b u;
+  u._int3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uint3 v) {
+  union _type_cast_12_b u;
+  u._uint3 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uint3 v) {
+  union _type_cast_12_b u;
+  u._uint3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE int3 as_int3(float3 v) {
+  union _type_cast_12_b u;
+  u._float3 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
+  union _type_cast_12_b u;
+  u._float3 = v;
+  return u._uint3;
+}
+
+union _type_cast_16_b {
+  long2 _long2;
+  ulong2 _ulong2;
+  int4 _int4;
+  uint4 _uint4;
+  short8 _short8;
+  ushort8 _ushort8;
+  char16 _char16;
+  uchar16 _uchar16;
+  double2 _double2;
+  float4 _float4;
+};
+
+INLINE OVERLOADABLE ulong2 as_ulong2(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE float4 as_float4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._double2;
+}
+
+union _type_cast_24_b {
+  long3 _long3;
+  ulong3 _ulong3;
+  double3 _double3;
+};
+
+INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
+  union _type_cast_24_b u;
+  u._long3 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE double3 as_double3(long3 v) {
+  union _type_cast_24_b u;
+  u._long3 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
+  union _type_cast_24_b u;
+  u._ulong3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
+  union _type_cast_24_b u;
+  u._ulong3 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE long3 as_long3(double3 v) {
+  union _type_cast_24_b u;
+  u._double3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
+  union _type_cast_24_b u;
+  u._double3 = v;
+  return u._ulong3;
+}
+
+union _type_cast_32_b {
+  long4 _long4;
+  ulong4 _ulong4;
+  int8 _int8;
+  uint8 _uint8;
+  short16 _short16;
+  ushort16 _ushort16;
+  double4 _double4;
+  float8 _float8;
+};
+
+INLINE OVERLOADABLE ulong4 as_ulong4(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE float8 as_float8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._double4;
+}
+
+union _type_cast_64_b {
+  long8 _long8;
+  ulong8 _ulong8;
+  int16 _int16;
+  uint16 _uint16;
+  double8 _double8;
+  float16 _float16;
+};
+
+INLINE OVERLOADABLE ulong8 as_ulong8(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE float16 as_float16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._double8;
+}
+
+union _type_cast_128_b {
+  long16 _long16;
+  ulong16 _ulong16;
+  double16 _double16;
+};
+
+INLINE OVERLOADABLE ulong16 as_ulong16(long16 v) {
+  union _type_cast_128_b u;
+  u._long16 = v;
+  return u._ulong16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(long16 v) {
+  union _type_cast_128_b u;
+  u._long16 = v;
+  return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(ulong16 v) {
+  union _type_cast_128_b u;
+  u._ulong16 = v;
+  return u._long16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(ulong16 v) {
+  union _type_cast_128_b u;
+  u._ulong16 = v;
+  return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(double16 v) {
+  union _type_cast_128_b u;
+  u._double16 = v;
+  return u._long16;
+}
+
+INLINE OVERLOADABLE ulong16 as_ulong16(double16 v) {
+  union _type_cast_128_b u;
+  u._double16 = v;
+  return u._ulong16;
+}
+
+// ##END_AS##
+
+// ##BEGIN_CONVERT##
+INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(long2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(long2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(long2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(long2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(long2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(long2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(long2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(long2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ulong2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ulong2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(ulong2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(ulong2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ulong2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ulong2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ulong2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(int2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(int2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(int2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(int2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(int2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(int2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(int2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uint2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uint2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uint2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uint2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(uint2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(uint2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uint2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(short2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(short2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(short2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(short2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(short2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(short2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(short2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ushort2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(ushort2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(ushort2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ushort2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ushort2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ushort2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ushort2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(char2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(char2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(char2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(char2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(char2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(char2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(char2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uchar2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uchar2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uchar2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(uchar2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(uchar2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uchar2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uchar2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(double2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(double2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(double2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(double2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(double2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(double2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(double2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(double2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(float2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(float2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(float2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(float2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(float2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(float2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(float2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(float2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(float2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(long3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(long3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(long3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(long3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(long3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(long3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(long3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(long3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ulong3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ulong3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(ulong3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(ulong3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ulong3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ulong3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ulong3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(int3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(int3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(int3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(int3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(int3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(int3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(int3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uint3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uint3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uint3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uint3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(uint3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(uint3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uint3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(short3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(short3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(short3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(short3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(short3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(short3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(short3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ushort3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(ushort3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(ushort3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ushort3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ushort3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ushort3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ushort3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(char3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(char3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(char3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(char3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(char3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(char3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(char3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uchar3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uchar3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uchar3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(uchar3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(uchar3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uchar3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uchar3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(double3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(double3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(double3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(double3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(double3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(double3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(double3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(double3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(float3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(float3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(float3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(float3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(float3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(float3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(float3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(float3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(float3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(long4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(long4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(long4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(long4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(long4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(long4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(long4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(long4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ulong4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ulong4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(ulong4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(ulong4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ulong4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ulong4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ulong4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(int4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(int4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(int4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(int4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(int4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(int4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(int4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uint4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uint4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uint4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uint4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(uint4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(uint4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uint4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(short4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(short4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(short4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(short4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(short4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(short4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(short4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ushort4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(ushort4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(ushort4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ushort4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ushort4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ushort4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ushort4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(char4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(char4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(char4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(char4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(char4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(char4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(char4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uchar4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uchar4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uchar4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(uchar4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(uchar4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uchar4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uchar4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(double4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(double4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(double4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(double4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(double4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(double4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(double4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(double4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(float4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(float4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(float4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(float4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(float4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(float4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(float4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(float4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(float4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(long8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(long8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(long8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(long8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(long8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(long8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(long8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(long8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ulong8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ulong8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(ulong8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(ulong8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ulong8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ulong8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ulong8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(int8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(int8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(int8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(int8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(int8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(int8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(int8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uint8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uint8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uint8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uint8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(uint8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(uint8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uint8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(short8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(short8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(short8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(short8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(short8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(short8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(short8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ushort8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(ushort8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(ushort8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ushort8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ushort8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ushort8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ushort8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(char8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(char8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(char8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(char8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(char8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(char8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(char8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uchar8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uchar8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uchar8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(uchar8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(uchar8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uchar8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uchar8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(double8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(double8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(double8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(double8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(double8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(double8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(double8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(double8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(float8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(float8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(float8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(float8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(float8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(float8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(float8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(float8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(float8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(long16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(long16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(long16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(long16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(long16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(long16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(long16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(long16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ulong16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ulong16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(ulong16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(ulong16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ulong16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ulong16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ulong16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(int16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(int16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(int16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(int16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(int16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(int16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(int16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uint16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uint16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uint16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uint16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(uint16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(uint16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uint16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(short16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(short16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(short16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(short16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(short16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(short16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(short16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ushort16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(ushort16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(ushort16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ushort16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ushort16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ushort16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ushort16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(char16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(char16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(char16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(char16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(char16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(char16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(char16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uchar16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uchar16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uchar16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(uchar16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(uchar16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uchar16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uchar16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(double16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(double16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(double16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(double16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(double16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(double16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(double16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(double16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(float16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(float16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(float16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(float16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(float16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(float16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(float16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(float16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+// ##END_CONVERT##
+
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL preprocessor directives & macros
 /////////////////////////////////////////////////////////////////////////////
@@ -1269,6 +5207,8 @@ DECL_IMAGE(int4, i)
 DECL_IMAGE(uint4, ui)
 DECL_IMAGE(float4, f)
 
+#pragma OPENCL EXTENSION cl_khr_fp64 : disable
+
 #undef DECL_IMAGE
 #undef DECL_READ_IMAGE
 #undef DECL_READ_IMAGE_NOSAMPLER
diff --git a/backend/src/update.sh b/backend/src/update.sh
new file mode 100755
index 0000000..0e5f8c0
--- /dev/null
+++ b/backend/src/update.sh
@@ -0,0 +1,3 @@
+#! /bin/sh -e
+./update_as.sh
+./update_convert.sh
diff --git a/backend/src/update_as.sh b/backend/src/update_as.sh
new file mode 100755
index 0000000..54b4191
--- /dev/null
+++ b/backend/src/update_as.sh
@@ -0,0 +1,11 @@
+#! /bin/sh -e
+
+STDLIB_HEADER=ocl_stdlib.h
+
+exec >$STDLIB_HEADER.tmp
+sed -n -e '1,/##BEGIN_AS##/p' $STDLIB_HEADER
+./gen_as.sh
+sed -n -e '/##END_AS##/,$p' $STDLIB_HEADER
+exec >&2
+
+mv $STDLIB_HEADER.tmp $STDLIB_HEADER
diff --git a/backend/src/update_convert.sh b/backend/src/update_convert.sh
new file mode 100755
index 0000000..f1fcd36
--- /dev/null
+++ b/backend/src/update_convert.sh
@@ -0,0 +1,11 @@
+#! /bin/sh -e
+
+STDLIB_HEADER=ocl_stdlib.h
+
+exec >$STDLIB_HEADER.tmp
+sed -n -e '1,/##BEGIN_CONVERT##/p' $STDLIB_HEADER
+./gen_convert.sh
+sed -n -e '/##END_CONVERT##/,$p' $STDLIB_HEADER
+exec >&2
+
+mv $STDLIB_HEADER.tmp $STDLIB_HEADER
diff --git a/kernels/compiler_double.cl b/kernels/compiler_double.cl
new file mode 100644
index 0000000..a84f142
--- /dev/null
+++ b/kernels/compiler_double.cl
@@ -0,0 +1,9 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double(global double *src, global double *dst) {
+  int i = get_global_id(0);
+  double d = 1.234567890123456789;
+  if (i < 14)
+    dst[i] = d * (src[i] + d);
+  else
+    dst[i] = 14;
+}
diff --git a/kernels/compiler_double_2.cl b/kernels/compiler_double_2.cl
new file mode 100644
index 0000000..20ee614
--- /dev/null
+++ b/kernels/compiler_double_2.cl
@@ -0,0 +1,9 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_2(global float *src, global double *dst) {
+  int i = get_global_id(0);
+  float d = 1.234567890123456789f;
+  if (i < 14)
+    dst[i] = d * (d + src[i]);
+  else
+    dst[i] = 14;
+}
diff --git a/kernels/compiler_double_3.cl b/kernels/compiler_double_3.cl
new file mode 100644
index 0000000..8b32404
--- /dev/null
+++ b/kernels/compiler_double_3.cl
@@ -0,0 +1,6 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_3(global float *src, global double *dst) {
+  int i = get_global_id(0);
+  float d = 1.234567890123456789f;
+  dst[i] = i < 14 ? d : 14;
+}
diff --git a/kernels/get_program_info.cl b/kernels/get_program_info.cl
new file mode 100644
index 0000000..8e0dd94
--- /dev/null
+++ b/kernels/get_program_info.cl
@@ -0,0 +1,10 @@
+kernel void get_program_info( __global int *ret ) {
+    uint x = get_work_dim();
+    size_t y = get_global_size(0);
+    y = get_global_id(0);
+    y = get_local_size(0);
+    y = get_local_id(0);
+    y = get_num_groups(0);
+    y = get_group_id(0);
+    y = get_global_offset(0);
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 108fa06..31152b0 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -27,6 +27,9 @@ set (utests_sources
   compiler_copy_image.cpp
   compiler_copy_image_3d.cpp
   compiler_copy_buffer_row.cpp
+  compiler_double.cpp
+  compiler_double_2.cpp
+  compiler_double_3.cpp
   compiler_fabs.cpp
   compiler_fill_image.cpp
   compiler_fill_image0.cpp
@@ -82,6 +85,7 @@ set (utests_sources
   compiler_get_image_info.cpp
   compiler_vector_load_store.cpp
   compiler_cl_finish.cpp
+  get_program_info.cpp
   buildin_work_dim.cpp
   builtin_global_size.cpp
   runtime_createcontext.cpp
diff --git a/utests/compiler_double.cpp b/utests/compiler_double.cpp
new file mode 100644
index 0000000..7c54ddf
--- /dev/null
+++ b/utests/compiler_double.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, double *src, double *dst) {
+  double f = src[global_id];
+  double d = 1.234567890123456789;
+  dst[global_id] = global_id < 14 ? (d * (f + d)) : 14;
+}
+
+void compiler_double(void)
+{
+  const size_t n = 16;
+  double cpu_dst[n], cpu_src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((double*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double);
diff --git a/utests/compiler_double_2.cpp b/utests/compiler_double_2.cpp
new file mode 100644
index 0000000..7e3ae4b
--- /dev/null
+++ b/utests/compiler_double_2.cpp
@@ -0,0 +1,47 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, double *dst) {
+  float f = src[global_id];
+  float d = 1.234567890123456789;
+  dst[global_id] = global_id < 14 ? d * (d + f) : 14;
+}
+
+void compiler_double_2(void)
+{
+  const size_t n = 16;
+  float cpu_src[n];
+  double cpu_dst[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_2);
diff --git a/utests/compiler_double_3.cpp b/utests/compiler_double_3.cpp
new file mode 100644
index 0000000..294950d
--- /dev/null
+++ b/utests/compiler_double_3.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, double *dst) {
+  float d = 1.234567890123456789;
+  dst[global_id] = global_id < 14 ? d : 14;
+}
+
+void compiler_double_3(void)
+{
+  const size_t n = 16;
+  float cpu_src[n];
+  double cpu_dst[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double_3");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_3);
diff --git a/utests/compiler_sub_bytes.cpp b/utests/compiler_sub_bytes.cpp
index 49a5261..740a8fd 100644
--- a/utests/compiler_sub_bytes.cpp
+++ b/utests/compiler_sub_bytes.cpp
@@ -11,7 +11,7 @@ static void compiler_sub_bytes(void)
   for (uint32_t i = 0; i < n; ++i) ((int8_t*)buf_data[0])[i] = (int8_t) rand();
   for (uint32_t i = 0; i < n; ++i) ((int8_t*)buf_data[1])[i] = (int8_t) rand();
   OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(int8_t), buf_data[0]);
-  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int8_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int8_t), buf_data[1]);
   OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int8_t), NULL);
 
   // Run the kernel
@@ -25,7 +25,7 @@ static void compiler_sub_bytes(void)
   // Check result
   OCL_MAP_BUFFER(2);
   for (uint32_t i = 0; i < n; ++i)
-    OCL_ASSERT(((int8_t*)buf_data[2])[i] = ((int8_t*)buf_data[0])[i] - ((int8_t*)buf_data[1])[i]);
+    OCL_ASSERT(((int8_t*)buf_data[2])[i] == (int8_t)(((int8_t*)buf_data[0])[i] - ((int8_t*)buf_data[1])[i]));
   free(buf_data[0]);
   free(buf_data[1]);
   buf_data[0] = buf_data[1] = NULL;
diff --git a/utests/compiler_sub_shorts.cpp b/utests/compiler_sub_shorts.cpp
index 4aeeca3..7c24a56 100644
--- a/utests/compiler_sub_shorts.cpp
+++ b/utests/compiler_sub_shorts.cpp
@@ -11,7 +11,7 @@ static void compiler_sub_shorts(void)
   for (uint32_t i = 0; i < n; ++i) ((int16_t*)buf_data[0])[i] = (int16_t) rand();
   for (uint32_t i = 0; i < n; ++i) ((int16_t*)buf_data[1])[i] = (int16_t) rand();
   OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(int16_t), buf_data[0]);
-  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int16_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(int16_t), buf_data[1]);
   OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int16_t), NULL);
 
   // Run the kernel
@@ -25,7 +25,7 @@ static void compiler_sub_shorts(void)
   // Check result
   OCL_MAP_BUFFER(2);
   for (uint32_t i = 0; i < n; ++i)
-    OCL_ASSERT(((int16_t*)buf_data[2])[i] = ((int16_t*)buf_data[0])[i] - ((int16_t*)buf_data[1])[i]);
+    OCL_ASSERT(((int16_t*)buf_data[2])[i] == (int16_t)(((int16_t*)buf_data[0])[i] - ((int16_t*)buf_data[1])[i]));
   free(buf_data[0]);
   free(buf_data[1]);
   buf_data[0] = buf_data[1] = NULL;
diff --git a/utests/get_program_info.cpp b/utests/get_program_info.cpp
new file mode 100644
index 0000000..20248e8
--- /dev/null
+++ b/utests/get_program_info.cpp
@@ -0,0 +1,247 @@
+#include <string.h>
+#include <string>
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* ********************************************** *
+ * This file to test the API of:                  *
+ * clGetProgramInfo                               *
+ * ********************************************** */
+#define NO_STANDARD_REF 0xFFFFF
+
+template <typename T = cl_uint>
+struct Info_Result {
+    T ret;
+    T refer;
+    int size;
+    typedef T type_value;
+
+    void * get_ret(void) {
+        return (void *)&ret;
+    }
+
+    Info_Result(T other) {
+        refer = other;
+        size = sizeof(T);
+    }
+
+    bool check_result (void) {
+        if (ret != refer && refer != (T)NO_STANDARD_REF)
+            return false;
+
+        return true;
+    }
+};
+
+template <>
+struct Info_Result<char *> {
+    char * ret;
+    char * refer;
+    int size;
+    typedef char* type_value;
+
+    Info_Result(char *other, int sz) {
+        size = sz;
+        ret = (char *)malloc(sizeof(char) * sz);
+        if (other) {
+            refer = (char *)malloc(sizeof(char) * sz);
+            memcpy(refer, other, sz);
+        }
+    }
+
+    ~Info_Result(void) {
+        free(refer);
+        free(ret);
+    }
+
+    void * get_ret(void) {
+        return (void *)ret;
+    }
+
+    bool check_result (void) {
+        if (refer && ::memcmp(ret, refer, size))
+            return false;
+
+        return true;
+    }
+};
+
+template <> //Used for such as CL_PROGRAM_BINARIES
+struct Info_Result<char **> {
+    char ** ret;
+    char ** refer;
+    int *elt_size;
+    int size;
+    typedef char** type_value;
+
+    Info_Result(char **other, int *sz, int elt_num) {
+        size = elt_num;
+
+        ret = (char **)malloc(elt_num * sizeof(char *));
+        memset(ret, 0, (elt_num * sizeof(char *)));
+        refer = (char **)malloc(elt_num * sizeof(char *));
+        memset(refer, 0, (elt_num * sizeof(char *)));
+        elt_size = (int *)malloc(elt_num * sizeof(int));
+        memset(elt_size, 0, (elt_num * sizeof(int)));
+        if (sz) {
+            int i = 0;
+            for (; i < elt_num; i++) {
+                elt_size[i] = sz[i];
+                ret[i] = (char *)malloc(sz[i] * sizeof(char));
+
+                if (other[i] && elt_size[i] > 0) {
+                    refer[i] = (char *)malloc(sz[i] * sizeof(char));
+                    memcpy(&refer[i], &other[i], sz[i]);
+                }
+                else
+                    refer[i] = NULL;
+            }
+        }
+    }
+
+    ~Info_Result(void) {
+        int i = 0;
+        for (; i < size; i++) {
+            if (refer[i])
+                free(refer[i]);
+            free(ret[i]);
+        }
+        free(ret);
+        free(refer);
+        free(elt_size);
+    }
+
+    void * get_ret(void) {
+        return (void *)ret;
+    }
+
+    bool check_result (void) {
+        int i = 0;
+        for (; i < size; i++) {
+            if (refer[i] && ::memcmp(ret[i], refer[i], elt_size[i]))
+                return false;
+        }
+
+        return true;
+    }
+};
+
+template <typename T1, typename T2>
+struct Traits {
+    static bool Is_Same(void) {
+        return false;
+    };
+};
+
+template <typename T1>
+struct Traits<T1, T1> {
+    static bool Is_Same(void) {
+        return true;
+    };
+};
+
+template <typename T>
+Info_Result<T>* cast_as(void *info)
+{
+    Info_Result<T>* ret;
+    ret = reinterpret_cast<Info_Result<T>*>(info);
+    OCL_ASSERT((Traits<T, typename Info_Result<T>::type_value>::Is_Same()));
+    return ret;
+}
+
+
+#define CALL_PROGINFO_AND_RET(TYPE) \
+    do { \
+	cl_int ret; \
+	size_t ret_size; \
+	\
+	Info_Result<TYPE>* info = cast_as<TYPE>(x.second); \
+	ret = clGetProgramInfo(program, x.first, \
+		info->size, info->get_ret(), &ret_size); \
+	OCL_ASSERT((!ret)); \
+	OCL_ASSERT((info->check_result())); \
+	delete info; \
+    } while(0)
+
+void get_program_info(void)
+{
+    map<cl_program_info, void *> maps;
+    int expect_value;
+    char * expect_source;
+    int sz;
+    char *ker_path = (char *)malloc(4096 * sizeof(char));
+    const char *kiss_path = getenv("OCL_KERNEL_PATH");
+    string line;
+    string source_code;
+
+    sprintf(ker_path, "%s/%s", kiss_path, "get_program_info.cl");
+
+    ifstream in(ker_path);
+    while (getline(in,line)) {
+        source_code = (source_code == "") ?
+                      source_code + line : source_code + "\n" + line;
+    }
+    free(ker_path);
+    //cout<< source_code;
+    source_code = source_code + "\n";
+
+    expect_source = (char *)source_code.c_str();
+
+    OCL_CREATE_KERNEL("get_program_info");
+
+    /* First test for clGetProgramInfo. We just have 1 devices now */
+    expect_value = 2;//One program, one kernel.
+    maps.insert(make_pair(CL_PROGRAM_REFERENCE_COUNT,
+                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
+    maps.insert(make_pair(CL_PROGRAM_CONTEXT,
+                          (void *)(new Info_Result<cl_context>(ctx))));
+    expect_value = 1;
+    maps.insert(make_pair(CL_PROGRAM_NUM_DEVICES,
+                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
+    maps.insert(make_pair(CL_PROGRAM_DEVICES,
+                          (void *)(new Info_Result<cl_device_id>(device))));
+    sz = (strlen(expect_source) + 1);
+    maps.insert(make_pair(CL_PROGRAM_SOURCE,
+                          (void *)(new Info_Result<char *>(expect_source, sz))));
+    expect_value = NO_STANDARD_REF;
+    maps.insert(make_pair(CL_PROGRAM_BINARY_SIZES,
+                          (void *)(new Info_Result<size_t>((size_t)expect_value))));
+    sz = 4096; //big enough?
+    expect_source = NULL;
+    maps.insert(make_pair(CL_PROGRAM_BINARIES,
+                          (void *)(new Info_Result<char **>(&expect_source, &sz, 1))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
+        switch (x.first) {
+        case CL_PROGRAM_REFERENCE_COUNT:
+        case CL_PROGRAM_NUM_DEVICES:
+            CALL_PROGINFO_AND_RET(cl_uint);
+            break;
+        case CL_PROGRAM_CONTEXT:
+            CALL_PROGINFO_AND_RET(cl_context);
+            break;
+        case CL_PROGRAM_DEVICES:
+            CALL_PROGINFO_AND_RET(cl_device_id);
+            break;
+        case CL_PROGRAM_SOURCE:
+            CALL_PROGINFO_AND_RET(char *);
+            break;
+        case CL_PROGRAM_BINARY_SIZES:
+            CALL_PROGINFO_AND_RET(size_t);
+            break;
+        case CL_PROGRAM_BINARIES:
+            CALL_PROGINFO_AND_RET(char **);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_program_info);
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list