[Pkg-opencl-devel] [beignet] 47/66: Imported Upstream version 0.2+git20130730+da26376

Fri Oct 31 07:27:07 UTC 2014

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 9905eb364d7fdbbd632215384b6380dadd0d103f
Author: Simon Richter <sjr at debian.org>
Date:   Tue Jul 30 13:43:36 2013 +0200

    Imported Upstream version 0.2+git20130730+da26376
---
 CMakeLists.txt                        |    8 +-
 backend/CMakeLists.txt                |   10 +-
 backend/src/.gitignore                |    3 +
 backend/src/CMakeLists.txt            |   87 +-
 backend/src/GBEConfig.h.in            |    1 +
 backend/src/backend/program.cpp       |   45 +-
 backend/src/builtin_vector_proto.def  |  261 ++
 backend/src/gen_builtin_vector.py     |  381 +++
 backend/src/ocl_as.h                  | 2161 ++++++++++++
 backend/src/ocl_convert.h             | 1801 ++++++++++
 backend/src/ocl_stdlib.h              | 5769 ---------------------------------
 backend/src/ocl_stdlib.tmpl.h         | 1378 ++++++++
 backend/src/update_as.sh              |   10 +-
 backend/src/update_blob_ocl_header.py |   65 +
 backend/src/update_convert.sh         |   11 +-
 kernels/builtin_bitselect.cl          |    4 +
 kernels/builtin_frexp.cl              |    4 +
 kernels/builtin_global_id.cl          |    4 +
 kernels/builtin_local_id.cl           |    6 +
 kernels/builtin_local_size.cl         |    3 +
 kernels/builtin_mad_sat.cl            |    4 +
 kernels/builtin_modf.cl               |    6 +
 kernels/builtin_nextafter.cl          |    4 +
 kernels/builtin_num_groups.cl         |    3 +
 kernels/builtin_remquo.cl             |    6 +
 kernels/builtin_shuffle.cl            |    8 +
 kernels/builtin_sign.cl               |    4 +
 kernels/compiler_smoothstep.cl        |    4 +
 src/OCLConfig.h.in                    |    6 +-
 src/cl_api.c                          |  167 +-
 src/cl_context.c                      |   18 +
 src/cl_gt_device.h                    |    6 +-
 src/cl_mem.c                          |  123 +-
 src/cl_mem.h                          |   20 +
 src/cl_mem_gl.c                       |    2 +-
 src/cl_platform_id.c                  |    2 +-
 src/cl_platform_id.h                  |    8 +-
 utests/CMakeLists.txt                 |   13 +
 utests/builtin_bitselect.cpp          |   50 +
 utests/builtin_frexp.cpp              |   50 +
 utests/builtin_global_id.cpp          |   77 +
 utests/builtin_local_id.cpp           |   81 +
 utests/builtin_local_size.cpp         |   88 +
 utests/builtin_mad_sat.cpp            |   44 +
 utests/builtin_modf.cpp               |   56 +
 utests/builtin_nextafter.cpp          |   60 +
 utests/builtin_num_groups.cpp         |   85 +
 utests/builtin_remquo.cpp             |   65 +
 utests/builtin_shuffle.cpp            |   45 +
 utests/builtin_sign.cpp               |   47 +
 utests/compiler_smoothstep.cpp        |   58 +
 utests/get_cl_info.cpp                |  114 +
 utests/utest_helper.cpp               |   17 +-
 53 files changed, 7493 insertions(+), 5860 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41ac43b..eb56567 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,8 +8,10 @@
 
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
-set (LIBCL_VERSION_MAJOR 0)
-set (LIBCL_VERSION_MINOR 2)
+set (LIBCL_DRIVER_VERSION_MAJOR 0)
+set (LIBCL_DRIVER_VERSION_MINOR 2)
+set (LIBCL_C_VERSION_MAJOR 1)
+set (LIBCL_C_VERSION_MINOR 1)
 
 configure_file (
   "src/OCLConfig.h.in"
@@ -137,6 +139,8 @@ ELSE(OCLIcd_FOUND)
   MESSAGE(STATUS "Looking for OCL ICD header file - not found")
 ENDIF(OCLIcd_FOUND)
 
+Find_Package(PythonInterp)
+
 ADD_SUBDIRECTORY(include)
 ADD_SUBDIRECTORY(backend)
 ADD_SUBDIRECTORY(src)
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 5498ff0..8622f3e 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -1,12 +1,6 @@
 project (GBE)
 set (LIBGBE_VERSION_MAJOR 0)
-set (LIBGBE_VERSION_MINOR 1)
-
-configure_file (
-  "src/GBEConfig.h.in"
-  "src/GBEConfig.h"
-)
-
+set (LIBGBE_VERSION_MINOR 2)
 cmake_minimum_required (VERSION 2.6.0)
 
 set (GBE_CMAKE_DIR "${GBE_SOURCE_DIR}/cmake")
@@ -97,7 +91,7 @@ elseif (COMPILER STREQUAL "ICC")
   set (CCMAKE_CXX_FLAGS_MINSIZEREL "-Os -DGBE_DEBUG=0")
   set (CMAKE_EXE_LINKER_FLAGS "")
 endif ()
-
+include_directories (${CMAKE_CURRENT_BINARY_DIR})
 ##############################################################
 # Project source code
 ##############################################################
diff --git a/backend/src/.gitignore b/backend/src/.gitignore
index 39239f3..d0ee832 100644
--- a/backend/src/.gitignore
+++ b/backend/src/.gitignore
@@ -1,4 +1,7 @@
 GBEConfig.h
 libgbe.so
 ocl_common_defines_str.cpp
+ocl_stdlib.h
+ocl_stdlib.h.pch
 ocl_stdlib_str.cpp
+ocl_vector.h
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index a0fe198..b7b47ae 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -1,27 +1,62 @@
-#add_subdirectory(llvm)
+set (ocl_vector_spec_file ${GBE_SOURCE_DIR}/src/builtin_vector_proto.def)
+set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
+set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
+set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
+set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
+set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h)
+set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
+set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
+set (ocl_gen_vector_cmd ${GBE_SOURCE_DIR}/src/gen_builtin_vector.py)
 
-macro (stringify TO_STRINGIFY_PATH TO_STRINGIFY_FILES)
-foreach (to_stringify_file ${TO_STRINGIFY_FILES})
-  set (input_file ${TO_STRINGIFY_PATH}/${to_stringify_file}.h)
-  set (output_file ${TO_STRINGIFY_PATH}/${to_stringify_file}_str.cpp)
-  set (string_header "\\\"string\\\"")
-  add_custom_command(
-    OUTPUT ${output_file}
-    COMMAND rm -rf ${output_file}
-    COMMAND echo "\\\#include ${string_header}" >> ${output_file}
-    COMMAND echo "namespace gbe {" >> ${output_file}
-    COMMAND echo "std::string ${to_stringify_file}_str = " >> ${output_file}
+set (string_header "\\\"string\\\"")
+add_custom_command(
+    OUTPUT ${ocl_blob_cpp_file}
+    COMMAND rm -rf ${ocl_blob_cpp_file}
+    COMMAND echo "\\\#include ${string_header}" >> ${ocl_blob_cpp_file}
+    COMMAND echo "namespace gbe {" >> ${ocl_blob_cpp_file}
+    COMMAND echo "std::string ocl_stdlib_str = " >> ${ocl_blob_cpp_file}
     # Yeah!!! welcome to back slash hell
-    COMMAND cat ${input_file} |sed 's/\\\\/\\\\\\\\/g' | sed 's/\\\"/\\\\\\\"/g' | awk '{ printf \(\"\\"%s\\\\n\\"\\n\", $$0\) }' >> ${output_file}
-    COMMAND echo "\;" >> ${output_file}
-    COMMAND echo "}" >> ${output_file}
-    COMMAND echo "" >> ${output_file}
-    MAIN_DEPENDENCY ${input_file})
-endforeach (to_stringify_file)
-endmacro (stringify)
+    COMMAND cat ${ocl_blob_file} |sed 's/\\\\/\\\\\\\\/g' | sed 's/\\\"/\\\\\\\"/g' | awk '{ printf \(\"\\"%s\\\\n\\"\\n\", $$0\) }' >> ${ocl_blob_cpp_file}
+    COMMAND echo "\;" >> ${ocl_blob_cpp_file}
+    COMMAND echo "}" >> ${ocl_blob_cpp_file}
+    COMMAND echo "" >> ${ocl_blob_cpp_file}
+    DEPENDS ${ocl_blob_file})
 
-set (TO_STRINGIFY_FILES ocl_stdlib ocl_common_defines)
-stringify ("${GBE_SOURCE_DIR}/src/" "${TO_STRINGIFY_FILES}")
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h;ocl_stdlib.h")
+
+add_custom_command(
+  OUTPUT ${ocl_vector_file}
+  COMMAND ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
+  DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file})
+
+add_custom_command(
+  OUTPUT ${ocl_blob_file}
+  COMMAND ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
+  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
+
+
+set (pch_object ${ocl_blob_file}.pch)
+# generate pch object
+if (LLVM_VERSION_NODOT VERSION_GREATER 32)
+    set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off -emit-pch)
+else (LLVM_VERSION_NODOT VERSION_GREATER 32)
+    if (LLVM_VERSION_NODOT VERSION_GREATER 31)
+        set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off -emit-pch)
+    else (LLVM_VERSION_NODOT VERSION_GREATER 31)
+        set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch)
+    endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
+endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
+set (clang_cmd ${clang_cmd} -fno-builtin)
+
+add_custom_command(
+     OUTPUT ${pch_object}
+     COMMAND rm -f ${pch_object}
+     COMMAND clang ${clang_cmd} ${ocl_blob_file} -o ${pch_object}
+     DEPENDS ${ocl_blob_file}
+     )
+
+add_custom_target(pch_object
+                  DEPENDS ${pch_object})
 
 if (GBE_USE_BLOB)
   set (GBE_SRC
@@ -31,9 +66,6 @@ else (GBE_USE_BLOB)
   set (GBE_SRC
     ocl_stdlib.h
     ocl_stdlib_str.cpp  # this file is auto-generated.
-    ocl_stdlib_str.cpp
-    ocl_common_defines.h
-    ocl_common_defines_str.cpp # this file is auto-generated.
     sys/vector.hpp
     sys/hash_map.hpp
     sys/map.hpp
@@ -111,6 +143,7 @@ link_directories (${LLVM_LIBRARY_DIRS})
 include_directories(${LLVM_INCLUDE_DIRS})
 add_library (gbe SHARED ${GBE_SRC})
 
+ADD_DEPENDENCIES (gbe pch_object)
 target_link_libraries(
                       gbe
                       ${DRM_INTEL_LIBRARY}
@@ -122,5 +155,11 @@ target_link_libraries(
                       ${CMAKE_DL_LIBS})
 
 install (TARGETS gbe LIBRARY DESTINATION lib)
+install (FILES ${pch_object} DESTINATION lib)
 install (FILES backend/program.h DESTINATION include/gen)
 
+set (PCH_OBJECT_DIR "${pch_object};${CMAKE_INSTALL_PREFIX}/lib/ocl_stdlib.h.pch")
+configure_file (
+  "GBEConfig.h.in"
+  "GBEConfig.h"
+)
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
index 369b184..74bef3f 100644
--- a/backend/src/GBEConfig.h.in
+++ b/backend/src/GBEConfig.h.in
@@ -1,3 +1,4 @@
 // the configured options and settings for LIBGBE
 #define LIBGBE_VERSION_MAJOR @LIBGBE_VERSION_MAJOR@
 #define LIBGBE_VERSION_MINOR @LIBGBE_VERSION_MINOR@
+#define PCH_OBJECT_DIR "@PCH_OBJECT_DIR@"
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 2a4feb9..26c22f3 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -35,6 +35,9 @@
 #include <cstring>
 #include <algorithm>
 #include <fstream>
+#include <dlfcn.h>
+#include <sstream>
+#include <unistd.h>
 
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
@@ -66,6 +69,7 @@
 #endif  /* LLVM_VERSION_MINOR <= 2 */
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/raw_ostream.h>
+#include "src/GBEConfig.h"
 
 namespace gbe {
 
@@ -150,6 +154,9 @@ namespace gbe {
     }
 
     args.push_back("-emit-llvm");
+    // XXX we haven't implement those builtin functions,
+    // so disable it currently.
+    args.push_back("-fno-builtin");
     if(bOpt)  args.push_back("-O3");
 #if LLVM_VERSION_MINOR <= 2
     args.push_back("-triple");
@@ -221,7 +228,8 @@ namespace gbe {
   }
 
   extern std::string ocl_stdlib_str;
-  extern std::string ocl_common_defines_str;
+
+  BVAR(OCL_USE_PCH, true);
   static gbe_program programNewFromSource(const char *source,
                                           size_t stringSize,
                                           const char *options,
@@ -231,16 +239,43 @@ namespace gbe {
     char clStr[L_tmpnam+1], llStr[L_tmpnam+1];
     const std::string clName = std::string(tmpnam_r(clStr)) + ".cl"; /* unsafe! */
     const std::string llName = std::string(tmpnam_r(llStr)) + ".ll"; /* unsafe! */
+    std::string pchHeaderName;
+    std::string clOpt;
 
-    // Write the source to the cl file
     FILE *clFile = fopen(clName.c_str(), "w");
     FATAL_IF(clFile == NULL, "Failed to open temporary file");
-    fwrite(ocl_common_defines_str.c_str(), strlen(ocl_common_defines_str.c_str()), 1, clFile);
-    fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
+
+    bool usePCH = false;
+
+    if(options)
+      clOpt += options;
+
+    if (options || !OCL_USE_PCH) {
+      /* Some building option may cause the prebuild pch header file
+         not compatible with the XXX.cl source. We need rebuild all here.*/
+      usePCH = false;
+    } else {
+      std::string dirs = PCH_OBJECT_DIR;
+      std::istringstream idirs(dirs);
+
+      while (getline(idirs, pchHeaderName, ';')) {
+        if(access(pchHeaderName.c_str(), R_OK) == 0) {
+          usePCH = true;
+          break;
+        }
+      }
+    }
+    if (usePCH) {
+      clOpt += " -include-pch ";
+      clOpt += pchHeaderName;
+      clOpt += " ";
+    } else
+      fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
+    // Write the source to the cl file
     fwrite(source, strlen(source), 1, clFile);
     fclose(clFile);
 
-    buildModuleFromSource(clName.c_str(), llName.c_str(), options ? options : "");
+    buildModuleFromSource(clName.c_str(), llName.c_str(), clOpt.c_str());
     remove(clName.c_str());
 
     // Now build the program from llvm
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
new file mode 100644
index 0000000..440b455
--- /dev/null
+++ b/backend/src/builtin_vector_proto.def
@@ -0,0 +1,261 @@
+##math
+gentype acos (gentype)
+gentype acosh (gentype)
+gentype acospi (gentype x)
+gentype asin (gentype)
+gentype asinh (gentype)
+gentype asinpi (gentype x)
+gentype atan (gentype y_over_x)
+# XXX atan2 is a builtin function
+#gentype atan2 (gentype y, gentype x)
+gentype atanh (gentype)
+gentype atanpi (gentype x)
+#gentype atan2pi (gentype y, gentype x)
+gentype cbrt (gentype)
+gentype ceil (gentype)
+gentype copysign (gentype x, gentype y)
+gentype cos (gentype)
+gentype cosh (gentype)
+gentype cospi (gentype x)
+gentype erfc (gentype)
+gentype erf (gentype)
+gentype exp (gentype x)
+gentype exp2 (gentype)
+gentype exp10 (gentype)
+gentype expm1 (gentype x)
+gentype fabs (gentype)
+gentype fdim (gentype x, gentype y)
+gentype floor (gentype)
+# XXX we use madd for fma
+#gentype fma (gentype a, gentype b, gentype c)
+gentype fmax (gentype x, gentype y)
+gentypef fmax (gentypef x, float y)
+gentyped fmax (gentyped x, double y)
+gentype fmin (gentype x, gentype y)
+gentypef fmin (gentypef x, float y)
+gentyped fmin (gentyped x, double y)
+gentype fmod (gentype x, gentype y)
+gentype fract (gentype x, __global gentype *iptr)
+gentype fract (gentype x, __local gentype *iptr)
+gentype fract (gentype x, __private gentype *iptr)
+floatn frexp (floatn x, __global intn *exp)
+floatn frexp (floatn x, __local intn *exp)
+floatn frexp (floatn x, __private intn *exp)
+float frexp (float x, __global int *exp)
+float frexp (float x, __local int *exp)
+float frexp (float x, __private int *exp)
+doublen frexp (doublen x, __global intn *exp)
+doublen frexp (doublen x, __local intn *exp)
+doublen frexp (doublen x, __private intn *exp)
+double frexp (double x, __global int *exp)
+double frexp (double x, __local int *exp)
+double frexp (double x, __private int *exp)
+gentype hypot (gentype x, gentype y)
+intn ilogb (floatn x)
+int ilogb (float x)
+intn ilogb (doublen x)
+int ilogb (double x)
+floatn ldexp (floatn x, intn k)
+floatn ldexp (floatn x, int k)
+float ldexp (float x, int k)
+doublen ldexp (doublen x, intn k)
+doublen ldexp (doublen x, int k)
+double ldexp (double x, int k)
+#gentype lgamma (gentype x)
+#floatn lgamma_r (floatn x, __global intn *signp)
+#floatn lgamma_r (floatn x, __local intn *signp)
+#floatn lgamma_r (floatn x, __private intn *signp)
+#float lgamma_r (float x, __global int *signp)
+#float lgamma_r (float x, __local int *signp)
+#float lgamma_r (float x,   __private int *signp)
+#doublen lgamma_r (doublen x, __global intn *signp)
+#doublen lgamma_r (doublen x, __local intn *signp)
+#doublen lgamma_r (doublen x, __private intn *signp)
+#double lgamma_r (double x, __global int *signp)
+#double lgamma_r (double x, __local int *signp)
+#double lgamma_r (double x, __private int *signp)
+gentype log (gentype)
+gentype log2 (gentype)
+gentype log10 (gentype)
+gentype log1p (gentype x)
+gentype logb (gentype x)
+gentype mad (gentype a, gentype b, gentype c)
+gentype maxmag (gentype x, gentype y)
+gentype minmag (gentype x, gentype y)
+gentype modf (gentype x, __global gentype *iptr)
+gentype modf (gentype x, __local gentype *iptr)
+gentype modf (gentype x, __private gentype *iptr)
+floatn nan (uintn nancode)
+float nan (uint nancode)
+doublen nan (ulongn nancode)
+double nan (ulong nancode)
+gentype nextafter (gentype x, gentype y)
+gentype pow (gentype x, gentype y)
+floatn pown (floatn x, intn y)
+float pown (float x, int y)
+doublen pown (doublen x, intn y)
+double pown (double x, int y)
+#XXX we define powr as pow
+#gentype powr (gentype x, gentype y)
+gentype remainder (gentype x, gentype y)
+floatn remquo (floatn x, floatn y, __global intn *quo)
+floatn remquo (floatn x, floatn y, __local intn *quo)
+floatn remquo (floatn x, floatn y, __private intn *quo)
+float remquo (float x, float y, __global int *quo)
+float remquo (float x, float y, __local int *quo)
+float remquo (float x, float y, __private int *quo)
+doublen remquo (doublen x, doublen y, __global intn *quo)
+doublen remquo (doublen x, doublen y, __local intn *quo)
+doublen remquo (doublen x, doublen y, __private intn *quo)
+double remquo (double x, double y, __global int *quo)
+double remquo (double x, double y, __local int *quo)
+double remquo (double x, double y, __private int *quo)
+gentype rint (gentype)
+floatn rootn (floatn x, intn y)
+
+doublen rootn (doublen x, intn y)
+doublen rootn (double x, int y)
+gentype round (gentype x)
+gentype rsqrt (gentype)
+gentype sin (gentype)
+gentype sincos (gentype x, __global gentype *cosval)
+gentype sincos (gentype x, __local gentype *cosval)
+gentype sincos (gentype x, __private gentype *cosval)
+gentype sinh (gentype)
+gentype sinpi (gentype x)
+gentype sqrt (gentype)
+gentype tan (gentype)
+gentype tanh (gentype)
+gentype tanpi (gentype x)
+#gentype tgamma (gentype)
+gentype trunc (gentype)
+
+##half_native_math
+#gentype half_cos (gentype x)
+#gentype half_divide (gentype x, gentype y)
+#gentype half_exp (gentype x)
+#gentype half_exp2 (gentype x)
+#gentype half_exp10 (gentype x)
+#gentype half_log (gentype x)
+#gentype half_log2 (gentype x)
+#gentype half_log10 (gentype x)
+#gentype half_powr (gentype x, gentype y)
+#gentype half_recip (gentype x)
+#gentype half_rsqrt (gentype x)
+#gentype half_sin (gentype x)
+#gentype half_sqrt (gentype x)
+#gentype half_tan (gentype x)
+
+# XXX we already defined all native and non-native
+# functions to the same one.
+#gentype native_cos (gentype x)
+#gentype native_divide (gentype x, gentype y)
+#gentype native_exp (gentype x)
+#gentype native_exp2 (gentype x)
+#gentype native_exp10 (gentype x)
+#gentype native_log (gentype x)
+#gentype native_log2 (gentype x)
+#gentype native_log10 (gentype x)
+#gentype native_powr (gentype x, gentype y)
+gentype native_recip (gentype x)
+#gentype native_rsqrt (gentype x)
+#gentype native_sin (gentype x)
+#gentype native_sqrt (gentype x)
+#gentype native_tan (gentype x)
+
+##integer
+ugentype abs (gentype x)
+ugentype abs_diff (gentype x, gentype y)
+gentype add_sat (gentype x,  gentype y)
+gentype hadd (gentype x,  gentype y)
+gentype rhadd (gentype x, gentype y)
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentype clamp (gentype x, sgentype minval, sgentype maxval)
+gentype clz (gentype x)
+gentype mad_hi (gentype a, gentype b, gentype c)
+gentype mad_sat (gentype a, gentype b, gentype c)
+gentype max (gentype x,  gentype y)
+gentype max (gentype x,  sgentype y)
+gentype min (gentype x,  gentype y)
+gentype min (gentype x,  sgentype y)
+gentype mul_hi (gentype x,  gentype y)
+gentype rotate (gentype v,  gentype i)
+gentype sub_sat (gentype x,  gentype y)
+shortn upsample (charn hi, ucharn lo)
+ushortn upsample (ucharn hi, ucharn lo)
+intn upsample (shortn hi, ushortn lo)
+uintn upsample (ushortn hi, ushortn lo)
+longn upsample (intn hi, uintn lo)
+ulongn upsample (uintn hi, uintn lo)
+# XXX not implemented
+#gentype popcount (gentype x)
+
+##fast_integer
+gentype mad24 (gentype x, gentype y, gentype z)
+gentype mul24 (gentype x, gentype y)
+
+##common
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentypef clamp (gentypef x, float minval, float maxval)
+gentyped clamp (gentyped x, double minval, double maxval)
+gentype degrees (gentype radians)
+gentype max (gentype x,  gentype y)
+gentypef max (gentypef x, float y)
+gentyped max (gentyped x, double y)
+gentype min (gentype x,  gentype y)
+gentypef min (gentypef x,  float y)
+gentyped min (gentyped x,  double y)
+gentype mix (gentype x, gentype y, gentype a)
+gentypef mix (gentypef x, gentypef y, float a)
+gentyped mix (gentyped x, gentyped y, double a)
+gentype radians (gentype degrees)
+gentype step (gentype edge, gentype x)
+gentypef step (float edge, gentypef x)
+gentyped step (double edge, gentyped x)
+gentype smoothstep (gentype edge0, gentype edge1, gentype x)
+gentypef smoothstep (float edge0, float edge1, gentypef x)
+gentyped smoothstep (double edge0, double edge1, gentyped x)
+gentype sign (gentype x)
+
+##relational
+intn isequal (floatn x, floatn y)
+longn isequal (doublen x, doublen y)
+intn isnotequal (floatn x, floatn y)
+longn isnotequal (doublen x, doublen y)
+intn isgreater (floatn x, floatn y)
+longn isgreater (doublen x, doublen y)
+intn isgreaterequal (floatn x, floatn y)
+longn isgreaterequal (doublen x, doublen y)
+intn isless (floatn x, floatn y)
+longn isless (doublen x, doublen y)
+intn islessequal (floatn x, floatn y)
+longn islessequal (doublen x, doublen y)
+# XXX not implemented
+intn islessgreater (floatn x, floatn y)
+longn islessgreater (doublen x, doublen y)
+intn isfinite (floatn
+longn isfinite (doublen)
+intn isinf (floatn)
+longn isinf (doublen)
+intn isnan (floatn)
+longn isnan (doublen)
+intn isnormal (floatn)
+longn isnormal (doublen)
+# XXX not implemented
+intn isordered (floatn x, floatn y)
+longn isordered (doublen x, doublen y)
+# XXX not implemented
+intn isunordered (floatn x, floatn y)
+longn isunordered (doublen x, doublen y)
+intn signbit (floatn)
+longn signbit (doublen)
+int any (igentype x)
+int all (igentype x)
+# XXX need to revisit select latter
+#gentype bitselect (gentype a, gentype b, gentype c)
+#gentype select (gentype a, gentype b, igentype c)
+#gentype select (gentype a, gentype b, ugentype c)
+
+##misc
+#gentypen shuffle (gentypem x, ugentypen mask)
+#gentypen shuffle2 (gentypem x, gentypem y, ugentypen mask)
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
new file mode 100755
index 0000000..b073682
--- /dev/null
+++ b/backend/src/gen_builtin_vector.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2012 Intel Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Author: Zhigang Gong <zhigang.gong at linux.intel.com>
+#/
+
+# This file is to generate inline code to lower down those builtin
+# vector functions to scalar functions.
+import re
+import sys
+import os
+
+if len(sys.argv) != 3:
+    print "Invalid argument {}".format(sys.argv)
+    print "use {} spec_file_name output_file_name".format(sys.argv[0])
+    raise
+
+all_vector = 1,2,3,4,8,16
+
+# generate generic type sets
+def gen_vector_type(type_set, vector_set = all_vector):
+    ret = []
+    for t in type_set:
+        for i in vector_set:
+            ret.append((t, i))
+    return ret
+
+def set_vector_memspace(vector_type_set, memspace):
+    ret = []
+    if memspace == '':
+        return vector_type_set
+    for t in vector_type_set:
+        ret.append((t[0], t[1], memspace))
+    return ret
+
+# if we have 3 elements in the type tuple, we are a pointer with a memory space type
+# at the third element.
+def isPointer(t):
+    return len(t) == 3
+
+all_itype = "char","short","int","long"
+all_utype = "uchar","ushort","uint","ulong"
+all_int_type = all_itype + all_utype
+
+all_float_type = "float","double"
+all_type = all_int_type + all_float_type
+
+# all vector/scalar types
+for t in all_type:
+    exec "{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t)
+    exec "s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t)
+
+# Predefined type sets according to the Open CL spec.
+math_gentype = ["math_gentype", gen_vector_type(all_float_type)]
+math_gentypef = ["math_gentypef", gen_vector_type(["float"])]
+math_gentyped = ["math_gentyped", gen_vector_type(["double"])]
+
+half_native_math_gentype = ["half_native_math_gentype", gen_vector_type(["float"])]
+
+integer_gentype = ["integer_gentype", gen_vector_type(all_int_type)]
+integer_ugentype = ["integer_ugentype", gen_vector_type(all_utype)]
+integer_sgentype = ["integer_sgentype", gen_vector_type(all_int_type, [1])]
+
+fast_integer_gentype = ["fast_integer_gentype", gen_vector_type(["uint", "int"])]
+
+common_gentype = ["common_gentype", gen_vector_type(all_float_type)]
+common_gentypef = ["common_gentypef", gen_vector_type(["float"])]
+common_gentyped = ["common_gentyped", gen_vector_type(["double"])]
+
+relational_gentype = ["relational_gentype", gen_vector_type(all_type)]
+relational_igentype = ["relational_igentype", gen_vector_type(all_itype)]
+relational_ugentype = ["relational_ugentype", gen_vector_type(all_utype)]
+
+misc_gentypem = ["misc_gentypem", gen_vector_type(all_type, [2, 4, 8, 16])]
+misc_gentypen = ["misc_gentypen", gen_vector_type(all_type, [2, 4, 8, 16])]
+misc_ugentypem = ["misc_ugentypem", gen_vector_type(all_utype, [2, 4, 8, 16])]
+misc_ugentypen = ["misc_ugentypen", gen_vector_type(all_utype, [2, 4, 8, 16])]
+
+all_predefined_type = math_gentype, math_gentypef, math_gentyped,                \
+                      half_native_math_gentype, integer_gentype,integer_sgentype,\
+                      integer_ugentype, charn, ucharn, shortn, ushortn, intn,    \
+                      uintn, longn, ulongn, floatn, doublen,                     \
+                      fast_integer_gentype, common_gentype, common_gentypef,     \
+                      common_gentyped, relational_gentype, relational_igentype,  \
+                      relational_ugentype, schar, suchar, sshort, sint, suint,   \
+                      slong, sulong, sfloat, sdouble, misc_gentypem,              \
+                      misc_ugentypem, misc_gentypen, misc_ugentypen
+
+# type dictionary contains all the predefined type sets.
+type_dict = {}
+
+for t in all_predefined_type:
+    type_dict.update({t[0]:t[1]})
+
+def _prefix(prefix, dtype):
+    if dtype.count("gentype") != 0:
+        return prefix + '_' + dtype
+    return dtype
+
+memspaces = ["__local ", "__private ", "__global "]
+
+def stripMemSpace(t):
+    if t[0:2] == '__':
+        for memspace in memspaces :
+            if t[0:len(memspace)] == memspace:
+                return memspace, t[len(memspace):]
+    return '', t
+
+def check_type(types):
+    for t in types:
+        memspace, t = stripMemSpace(t)
+        if not t in type_dict:
+            print t
+            raise "found invalid type."
+
+def match_unsigned(dtype):
+    if dtype[0] == 'float':
+        return ["uint", dtype[1]]
+    if dtype[0] == 'double':
+        return ["ulong", dtype[1]]
+    if dtype[0][0] == 'u':
+        return dtype
+    return ['u' + dtype[0], dtype[1]]
+
+def match_signed(dtype):
+    if dtype[0] == 'float':
+        return ["int", dtype[1]]
+    if dtype[0] == 'double':
+        return ["long", dtype[1]]
+    if dtype[0][0] != 'u':
+        return dtype
+    return [dtype[0][1:], dtype[1]]
+
+def match_scalar(dtype):
+    return [dtype[0], 1]
+
+# The dstType is the expected type, srcType is
+# the reference type. Sometimes, the dstType and
+# srcType are different. We need to fix this issue
+# and return correct dst type.
+def fixup_type(dstType, srcType, n):
+    if dstType == srcType:
+       return dstType[n]
+
+    if dstType != srcType:
+        # scalar dst type
+        if len(dstType) == 1:
+            return dstType[0]
+        # dst is not scalar bug src is scalar
+        if len(srcType) == 1:
+            return dstType[n]
+        if dstType == integer_sgentype[1] and srcType == integer_gentype[1]:
+            return match_scalar(srcType[n])
+
+        if dstType == integer_gentype[1] and  \
+           (srcType == integer_sgentype[1] or \
+            srcType == integer_ugentype[1]):
+            return dstType[n]
+
+        if dstType == integer_ugentype[1] and srcType == integer_gentype[1]:
+            return match_unsigned(srcType[n])
+
+        if dstType == relational_igentype[1] and srcType == relational_gentype[1]:
+            return match_signed(srcType[n])
+        if dstType == relational_ugentype[1] and srcType == relational_gentype[1]:
+            return match_unsigned(srcType[n])
+
+        if dstType == relational_gentype[1] and    \
+           (srcType == relational_igentype[1] or   \
+            srcType == relational_ugentype[1]):
+            return dstType[n]
+
+        if (len(dstType) == len(srcType)):
+            return dstType[n]
+
+    print dstType, srcType
+    raise "type mispatch"
+
+class builtinProto():
+    valueTypeStr = ""
+    functionName = ""
+    paramTypeStrs = []
+    paramCount = 0
+    outputStr = []
+    prefix = ""
+
+    def init(self, sectionHeader, sectionPrefix):
+        self.valueTypeStr = ""
+        self.functionName = ""
+        self.paramTypeStrs = []
+        self.paramCount = 0
+        if sectionHeader != "":
+            self.outputStr = [sectionHeader]
+        else:
+            self.outputStr = []
+        if sectionPrefix != "":
+            self.prefix = sectionPrefix
+        self.indent = 0
+
+    def append(self, line, nextInit = ""):
+        self.outputStr.append(line);
+        return nextInit;
+
+    def indentSpace(self):
+        ret = ""
+        for i in range(self.indent):
+            ret += ' '
+
+        return ret
+
+    def init_from_line(self, t):
+        self.append('//{}'.format(t))
+        line = filter(None, re.split(',| |\(', t.rstrip(')\n')))
+        self.paramCount = 0
+        stripped = 0
+        memSpace = ''
+        for i, text in enumerate(line):
+            idx = i - stripped
+            if idx == 0:
+                self.valueTypeStr = _prefix(self.prefix, line[i])
+                continue
+
+            if idx == 1:
+                self.functionName = line[i];
+                continue
+
+            if idx % 2 == 0:
+                if line[i][0] == '(':
+                    tmpType = line[i][1:]
+                else:
+                    tmpType = line[i]
+                if tmpType == '__local' or   \
+                   tmpType == '__private' or \
+                   tmpType == '__global':
+                   memSpace = tmpType + ' '
+                   stripped += 1
+                   continue
+                self.paramTypeStrs.append(memSpace + _prefix(self.prefix, tmpType))
+                memSpace = ''
+                self.paramCount += 1
+
+    def gen_proto_str_1(self, vtypeSeq, ptypeSeqs, i):
+        for n in range(0, self.paramCount):
+            ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i);
+            vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i);
+            # XXX FIXME now skip all double vector, as we don't
+            # defined those scalar version's prototype.
+            if ptype[0].find('double') != -1 or \
+               vtype[0].find('double') != -1 or \
+               ptype[0].find('long') != -1 or \
+               vtype[0].find('long') != -1 :
+                return
+
+            if (n == 0):
+                formatStr = 'INLINE_OVERLOADABLE {}{} {} ('.format(vtype[0], vtype[1], self.functionName)
+            else:
+                formatStr += ', '
+
+            if vtype[1] == 1:
+                return
+
+            if isPointer(ptype):
+                formatStr += ptype[2]
+                pointerStr = '*'
+            else:
+                pointerStr = ''
+
+            if ptype[1] != 1:
+                formatStr += '{}{} {}param{}'.format(ptype[0], ptype[1], pointerStr, n)
+            else:
+                formatStr += '{} {}param{}'.format(ptype[0], pointerStr, n)
+
+        formatStr += ')'
+        formatStr = self.append(formatStr, '{{return ({}{})('.format(vtype[0], vtype[1]))
+        self.indent = len(formatStr)
+        for j in range(0, vtype[1]):
+            if (j != 0):
+                formatStr += ','
+                if (j + 1) % 2 == 0:
+                    formatStr += ' '
+                if j % 2 == 0:
+                    formatStr = self.append(formatStr, self.indentSpace())
+
+            formatStr += '{}('.format(self.functionName)
+            for n in range(0, self.paramCount):
+                if n != 0:
+                    formatStr += ', '
+
+                ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i)
+                vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i)
+                if vtype[1] != ptype[1]:
+                    if ptype[1] != 1:
+                        raise "parameter is not a scalar but has different width with result value."
+                    if isPointer(ptype):
+                        formatStr += '&'
+                    formatStr += 'param{}'.format(n)
+                    continue
+
+                if (isPointer(ptype)):
+                    formatStr += '({} {} *)param{} + {:2d}'.format(ptype[2], ptype[0], n, j)
+                else:
+                    formatStr += 'param{}.s{:x}'.format(n, j)
+
+            formatStr += ')'
+
+        formatStr += '); }\n'
+        self.append(formatStr)
+
+        return formatStr
+
+    def output(self):
+        for line in self.outputStr:
+            print line
+
+    def output(self, outFile):
+        for line in self.outputStr:
+            outFile.write('{}\n'.format(line))
+
+    def gen_proto_str(self):
+        check_type([self.valueTypeStr] + self.paramTypeStrs)
+        vtypeSeq = type_dict[self.valueTypeStr]
+        ptypeSeqs = []
+        count = len(vtypeSeq);
+        for t in self.paramTypeStrs:
+            memspace,t = stripMemSpace(t)
+            ptypeSeqs.append(set_vector_memspace(type_dict[t], memspace))
+            count = max(count, len(type_dict[t]))
+
+        for i in range(count):
+            formatStr = self.gen_proto_str_1(vtypeSeq, ptypeSeqs, i)
+
+        self.append("")
+
+def safeUnlink(filename):
+    try:
+        os.remove(filename)
+    except OSError:
+        pass
+
+# save the prototypes into ocl_vector.h
+specFile = open(sys.argv[1], 'r')
+headerFileName = sys.argv[2]
+tempHeaderFileName = sys.argv[2] + '.tmp'
+safeUnlink(headerFileName)
+tempHeader = open(tempHeaderFileName, 'w')
+
+tempHeader.write("//This file is autogenerated by {}.\n".format(sys.argv[0]))
+tempHeader.write("//Don't modify it manually.\n")
+
+functionProto = builtinProto()
+for line in specFile:
+    if line.isspace():
+        continue
+    if line[0] == '#':
+        if line[1] == '#':
+            sectionHeader = "//{} builtin functions".format(line[2:].rstrip())
+            sectionPrefix=(line[2:].split())[0]
+        continue
+    functionProto.init(sectionHeader, sectionPrefix)
+    sectionHeader = ""
+    setionPrefix = ""
+    functionProto.init_from_line(line)
+    functionProto.gen_proto_str()
+    functionProto.output(tempHeader)
+
+tempHeader.close()
+os.rename(tempHeaderFileName, headerFileName)
diff --git a/backend/src/ocl_as.h b/backend/src/ocl_as.h
new file mode 100644
index 0000000..af98d53
--- /dev/null
+++ b/backend/src/ocl_as.h
@@ -0,0 +1,2161 @@
+// This file is autogenerated by gen_as.sh.
+// Don't modify it manually.
+union _type_cast_1_b {
+  char _char;
+  uchar _uchar;
+};
+
+INLINE OVERLOADABLE uchar as_uchar(char v) {
+  union _type_cast_1_b u;
+  u._char = v;
+  return u._uchar;
+}
+
+INLINE OVERLOADABLE char as_char(uchar v) {
+  union _type_cast_1_b u;
+  u._uchar = v;
+  return u._char;
+}
+
+union _type_cast_2_b {
+  short _short;
+  ushort _ushort;
+  char2 _char2;
+  uchar2 _uchar2;
+};
+
+INLINE OVERLOADABLE ushort as_ushort(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE char2 as_char2(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._char2;
+}
+
+union _type_cast_3_b {
+  char3 _char3;
+  uchar3 _uchar3;
+};
+
+INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
+  union _type_cast_3_b u;
+  u._char3 = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
+  union _type_cast_3_b u;
+  u._uchar3 = v;
+  return u._char3;
+}
+
+union _type_cast_4_b {
+  int _int;
+  uint _uint;
+  short2 _short2;
+  ushort2 _ushort2;
+  char4 _char4;
+  uchar4 _uchar4;
+  float _float;
+};
+
+INLINE OVERLOADABLE uint as_uint(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE float as_float(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char4 as_char4(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uchar4;
+}
+
+union _type_cast_6_b {
+  short3 _short3;
+  ushort3 _ushort3;
+};
+
+INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
+  union _type_cast_6_b u;
+  u._short3 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
+  union _type_cast_6_b u;
+  u._ushort3 = v;
+  return u._short3;
+}
+
+union _type_cast_8_b {
+  long _long;
+  ulong _ulong;
+  int2 _int2;
+  uint2 _uint2;
+  short4 _short4;
+  ushort4 _ushort4;
+  char8 _char8;
+  uchar8 _uchar8;
+  double _double;
+  float2 _float2;
+};
+
+INLINE OVERLOADABLE ulong as_ulong(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE double as_double(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE float2 as_float2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short4 as_short4(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._double;
+}
+
+union _type_cast_12_b {
+  int3 _int3;
+  uint3 _uint3;
+  float3 _float3;
+};
+
+INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
+  union _type_cast_12_b u;
+  u._int3 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE float3 as_float3(int3 v) {
+  union _type_cast_12_b u;
+  u._int3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uint3 v) {
+  union _type_cast_12_b u;
+  u._uint3 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uint3 v) {
+  union _type_cast_12_b u;
+  u._uint3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE int3 as_int3(float3 v) {
+  union _type_cast_12_b u;
+  u._float3 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
+  union _type_cast_12_b u;
+  u._float3 = v;
+  return u._uint3;
+}
+
+union _type_cast_16_b {
+  long2 _long2;
+  ulong2 _ulong2;
+  int4 _int4;
+  uint4 _uint4;
+  short8 _short8;
+  ushort8 _ushort8;
+  char16 _char16;
+  uchar16 _uchar16;
+  double2 _double2;
+  float4 _float4;
+};
+
+INLINE OVERLOADABLE ulong2 as_ulong2(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE float4 as_float4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int4 as_int4(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._double2;
+}
+
+union _type_cast_24_b {
+  long3 _long3;
+  ulong3 _ulong3;
+  double3 _double3;
+};
+
+INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
+  union _type_cast_24_b u;
+  u._long3 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE double3 as_double3(long3 v) {
+  union _type_cast_24_b u;
+  u._long3 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
+  union _type_cast_24_b u;
+  u._ulong3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
+  union _type_cast_24_b u;
+  u._ulong3 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE long3 as_long3(double3 v) {
+  union _type_cast_24_b u;
+  u._double3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
+  union _type_cast_24_b u;
+  u._double3 = v;
+  return u._ulong3;
+}
+
+union _type_cast_32_b {
+  long4 _long4;
+  ulong4 _ulong4;
+  int8 _int8;
+  uint8 _uint8;
+  short16 _short16;
+  ushort16 _ushort16;
+  double4 _double4;
+  float8 _float8;
+};
+
+INLINE OVERLOADABLE ulong4 as_ulong4(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE float8 as_float8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long4 as_long4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double4 as_double4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._double4;
+}
+
+union _type_cast_64_b {
+  long8 _long8;
+  ulong8 _ulong8;
+  int16 _int16;
+  uint16 _uint16;
+  double8 _double8;
+  float16 _float16;
+};
+
+INLINE OVERLOADABLE ulong8 as_ulong8(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE float16 as_float16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._double8;
+}
+
+union _type_cast_128_b {
+  long16 _long16;
+  ulong16 _ulong16;
+  double16 _double16;
+};
+
+INLINE OVERLOADABLE ulong16 as_ulong16(long16 v) {
+  union _type_cast_128_b u;
+  u._long16 = v;
+  return u._ulong16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(long16 v) {
+  union _type_cast_128_b u;
+  u._long16 = v;
+  return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(ulong16 v) {
+  union _type_cast_128_b u;
+  u._ulong16 = v;
+  return u._long16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(ulong16 v) {
+  union _type_cast_128_b u;
+  u._ulong16 = v;
+  return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(double16 v) {
+  union _type_cast_128_b u;
+  u._double16 = v;
+  return u._long16;
+}
+
+INLINE OVERLOADABLE ulong16 as_ulong16(double16 v) {
+  union _type_cast_128_b u;
+  u._double16 = v;
+  return u._ulong16;
+}
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
new file mode 100644
index 0000000..4063788
--- /dev/null
+++ b/backend/src/ocl_convert.h
@@ -0,0 +1,1801 @@
+// This file is autogenerated by gen_convert.sh.
+// Don't modify it manually.
+INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(long2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(long2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(long2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(long2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(long2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(long2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(long2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(long2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ulong2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ulong2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(ulong2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(ulong2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ulong2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ulong2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ulong2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(int2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(int2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(int2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(int2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(int2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(int2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(int2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uint2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uint2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uint2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uint2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(uint2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(uint2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uint2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(short2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(short2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(short2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(short2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(short2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(short2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(short2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ushort2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(ushort2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(ushort2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ushort2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ushort2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ushort2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ushort2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(char2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(char2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(char2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(char2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(char2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(char2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(char2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uchar2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uchar2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uchar2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(uchar2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(uchar2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uchar2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uchar2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(double2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(double2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(double2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(double2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(double2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(double2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(double2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(double2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(float2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(float2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(float2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(float2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(float2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(float2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(float2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(float2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(float2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(long3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(long3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(long3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(long3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(long3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(long3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(long3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(long3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ulong3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ulong3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(ulong3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(ulong3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ulong3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ulong3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ulong3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(int3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(int3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(int3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(int3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(int3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(int3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(int3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uint3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uint3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uint3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uint3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(uint3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(uint3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uint3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(short3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(short3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(short3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(short3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(short3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(short3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(short3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ushort3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(ushort3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(ushort3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ushort3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ushort3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ushort3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ushort3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(char3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(char3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(char3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(char3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(char3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(char3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(char3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uchar3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uchar3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uchar3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(uchar3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(uchar3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uchar3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uchar3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(double3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(double3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(double3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(double3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(double3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(double3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(double3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(double3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(float3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(float3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(float3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(float3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(float3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(float3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(float3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(float3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(float3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(long4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(long4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(long4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(long4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(long4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(long4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(long4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(long4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ulong4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ulong4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(ulong4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(ulong4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ulong4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ulong4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ulong4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(int4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(int4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(int4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(int4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(int4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(int4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(int4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uint4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uint4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uint4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uint4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(uint4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(uint4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uint4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(short4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(short4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(short4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(short4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(short4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(short4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(short4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ushort4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(ushort4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(ushort4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ushort4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ushort4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ushort4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ushort4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(char4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(char4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(char4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(char4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(char4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(char4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(char4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uchar4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uchar4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uchar4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(uchar4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(uchar4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uchar4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uchar4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(double4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(double4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(double4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(double4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(double4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(double4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(double4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(double4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(float4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(float4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(float4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(float4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(float4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(float4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(float4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(float4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(float4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(long8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(long8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(long8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(long8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(long8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(long8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(long8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(long8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ulong8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ulong8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(ulong8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(ulong8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ulong8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ulong8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ulong8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(int8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(int8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(int8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(int8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(int8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(int8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(int8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uint8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uint8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uint8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uint8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(uint8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(uint8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uint8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(short8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(short8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(short8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(short8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(short8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(short8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(short8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ushort8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(ushort8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(ushort8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ushort8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ushort8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ushort8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ushort8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(char8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(char8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(char8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(char8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(char8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(char8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(char8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uchar8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uchar8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uchar8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(uchar8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(uchar8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uchar8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uchar8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(double8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(double8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(double8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(double8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(double8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(double8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(double8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(double8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(float8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(float8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(float8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(float8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(float8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(float8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(float8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(float8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(float8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(long16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(long16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(long16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(long16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(long16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(long16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(long16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(long16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ulong16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ulong16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(ulong16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(ulong16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ulong16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ulong16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ulong16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(int16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(int16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(int16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(int16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(int16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(int16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(int16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uint16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uint16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uint16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uint16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(uint16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(uint16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uint16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(short16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(short16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(short16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(short16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(short16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(short16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(short16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ushort16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(ushort16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(ushort16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ushort16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ushort16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ushort16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ushort16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(char16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(char16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(char16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(char16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(char16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(char16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(char16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uchar16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uchar16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uchar16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(uchar16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(uchar16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uchar16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uchar16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(double16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(double16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(double16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(double16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(double16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(double16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(double16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(double16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(float16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(float16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(float16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(float16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(float16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(float16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(float16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(float16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
deleted file mode 100644
index 0c78c8e..0000000
--- a/backend/src/ocl_stdlib.h
+++ /dev/null
@@ -1,5769 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
- */
-
-#ifndef __GEN_OCL_STDLIB_H__
-#define __GEN_OCL_STDLIB_H__
-
-#define INLINE inline __attribute__((always_inline))
-#define OVERLOADABLE __attribute__((overloadable))
-#define PURE __attribute__((pure))
-#define CONST __attribute__((const))
-#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL built-in scalar data types
-/////////////////////////////////////////////////////////////////////////////
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-typedef unsigned int uint;
-typedef unsigned long ulong;
-typedef __typeof__(sizeof(int)) size_t;
-typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
-typedef signed int intptr_t;
-typedef unsigned int uintptr_t;
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL address space
-/////////////////////////////////////////////////////////////////////////////
-// These are built-ins in LLVM 3.3.
-#if 100*__clang_major__ + __clang_minor__ <= 302
-#define __private __attribute__((address_space(0)))
-#define __global __attribute__((address_space(1)))
-#define __constant __attribute__((address_space(2)))
-#define __local __attribute__((address_space(3)))
-#define global __global
-//#define local __local
-#define constant __constant
-#define private __private
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL built-in vector data types
-/////////////////////////////////////////////////////////////////////////////
-#define DEF(type) typedef type type##2 __attribute__((ext_vector_type(2)));\
-                  typedef type type##3 __attribute__((ext_vector_type(3)));\
-                  typedef type type##4 __attribute__((ext_vector_type(4)));\
-                  typedef type type##8 __attribute__((ext_vector_type(8)));\
-                  typedef type type##16 __attribute__((ext_vector_type(16)));
-DEF(char);
-DEF(uchar);
-DEF(short);
-DEF(ushort);
-DEF(int);
-DEF(uint);
-DEF(long);
-DEF(ulong);
-DEF(float);
-DEF(double);
-#undef DEF
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL other built-in data types
-/////////////////////////////////////////////////////////////////////////////
-// FIXME:
-// This is a transitional hack to bypass the LLVM 3.3 built-in types.
-// See the Khronos SPIR specification for handling of these types.
-#define __texture __attribute__((address_space(4)))
-struct _image2d_t;
-typedef __texture struct _image2d_t* __image2d_t;
-struct _image3d_t;
-typedef __texture struct _image3d_t* __image3d_t;
-typedef uint __sampler_t;
-typedef size_t __event_t;
-#define image2d_t __image2d_t
-#define image3d_t __image3d_t
-#define sampler_t __sampler_t
-#define event_t __event_t
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL conversions & type casting
-/////////////////////////////////////////////////////////////////////////////
-
-// ##BEGIN_AS##
-union _type_cast_1_b {
-  char _char;
-  uchar _uchar;
-};
-
-INLINE OVERLOADABLE uchar as_uchar(char v) {
-  union _type_cast_1_b u;
-  u._char = v;
-  return u._uchar;
-}
-
-INLINE OVERLOADABLE char as_char(uchar v) {
-  union _type_cast_1_b u;
-  u._uchar = v;
-  return u._char;
-}
-
-union _type_cast_2_b {
-  short _short;
-  ushort _ushort;
-  char2 _char2;
-  uchar2 _uchar2;
-};
-
-INLINE OVERLOADABLE ushort as_ushort(short v) {
-  union _type_cast_2_b u;
-  u._short = v;
-  return u._ushort;
-}
-
-INLINE OVERLOADABLE char2 as_char2(short v) {
-  union _type_cast_2_b u;
-  u._short = v;
-  return u._char2;
-}
-
-INLINE OVERLOADABLE uchar2 as_uchar2(short v) {
-  union _type_cast_2_b u;
-  u._short = v;
-  return u._uchar2;
-}
-
-INLINE OVERLOADABLE short as_short(ushort v) {
-  union _type_cast_2_b u;
-  u._ushort = v;
-  return u._short;
-}
-
-INLINE OVERLOADABLE char2 as_char2(ushort v) {
-  union _type_cast_2_b u;
-  u._ushort = v;
-  return u._char2;
-}
-
-INLINE OVERLOADABLE uchar2 as_uchar2(ushort v) {
-  union _type_cast_2_b u;
-  u._ushort = v;
-  return u._uchar2;
-}
-
-INLINE OVERLOADABLE short as_short(char2 v) {
-  union _type_cast_2_b u;
-  u._char2 = v;
-  return u._short;
-}
-
-INLINE OVERLOADABLE ushort as_ushort(char2 v) {
-  union _type_cast_2_b u;
-  u._char2 = v;
-  return u._ushort;
-}
-
-INLINE OVERLOADABLE uchar2 as_uchar2(char2 v) {
-  union _type_cast_2_b u;
-  u._char2 = v;
-  return u._uchar2;
-}
-
-INLINE OVERLOADABLE short as_short(uchar2 v) {
-  union _type_cast_2_b u;
-  u._uchar2 = v;
-  return u._short;
-}
-
-INLINE OVERLOADABLE ushort as_ushort(uchar2 v) {
-  union _type_cast_2_b u;
-  u._uchar2 = v;
-  return u._ushort;
-}
-
-INLINE OVERLOADABLE char2 as_char2(uchar2 v) {
-  union _type_cast_2_b u;
-  u._uchar2 = v;
-  return u._char2;
-}
-
-union _type_cast_3_b {
-  char3 _char3;
-  uchar3 _uchar3;
-};
-
-INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
-  union _type_cast_3_b u;
-  u._char3 = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
-  union _type_cast_3_b u;
-  u._uchar3 = v;
-  return u._char3;
-}
-
-union _type_cast_4_b {
-  int _int;
-  uint _uint;
-  short2 _short2;
-  ushort2 _ushort2;
-  char4 _char4;
-  uchar4 _uchar4;
-  float _float;
-};
-
-INLINE OVERLOADABLE uint as_uint(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char4 as_char4(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE short2 as_short2(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char4 as_char4(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char4 as_char4(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE char4 as_char4(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char4 as_char4(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE float as_float(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char4 as_char4(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._uchar4;
-}
-
-union _type_cast_6_b {
-  short3 _short3;
-  ushort3 _ushort3;
-};
-
-INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
-  union _type_cast_6_b u;
-  u._short3 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
-  union _type_cast_6_b u;
-  u._ushort3 = v;
-  return u._short3;
-}
-
-union _type_cast_8_b {
-  long _long;
-  ulong _ulong;
-  int2 _int2;
-  uint2 _uint2;
-  short4 _short4;
-  ushort4 _ushort4;
-  char8 _char8;
-  uchar8 _uchar8;
-  double _double;
-  float2 _float2;
-};
-
-INLINE OVERLOADABLE ulong as_ulong(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE int2 as_int2(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE double as_double(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE float2 as_float2(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short4 as_short4(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._double;
-}
-
-union _type_cast_12_b {
-  int3 _int3;
-  uint3 _uint3;
-  float3 _float3;
-};
-
-INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
-  union _type_cast_12_b u;
-  u._int3 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE float3 as_float3(int3 v) {
-  union _type_cast_12_b u;
-  u._int3 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE int3 as_int3(uint3 v) {
-  union _type_cast_12_b u;
-  u._uint3 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE float3 as_float3(uint3 v) {
-  union _type_cast_12_b u;
-  u._uint3 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE int3 as_int3(float3 v) {
-  union _type_cast_12_b u;
-  u._float3 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
-  union _type_cast_12_b u;
-  u._float3 = v;
-  return u._uint3;
-}
-
-union _type_cast_16_b {
-  long2 _long2;
-  ulong2 _ulong2;
-  int4 _int4;
-  uint4 _uint4;
-  short8 _short8;
-  ushort8 _ushort8;
-  char16 _char16;
-  uchar16 _uchar16;
-  double2 _double2;
-  float4 _float4;
-};
-
-INLINE OVERLOADABLE ulong2 as_ulong2(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float4 as_float4(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE float4 as_float4(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int4 as_int4(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._double2;
-}
-
-union _type_cast_24_b {
-  long3 _long3;
-  ulong3 _ulong3;
-  double3 _double3;
-};
-
-INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
-  union _type_cast_24_b u;
-  u._long3 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE double3 as_double3(long3 v) {
-  union _type_cast_24_b u;
-  u._long3 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
-  union _type_cast_24_b u;
-  u._ulong3 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
-  union _type_cast_24_b u;
-  u._ulong3 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE long3 as_long3(double3 v) {
-  union _type_cast_24_b u;
-  u._double3 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
-  union _type_cast_24_b u;
-  u._double3 = v;
-  return u._ulong3;
-}
-
-union _type_cast_32_b {
-  long4 _long4;
-  ulong4 _ulong4;
-  int8 _int8;
-  uint8 _uint8;
-  short16 _short16;
-  ushort16 _ushort16;
-  double4 _double4;
-  float8 _float8;
-};
-
-INLINE OVERLOADABLE ulong4 as_ulong4(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double4 as_double4(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long4 as_long4(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double4 as_double4(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long4 as_long4(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double4 as_double4(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long4 as_long4(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double4 as_double4(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long4 as_long4(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double4 as_double4(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long4 as_long4(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE double4 as_double4(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long4 as_long4(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE float8 as_float8(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long4 as_long4(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double4 as_double4(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._double4;
-}
-
-union _type_cast_64_b {
-  long8 _long8;
-  ulong8 _ulong8;
-  int16 _int16;
-  uint16 _uint16;
-  double8 _double8;
-  float16 _float16;
-};
-
-INLINE OVERLOADABLE ulong8 as_ulong8(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE float16 as_float16(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._double8;
-}
-
-union _type_cast_128_b {
-  long16 _long16;
-  ulong16 _ulong16;
-  double16 _double16;
-};
-
-INLINE OVERLOADABLE ulong16 as_ulong16(long16 v) {
-  union _type_cast_128_b u;
-  u._long16 = v;
-  return u._ulong16;
-}
-
-INLINE OVERLOADABLE double16 as_double16(long16 v) {
-  union _type_cast_128_b u;
-  u._long16 = v;
-  return u._double16;
-}
-
-INLINE OVERLOADABLE long16 as_long16(ulong16 v) {
-  union _type_cast_128_b u;
-  u._ulong16 = v;
-  return u._long16;
-}
-
-INLINE OVERLOADABLE double16 as_double16(ulong16 v) {
-  union _type_cast_128_b u;
-  u._ulong16 = v;
-  return u._double16;
-}
-
-INLINE OVERLOADABLE long16 as_long16(double16 v) {
-  union _type_cast_128_b u;
-  u._double16 = v;
-  return u._long16;
-}
-
-INLINE OVERLOADABLE ulong16 as_ulong16(double16 v) {
-  union _type_cast_128_b u;
-  u._double16 = v;
-  return u._ulong16;
-}
-
-// ##END_AS##
-
-// ##BEGIN_CONVERT##
-INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(long2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(long2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(long2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(long2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(long2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(long2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(long2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(long2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(ulong2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(ulong2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(ulong2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(ulong2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(ulong2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(ulong2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(ulong2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(int2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(int2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(int2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(int2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(int2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(int2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(int2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(uint2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(uint2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(uint2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(uint2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(uint2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(uint2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(uint2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(short2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(short2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(short2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(short2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(short2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(short2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(short2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(ushort2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(ushort2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(ushort2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(ushort2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(ushort2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(ushort2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(ushort2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(char2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(char2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(char2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(char2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(char2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(char2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(char2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(uchar2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(uchar2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(uchar2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(uchar2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(uchar2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(uchar2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(uchar2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(double2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(double2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(double2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(double2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(double2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(double2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(double2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(double2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(float2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(float2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(float2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(float2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(float2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(float2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(float2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(float2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(float2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(long3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(long3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(long3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(long3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(long3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(long3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(long3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(long3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(ulong3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(ulong3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(ulong3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(ulong3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(ulong3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(ulong3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(ulong3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(int3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(int3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(int3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(int3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(int3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(int3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(int3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(uint3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(uint3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(uint3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(uint3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(uint3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(uint3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(uint3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(short3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(short3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(short3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(short3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(short3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(short3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(short3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(ushort3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(ushort3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(ushort3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(ushort3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(ushort3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(ushort3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(ushort3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(char3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(char3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(char3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(char3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(char3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(char3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(char3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(uchar3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(uchar3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(uchar3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(uchar3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(uchar3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(uchar3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(uchar3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(double3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(double3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(double3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(double3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(double3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(double3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(double3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(double3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(float3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(float3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(float3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(float3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(float3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(float3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(float3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(float3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(float3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(long4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(long4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(long4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(long4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(long4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(long4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(long4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(long4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(ulong4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(ulong4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(ulong4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(ulong4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(ulong4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(ulong4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(ulong4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(int4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(int4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(int4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(int4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(int4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(int4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(int4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(uint4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(uint4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(uint4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(uint4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(uint4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(uint4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(uint4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(short4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(short4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(short4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(short4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(short4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(short4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(short4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(ushort4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(ushort4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(ushort4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(ushort4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(ushort4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(ushort4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(ushort4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(char4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(char4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(char4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(char4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(char4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(char4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(char4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(uchar4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(uchar4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(uchar4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(uchar4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(uchar4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(uchar4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(uchar4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(double4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(double4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(double4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(double4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(double4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(double4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(double4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(double4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(float4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(float4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(float4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(float4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(float4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(float4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(float4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(float4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(float4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(long8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(long8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(long8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(long8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(long8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(long8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(long8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(long8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(ulong8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(ulong8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(ulong8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(ulong8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(ulong8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(ulong8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(ulong8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(int8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(int8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(int8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(int8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(int8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(int8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(int8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(uint8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(uint8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(uint8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(uint8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(uint8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(uint8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(uint8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(short8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(short8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(short8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(short8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(short8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(short8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(short8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(ushort8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(ushort8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(ushort8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(ushort8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(ushort8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(ushort8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(ushort8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(char8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(char8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(char8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(char8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(char8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(char8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(char8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(uchar8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(uchar8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(uchar8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(uchar8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(uchar8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(uchar8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(uchar8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(double8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(double8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(double8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(double8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(double8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(double8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(double8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(double8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(float8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(float8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(float8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(float8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(float8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(float8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(float8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(float8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(float8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(long16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(long16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(long16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(long16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(long16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(long16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(long16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(long16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(ulong16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(ulong16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(ulong16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(ulong16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(ulong16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(ulong16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(ulong16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(int16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(int16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(int16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(int16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(int16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(int16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(int16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(uint16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(uint16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(uint16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(uint16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(uint16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(uint16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(uint16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(short16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(short16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(short16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(short16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(short16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(short16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(short16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(ushort16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(ushort16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(ushort16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(ushort16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(ushort16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(ushort16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(ushort16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(char16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(char16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(char16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(char16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(char16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(char16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(char16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(uchar16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(uchar16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(uchar16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(uchar16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(uchar16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(uchar16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(uchar16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(double16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(double16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(double16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(double16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(double16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(double16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(double16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(double16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(float16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(float16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(float16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(float16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(float16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(float16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(float16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(float16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-// ##END_CONVERT##
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL preprocessor directives & macros
-/////////////////////////////////////////////////////////////////////////////
-#define __OPENCL_VERSION__ 110
-#define __CL_VERSION_1_0__ 100
-#define __CL_VERSION_1_1__ 110
-#define __ENDIAN_LITTLE__ 1
-#define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
-                                        __attribute__((vec_type_hint(TYPE)))
-#define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL floating-point macros and pragmas
-/////////////////////////////////////////////////////////////////////////////
-#define FLT_DIG 6
-#define FLT_MANT_DIG 24
-#define FLT_MAX_10_EXP +38
-#define FLT_MAX_EXP +128
-#define FLT_MIN_10_EXP -37
-#define FLT_MIN_EXP -125
-#define FLT_RADIX 2
-#define FLT_MAX 0x1.fffffep127f
-#define FLT_MIN 0x1.0p-126f
-#define FLT_EPSILON 0x1.0p-23f
-
-#define MAXFLOAT     3.40282347e38F
-#define HUGE_VALF    (__builtin_huge_valf())
-#define INFINITY     (__builtin_inff())
-#define NAN          (__builtin_nanf(""))
-#define M_E_F        2.718281828459045F
-#define M_LOG2E_F    1.4426950408889634F
-#define M_LOG10E_F   0.43429448190325176F
-#define M_LN2_F      0.6931471805599453F
-#define M_LN10_F     2.302585092994046F
-#define M_PI_F       3.141592653589793F
-#define M_PI_2_F     1.5707963267948966F
-#define M_PI_4_F     0.7853981633974483F
-#define M_1_PI_F     0.3183098861837907F
-#define M_2_PI_F     0.6366197723675814F
-#define M_2_SQRTPI_F 1.1283791670955126F
-#define M_SQRT2_F    1.4142135623730951F
-#define M_SQRT1_2_F  0.7071067811865476F
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL integer built-in macros
-/////////////////////////////////////////////////////////////////////////////
-#define CHAR_BIT    8
-#define CHAR_MAX    SCHAR_MAX
-#define CHAR_MIN    SCHAR_MIN
-#define INT_MAX     2147483647
-#define INT_MIN     (-2147483647 - 1)
-#define LONG_MAX    0x7fffffffffffffffL
-#define LONG_MIN    (-0x7fffffffffffffffL - 1)
-#define SCHAR_MAX   127
-#define SCHAR_MIN   (-127 - 1)
-#define SHRT_MAX    32767
-#define SHRT_MIN    (-32767 - 1)
-#define UCHAR_MAX   255
-#define USHRT_MAX   65535
-#define UINT_MAX    0xffffffff
-#define ULONG_MAX   0xffffffffffffffffUL
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL relational built-in functions
-/////////////////////////////////////////////////////////////////////////////
-#define DEF DECL(int, float); \
-            DECL(int2, float2); \
-            DECL(int3, float3); \
-            DECL(int4, float4); \
-            DECL(int8, float8); \
-            DECL(int16, float16);
-#define DECL(ret, type) ret INLINE_OVERLOADABLE isequal(type x, type y) { return x == y; }
-DEF;
-#undef DECL
-#define DECL(ret, type) ret INLINE_OVERLOADABLE isnotequal(type x, type y) { return x != y; }
-DEF;
-#undef DECL
-#define DECL(ret, type) ret INLINE_OVERLOADABLE isgreater(type x, type y) { return x > y; }
-DEF;
-#undef DECL
-#define DECL(ret, type) ret INLINE_OVERLOADABLE isgreaterequal(type x, type y) { return x >= y; }
-DEF;
-#undef DECL
-#define DECL(ret, type) ret INLINE_OVERLOADABLE isless(type x, type y) { return x < y; }
-DEF;
-#undef DECL
-#define DECL(ret, type) ret INLINE_OVERLOADABLE islessequal(type x, type y) { return x <= y; }
-DEF;
-#undef DECL
-#undef DEF
-
-#define SDEF(TYPE)                                                              \
-OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
-OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
-INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
-SDEF(char);
-SDEF(short);
-SDEF(int);
-SDEF(long);
-#undef SDEF
-#define UDEF(TYPE)                                                              \
-OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
-OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
-INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
-UDEF(uchar);
-UDEF(ushort);
-UDEF(uint);
-UDEF(ulong);
-#undef UDEF
-
-
-uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
-    return add_sat((uchar)x, (uchar)0);
-}
-
-#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x) { return (int2)(name(x.s0), name(x.s1)); }
-#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x) { return (int3)(name(x.s0), name(x.s1), name(x.s2)); }
-#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x) { return (int4)(name(x.s0), name(x.s1), name(x.s2), name(x.s3)); }
-#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x) { return (int8)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7)); }
-#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x) { return (int16)(name(x.s0), name(x.s1), name(x.s2), name(x.s3), name(x.s4), name(x.s5), name(x.s6), name(x.s7), name(x.s8), name(x.s9), name(x.sA), name(x.sB), name(x.sC), name(x.sD), name(x.sE), name(x.sF)); }
-INLINE_OVERLOADABLE int isfinite(float x) { return __builtin_isfinite(x); }
-DEC2(isfinite);
-DEC3(isfinite);
-DEC4(isfinite);
-DEC8(isfinite);
-DEC16(isfinite);
-INLINE_OVERLOADABLE int isinf(float x) { return __builtin_isinf(x); }
-DEC2(isinf);
-DEC3(isinf);
-DEC4(isinf);
-DEC8(isinf);
-DEC16(isinf);
-INLINE_OVERLOADABLE int isnan(float x) { return __builtin_isnan(x); }
-DEC2(isnan);
-DEC3(isnan);
-DEC4(isnan);
-DEC8(isnan);
-DEC16(isnan);
-INLINE_OVERLOADABLE int isnormal(float x) { return __builtin_isnormal(x); }
-DEC2(isnormal);
-DEC3(isnormal);
-DEC4(isnormal);
-DEC8(isnormal);
-DEC16(isnormal);
-INLINE_OVERLOADABLE int signbit(float x) { return __builtin_signbit(x); }
-DEC2(signbit);
-DEC3(signbit);
-DEC4(signbit);
-DEC8(signbit);
-DEC16(signbit);
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-#define DEC2(name) INLINE_OVERLOADABLE int2 name(float2 x, float2 y) { return (int2)(name(x.s0, y.s0), name(x.s1, y.s1)); }
-#define DEC3(name) INLINE_OVERLOADABLE int3 name(float3 x, float3 y) { return (int3)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2)); }
-#define DEC4(name) INLINE_OVERLOADABLE int4 name(float4 x, float4 y) { return (int4)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3)); }
-#define DEC8(name) INLINE_OVERLOADABLE int8 name(float8 x, float8 y) { return (int8)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7)); }
-#define DEC16(name) INLINE_OVERLOADABLE int16 name(float16 x, float16 y) { return (int16)(name(x.s0, y.s0), name(x.s1, y.s1), name(x.s2, y.s2), name(x.s3, y.s3), name(x.s4, y.s4), name(x.s5, y.s5), name(x.s6, y.s6), name(x.s7, y.s7), name(x.s8, y.s8), name(x.s9, y.s9), name(x.sA, y.sA), name(x.sB, y.sB), name(x.sC, y.sC), name(x.sD, y.sD), name(x.sE, y.sE), name(x.sF, y.sF)); }
-INLINE_OVERLOADABLE int islessgreater(float x, float y) { return (x<y)||(x>y); }
-DEC2(islessgreater);
-DEC3(islessgreater);
-DEC4(islessgreater);
-DEC8(islessgreater);
-DEC16(islessgreater);
-INLINE_OVERLOADABLE int isordered(float x, float y) { return isequal(x,x) && isequal(y,y); }
-DEC2(isordered);
-DEC3(isordered);
-DEC4(isordered);
-DEC8(isordered);
-DEC16(isordered);
-INLINE_OVERLOADABLE int isunordered(float x, float y) { return isnan(x) || isnan(y); }
-DEC2(isunordered);
-DEC3(isunordered);
-DEC4(isunordered);
-DEC8(isunordered);
-DEC16(isunordered);
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-#define DEC1(type) INLINE_OVERLOADABLE int any(type a) { return a<0; }
-#define DEC2(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
-#define DEC3(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0; }
-#define DEC4(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0; }
-#define DEC8(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0; }
-#define DEC16(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0 || a.s8<0 || a.s9<0 || a.sA<0 || a.sB<0 || a.sC<0 || a.sD<0 || a.sE<0 || a.sF<0; }
-DEC1(char);
-DEC1(short);
-DEC1(int);
-DEC1(long);
-#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
-DEC(2);
-DEC(3);
-DEC(4);
-DEC(8);
-DEC(16);
-#undef DEC
-#undef DEC1
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-#define DEC1(type) INLINE_OVERLOADABLE int all(type a) { return a<0; }
-#define DEC2(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0; }
-#define DEC3(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0; }
-#define DEC4(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0; }
-#define DEC8(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0; }
-#define DEC16(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0 && a.s8<0 && a.s9<0 && a.sA<0 && a.sB<0 && a.sC<0 && a.sD<0 && a.sE<0 && a.sF<0; }
-DEC1(char);
-DEC1(short);
-DEC1(int);
-DEC1(long);
-#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
-DEC(2);
-DEC(3);
-DEC(4);
-DEC(8);
-DEC(16);
-#undef DEC
-#undef DEC1
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-/////////////////////////////////////////////////////////////////////////////
-// Integer built-in functions
-/////////////////////////////////////////////////////////////////////////////
-PURE CONST uint __gen_ocl_fbh(uint);
-PURE CONST uint __gen_ocl_fbl(uint);
-
-INLINE_OVERLOADABLE char clz(char x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbl(x) - 24;
-}
-
-INLINE_OVERLOADABLE uchar clz(uchar x) {
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbl(x) - 24;
-}
-
-INLINE_OVERLOADABLE short clz(short x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-INLINE_OVERLOADABLE ushort clz(ushort x) {
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-INLINE_OVERLOADABLE int clz(int x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-INLINE_OVERLOADABLE uint clz(uint x) {
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-#define DEC2(type) INLINE_OVERLOADABLE type##2 clz(type##2 a) { return (type##2)(clz(a.s0), clz(a.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 clz(type##3 a) { return (type##3)(clz(a.s0), clz(a.s1), clz(a.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 clz(type##4 a) { return (type##4)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 clz(type##8 a) { return (type##8)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 clz(type##16 a) { return (type##16)(clz(a.s0), clz(a.s1), clz(a.s2), clz(a.s3), clz(a.s4), clz(a.s5), clz(a.s6), clz(a.s7), clz(a.s8), clz(a.s9), clz(a.sa), clz(a.sb), clz(a.sc), clz(a.sd), clz(a.se), clz(a.sf)); }
-#define DEC(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint) 
-DEC(2)
-DEC(3)
-DEC(4)
-DEC(8)
-DEC(16)
-#undef DEC
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
-OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
-INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
-INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
-INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
-INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
-INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
-INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
-#define DEC2(type) INLINE_OVERLOADABLE type##2 mul_hi(type##2 a, type##2 b) { return (type##2)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 mul_hi(type##3 a, type##3 b) { return (type##3)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 mul_hi(type##4 a, type##4 b) { return (type##4)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 mul_hi(type##8 a, type##8 b) { return (type##8)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 mul_hi(type##16 a, type##16 b) { return (type##16)(mul_hi(a.s0, b.s0), mul_hi(a.s1, b.s1), mul_hi(a.s2, b.s2), mul_hi(a.s3, b.s3), mul_hi(a.s4, b.s4), mul_hi(a.s5, b.s5), mul_hi(a.s6, b.s6), mul_hi(a.s7, b.s7), mul_hi(a.s8, b.s8), mul_hi(a.s9, b.s9), mul_hi(a.sa, b.sa), mul_hi(a.sb, b.sb), mul_hi(a.sc, b.sc), mul_hi(a.sd, b.sd), mul_hi(a.se, b.se), mul_hi(a.sf, b.sf)); }
-#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
-DEF(2)
-DEF(3)
-DEF(4)
-DEF(8)
-DEF(16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
-DEF(char)
-DEF(uchar)
-DEF(short)
-DEF(ushort)
-DEF(int)
-DEF(uint)
-#undef DEF
-#define DEC2(type) INLINE_OVERLOADABLE type##2 mad_hi(type##2 a, type##2 b, type##2 c) { return (type##2)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 mad_hi(type##3 a, type##3 b, type##3 c) { return (type##3)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 mad_hi(type##4 a, type##4 b, type##4 c) { return (type##4)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 mad_hi(type##8 a, type##8 b, type##8 c) { return (type##8)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 mad_hi(type##16 a, type##16 b, type##16 c) { return (type##16)(mad_hi(a.s0, b.s0, c.s0), mad_hi(a.s1, b.s1, c.s1), mad_hi(a.s2, b.s2, c.s2), mad_hi(a.s3, b.s3, c.s3), mad_hi(a.s4, b.s4, c.s4), mad_hi(a.s5, b.s5, c.s5), mad_hi(a.s6, b.s6, c.s6), mad_hi(a.s7, b.s7, c.s7), mad_hi(a.s8, b.s8, c.s8), mad_hi(a.s9, b.s9, c.s9), mad_hi(a.sa, b.sa, c.sa), mad_hi(a.sb, b.sb, c.sb), mad_hi(a.sc, b.sc, c.sc), mad_hi(a.sd, b.sd, c.sd), mad_hi(a.se, b.s [...]
-#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
-DEF(2)
-DEF(3)
-DEF(4)
-DEF(8)
-DEF(16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-INLINE_OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
-INLINE_OVERLOADABLE uint mul24(uint a, uint b) { return (a & 0xFFFFFF) * (b & 0xFFFFFF); }
-#define DEC2(type) INLINE_OVERLOADABLE type##2 mul24(type##2 a, type##2 b) { return (type##2)(mul24(a.s0, b.s0), mul24(a.s1, b.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 mul24(type##3 a, type##3 b) { return (type##3)(mul24(a.s0, b.s0), mul24(a.s1, b.s1), mul24(a.s2, b.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 mul24(type##4 a, type##4 b) { return (type##4)(mul24(a.s0, b.s0), mul24(a.s1, b.s1), mul24(a.s2, b.s2), mul24(a.s3, b.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 mul24(type##8 a, type##8 b) { return (type##8)(mul24(a.s0, b.s0), mul24(a.s1, b.s1), mul24(a.s2, b.s2), mul24(a.s3, b.s3), mul24(a.s4, b.s4), mul24(a.s5, b.s5), mul24(a.s6, b.s6), mul24(a.s7, b.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 mul24(type##16 a, type##16 b) { return (type##16)(mul24(a.s0, b.s0), mul24(a.s1, b.s1), mul24(a.s2, b.s2), mul24(a.s3, b.s3), mul24(a.s4, b.s4), mul24(a.s5, b.s5), mul24(a.s6, b.s6), mul24(a.s7, b.s7), mul24(a.s8, b.s8), mul24(a.s9, b.s9), mul24(a.sa, b.sa), mul24(a.sb, b.sb), mul24(a.sc, b.sc), mul24(a.sd, b.sd), mul24(a.se, b.se), mul24(a.sf, b.sf)); }
-#define DEF(n) DEC##n(int); DEC##n(uint)
-DEF(2)
-DEF(3)
-DEF(4)
-DEF(8)
-DEF(16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-INLINE_OVERLOADABLE int mad24(int a, int b, int c) { return mul24(a, b) + c; }
-INLINE_OVERLOADABLE uint mad24(uint a, uint b, uint c) { return mul24(a, b) + c; }
-#define DEC2(type) INLINE_OVERLOADABLE type##2 mad24(type##2 a, type##2 b, type##2 c) { return (type##2)(mad24(a.s0, b.s0, c.s0), mad24(a.s1, b.s1, c.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 mad24(type##3 a, type##3 b, type##3 c) { return (type##3)(mad24(a.s0, b.s0, c.s0), mad24(a.s1, b.s1, c.s1), mad24(a.s2, b.s2, c.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 mad24(type##4 a, type##4 b, type##4 c) { return (type##4)(mad24(a.s0, b.s0, c.s0), mad24(a.s1, b.s1, c.s1), mad24(a.s2, b.s2, c.s2), mad24(a.s3, b.s3, c.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 mad24(type##8 a, type##8 b, type##8 c) { return (type##8)(mad24(a.s0, b.s0, c.s0), mad24(a.s1, b.s1, c.s1), mad24(a.s2, b.s2, c.s2), mad24(a.s3, b.s3, c.s3), mad24(a.s4, b.s4, c.s4), mad24(a.s5, b.s5, c.s5), mad24(a.s6, b.s6, c.s6), mad24(a.s7, b.s7, c.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 mad24(type##16 a, type##16 b, type##16 c) { return (type##16)(mad24(a.s0, b.s0, c.s0), mad24(a.s1, b.s1, c.s1), mad24(a.s2, b.s2, c.s2), mad24(a.s3, b.s3, c.s3), mad24(a.s4, b.s4, c.s4), mad24(a.s5, b.s5, c.s5), mad24(a.s6, b.s6, c.s6), mad24(a.s7, b.s7, c.s7), mad24(a.s8, b.s8, c.s8), mad24(a.s9, b.s9, c.s9), mad24(a.sa, b.sa, c.sa), mad24(a.sb, b.sb, c.sb), mad24(a.sc, b.sc, c.sc), mad24(a.sd, b.sd, c.sd), mad24(a.se, b.se, c.se), mad24( [...]
-#define DEF(n) DEC##n(int); DEC##n(uint)
-DEF(2)
-DEF(3)
-DEF(4)
-DEF(8)
-DEF(16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
-INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
-INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
-INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
-INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
-INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
-#define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
-DEF(char, 7)
-DEF(uchar, 7)
-DEF(short, 15)
-DEF(ushort, 15)
-DEF(int, 31)
-DEF(uint, 31)
-#undef DEF
-#define DEC2(type) INLINE_OVERLOADABLE type##2 rotate(type##2 a, type##2 b) { return (type##2)(rotate(a.s0, b.s0), rotate(a.s1, b.s1)); }
-#define DEC3(type) INLINE_OVERLOADABLE type##3 rotate(type##3 a, type##3 b) { return (type##3)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2)); }
-#define DEC4(type) INLINE_OVERLOADABLE type##4 rotate(type##4 a, type##4 b) { return (type##4)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3)); }
-#define DEC8(type) INLINE_OVERLOADABLE type##8 rotate(type##8 a, type##8 b) { return (type##8)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7)); }
-#define DEC16(type) INLINE_OVERLOADABLE type##16 rotate(type##16 a, type##16 b) { return (type##16)(rotate(a.s0, b.s0), rotate(a.s1, b.s1), rotate(a.s2, b.s2), rotate(a.s3, b.s3), rotate(a.s4, b.s4), rotate(a.s5, b.s5), rotate(a.s6, b.s6), rotate(a.s7, b.s7), rotate(a.s8, b.s8), rotate(a.s9, b.s9), rotate(a.sa, b.sa), rotate(a.sb, b.sb), rotate(a.sc, b.sc), rotate(a.sd, b.sd), rotate(a.se, b.se), rotate(a.sf, b.sf)); }
-#define DEF(n) DEC##n(char); DEC##n(uchar); DEC##n(short); DEC##n(ushort); DEC##n(int); DEC##n(uint)
-DEF(2)
-DEF(3)
-DEF(4)
-DEF(8)
-DEF(16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
-OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
-INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
-INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
-INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
-INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
-#define DEC2(type, type2) INLINE_OVERLOADABLE type2##2 upsample(type##2 a, type##2 b) { return (type2##2)(upsample(a.s0, b.s0), upsample(a.s1, b.s1)); }
-#define DEC3(type, type2) INLINE_OVERLOADABLE type2##3 upsample(type##3 a, type##3 b) { return (type2##3)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2)); }
-#define DEC4(type, type2) INLINE_OVERLOADABLE type2##4 upsample(type##4 a, type##4 b) { return (type2##4)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3)); }
-#define DEC8(type, type2) INLINE_OVERLOADABLE type2##8 upsample(type##8 a, type##8 b) { return (type2##8)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7)); }
-#define DEC16(type, type2) INLINE_OVERLOADABLE type2##16 upsample(type##16 a, type##16 b) { return (type2##16)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7), upsample(a.s8, b.s8), upsample(a.s9, b.s9), upsample(a.sa, b.sa), upsample(a.sb, b.sb), upsample(a.sc, b.sc), upsample(a.sd, b.sd), upsample(a.se, b.se), upsample(a.sf, b.sf)); }
-#define DEF(n) DEC##n(uchar, ushort); DEC##n(ushort, uint)
-DEF(2)
-DEF(3)
-DEF(4)
-DEF(8)
-DEF(16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-#define DEC2(type, type2) INLINE_OVERLOADABLE type2##2 upsample(type##2 a, u##type##2 b) { return (type2##2)(upsample(a.s0, b.s0), upsample(a.s1, b.s1)); }
-#define DEC3(type, type2) INLINE_OVERLOADABLE type2##3 upsample(type##3 a, u##type##3 b) { return (type2##3)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2)); }
-#define DEC4(type, type2) INLINE_OVERLOADABLE type2##4 upsample(type##4 a, u##type##4 b) { return (type2##4)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3)); }
-#define DEC8(type, type2) INLINE_OVERLOADABLE type2##8 upsample(type##8 a, u##type##8 b) { return (type2##8)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7)); }
-#define DEC16(type, type2) INLINE_OVERLOADABLE type2##16 upsample(type##16 a, u##type##16 b) { return (type2##16)(upsample(a.s0, b.s0), upsample(a.s1, b.s1), upsample(a.s2, b.s2), upsample(a.s3, b.s3), upsample(a.s4, b.s4), upsample(a.s5, b.s5), upsample(a.s6, b.s6), upsample(a.s7, b.s7), upsample(a.s8, b.s8), upsample(a.s9, b.s9), upsample(a.sa, b.sa), upsample(a.sb, b.sb), upsample(a.sc, b.sc), upsample(a.sd, b.sd), upsample(a.se, b.se), upsample(a.sf, b.sf)); }
-#define DEF(n) DEC##n(char, short); DEC##n(short, int)
-DEF(2)
-DEF(3)
-DEF(4)
-DEF(8)
-DEF(16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-PURE CONST uint __gen_ocl_hadd(uint x, uint y);
-PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
-#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
-#define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
-DEC
-#undef DEF
-#define DEF(type) INLINE_OVERLOADABLE type rhadd(type x, type y) { return (x + y + 1) >> 1; }
-DEC
-#undef DEF
-#undef DEC
-INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y) >> 1) : __gen_ocl_hadd(x, y); }
-INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
-INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
-INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
-#define DEC2(func, type) INLINE_OVERLOADABLE type##2 func(type##2 a, type##2 b) { return (type##2)(func(a.s0, b.s0), func(a.s1, b.s1)); }
-#define DEC3(func, type) INLINE_OVERLOADABLE type##3 func(type##3 a, type##3 b) { return (type##3)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2)); }
-#define DEC4(func, type) INLINE_OVERLOADABLE type##4 func(type##4 a, type##4 b) { return (type##4)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3)); }
-#define DEC8(func, type) INLINE_OVERLOADABLE type##8 func(type##8 a, type##8 b) { return (type##8)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7)); }
-#define DEC16(func, type) INLINE_OVERLOADABLE type##16 func(type##16 a, type##16 b) { return (type##16)(func(a.s0, b.s0), func(a.s1, b.s1), func(a.s2, b.s2), func(a.s3, b.s3), func(a.s4, b.s4), func(a.s5, b.s5), func(a.s6, b.s6), func(a.s7, b.s7), func(a.s8, b.s8), func(a.s9, b.s9), func(a.sa, b.sa), func(a.sb, b.sb), func(a.sc, b.sc), func(a.sd, b.sd), func(a.se, b.se), func(a.sf, b.sf)); }
-#define DEF(func, n) DEC##n(func, char); DEC##n(func, uchar); DEC##n(func, short); DEC##n(func, ushort); DEC##n(func, int); DEC##n(func, uint)
-DEF(hadd, 2)
-DEF(hadd, 3)
-DEF(hadd, 4)
-DEF(hadd, 8)
-DEF(hadd, 16)
-DEF(rhadd, 2)
-DEF(rhadd, 3)
-DEF(rhadd, 4)
-DEF(rhadd, 8)
-DEF(rhadd, 16)
-#undef DEF
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-int __gen_ocl_abs(int x);
-#define ABS_I(I, CVT)  (CVT)__gen_ocl_abs(x.s##I)
-#define ABS_VEC1(CVT)  (CVT)__gen_ocl_abs(x)
-#define ABS_VEC2(CVT)  ABS_I(0, CVT), ABS_I(1, CVT)
-#define ABS_VEC3(CVT)  ABS_I(0, CVT), ABS_I(1, CVT), ABS_I(2, CVT)
-#define ABS_VEC4(CVT)  ABS_VEC2(CVT), ABS_I(2, CVT), ABS_I(3, CVT)
-#define ABS_VEC8(CVT)  ABS_VEC4(CVT), ABS_I(4, CVT), ABS_I(5, CVT),\
-	               ABS_I(6, CVT), ABS_I(7, CVT)
-#define ABS_VEC16(CVT) ABS_VEC8(CVT), ABS_I(8, CVT), ABS_I(9, CVT), \
-	               ABS_I(A, CVT), ABS_I(B, CVT), ABS_I(C, CVT), \
-	               ABS_I(D, CVT), ABS_I(E, CVT), ABS_I(F, CVT)
-
-#define DEC_1(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return ABS_VEC1(u##TYPE); }
-#define DEC_N(TYPE, N) INLINE_OVERLOADABLE u##TYPE##N abs(TYPE##N x) { return (u##TYPE##N)(ABS_VEC##N(u##TYPE)); };
-#define DEC(TYPE) DEC_1(TYPE) DEC_N(TYPE, 2) DEC_N(TYPE, 3) DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
-
-DEC(int)
-DEC(short)
-DEC(char)
-#undef DEC_1
-#undef DEC_N
-/* For unsigned types, do nothing. */
-#define DEC_1(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
-#define DEC_N(TYPE, N) INLINE_OVERLOADABLE TYPE##N abs(TYPE##N x) { return x; }
-DEC(uint)
-DEC(ushort)
-DEC(uchar)
-#undef DEC
-#undef DEC_1
-#undef DEC_N
-#undef ABS_I
-#undef ABS_VEC1
-#undef ABS_VEC2
-#undef ABS_VEC3
-#undef ABS_VEC4
-#undef ABS_VEC8
-#undef ABS_VEC16
-
-
-/* Char and short type abs diff */
-/* promote char and short to int and will be no module overflow */
-#define ABS_DIFF(CVT) (CVT)(abs((int)x - (int)y))
-#define ABS_DIFF_I(CVT, I)  (CVT)(abs((int)x.s##I - (int)y.s##I))
-
-#define ABS_DIFF_VEC1(CVT)  ABS_DIFF(CVT)
-#define ABS_DIFF_VEC2(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1)
-#define ABS_DIFF_VEC3(CVT)  ABS_DIFF_I(CVT, 0), ABS_DIFF_I(CVT, 1), ABS_DIFF_I(CVT, 2)
-#define ABS_DIFF_VEC4(CVT)  ABS_DIFF_VEC2(CVT), ABS_DIFF_I(CVT, 2), ABS_DIFF_I(CVT, 3)
-#define ABS_DIFF_VEC8(CVT)  ABS_DIFF_VEC4(CVT), ABS_DIFF_I(CVT, 4), ABS_DIFF_I(CVT, 5), \
-                            ABS_DIFF_I(CVT, 6), ABS_DIFF_I(CVT, 7)
-#define ABS_DIFF_VEC16(CVT)  ABS_DIFF_VEC8(CVT), ABS_DIFF_I(CVT, 8), ABS_DIFF_I(CVT, 9), \
-                            ABS_DIFF_I(CVT, A), ABS_DIFF_I(CVT, B), \
-                            ABS_DIFF_I(CVT, C), ABS_DIFF_I(CVT, D), \
-                            ABS_DIFF_I(CVT, E), ABS_DIFF_I(CVT, F)
-
-#define DEC_1(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
-                           { return ABS_DIFF_VEC1(UTYPE); }
-#define DEC_N(TYPE, UTYPE, N) INLINE_OVERLOADABLE UTYPE##N abs_diff(TYPE##N x, TYPE##N y) \
-                              { return (UTYPE##N)(ABS_DIFF_VEC##N(UTYPE)); };
-#define DEC(TYPE, UTYPE)  DEC_1(TYPE, UTYPE) DEC_N(TYPE, UTYPE, 2)  DEC_N(TYPE, UTYPE, 3 ) \
-                          DEC_N(TYPE, UTYPE, 4) DEC_N(TYPE, UTYPE, 8) DEC_N(TYPE, UTYPE, 16)
-DEC(char, uchar)
-DEC(uchar, uchar)
-DEC(short, ushort)
-DEC(ushort, ushort)
-
-#undef DEC
-#undef DEC_1
-#undef DEC_N
-#undef ABS_DIFF
-#undef ABS_DIFF_I
-#undef ABS_DIFF_VEC1
-#undef ABS_DIFF_VEC2
-#undef ABS_DIFF_VEC3
-#undef ABS_DIFF_VEC4
-#undef ABS_DIFF_VEC8
-#undef ABS_DIFF_VEC16
-
-INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
-    /* same signed will never overflow. */
-    return y > x ? (y -x) : (x - y);
-}
-
-INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
-    /* same signed will never module overflow. */
-    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
-        return abs(x - y);
-
-    return (abs(x) + abs(y));
-}
-
-#define ABS_DIFF_I(I)  abs_diff(x.s##I, y.s##I)
-
-#define ABS_DIFF_VEC2  ABS_DIFF_I(0), ABS_DIFF_I(1)
-#define ABS_DIFF_VEC3  ABS_DIFF_I(0), ABS_DIFF_I(1), ABS_DIFF_I(2)
-#define ABS_DIFF_VEC4  ABS_DIFF_VEC2, ABS_DIFF_I(2), ABS_DIFF_I(3)
-#define ABS_DIFF_VEC8  ABS_DIFF_VEC4, ABS_DIFF_I(4), ABS_DIFF_I(5), \
-                       ABS_DIFF_I(6), ABS_DIFF_I(7)
-#define ABS_DIFF_VEC16  ABS_DIFF_VEC8, ABS_DIFF_I(8), ABS_DIFF_I(9), \
-                            ABS_DIFF_I(A), ABS_DIFF_I(B), \
-                            ABS_DIFF_I(C), ABS_DIFF_I(D), \
-                            ABS_DIFF_I(E), ABS_DIFF_I(F)
-
-#define DEC_N(TYPE, N) INLINE_OVERLOADABLE uint##N abs_diff(TYPE##N x, TYPE##N y) \
-				      { return (uint##N)(ABS_DIFF_VEC##N); };
-#define DEC(TYPE)   DEC_N(TYPE, 2)  DEC_N(TYPE, 3 ) \
-                           DEC_N(TYPE, 4) DEC_N(TYPE, 8) DEC_N(TYPE, 16)
-DEC(int)
-DEC(uint)
-
-#undef DEC
-#undef DEC_1
-#undef DEC_N
-#undef ABS_DIFF
-#undef ABS_DIFF_I
-#undef ABS_DIFF_VEC1
-#undef ABS_DIFF_VEC2
-#undef ABS_DIFF_VEC3
-#undef ABS_DIFF_VEC4
-#undef ABS_DIFF_VEC8
-#undef ABS_DIFF_VEC16
-
-/////////////////////////////////////////////////////////////////////////////
-// Work Items functions (see 6.11.1 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-
-PURE CONST uint __gen_ocl_get_work_dim(void);
-INLINE uint get_work_dim(void) {
-  return __gen_ocl_get_work_dim();
-}
-
-#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
-PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
-PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
-PURE CONST unsigned int __gen_ocl_##NAME##2(void);
-DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
-DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
-DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
-DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
-DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
-DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
-#undef DECL_INTERNAL_WORK_ITEM_FN
-
-#define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
-INLINE unsigned NAME(unsigned int dim) {             \
-  if (dim == 0) return __gen_ocl_##NAME##0();        \
-  else if (dim == 1) return __gen_ocl_##NAME##1();   \
-  else if (dim == 2) return __gen_ocl_##NAME##2();   \
-  else return OTHER_RET;                             \
-}
-
-DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
-DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
-DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
-DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
-DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
-DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
-#undef DECL_PUBLIC_WORK_ITEM_FN
-
-INLINE uint get_global_id(uint dim) {
-  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Math Functions (see 6.11.2 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-PURE CONST float __gen_ocl_fabs(float x);
-PURE CONST float __gen_ocl_sin(float x);
-PURE CONST float __gen_ocl_cos(float x);
-PURE CONST float __gen_ocl_sqrt(float x);
-PURE CONST float __gen_ocl_rsqrt(float x);
-PURE CONST float __gen_ocl_log(float x);
-PURE CONST float __gen_ocl_pow(float x, float y);
-PURE CONST float __gen_ocl_rcp(float x);
-PURE CONST float __gen_ocl_rndz(float x);
-PURE CONST float __gen_ocl_rnde(float x);
-PURE CONST float __gen_ocl_rndu(float x);
-PURE CONST float __gen_ocl_rndd(float x);
-INLINE_OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
-INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
-  return __gen_ocl_cos(x * M_PI_F);
-}
-INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
-  return __gen_ocl_sin(x * M_PI_F);
-}
-INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
-INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
-INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
-INLINE_OVERLOADABLE float native_log(float x) {
-  return native_log2(x) * 0.6931472002f;
-}
-INLINE_OVERLOADABLE float native_log10(float x) {
-  return native_log2(x) * 0.3010299956f;
-}
-INLINE_OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
-INLINE_OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
-INLINE_OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
-INLINE_OVERLOADABLE int2 ilogb(float2 x) {
-  return (int2)(ilogb(x.s0), ilogb(x.s1));
-}
-INLINE_OVERLOADABLE int4 ilogb(float4 x) {
-  return (int4)(ilogb(x.s01), ilogb(x.s23));
-}
-INLINE_OVERLOADABLE int8 ilogb(float8 x) {
-  return (int8)(ilogb(x.s0123), ilogb(x.s4567));
-}
-INLINE_OVERLOADABLE int16 ilogb(float16 x) {
-  return (int16)(ilogb(x.s01234567), ilogb(x.s89abcdef));
-}
-INLINE_OVERLOADABLE float nan(uint code) {
-  return NAN;
-}
-INLINE_OVERLOADABLE float2 nan(uint2 code) {
-  return (float2)(nan(code.s0), nan(code.s1));
-}
-INLINE_OVERLOADABLE float4 nan(uint4 code) {
-  return (float4)(nan(code.s01), nan(code.s23));
-}
-INLINE_OVERLOADABLE float8 nan(uint8 code) {
-  return (float8)(nan(code.s0123), nan(code.s4567));
-}
-INLINE_OVERLOADABLE float16 nan(uint16 code) {
-  return (float16)(nan(code.s01234567), nan(code.s89abcdef));
-}
-INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
-INLINE_OVERLOADABLE float native_tan(float x) {
-  return native_sin(x) / native_cos(x);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
-  return native_tan(x * M_PI_F);
-}
-INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
-INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
-INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
-INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
-  return __gen_ocl_pow(x, 0.3333333333f);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
-  *cosval = native_cos(x);
-  return native_sin(x);
-}
-INLINE_OVERLOADABLE float2 __gen_ocl_internal_sincos(float2 x, float2 *cosval) {
-  return (float2)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval));
-}
-INLINE_OVERLOADABLE float4 __gen_ocl_internal_sincos(float4 x, float4 *cosval) {
-  return (float4)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval));
-}
-INLINE_OVERLOADABLE float8 __gen_ocl_internal_sincos(float8 x, float8 *cosval) {
-  return (float8)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s4, 4 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s5, 5 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
-                  __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval));
-}
-INLINE_OVERLOADABLE float16 __gen_ocl_internal_sincos(float16 x, float16 *cosval) {
-  return (float16)(__gen_ocl_internal_sincos(x.s0, (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s1, 1 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s2, 2 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s3, 3 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s4, 4 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s5, 5 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s6, 6 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s7, 7 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s8, 8 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.s9, 9 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.sa, 10 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.sb, 11 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.sc, 12 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.sd, 13 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.se, 14 + (float *)cosval),
-                   __gen_ocl_internal_sincos(x.sf, 15 + (float *)cosval));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
-  return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
-  return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
-  float y = native_exp(-2 * x);
-  return (1 - y) / (1 + y);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
-  return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
-  return __gen_ocl_internal_asin(x) / M_PI_F;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
-  return M_PI_2_F - __gen_ocl_internal_asin(x);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
-  return __gen_ocl_internal_acos(x) / M_PI_F;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
-  float a = 0, c = 1;
-  if (x <= -1) {
-    a = - M_PI_2_F;
-    x = 1 / x;
-    c = -1;
-  }
-  if (x >= 1) {
-    a = M_PI_2_F;
-    x = 1 / x;
-    c = -1;
-  }
-  return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
-  return __gen_ocl_internal_atan(x) / M_PI_F;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
-  return native_log(x + native_sqrt(x * x + 1));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
-  return 0.5f * native_sqrt((1 + x) / (1 - x));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
-  return x * y < 0 ? -x : x;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
-  return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
-  return 1 - __gen_ocl_internal_erf(x);
-}
-
-// XXX work-around PTX profile
-#define sqrt native_sqrt
-INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x)   { return native_log(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x)  { return native_log2(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
-INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
-INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
-  return 2 * __gen_ocl_internal_round(x / 2);
-}
-// TODO use llvm intrinsics definitions
-#define cos native_cos
-#define cospi __gen_ocl_internal_cospi
-#define cosh __gen_ocl_internal_cosh
-#define acos __gen_ocl_internal_acos
-#define acospi __gen_ocl_internal_acospi
-#define acosh __gen_ocl_internal_acosh
-#define sin native_sin
-#define sinpi __gen_ocl_internal_sinpi
-#define sinh __gen_ocl_internal_sinh
-#define sincos __gen_ocl_internal_sincos
-#define asin __gen_ocl_internal_asin
-#define asinpi __gen_ocl_internal_asinpi
-#define asinh __gen_ocl_internal_asinh
-#define tan native_tan
-#define tanpi __gen_ocl_internal_tanpi
-#define tanh __gen_ocl_internal_tanh
-#define atan __gen_ocl_internal_atan
-#define atanpi __gen_ocl_internal_atanpi
-#define atanh __gen_ocl_internal_atanh
-#define pow powr
-#define cbrt __gen_ocl_internal_cbrt
-#define rint __gen_ocl_internal_rint
-#define copysign __gen_ocl_internal_copysign
-#define erf __gen_ocl_internal_erf
-#define erfc __gen_ocl_internal_erfc
-
-INLINE_OVERLOADABLE float mad(float a, float b, float c) {
-  return a*b+c;
-}
-
-INLINE_OVERLOADABLE uint select(uint src0, uint src1, int cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE int select(int src0, int src1, int cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE int select(int src0, int src1, uint cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE float select(float src0, float src1, int cond) {
-  return cond ? src1 : src0;
-}
-INLINE_OVERLOADABLE float select(float src0, float src1, uint cond) {
-  return cond ? src1 : src0;
-}
-
-// This will be optimized out by LLVM and will output LLVM select instructions
-#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
-INLINE_OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
-  TYPE4 dst; \
-  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
-  const TYPE x1 = src1.x; \
-  const TYPE y0 = src0.y; \
-  const TYPE y1 = src1.y; \
-  const TYPE z0 = src0.z; \
-  const TYPE z1 = src1.z; \
-  const TYPE w0 = src0.w; \
-  const TYPE w1 = src1.w; \
-  dst.x = (cond.x & MASK) ? x1 : x0; \
-  dst.y = (cond.y & MASK) ? y1 : y0; \
-  dst.z = (cond.z & MASK) ? z1 : z0; \
-  dst.w = (cond.w & MASK) ? w1 : w0; \
-  return dst; \
-}
-DECL_SELECT4(int4, int, int4, 0x80000000)
-DECL_SELECT4(int4, int, uint4, 0x80000000)
-DECL_SELECT4(float4, float, int4, 0x80000000)
-DECL_SELECT4(float4, float, uint4, 0x80000000)
-#undef DECL_SELECT4
-
-/////////////////////////////////////////////////////////////////////////////
-// Common Functions (see 6.11.4 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-INLINE_OVERLOADABLE float step(float edge, float x) {
-  return x < edge ? 0.0 : 1.0;
-}
-#define STEP(I)  x.s##I < edge.s##I ? 0.0 : 1.0
-INLINE_OVERLOADABLE float2 step(float2 edge, float2 x) {
-  return (float2)(STEP(0), STEP(1));
-}
-INLINE_OVERLOADABLE float3 step(float3 edge, float3 x) {
-  return (float3)(STEP(0), STEP(1), STEP(2));
-}
-INLINE_OVERLOADABLE float4 step(float4 edge, float4 x) {
-  return (float4)(STEP(0), STEP(1), STEP(2), STEP(3));
-}
-INLINE_OVERLOADABLE float8 step(float8 edge, float8 x) {
-  return (float8)(STEP(0), STEP(1), STEP(2), STEP(3),
-                  STEP(4), STEP(5), STEP(6), STEP(7));
-}
-INLINE_OVERLOADABLE float16 step(float16 edge, float16 x) {
-  return (float16)(STEP(0), STEP(1), STEP(2), STEP(3),
-                   STEP(4), STEP(5), STEP(6), STEP(7),
-                   STEP(8), STEP(9), STEP(A), STEP(B),
-                   STEP(C), STEP(D), STEP(E), STEP(F));
-}
-#undef STEP
-#define STEP(I)  x.s##I < edge ? 0.0 : 1.0
-INLINE_OVERLOADABLE float2 step(float edge, float2 x) {
-  return (float2)(STEP(0), STEP(1));
-}
-INLINE_OVERLOADABLE float3 step(float edge, float3 x) {
-  return (float3)(STEP(0), STEP(1), STEP(2));
-}
-INLINE_OVERLOADABLE float4 step(float edge, float4 x) {
-  return (float4)(STEP(0), STEP(1), STEP(2), STEP(3));
-}
-INLINE_OVERLOADABLE float8 step(float edge, float8 x) {
-  return (float8)(STEP(0), STEP(1), STEP(2), STEP(3),
-                  STEP(4), STEP(5), STEP(6), STEP(7));
-}
-INLINE_OVERLOADABLE float16 step(float edge, float16 x) {
-  return (float16)(STEP(0), STEP(1), STEP(2), STEP(3),
-                   STEP(4), STEP(5), STEP(6), STEP(7),
-                   STEP(8), STEP(9), STEP(A), STEP(B),
-                   STEP(C), STEP(D), STEP(E), STEP(F));
-}
-#undef STEP
-
-#define DECL_MIN_MAX_CLAMP(TYPE) \
-INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
-  return a > b ? a : b; \
-} \
-INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
-  return a < b ? a : b; \
-} \
-INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
-  return max(min(v, u), l); \
-}
-DECL_MIN_MAX_CLAMP(float)
-DECL_MIN_MAX_CLAMP(int)
-DECL_MIN_MAX_CLAMP(short)
-DECL_MIN_MAX_CLAMP(char)
-DECL_MIN_MAX_CLAMP(uint)
-DECL_MIN_MAX_CLAMP(unsigned short)
-DECL_MIN_MAX_CLAMP(unsigned char)
-#undef DECL_MIN_MAX_CLAMP
-
-INLINE_OVERLOADABLE float degrees(float radians) { return (180 / M_PI_F) * radians; }
-INLINE_OVERLOADABLE float2 degrees(float2 r) { return (float2)(degrees(r.s0), degrees(r.s1)); }
-INLINE_OVERLOADABLE float3 degrees(float3 r) { return (float3)(degrees(r.s0), degrees(r.s1), degrees(r.s2)); }
-INLINE_OVERLOADABLE float4 degrees(float4 r) { return (float4)(degrees(r.s0), degrees(r.s1), degrees(r.s2), degrees(r.s3)); }
-INLINE_OVERLOADABLE float8 degrees(float8 r) { return (float8)(degrees(r.s0), degrees(r.s1), degrees(r.s2), degrees(r.s3), degrees(r.s4), degrees(r.s5), degrees(r.s6), degrees(r.s7)); }
-INLINE_OVERLOADABLE float16 degrees(float16 r) { return (float16)(degrees(r.s0), degrees(r.s1), degrees(r.s2), degrees(r.s3), degrees(r.s4), degrees(r.s5), degrees(r.s6), degrees(r.s7), degrees(r.s8), degrees(r.s9), degrees(r.sa), degrees(r.sb), degrees(r.sc), degrees(r.sd), degrees(r.se), degrees(r.sf)); }
-INLINE_OVERLOADABLE float radians(float degrees) { return (M_PI_F / 180) * degrees; }
-INLINE_OVERLOADABLE float2 radians(float2 r) { return (float2)(radians(r.s0), radians(r.s1)); }
-INLINE_OVERLOADABLE float3 radians(float3 r) { return (float3)(radians(r.s0), radians(r.s1), radians(r.s2)); }
-INLINE_OVERLOADABLE float4 radians(float4 r) { return (float4)(radians(r.s0), radians(r.s1), radians(r.s2), radians(r.s3)); }
-INLINE_OVERLOADABLE float8 radians(float8 r) { return (float8)(radians(r.s0), radians(r.s1), radians(r.s2), radians(r.s3), radians(r.s4), radians(r.s5), radians(r.s6), radians(r.s7)); }
-INLINE_OVERLOADABLE float16 radians(float16 r) { return (float16)(radians(r.s0), radians(r.s1), radians(r.s2), radians(r.s3), radians(r.s4), radians(r.s5), radians(r.s6), radians(r.s7), radians(r.s8), radians(r.s9), radians(r.sa), radians(r.sb), radians(r.sc), radians(r.sd), radians(r.se), radians(r.sf)); }
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a > b ? x : b > a ? y : max(x, y);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a < b ? x : b < a ? y : min(x, y);
-}
-INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
-INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
-  return __gen_ocl_internal_fmax(x, y) - y;
-}
-INLINE_OVERLOADABLE float fract(float x, float *p) {
-  *p = __gen_ocl_internal_floor(x);
-  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
-}
-INLINE_OVERLOADABLE float2 fract(float2 x, float2 *p) {
-  return (float2)(fract(x.s0, (float *)p),
-                  fract(x.s1, 1 + (float *)p));
-}
-INLINE_OVERLOADABLE float4 fract(float4 x, float4 *p) {
-  return (float4)(fract(x.s0, (float *)p),
-                  fract(x.s1, 1 + (float *)p),
-                  fract(x.s2, 2 + (float *)p),
-                  fract(x.s3, 3 + (float *)p));
-}
-INLINE_OVERLOADABLE float8 fract(float8 x, float8 *p) {
-  return (float8)(fract(x.s0, (float *)p),
-                  fract(x.s1, 1 + (float *)p),
-                  fract(x.s2, 2 + (float *)p),
-                  fract(x.s3, 3 + (float *)p),
-                  fract(x.s4, 4 + (float *)p),
-                  fract(x.s5, 5 + (float *)p),
-                  fract(x.s6, 6 + (float *)p),
-                  fract(x.s7, 7 + (float *)p));
-}
-INLINE_OVERLOADABLE float16 fract(float16 x, float16 *p) {
-  return (float16)(fract(x.s0, (float *)p),
-                   fract(x.s1, 1 + (float *)p),
-                   fract(x.s2, 2 + (float *)p),
-                   fract(x.s3, 3 + (float *)p),
-                   fract(x.s4, 4 + (float *)p),
-                   fract(x.s5, 5 + (float *)p),
-                   fract(x.s6, 6 + (float *)p),
-                   fract(x.s7, 7 + (float *)p),
-                   fract(x.s8, 8 + (float *)p),
-                   fract(x.s9, 9 + (float *)p),
-                   fract(x.sa, 10 + (float *)p),
-                   fract(x.sb, 11 + (float *)p),
-                   fract(x.sc, 12 + (float *)p),
-                   fract(x.sd, 13 + (float *)p),
-                   fract(x.se, 14 + (float *)p),
-                   fract(x.sf, 15 + (float *)p));
-}
-INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
-INLINE_OVERLOADABLE float ldexp(float x, int n) {
-  return __gen_ocl_pow(2, n) * x;
-}
-INLINE_OVERLOADABLE float pown(float x, int n) {
-  if (x == 0 && n == 0)
-    return 1;
-  return powr(x, n);
-}
-INLINE_OVERLOADABLE float rootn(float x, int n) {
-  return powr(x, 1.f / n);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Geometric functions (see 6.11.5 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
-  return mad(p0.x,p1.x,p0.y*p1.y);
-}
-INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
-  return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
-}
-INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
-  return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
-}
-
-INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {
-  return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
-         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
-}
-INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {
-  return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
-         mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
-         mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
-         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
-}
-
-INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
-INLINE_OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
-INLINE_OVERLOADABLE float normalize(float x) { return 1.f; }
-INLINE_OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
-
-INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
-INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_normalize(float x) { return 1.f; }
-INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
-
-INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
-   return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
-}
-INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
-   return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Vector loads and stores
-/////////////////////////////////////////////////////////////////////////////
-
-// These loads and stores will use untyped reads and writes, so we can just
-// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
-// Well we do not care, we do not activate TBAA in the compiler
-#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
-INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
-  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
-} \
-INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
-  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
-}
-
-#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
-
-#define DECL_UNTYPED_RW_ALL(TYPE) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
-
-DECL_UNTYPED_RW_ALL(char)
-DECL_UNTYPED_RW_ALL(uchar)
-DECL_UNTYPED_RW_ALL(short)
-DECL_UNTYPED_RW_ALL(ushort)
-DECL_UNTYPED_RW_ALL(int)
-DECL_UNTYPED_RW_ALL(uint)
-DECL_UNTYPED_RW_ALL(long)
-DECL_UNTYPED_RW_ALL(ulong)
-DECL_UNTYPED_RW_ALL(float)
-
-#undef DECL_UNTYPED_RW_ALL
-#undef DECL_UNTYPED_RW_ALL_SPACE
-#undef DECL_UNTYPED_RW_SPACE_N
-
-/////////////////////////////////////////////////////////////////////////////
-// Declare functions for vector types which are derived from scalar ones
-/////////////////////////////////////////////////////////////////////////////
-#define DECL_VECTOR_1OP(NAME, TYPE) \
-  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v) { \
-    return (TYPE##2)(NAME(v.x), NAME(v.y)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v) { \
-    return (TYPE##3)(NAME(v.x), NAME(v.y), NAME(v.z)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v) { \
-    return (TYPE##4)(NAME(v.x), NAME(v.y), NAME(v.z), NAME(v.w)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v) { \
-    TYPE##8 dst;\
-    dst.s0123 = NAME(v.s0123);\
-    dst.s4567 = NAME(v.s4567);\
-    return dst;\
-  }\
-  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v) { \
-    TYPE##16 dst;\
-    dst.s01234567 = NAME(v.s01234567);\
-    dst.s89abcdef = NAME(v.s89abcdef);\
-    return dst;\
-  }
-DECL_VECTOR_1OP(native_cos, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_cospi, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_cosh, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_acos, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_acospi, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_acosh, float);
-DECL_VECTOR_1OP(native_sin, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_sinpi, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_sinh, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_asin, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_asinpi, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_asinh, float);
-DECL_VECTOR_1OP(native_tan, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_tanpi, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_tanh, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_atan, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_atanpi, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_atanh, float);
-DECL_VECTOR_1OP(native_sqrt, float);
-DECL_VECTOR_1OP(native_rsqrt, float);
-DECL_VECTOR_1OP(native_log2, float);
-DECL_VECTOR_1OP(log1p, float);
-DECL_VECTOR_1OP(logb, float);
-DECL_VECTOR_1OP(native_recip, float);
-DECL_VECTOR_1OP(native_exp2, float);
-DECL_VECTOR_1OP(native_exp10, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_expm1, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_cbrt, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_fabs, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_trunc, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_round, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_floor, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_ceil, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_log, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_log2, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_log10, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_rint, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_erf, float);
-DECL_VECTOR_1OP(__gen_ocl_internal_erfc, float);
-#undef DECL_VECTOR_1OP
-/////////////////////////////////////////////////////////////////////////////
-// Arithmetic functions
-/////////////////////////////////////////////////////////////////////////////
-
-#define DECL_VECTOR_2OP(NAME, TYPE) \
-  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1) { \
-    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v0.y, v1.y)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1) { \
-    return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1) { \
-    return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1) { \
-    TYPE##8 dst;\
-    dst.s0123 = NAME(v0.s0123, v1.s0123);\
-    dst.s4567 = NAME(v0.s4567, v1.s4567);\
-    return dst;\
-  }\
-  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1) { \
-    TYPE##16 dst;\
-    dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
-    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
-    return dst;\
-  }
-DECL_VECTOR_2OP(hypot, float);
-DECL_VECTOR_2OP(min, float);
-DECL_VECTOR_2OP(max, float);
-DECL_VECTOR_2OP(__gen_ocl_internal_fmin, float);
-DECL_VECTOR_2OP(__gen_ocl_internal_fmax, float);
-DECL_VECTOR_2OP(__gen_ocl_internal_fdim, float);
-DECL_VECTOR_2OP(fmod, float);
-DECL_VECTOR_2OP(remainder, float);
-DECL_VECTOR_2OP(powr, float);
-DECL_VECTOR_2OP(native_divide, float);
-DECL_VECTOR_2OP(copysign, float);
-DECL_VECTOR_2OP(__gen_ocl_internal_maxmag, float);
-DECL_VECTOR_2OP(__gen_ocl_internal_minmag, float);
-
-#define DECL_VECTOR_NOP_ALL_INT_TYPES(NOP, NAME) \
-NOP(NAME, char)   \
-NOP(NAME, uchar)  \
-NOP(NAME, short)  \
-NOP(NAME, ushort) \
-NOP(NAME, int)    \
-NOP(NAME, uint)   \
-NOP(NAME, long)   \
-NOP(NAME, ulong)
-
-DECL_VECTOR_NOP_ALL_INT_TYPES(DECL_VECTOR_2OP, add_sat)
-DECL_VECTOR_NOP_ALL_INT_TYPES(DECL_VECTOR_2OP, sub_sat)
-#undef DECL_VECTOR_NOP_ALL_INT_TYPES
-#undef DECL_VECTOR_2OP
-
-#define DECL_VECTOR_2OP(NAME, TYPE, TYPE2) \
-  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE2##2 v1) { \
-    return (TYPE##2)(NAME(v0.x, v1.x), NAME(v0.y, v1.y)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE2##3 v1) { \
-    return (TYPE##3)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE2##4 v1) { \
-    return (TYPE##4)(NAME(v0.x, v1.x), NAME(v0.y, v1.y), NAME(v0.z, v1.z), NAME(v0.w, v1.w)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE2##8 v1) { \
-    TYPE##8 dst;\
-    dst.s0123 = NAME(v0.s0123, v1.s0123);\
-    dst.s4567 = NAME(v0.s4567, v1.s4567);\
-    return dst;\
-  }\
-  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE2##16 v1) { \
-    TYPE##16 dst;\
-    dst.s01234567 = NAME(v0.s01234567, v1.s01234567);\
-    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef);\
-    return dst;\
-  }
-DECL_VECTOR_2OP(ldexp, float, int);
-DECL_VECTOR_2OP(pown, float, int);
-DECL_VECTOR_2OP(rootn, float, int);
-#undef DECL_VECTOR_2OP
-
-#define DECL_VECTOR_3OP(NAME, TYPE) \
-  INLINE_OVERLOADABLE TYPE##2 NAME(TYPE##2 v0, TYPE##2 v1, TYPE##2 v2) { \
-    return (TYPE##2)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##3 NAME(TYPE##3 v0, TYPE##3 v1, TYPE##3 v2) { \
-    return (TYPE##3)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##4 NAME(TYPE##4 v0, TYPE##4 v1, TYPE##4 v2) { \
-    return (TYPE##4)(NAME(v0.x, v1.x, v2.x), NAME(v0.y, v1.y, v2.y), NAME(v0.z, v1.z, v2.z), NAME(v0.w, v1.w, v2.w)); \
-  }\
-  INLINE_OVERLOADABLE TYPE##8 NAME(TYPE##8 v0, TYPE##8 v1, TYPE##8 v2) { \
-    TYPE##8 dst;\
-    dst.s0123 = NAME(v0.s0123, v1.s0123, v2.s0123);\
-    dst.s4567 = NAME(v0.s4567, v1.s4567, v2.s4567);\
-    return dst;\
-  }\
-  INLINE_OVERLOADABLE TYPE##16 NAME(TYPE##16 v0, TYPE##16 v1, TYPE##16 v2) { \
-    TYPE##16 dst;\
-    dst.s01234567 = NAME(v0.s01234567, v1.s01234567, v2.s01234567);\
-    dst.s89abcdef = NAME(v0.s89abcdef, v1.s89abcdef, v2.s89abcdef);\
-    return dst;\
-  }
-DECL_VECTOR_3OP(mad, float);
-DECL_VECTOR_3OP(mix, float);
-#undef DECL_VECTOR_3OP
-
-// mix requires more variants
-INLINE_OVERLOADABLE float2 mix(float2 x, float2 y, float a) { return mix(x,y,(float2)(a));}
-INLINE_OVERLOADABLE float3 mix(float3 x, float3 y, float a) { return mix(x,y,(float3)(a));}
-INLINE_OVERLOADABLE float4 mix(float4 x, float4 y, float a) { return mix(x,y,(float4)(a));}
-INLINE_OVERLOADABLE float8 mix(float8 x, float8 y, float a) { return mix(x,y,(float8)(a));}
-INLINE_OVERLOADABLE float16 mix(float16 x, float16 y, float a) { return mix(x,y,(float16)(a));}
-
-// XXX workaround ptx profile
-#define fabs __gen_ocl_internal_fabs
-#define trunc __gen_ocl_internal_trunc
-#define round __gen_ocl_internal_round
-#define floor __gen_ocl_internal_floor
-#define ceil __gen_ocl_internal_ceil
-#define log __gen_ocl_internal_log
-#define log2 __gen_ocl_internal_log2
-#define log10 __gen_ocl_internal_log10
-#define exp __gen_ocl_internal_exp
-#define exp2 native_exp2
-#define exp10 native_exp10
-#define expm1 __gen_ocl_internal_expm1
-#define fmin __gen_ocl_internal_fmin
-#define fmax __gen_ocl_internal_fmax
-#define fma mad
-#define fdim __gen_ocl_internal_fdim
-#define maxmag __gen_ocl_internal_maxmag
-#define minmag __gen_ocl_internal_minmag
-
-/////////////////////////////////////////////////////////////////////////////
-// Synchronization functions
-/////////////////////////////////////////////////////////////////////////////
-#define CLK_LOCAL_MEM_FENCE  (1 << 0)
-#define CLK_GLOBAL_MEM_FENCE (1 << 1)
-
-void __gen_ocl_barrier_local(void);
-void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
-
-typedef uint cl_mem_fence_flags;
-INLINE void barrier(cl_mem_fence_flags flags) {
-  if (flags == (CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE))
-    __gen_ocl_barrier_local_and_global();
-  else if (flags == CLK_LOCAL_MEM_FENCE)
-    __gen_ocl_barrier_local();
-  else if (flags == CLK_GLOBAL_MEM_FENCE)
-    __gen_ocl_barrier_global();
-}
-
-INLINE void mem_fence(cl_mem_fence_flags flags) {
-}
-INLINE void read_mem_fence(cl_mem_fence_flags flags) {
-}
-INLINE void write_mem_fence(cl_mem_fence_flags flags) {
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Atomic functions
-/////////////////////////////////////////////////////////////////////////////
-OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
-
-#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
-  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
-    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
-  }
-
-#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX) \
-
-#define DECL_ATOMIC_OP(NAME) \
-  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)              \
-  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
-
-DECL_ATOMIC_OP(add)
-DECL_ATOMIC_OP(sub)
-DECL_ATOMIC_OP(and)
-DECL_ATOMIC_OP(or)
-DECL_ATOMIC_OP(xor)
-DECL_ATOMIC_OP(xchg)
-DECL_ATOMIC_OP_TYPE(xchg, float, atomic_)
-DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
-DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
-DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
-DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
-
-#undef DECL_ATOMIC_OP
-#undef DECL_ATOMIC_OP_TYPE
-#undef DECL_ATOMIC_OP_SPACE
-
-#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
-  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
-    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
-  }
-
-#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
-
-#define DECL_ATOMIC_OP(NAME) \
-  DECL_ATOMIC_OP_TYPE(NAME, uint) \
-  DECL_ATOMIC_OP_TYPE(NAME, int)
-
-DECL_ATOMIC_OP(inc)
-DECL_ATOMIC_OP(dec)
-
-#undef DECL_ATOMIC_OP
-#undef DECL_ATOMIC_OP_TYPE
-#undef DECL_ATOMIC_OP_SPACE
-
-#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
-  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
-    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
-  }
-
-#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
-
-#define DECL_ATOMIC_OP(NAME) \
-  DECL_ATOMIC_OP_TYPE(NAME, uint) \
-  DECL_ATOMIC_OP_TYPE(NAME, int)
-
-DECL_ATOMIC_OP(cmpxchg)
-
-#undef DECL_ATOMIC_OP
-#undef DECL_ATOMIC_OP_TYPE
-#undef DECL_ATOMIC_OP_SPACE
-
-/////////////////////////////////////////////////////////////////////////////
-// Force the compilation to SIMD8 or SIMD16
-/////////////////////////////////////////////////////////////////////////////
-
-int __gen_ocl_force_simd8(void);
-int __gen_ocl_force_simd16(void);
-
-#define NULL ((void*)0)
-
-/////////////////////////////////////////////////////////////////////////////
-// Image access functions
-/////////////////////////////////////////////////////////////////////////////
-
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v);
-
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w);
-
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color);
-
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
-int __gen_ocl_get_image_width(uint surface_id);
-int __gen_ocl_get_image_height(uint surface_id);
-int __gen_ocl_get_image_channel_data_type(uint surface_id);
-int __gen_ocl_get_image_channel_order(uint surface_id);
-int __gen_ocl_get_image_depth(uint surface_id);
-
-#define GET_IMAGE(cl_image, surface_id) \
-    uint surface_id = (uint)cl_image
-
-#define DECL_READ_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, sampler_t sampler, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ##suffix(surface_id, sampler, coord.s0, coord.s1);\
-  }
-
-#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ##suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1);\
-  }
-
-#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE void write_image ##suffix(image2d_t cl_image, coord_type coord, type color)\
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    __gen_ocl_write_image ##suffix(surface_id, coord.s0, coord.s1, color);\
-  }
-
-#define DECL_IMAGE(type, suffix)        \
-  DECL_READ_IMAGE(type, suffix, int2)   \
-  DECL_READ_IMAGE(type, suffix, float2) \
-  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int2) \
-  DECL_WRITE_IMAGE(type, suffix, int2)   \
-  DECL_WRITE_IMAGE(type, suffix, float2)
-
-DECL_IMAGE(int4, i)
-DECL_IMAGE(uint4, ui)
-DECL_IMAGE(float4, f)
-
-#undef DECL_IMAGE
-#undef DECL_READ_IMAGE
-#undef DECL_READ_IMAGE_NOSAMPLER
-#undef DECL_WRITE_IMAGE
-
-#define DECL_IMAGE_INFO(image_type)    \
-  INLINE_OVERLOADABLE  int get_image_width(image_type image) \
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_width(surface_id);\
-  } \
-  INLINE_OVERLOADABLE  int get_image_height(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_height(surface_id); \
-  } \
-  INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_data_type(surface_id); \
-  }\
-  INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_order(surface_id); \
-  }
-
-DECL_IMAGE_INFO(image2d_t)
-DECL_IMAGE_INFO(image3d_t)
-
-INLINE_OVERLOADABLE  int get_image_depth(image3d_t image)
-  {
-   GET_IMAGE(image, surface_id);
-   return __gen_ocl_get_image_depth(surface_id);
-  }
-
-INLINE_OVERLOADABLE  int2 get_image_dim(image2d_t image)
-  { return (int2){get_image_width(image), get_image_height(image)}; }
-
-INLINE_OVERLOADABLE  int4 get_image_dim(image3d_t image)
-  { return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0}; }
-#if 0
-/* The following functions are not implemented yet. */
-DECL_IMAGE_INFO(image1d_t)
-DECL_IMAGE_INFO(image1d_buffer_t)
-DECL_IMAGE_INFO(image1d_array_t)
-DECL_IMAGE_INFO(image2d_array_t)
-
-INLINE_OVERLOADABLE  int2 get_image_dim(image2d_array_t image)
-  { return __gen_ocl_get_image_dim(image); }
-
-INLINE_OVERLOADABLE  int4 get_image_dim(image2d_array_t image)
-  { return __gen_ocl_get_image_dim(image); }
-
-INLINE_OVERLOADABLE  size_t get_image_array_size(image2d_array_t image)
-  { return __gen_ocl_get_image_array_size(image); }
-
-INLINE_OVERLOADABLE  size_t get_image_array_size(image1d_array_t image)
-  { return __gen_ocl_get_image_array_size(image); }
-#endif
-
-#define DECL_READ_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, sampler_t sampler, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ## suffix(surface_id, (uint)sampler, coord.s0, coord.s1, coord.s2);\
-  }
-
-#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, coord_type coord) \
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    return __gen_ocl_read_image ## suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1, coord.s2);\
-  }
-
-#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
-  INLINE_OVERLOADABLE void write_image ## suffix(image3d_t cl_image, coord_type coord, type color)\
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    __gen_ocl_write_image ## suffix(surface_id, coord.s0, coord.s1, coord.s2, color);\
-  }
-
-#define DECL_IMAGE(type, suffix)        \
-  DECL_READ_IMAGE(type, suffix, int4)   \
-  DECL_READ_IMAGE(type, suffix, float4) \
-  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int4) \
-  DECL_WRITE_IMAGE(type, suffix, int4)   \
-  DECL_WRITE_IMAGE(type, suffix, float4)
-
-DECL_IMAGE(int4, i)
-DECL_IMAGE(uint4, ui)
-DECL_IMAGE(float4, f)
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-#undef DECL_IMAGE
-#undef DECL_READ_IMAGE
-#undef DECL_READ_IMAGE_NOSAMPLER
-#undef DECL_WRITE_IMAGE
-
-#undef GET_IMAGE
-#undef INLINE_OVERLOADABLE
-
-#undef PURE
-#undef CONST
-#undef OVERLOADABLE
-#undef INLINE
-#endif /* __GEN_OCL_STDLIB_H__ */
-
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
new file mode 100644
index 0000000..c972a3e
--- /dev/null
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -0,0 +1,1378 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GEN_OCL_STDLIB_H__
+#define __GEN_OCL_STDLIB_H__
+
+#define INLINE inline __attribute__((always_inline))
+#define OVERLOADABLE __attribute__((overloadable))
+#define PURE __attribute__((pure))
+#define CONST __attribute__((const))
+#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in scalar data types
+/////////////////////////////////////////////////////////////////////////////
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef __typeof__(sizeof(int)) size_t;
+typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
+typedef signed int intptr_t;
+typedef unsigned int uintptr_t;
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL address space
+/////////////////////////////////////////////////////////////////////////////
+// These are built-ins in LLVM 3.3.
+#if 100*__clang_major__ + __clang_minor__ <= 302
+#define __private __attribute__((address_space(0)))
+#define __global __attribute__((address_space(1)))
+#define __constant __attribute__((address_space(2)))
+#define __local __attribute__((address_space(3)))
+#define global __global
+#define local __local
+#define constant __constant
+#define private __private
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in vector data types
+/////////////////////////////////////////////////////////////////////////////
+#define DEF(type) typedef type type##2 __attribute__((ext_vector_type(2)));\
+                  typedef type type##3 __attribute__((ext_vector_type(3)));\
+                  typedef type type##4 __attribute__((ext_vector_type(4)));\
+                  typedef type type##8 __attribute__((ext_vector_type(8)));\
+                  typedef type type##16 __attribute__((ext_vector_type(16)));
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+DEF(double);
+#undef DEF
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL other built-in data types
+/////////////////////////////////////////////////////////////////////////////
+// FIXME:
+// This is a transitional hack to bypass the LLVM 3.3 built-in types.
+// See the Khronos SPIR specification for handling of these types.
+#define __texture __attribute__((address_space(4)))
+struct _image2d_t;
+typedef __texture struct _image2d_t* __image2d_t;
+struct _image3d_t;
+typedef __texture struct _image3d_t* __image3d_t;
+typedef uint __sampler_t;
+typedef size_t __event_t;
+#define image2d_t __image2d_t
+#define image3d_t __image3d_t
+#define sampler_t __sampler_t
+#define event_t __event_t
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL conversions & type casting
+/////////////////////////////////////////////////////////////////////////////
+
+// ##BEGIN_AS##
+
+// ##END_AS##
+
+// ##BEGIN_CONVERT##
+
+// ##END_CONVERT##
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL preprocessor directives & macros
+/////////////////////////////////////////////////////////////////////////////
+#define __OPENCL_VERSION__ 110
+#define __CL_VERSION_1_0__ 100
+#define __CL_VERSION_1_1__ 110
+#define __ENDIAN_LITTLE__ 1
+#define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
+                                        __attribute__((vec_type_hint(TYPE)))
+#define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL floating-point macros and pragmas
+/////////////////////////////////////////////////////////////////////////////
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define MAXFLOAT     3.40282347e38F
+#define HUGE_VALF    (__builtin_huge_valf())
+#define INFINITY     (__builtin_inff())
+#define NAN          (__builtin_nanf(""))
+#define M_E_F        2.718281828459045F
+#define M_LOG2E_F    1.4426950408889634F
+#define M_LOG10E_F   0.43429448190325176F
+#define M_LN2_F      0.6931471805599453F
+#define M_LN10_F     2.302585092994046F
+#define M_PI_F       3.141592653589793F
+#define M_PI_2_F     1.5707963267948966F
+#define M_PI_4_F     0.7853981633974483F
+#define M_1_PI_F     0.3183098861837907F
+#define M_2_PI_F     0.6366197723675814F
+#define M_2_SQRTPI_F 1.1283791670955126F
+#define M_SQRT2_F    1.4142135623730951F
+#define M_SQRT1_2_F  0.7071067811865476F
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL integer built-in macros
+/////////////////////////////////////////////////////////////////////////////
+#define CHAR_BIT    8
+#define CHAR_MAX    SCHAR_MAX
+#define CHAR_MIN    SCHAR_MIN
+#define INT_MAX     2147483647
+#define INT_MIN     (-2147483647 - 1)
+#define LONG_MAX    0x7fffffffffffffffL
+#define LONG_MIN    (-0x7fffffffffffffffL - 1)
+#define SCHAR_MAX   127
+#define SCHAR_MIN   (-127 - 1)
+#define SHRT_MAX    32767
+#define SHRT_MIN    (-32767 - 1)
+#define UCHAR_MAX   255
+#define USHRT_MAX   65535
+#define UINT_MAX    0xffffffff
+#define ULONG_MAX   0xffffffffffffffffUL
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL relational built-in functions
+/////////////////////////////////////////////////////////////////////////////
+
+int INLINE_OVERLOADABLE isequal(float x, float y) { return x == y; }
+int INLINE_OVERLOADABLE isnotequal(float x, float y) { return x != y; }
+int INLINE_OVERLOADABLE isgreater(float x, float y) { return x > y; }
+int INLINE_OVERLOADABLE isgreaterequal(float x, float y) { return x >= y; }
+int INLINE_OVERLOADABLE isless(float x, float y) { return x < y; }
+int INLINE_OVERLOADABLE islessequal(float x, float y) { return x <= y; }
+int INLINE_OVERLOADABLE islessgreater(float x, float y) { return (x < y) || (x > y); }
+
+#define SDEF(TYPE)                                                              \
+OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
+INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
+INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
+SDEF(char);
+SDEF(short);
+SDEF(int);
+SDEF(long);
+#undef SDEF
+#define UDEF(TYPE)                                                              \
+OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
+INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
+INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
+UDEF(uchar);
+UDEF(ushort);
+UDEF(uint);
+UDEF(ulong);
+#undef UDEF
+
+
+uchar INLINE_OVERLOADABLE convert_uchar_sat(float x) {
+    return add_sat((uchar)x, (uchar)0);
+}
+
+INLINE_OVERLOADABLE int isfinite(float x) { return __builtin_isfinite(x); }
+INLINE_OVERLOADABLE int isinf(float x) { return __builtin_isinf(x); }
+INLINE_OVERLOADABLE int isnan(float x) { return __builtin_isnan(x); }
+INLINE_OVERLOADABLE int isnormal(float x) { return __builtin_isnormal(x); }
+INLINE_OVERLOADABLE int isordered(float x, float y) { return isequal(x, x) && isequal(y, y); }
+INLINE_OVERLOADABLE int isunordered(float x, float y) { return isnan(x) || isnan(y); }
+INLINE_OVERLOADABLE int signbit(float x) { return __builtin_signbit(x); }
+
+#define DEC1(type) INLINE_OVERLOADABLE int any(type a) { return a<0; }
+#define DEC2(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
+#define DEC3(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0; }
+#define DEC4(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0; }
+#define DEC8(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0; }
+#define DEC16(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0 || a.s8<0 || a.s9<0 || a.sA<0 || a.sB<0 || a.sC<0 || a.sD<0 || a.sE<0 || a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+#define DEC1(type) INLINE_OVERLOADABLE int all(type a) { return a<0; }
+#define DEC2(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0; }
+#define DEC3(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0; }
+#define DEC4(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0; }
+#define DEC8(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0; }
+#define DEC16(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0 && a.s8<0 && a.s9<0 && a.sA<0 && a.sB<0 && a.sC<0 && a.sD<0 && a.sE<0 && a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEF(type) INLINE_OVERLOADABLE type bitselect(type a, type b, type c) { return (a & ~c) | (b & c); }
+DEF(char); DEF(uchar); DEF(short); DEF(ushort); DEF(int); DEF(uint)
+#undef DEF
+INLINE_OVERLOADABLE float bitselect(float a, float b, float c) {
+  return as_float(bitselect(as_int(a), as_int(b), as_int(c)));
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Integer built-in functions
+/////////////////////////////////////////////////////////////////////////////
+PURE CONST uint __gen_ocl_fbh(uint);
+PURE CONST uint __gen_ocl_fbl(uint);
+
+INLINE_OVERLOADABLE char clz(char x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 8;
+  return __gen_ocl_fbl(x) - 24;
+}
+
+INLINE_OVERLOADABLE uchar clz(uchar x) {
+  if (x == 0)
+    return 8;
+  return __gen_ocl_fbl(x) - 24;
+}
+
+INLINE_OVERLOADABLE short clz(short x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 16;
+  return __gen_ocl_fbh(x) - 16;
+}
+
+INLINE_OVERLOADABLE ushort clz(ushort x) {
+  if (x == 0)
+    return 16;
+  return __gen_ocl_fbh(x) - 16;
+}
+
+INLINE_OVERLOADABLE int clz(int x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 32;
+  return __gen_ocl_fbh(x);
+}
+
+INLINE_OVERLOADABLE uint clz(uint x) {
+  if (x == 0)
+    return 32;
+  return __gen_ocl_fbh(x);
+}
+
+OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
+OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
+INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+
+#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+#undef DEF
+
+INLINE_OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
+INLINE_OVERLOADABLE uint mul24(uint a, uint b) { return (a & 0xFFFFFF) * (b & 0xFFFFFF); }
+
+INLINE_OVERLOADABLE int mad24(int a, int b, int c) { return mul24(a, b) + c; }
+INLINE_OVERLOADABLE uint mad24(uint a, uint b, uint c) { return mul24(a, b) + c; }
+
+INLINE_OVERLOADABLE char mad_sat(char a, char b, char c) {
+  int x = (int)a * (int)b + (int)c;
+  if (x > 127)
+    x = 127;
+  if (x < -128)
+    x = -128;
+  return x;
+}
+
+INLINE_OVERLOADABLE uchar mad_sat(uchar a, uchar b, uchar c) {
+  uint x = (uint)a * (uint)b + (uint)c;
+  if (x > 255)
+    x = 255;
+  return x;
+}
+
+INLINE_OVERLOADABLE short mad_sat(short a, short b, short c) {
+  int x = (int)a * (int)b + (int)c;
+  if (x > 32767)
+    x = 32767;
+  if (x < -32768)
+    x = -32768;
+  return x;
+}
+
+INLINE_OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c) {
+  uint x = (uint)a * (uint)b + (uint)c;
+  if (x > 65535)
+    x = 65535;
+  return x;
+}
+
+/* XXX not implemented. */
+INLINE_OVERLOADABLE int mad_sat(int a, int b, int c) {
+  return 0;
+}
+
+INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
+  return 0;
+}
+
+INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
+INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
+INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
+INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
+INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
+INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
+#define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
+DEF(char, 7)
+DEF(uchar, 7)
+DEF(short, 15)
+DEF(ushort, 15)
+DEF(int, 31)
+DEF(uint, 31)
+#undef DEF
+
+OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
+OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
+INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+
+PURE CONST uint __gen_ocl_hadd(uint x, uint y);
+PURE CONST uint __gen_ocl_rhadd(uint x, uint y);
+#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
+#define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
+DEC
+#undef DEF
+#define DEF(type) INLINE_OVERLOADABLE type rhadd(type x, type y) { return (x + y + 1) >> 1; }
+DEC
+#undef DEF
+#undef DEC
+INLINE_OVERLOADABLE int hadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y) >> 1) : __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE int rhadd(int x, int y) { return (x < 0 && y > 0) || (x > 0 && y < 0) ? ((x + y + 1) >> 1) : __gen_ocl_rhadd(x, y); }
+INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+
+int __gen_ocl_abs(int x);
+#define DEC(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
+DEC(int)
+DEC(short)
+DEC(char)
+#undef DEC
+/* For unsigned types, do nothing. */
+#define DEC(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
+DEC(uint)
+DEC(ushort)
+DEC(uchar)
+#undef DEC
+
+/* Char and short type abs diff */
+/* promote char and short to int and will be no module overflow */
+#define DEC(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
+                         { return (UTYPE) (abs((int)x - (int)y)); }
+DEC(char, uchar)
+DEC(uchar, uchar)
+DEC(short, ushort)
+DEC(ushort, ushort)
+#undef DEC
+
+INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
+    /* same signed will never overflow. */
+    return y > x ? (y -x) : (x - y);
+}
+
+INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
+    /* same signed will never module overflow. */
+    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+        return abs(x - y);
+
+    return (abs(x) + abs(y));
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Work Items functions (see 6.11.1 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+
+PURE CONST uint __gen_ocl_get_work_dim(void);
+INLINE uint get_work_dim(void) {
+  return __gen_ocl_get_work_dim();
+}
+
+#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
+PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##2(void);
+DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
+DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
+#undef DECL_INTERNAL_WORK_ITEM_FN
+
+#define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
+INLINE unsigned NAME(unsigned int dim) {             \
+  if (dim == 0) return __gen_ocl_##NAME##0();        \
+  else if (dim == 1) return __gen_ocl_##NAME##1();   \
+  else if (dim == 2) return __gen_ocl_##NAME##2();   \
+  else return OTHER_RET;                             \
+}
+
+DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
+#undef DECL_PUBLIC_WORK_ITEM_FN
+
+INLINE uint get_global_id(uint dim) {
+  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Math Functions (see 6.11.2 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+PURE CONST float __gen_ocl_fabs(float x);
+PURE CONST float __gen_ocl_sin(float x);
+PURE CONST float __gen_ocl_cos(float x);
+PURE CONST float __gen_ocl_sqrt(float x);
+PURE CONST float __gen_ocl_rsqrt(float x);
+PURE CONST float __gen_ocl_log(float x);
+PURE CONST float __gen_ocl_pow(float x, float y);
+PURE CONST float __gen_ocl_rcp(float x);
+PURE CONST float __gen_ocl_rndz(float x);
+PURE CONST float __gen_ocl_rnde(float x);
+PURE CONST float __gen_ocl_rndu(float x);
+PURE CONST float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
+INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+  return __gen_ocl_cos(x * M_PI_F);
+}
+INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+  return __gen_ocl_sin(x * M_PI_F);
+}
+INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+INLINE_OVERLOADABLE float native_log(float x) {
+  return native_log2(x) * 0.6931472002f;
+}
+INLINE_OVERLOADABLE float native_log10(float x) {
+  return native_log2(x) * 0.3010299956f;
+}
+INLINE_OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
+INLINE_OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE_OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE_OVERLOADABLE float nan(uint code) {
+  return NAN;
+}
+INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+INLINE_OVERLOADABLE float native_tan(float x) {
+  return native_sin(x) / native_cos(x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+  return native_tan(x * M_PI_F);
+}
+INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
+INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
+INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+  return __gen_ocl_pow(x, 0.3333333333f);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_sincos(float x, float *cosval) {
+  *cosval = native_cos(x);
+  return native_sin(x);
+}
+INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
+INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
+INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { return __gen_ocl_internal_sincos(x, (float*)cosval); }
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
+  return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+  return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+  float y = native_exp(-2 * x);
+  return (1 - y) / (1 + y);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+  return x + __gen_ocl_pow(x, 3) / 6 + __gen_ocl_pow(x, 5) * 3 / 40 + __gen_ocl_pow(x, 7) * 5 / 112;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+  return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+  return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+  return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+  float a = 0, c = 1;
+  if (x <= -1) {
+    a = - M_PI_2_F;
+    x = 1 / x;
+    c = -1;
+  }
+  if (x >= 1) {
+    a = M_PI_2_F;
+    x = 1 / x;
+    c = -1;
+  }
+  return a + c * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 5 - __gen_ocl_pow(x, 7) / 7 + __gen_ocl_pow(x, 9) / 9 - __gen_ocl_pow(x, 11) / 11);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+  return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
+  return native_log(x + native_sqrt(x * x + 1));
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+  return 0.5f * native_sqrt((1 + x) / (1 - x));
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+  return x * y < 0 ? -x : x;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+  return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+  return 1 - __gen_ocl_internal_erf(x);
+}
+
+// XXX work-around PTX profile
+#define sqrt native_sqrt
+INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x)   { return native_log(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x)  { return native_log2(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
+INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
+INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+  return 2 * __gen_ocl_internal_round(x / 2);
+}
+// TODO use llvm intrinsics definitions
+#define cos native_cos
+#define cospi __gen_ocl_internal_cospi
+#define cosh __gen_ocl_internal_cosh
+#define acos __gen_ocl_internal_acos
+#define acospi __gen_ocl_internal_acospi
+#define acosh __gen_ocl_internal_acosh
+#define sin native_sin
+#define sinpi __gen_ocl_internal_sinpi
+#define sinh __gen_ocl_internal_sinh
+#define asin __gen_ocl_internal_asin
+#define asinpi __gen_ocl_internal_asinpi
+#define asinh __gen_ocl_internal_asinh
+#define tan native_tan
+#define tanpi __gen_ocl_internal_tanpi
+#define tanh __gen_ocl_internal_tanh
+#define atan __gen_ocl_internal_atan
+#define atanpi __gen_ocl_internal_atanpi
+#define atanh __gen_ocl_internal_atanh
+#define pow powr
+#define cbrt __gen_ocl_internal_cbrt
+#define rint __gen_ocl_internal_rint
+#define copysign __gen_ocl_internal_copysign
+#define erf __gen_ocl_internal_erf
+#define erfc __gen_ocl_internal_erfc
+
+INLINE_OVERLOADABLE float mad(float a, float b, float c) {
+  return a*b+c;
+}
+
+INLINE_OVERLOADABLE uint select(uint src0, uint src1, int cond) {
+  return cond ? src1 : src0;
+}
+INLINE_OVERLOADABLE uint select(uint src0, uint src1, uint cond) {
+  return cond ? src1 : src0;
+}
+INLINE_OVERLOADABLE int select(int src0, int src1, int cond) {
+  return cond ? src1 : src0;
+}
+INLINE_OVERLOADABLE int select(int src0, int src1, uint cond) {
+  return cond ? src1 : src0;
+}
+INLINE_OVERLOADABLE float select(float src0, float src1, int cond) {
+  return cond ? src1 : src0;
+}
+INLINE_OVERLOADABLE float select(float src0, float src1, uint cond) {
+  return cond ? src1 : src0;
+}
+
+// This will be optimized out by LLVM and will output LLVM select instructions
+#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
+INLINE_OVERLOADABLE TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+  TYPE4 dst; \
+  const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
+  const TYPE x1 = src1.x; \
+  const TYPE y0 = src0.y; \
+  const TYPE y1 = src1.y; \
+  const TYPE z0 = src0.z; \
+  const TYPE z1 = src1.z; \
+  const TYPE w0 = src0.w; \
+  const TYPE w1 = src1.w; \
+  dst.x = (cond.x & MASK) ? x1 : x0; \
+  dst.y = (cond.y & MASK) ? y1 : y0; \
+  dst.z = (cond.z & MASK) ? z1 : z0; \
+  dst.w = (cond.w & MASK) ? w1 : w0; \
+  return dst; \
+}
+DECL_SELECT4(int4, int, int4, 0x80000000)
+DECL_SELECT4(int4, int, uint4, 0x80000000)
+DECL_SELECT4(float4, float, int4, 0x80000000)
+DECL_SELECT4(float4, float, uint4, 0x80000000)
+#undef DECL_SELECT4
+
+/////////////////////////////////////////////////////////////////////////////
+// Common Functions (see 6.11.4 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float step(float edge, float x) {
+  return x < edge ? 0.0 : 1.0;
+}
+
+#define DECL_MIN_MAX_CLAMP(TYPE) \
+INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
+  return a > b ? a : b; \
+} \
+INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
+  return a < b ? a : b; \
+} \
+INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+  return max(min(v, u), l); \
+}
+DECL_MIN_MAX_CLAMP(float)
+DECL_MIN_MAX_CLAMP(int)
+DECL_MIN_MAX_CLAMP(short)
+DECL_MIN_MAX_CLAMP(char)
+DECL_MIN_MAX_CLAMP(uint)
+DECL_MIN_MAX_CLAMP(unsigned short)
+DECL_MIN_MAX_CLAMP(unsigned char)
+#undef DECL_MIN_MAX_CLAMP
+
+INLINE_OVERLOADABLE float __gen_ocl_frexp(float x, int *exp) {
+  uint u = as_uint(x);
+  if ((u & 0x7FFFFFFFu) == 0) {
+    *exp = 0;
+    return x;
+  }
+  int e = (u >> 23) & 255;
+  if (e == 255)
+    return x;
+  *exp = e - 126;
+  u = (u & (0x807FFFFFu)) | 0x3F000000;
+  return as_float(u);
+}
+
+INLINE_OVERLOADABLE float frexp(float x, global int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
+INLINE_OVERLOADABLE float frexp(float x, local int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
+INLINE_OVERLOADABLE float frexp(float x, private int *exp) { return __gen_ocl_frexp(x, (int *)exp); }
+
+INLINE_OVERLOADABLE float nextafter(float x, float y) {
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF;
+  uint hy = as_uint(y), iy = hy & 0x7FFFFFFF;
+  if (ix > 0x7F800000 || iy > 0x7F800000)
+    return nan(0u);
+  if (hx == hy)
+    return x;
+  if (ix == 0)
+    return as_float((hy & 0x80000000u) | 1);
+  if (((0 == (hx & 0x80000000u)) && y > x) || ((hx & 0x80000000u) && y < x))
+    hx ++;
+  else
+    hx --;
+  return as_float(hx);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_modf(float x, float *i) {
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF;
+  if (ix > 0x7F800000) {
+    *i = nan(0u);
+    return nan(0u);
+  }
+  if (ix == 0x7F800000) {
+    *i = x;
+    return as_float(hx & 0x80000000u);
+  }
+  *i = __gen_ocl_rndz(x);
+  return x - *i;
+}
+
+INLINE_OVERLOADABLE float modf(float x, global float *i) { return __gen_ocl_modf(x, (float *)i); }
+INLINE_OVERLOADABLE float modf(float x, local float *i) { return __gen_ocl_modf(x, (float *)i); }
+INLINE_OVERLOADABLE float modf(float x, private float *i) { return __gen_ocl_modf(x, (float *)i); }
+
+INLINE_OVERLOADABLE float degrees(float radians) { return (180 / M_PI_F) * radians; }
+INLINE_OVERLOADABLE float radians(float degrees) { return (M_PI_F / 180) * degrees; }
+
+INLINE_OVERLOADABLE float smoothstep(float e0, float e1, float x) {
+  x = clamp((x - e0) / (e1 - e0), 0.f, 1.f);
+  return x * x * (3 - 2 * x);
+}
+
+INLINE_OVERLOADABLE float sign(float x) {
+  if(x > 0)
+    return 1;
+  if(x < 0)
+    return -1;
+  if(x == -0.f)
+    return -0.f;
+  return 0.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a > b ? x : b > a ? y : max(x, y);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a < b ? x : b < a ? y : min(x, y);
+}
+INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+  return __gen_ocl_internal_fmax(x, y) - y;
+}
+INLINE_OVERLOADABLE float __gen_ocl_fract(float x, float *p) {
+  *p = __gen_ocl_internal_floor(x);
+  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+}
+INLINE_OVERLOADABLE float fract(float x, global float *p) { return __gen_ocl_fract(x, (float *)p); }
+INLINE_OVERLOADABLE float fract(float x, local float *p) { return __gen_ocl_fract(x, (float *)p); }
+INLINE_OVERLOADABLE float fract(float x, private float *p) { return __gen_ocl_fract(x, (float *)p); }
+
+INLINE_OVERLOADABLE float __gen_ocl_remquo(float x, float y, int *quo) {
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF, hy = as_uint(y), iy = hy & 0x7FFFFFFF;
+  if (ix > 0x7F800000 || iy > 0x7F800000 || ix == 0x7F800000 || iy == 0)
+    return nan(0u);
+  float k = x / y;
+  int q =  __gen_ocl_rnde(k);
+  *quo = q >= 0 ? (q & 127) : (q | 0xFFFFFF80u);
+  float r = x - q * y;
+  uint hr = as_uint(r), ir = hr & 0x7FFFFFFF;
+  if (ir == 0)
+    hr = ir | (hx & 0x80000000u);
+  return as_float(hr);
+}
+
+INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
+INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
+INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { return __gen_ocl_remquo(x, y, (int *)quo); }
+
+INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+INLINE_OVERLOADABLE float ldexp(float x, int n) {
+  return __gen_ocl_pow(2, n) * x;
+}
+INLINE_OVERLOADABLE float pown(float x, int n) {
+  if (x == 0 && n == 0)
+    return 1;
+  return powr(x, n);
+}
+INLINE_OVERLOADABLE float rootn(float x, int n) {
+  return powr(x, 1.f / n);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Geometric functions (see 6.11.5 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
+  return mad(p0.x,p1.x,p0.y*p1.y);
+}
+INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
+  return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
+}
+INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
+  return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
+}
+
+INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {
+  return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
+         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
+}
+INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {
+  return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
+         mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
+         mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
+         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
+}
+
+INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
+INLINE_OVERLOADABLE float normalize(float x) { return 1.f; }
+INLINE_OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_normalize(float x) { return 1.f; }
+INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
+   return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
+}
+INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
+   return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 3, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
+
+DECL_UNTYPED_RW_ALL(char)
+DECL_UNTYPED_RW_ALL(uchar)
+DECL_UNTYPED_RW_ALL(short)
+DECL_UNTYPED_RW_ALL(ushort)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+
+// XXX workaround ptx profile
+#define fabs __gen_ocl_internal_fabs
+#define trunc __gen_ocl_internal_trunc
+#define round __gen_ocl_internal_round
+#define floor __gen_ocl_internal_floor
+#define ceil __gen_ocl_internal_ceil
+#define log __gen_ocl_internal_log
+#define log2 __gen_ocl_internal_log2
+#define log10 __gen_ocl_internal_log10
+#define exp __gen_ocl_internal_exp
+#define exp2 native_exp2
+#define exp10 native_exp10
+#define expm1 __gen_ocl_internal_expm1
+#define fmin __gen_ocl_internal_fmin
+#define fmax __gen_ocl_internal_fmax
+#define fma mad
+#define fdim __gen_ocl_internal_fdim
+#define maxmag __gen_ocl_internal_maxmag
+#define minmag __gen_ocl_internal_minmag
+
+/////////////////////////////////////////////////////////////////////////////
+// Miscellaneous Vector Functions (see 6.11.12 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+#define DEC2(TYPE, XTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle(XTYPE x, uint2 mask) { \
+    TYPE##2 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC4(TYPE, XTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle(XTYPE x, uint4 mask) { \
+    TYPE##4 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC8(TYPE, XTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle(XTYPE x, uint8 mask) { \
+    TYPE##8 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC16(TYPE, XTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle(XTYPE x, uint16 mask) { \
+    TYPE##16 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+    y.s8 = ((TYPE *) &x)[mask.s8 & (vec_step(x) - 1)]; \
+    y.s9 = ((TYPE *) &x)[mask.s9 & (vec_step(x) - 1)]; \
+    y.sa = ((TYPE *) &x)[mask.sa & (vec_step(x) - 1)]; \
+    y.sb = ((TYPE *) &x)[mask.sb & (vec_step(x) - 1)]; \
+    y.sc = ((TYPE *) &x)[mask.sc & (vec_step(x) - 1)]; \
+    y.sd = ((TYPE *) &x)[mask.sd & (vec_step(x) - 1)]; \
+    y.se = ((TYPE *) &x)[mask.se & (vec_step(x) - 1)]; \
+    y.sf = ((TYPE *) &x)[mask.sf & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEF(TYPE) \
+  DEC2(TYPE, TYPE##2); DEC2(TYPE, TYPE##4); DEC2(TYPE, TYPE##8); DEC2(TYPE, TYPE##16) \
+  DEC4(TYPE, TYPE##2); DEC4(TYPE, TYPE##4); DEC4(TYPE, TYPE##8); DEC4(TYPE, TYPE##16) \
+  DEC8(TYPE, TYPE##2); DEC8(TYPE, TYPE##4); DEC8(TYPE, TYPE##8); DEC8(TYPE, TYPE##16) \
+  DEC16(TYPE, TYPE##2); DEC16(TYPE, TYPE##4); DEC16(TYPE, TYPE##8); DEC16(TYPE, TYPE##16)
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+#undef DEF
+#undef DEC2
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+/////////////////////////////////////////////////////////////////////////////
+// Synchronization functions
+/////////////////////////////////////////////////////////////////////////////
+#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+void __gen_ocl_barrier_local(void);
+void __gen_ocl_barrier_global(void);
+void __gen_ocl_barrier_local_and_global(void);
+
+typedef uint cl_mem_fence_flags;
+INLINE void barrier(cl_mem_fence_flags flags) {
+  if (flags == (CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE))
+    __gen_ocl_barrier_local_and_global();
+  else if (flags == CLK_LOCAL_MEM_FENCE)
+    __gen_ocl_barrier_local();
+  else if (flags == CLK_GLOBAL_MEM_FENCE)
+    __gen_ocl_barrier_global();
+}
+
+INLINE void mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void read_mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void write_mem_fence(cl_mem_fence_flags flags) {
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Atomic functions
+/////////////////////////////////////////////////////////////////////////////
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX) \
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)              \
+  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+
+DECL_ATOMIC_OP(add)
+DECL_ATOMIC_OP(sub)
+DECL_ATOMIC_OP(and)
+DECL_ATOMIC_OP(or)
+DECL_ATOMIC_OP(xor)
+DECL_ATOMIC_OP(xchg)
+DECL_ATOMIC_OP_TYPE(xchg, float, atomic_)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(inc)
+DECL_ATOMIC_OP(dec)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(cmpxchg)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+/////////////////////////////////////////////////////////////////////////////
+// Force the compilation to SIMD8 or SIMD16
+/////////////////////////////////////////////////////////////////////////////
+
+int __gen_ocl_force_simd8(void);
+int __gen_ocl_force_simd16(void);
+
+#define NULL ((void*)0)
+
+// ##BEGIN_COMMON_DEFINES##
+// ##END_COMMON_DEFINES##
+
+/////////////////////////////////////////////////////////////////////////////
+// Image access functions
+/////////////////////////////////////////////////////////////////////////////
+
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v);
+
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w);
+
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color);
+
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
+int __gen_ocl_get_image_width(uint surface_id);
+int __gen_ocl_get_image_height(uint surface_id);
+int __gen_ocl_get_image_channel_data_type(uint surface_id);
+int __gen_ocl_get_image_channel_order(uint surface_id);
+int __gen_ocl_get_image_depth(uint surface_id);
+
+#define GET_IMAGE(cl_image, surface_id) \
+    uint surface_id = (uint)cl_image
+
+#define DECL_READ_IMAGE(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, sampler_t sampler, coord_type coord) \
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    return __gen_ocl_read_image ##suffix(surface_id, sampler, coord.s0, coord.s1);\
+  }
+
+#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE type read_image ##suffix(image2d_t cl_image, coord_type coord) \
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    return __gen_ocl_read_image ##suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1);\
+  }
+
+#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE void write_image ##suffix(image2d_t cl_image, coord_type coord, type color)\
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    __gen_ocl_write_image ##suffix(surface_id, coord.s0, coord.s1, color);\
+  }
+
+#define DECL_IMAGE(type, suffix)        \
+  DECL_READ_IMAGE(type, suffix, int2)   \
+  DECL_READ_IMAGE(type, suffix, float2) \
+  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int2) \
+  DECL_WRITE_IMAGE(type, suffix, int2)   \
+  DECL_WRITE_IMAGE(type, suffix, float2)
+
+DECL_IMAGE(int4, i)
+DECL_IMAGE(uint4, ui)
+DECL_IMAGE(float4, f)
+
+#undef DECL_IMAGE
+#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
+#undef DECL_WRITE_IMAGE
+
+#define DECL_IMAGE_INFO(image_type)    \
+  INLINE_OVERLOADABLE  int get_image_width(image_type image) \
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_width(surface_id);\
+  } \
+  INLINE_OVERLOADABLE  int get_image_height(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_height(surface_id); \
+  } \
+  INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_data_type(surface_id); \
+  }\
+  INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_order(surface_id); \
+  }
+
+DECL_IMAGE_INFO(image2d_t)
+DECL_IMAGE_INFO(image3d_t)
+
+INLINE_OVERLOADABLE  int get_image_depth(image3d_t image)
+  {
+   GET_IMAGE(image, surface_id);
+   return __gen_ocl_get_image_depth(surface_id);
+  }
+
+INLINE_OVERLOADABLE  int2 get_image_dim(image2d_t image)
+  { return (int2){get_image_width(image), get_image_height(image)}; }
+
+INLINE_OVERLOADABLE  int4 get_image_dim(image3d_t image)
+  { return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0}; }
+#if 0
+/* The following functions are not implemented yet. */
+DECL_IMAGE_INFO(image1d_t)
+DECL_IMAGE_INFO(image1d_buffer_t)
+DECL_IMAGE_INFO(image1d_array_t)
+DECL_IMAGE_INFO(image2d_array_t)
+
+INLINE_OVERLOADABLE  int2 get_image_dim(image2d_array_t image)
+  { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE  int4 get_image_dim(image2d_array_t image)
+  { return __gen_ocl_get_image_dim(image); }
+
+INLINE_OVERLOADABLE  size_t get_image_array_size(image2d_array_t image)
+  { return __gen_ocl_get_image_array_size(image); }
+
+INLINE_OVERLOADABLE  size_t get_image_array_size(image1d_array_t image)
+  { return __gen_ocl_get_image_array_size(image); }
+#endif
+
+#define DECL_READ_IMAGE(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, sampler_t sampler, coord_type coord) \
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    return __gen_ocl_read_image ## suffix(surface_id, (uint)sampler, coord.s0, coord.s1, coord.s2);\
+  }
+
+#define DECL_READ_IMAGE_NOSAMPLER(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE type read_image ## suffix(image3d_t cl_image, coord_type coord) \
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    return __gen_ocl_read_image ## suffix(surface_id, CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST, coord.s0, coord.s1, coord.s2);\
+  }
+
+#define DECL_WRITE_IMAGE(type, suffix, coord_type) \
+  INLINE_OVERLOADABLE void write_image ## suffix(image3d_t cl_image, coord_type coord, type color)\
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    __gen_ocl_write_image ## suffix(surface_id, coord.s0, coord.s1, coord.s2, color);\
+  }
+
+#define DECL_IMAGE(type, suffix)        \
+  DECL_READ_IMAGE(type, suffix, int4)   \
+  DECL_READ_IMAGE(type, suffix, float4) \
+  DECL_READ_IMAGE_NOSAMPLER(type, suffix, int4) \
+  DECL_WRITE_IMAGE(type, suffix, int4)   \
+  DECL_WRITE_IMAGE(type, suffix, float4)
+
+DECL_IMAGE(int4, i)
+DECL_IMAGE(uint4, ui)
+DECL_IMAGE(float4, f)
+
+
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : disable
+
+#undef DECL_IMAGE
+#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
+#undef DECL_WRITE_IMAGE
+
+#undef GET_IMAGE
+// ##BEGIN_VECTOR##
+// ##END_VECTOR##
+
+#undef INLINE_OVERLOADABLE
+#undef PURE
+#undef CONST
+#undef OVERLOADABLE
+#undef INLINE
+#endif /* __GEN_OCL_STDLIB_H__ */
diff --git a/backend/src/update_as.sh b/backend/src/update_as.sh
index 54b4191..c68e789 100755
--- a/backend/src/update_as.sh
+++ b/backend/src/update_as.sh
@@ -1,11 +1,11 @@
 #! /bin/sh -e
 
-STDLIB_HEADER=ocl_stdlib.h
+AS_HEADER=ocl_as.h
 
-exec >$STDLIB_HEADER.tmp
-sed -n -e '1,/##BEGIN_AS##/p' $STDLIB_HEADER
+exec >$AS_HEADER.tmp
+echo "// This file is autogenerated by gen_as.sh."
+echo "// Don't modify it manually."
 ./gen_as.sh
-sed -n -e '/##END_AS##/,$p' $STDLIB_HEADER
 exec >&2
 
-mv $STDLIB_HEADER.tmp $STDLIB_HEADER
+mv $AS_HEADER.tmp $AS_HEADER
diff --git a/backend/src/update_blob_ocl_header.py b/backend/src/update_blob_ocl_header.py
new file mode 100755
index 0000000..197f16c
--- /dev/null
+++ b/backend/src/update_blob_ocl_header.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2012 Intel Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Author: Zhigang Gong <zhigang.gong at linux.intel.com>
+#/
+import sys
+import os
+
+if len(sys.argv) != 3:
+    print "Invalid argument {}".format(sys.argv)
+    print "use {} tmpl_file_name output_file_name".format(sys.argv[0])
+    raise
+
+def safeUnlink(filename):
+    try:
+        os.remove(filename)
+    except OSError:
+        pass
+
+header_segments = [ "vector", "as", "convert", "common_defines"]
+blobFileName = sys.argv[2]
+blobTempName = sys.argv[2] + '.tmp'
+safeUnlink(blobFileName)
+tmplFile = open(sys.argv[1], 'r')
+blob = open(sys.argv[2] + '.tmp', 'w')
+path = os.path.dirname(sys.argv[1])
+if path == '':
+    path = '.'
+
+matched_header = ""
+for tline in tmplFile:
+    if matched_header == "":
+        blob.write(tline)
+        for header in header_segments:
+            if tline.strip() == '// ##BEGIN_{}##'.format(header.upper()) :
+                hFile = open(path + '/ocl_' + header + '.h', 'r')
+                lineNr = 0
+                for hline in hFile:
+                    if lineNr >= 2:  #ignore the 2 lines of comment at the top of file.
+                        blob.write(hline)
+                    lineNr += 1
+                hFile.close()
+                matched_header = header
+    else:
+        if tline.strip() == '// ##END_{}##'.format(matched_header.upper()) :
+            blob.write(tline)
+            matched_header = "";
+
+tmplFile.close()
+blob.close()
+os.rename(blobTempName, blobFileName)
diff --git a/backend/src/update_convert.sh b/backend/src/update_convert.sh
index f1fcd36..3c47917 100755
--- a/backend/src/update_convert.sh
+++ b/backend/src/update_convert.sh
@@ -1,11 +1,12 @@
 #! /bin/sh -e
 
-STDLIB_HEADER=ocl_stdlib.h
+CONVERT_HEADER=ocl_convert.h
 
-exec >$STDLIB_HEADER.tmp
-sed -n -e '1,/##BEGIN_CONVERT##/p' $STDLIB_HEADER
+
+exec >$CONVERT_HEADER.tmp
+echo "// This file is autogenerated by gen_convert.sh."
+echo "// Don't modify it manually."
 ./gen_convert.sh
-sed -n -e '/##END_CONVERT##/,$p' $STDLIB_HEADER
 exec >&2
 
-mv $STDLIB_HEADER.tmp $STDLIB_HEADER
+mv $CONVERT_HEADER.tmp $CONVERT_HEADER
diff --git a/kernels/builtin_bitselect.cl b/kernels/builtin_bitselect.cl
new file mode 100644
index 0000000..9b60cbe
--- /dev/null
+++ b/kernels/builtin_bitselect.cl
@@ -0,0 +1,4 @@
+kernel void builtin_bitselect(global float *src1, global float *src2, global float *src3, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = bitselect(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/builtin_frexp.cl b/kernels/builtin_frexp.cl
new file mode 100644
index 0000000..766695a
--- /dev/null
+++ b/kernels/builtin_frexp.cl
@@ -0,0 +1,4 @@
+kernel void builtin_frexp(global float *src, global float *dst, global int *e) {
+  int i = get_global_id(0);
+  dst[i] = frexp(src[i], &e[i]);
+}
diff --git a/kernels/builtin_global_id.cl b/kernels/builtin_global_id.cl
new file mode 100644
index 0000000..5b82f9f
--- /dev/null
+++ b/kernels/builtin_global_id.cl
@@ -0,0 +1,4 @@
+kernel void builtin_global_id( __global int *ret) {
+  int id = get_global_id(0) + get_global_id(1)*3 + get_global_id(2)*3*4;
+  ret[id] = id;
+}
diff --git a/kernels/builtin_local_id.cl b/kernels/builtin_local_id.cl
new file mode 100644
index 0000000..489833d
--- /dev/null
+++ b/kernels/builtin_local_id.cl
@@ -0,0 +1,6 @@
+kernel void builtin_local_id( __global int *ret) {
+  int id = get_local_id(0) +  get_group_id(0) * 2 + \
+           get_local_id(1) * 4 + get_group_id(1) * 12 +\
+           get_local_id(2) * 36 + get_group_id(2) * 144;
+  ret[id] = id;
+}
diff --git a/kernels/builtin_local_size.cl b/kernels/builtin_local_size.cl
new file mode 100644
index 0000000..979d907
--- /dev/null
+++ b/kernels/builtin_local_size.cl
@@ -0,0 +1,3 @@
+kernel void builtin_local_size( __global int *ret, __global int *i_dim ) {
+  *ret = get_local_size( *i_dim);
+}
diff --git a/kernels/builtin_mad_sat.cl b/kernels/builtin_mad_sat.cl
new file mode 100644
index 0000000..1739a4d
--- /dev/null
+++ b/kernels/builtin_mad_sat.cl
@@ -0,0 +1,4 @@
+kernel void builtin_mad_sat(global short *src1, global short *src2, global short *src3, global short *dst) {
+  short i = get_global_id(0);
+  dst[i] = mad_sat(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/builtin_modf.cl b/kernels/builtin_modf.cl
new file mode 100644
index 0000000..43630ed
--- /dev/null
+++ b/kernels/builtin_modf.cl
@@ -0,0 +1,6 @@
+kernel void builtin_modf(global float *src, global float *dst, global float *it) {
+  int i = get_global_id(0);
+  float x;
+  dst[i] = modf(src[i], &x);
+  it[i] = x;
+}
diff --git a/kernels/builtin_nextafter.cl b/kernels/builtin_nextafter.cl
new file mode 100644
index 0000000..3945e34
--- /dev/null
+++ b/kernels/builtin_nextafter.cl
@@ -0,0 +1,4 @@
+kernel void builtin_nextafter(global float *src1, global float *src2, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = nextafter(src1[i], src2[i]);
+}
diff --git a/kernels/builtin_num_groups.cl b/kernels/builtin_num_groups.cl
new file mode 100644
index 0000000..719d25d
--- /dev/null
+++ b/kernels/builtin_num_groups.cl
@@ -0,0 +1,3 @@
+kernel void builtin_num_groups( __global int *ret, __global int *i_dim ) {
+  *ret = get_num_groups( *i_dim);
+}
diff --git a/kernels/builtin_remquo.cl b/kernels/builtin_remquo.cl
new file mode 100644
index 0000000..d66c164
--- /dev/null
+++ b/kernels/builtin_remquo.cl
@@ -0,0 +1,6 @@
+kernel void builtin_remquo(global float *x, global float *y, global float *dst, global int *quo) {
+  int i = get_global_id(0);
+  int q;
+  dst[i] = remquo(x[i], y[i], & q);
+  quo[i] = q;
+}
diff --git a/kernels/builtin_shuffle.cl b/kernels/builtin_shuffle.cl
new file mode 100644
index 0000000..ad988b9
--- /dev/null
+++ b/kernels/builtin_shuffle.cl
@@ -0,0 +1,8 @@
+kernel void builtin_shuffle(global float *src1, global float *src2, global float *dst1, global float *dst2) {
+  int i = get_global_id(0);
+  float2 src = (float2)(src1[i], src2[i]);
+  uint2 mask = (uint2)(1, 0);
+  float2 dst = shuffle(src, mask);
+  dst1[i] = dst.s0;
+  dst2[i] = dst.s1;
+}
diff --git a/kernels/builtin_sign.cl b/kernels/builtin_sign.cl
new file mode 100644
index 0000000..ff9a66b
--- /dev/null
+++ b/kernels/builtin_sign.cl
@@ -0,0 +1,4 @@
+kernel void builtin_sign(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = sign(src[i]);
+}
diff --git a/kernels/compiler_smoothstep.cl b/kernels/compiler_smoothstep.cl
new file mode 100644
index 0000000..d3b7da4
--- /dev/null
+++ b/kernels/compiler_smoothstep.cl
@@ -0,0 +1,4 @@
+kernel void compiler_smoothstep(global float *src1, global float *src2, global float *src3, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = smoothstep(src1[i], src2[i], src3[i]);
+}
diff --git a/src/OCLConfig.h.in b/src/OCLConfig.h.in
index cfd0378..8662584 100644
--- a/src/OCLConfig.h.in
+++ b/src/OCLConfig.h.in
@@ -1,3 +1,5 @@
 // the configured options and settings for LIBCL
-#define LIBCL_VERSION_MAJOR @LIBCL_VERSION_MAJOR@
-#define LIBCL_VERSION_MINOR @LIBCL_VERSION_MINOR@
+#define LIBCL_DRIVER_VERSION_MAJOR @LIBCL_DRIVER_VERSION_MAJOR@
+#define LIBCL_DRIVER_VERSION_MINOR @LIBCL_DRIVER_VERSION_MINOR@
+#define LIBCL_C_VERSION_MAJOR @LIBCL_C_VERSION_MAJOR@
+#define LIBCL_C_VERSION_MINOR @LIBCL_C_VERSION_MINOR@
diff --git a/src/cl_api.c b/src/cl_api.c
index dc52f0a..146c010 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -559,11 +559,16 @@ clGetMemObjectInfo(cl_mem      memobj,
                    void *      param_value,
                    size_t *    param_value_size_ret)
 {
-  return cl_get_mem_object_info(memobj,
-                                param_name,
-                                param_value_size,
-                                param_value,
-                                param_value_size_ret);
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM(memobj);
+
+  err = cl_get_mem_object_info(memobj,
+                               param_name,
+                               param_value_size,
+                               param_value,
+                               param_value_size_ret);
+error:
+  return err;
 }
 
 cl_int
@@ -573,8 +578,11 @@ clGetImageInfo(cl_mem         image,
                void *         param_value,
                size_t *       param_value_size_ret)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  return cl_get_image_info(image,
+                           param_name,
+                           param_value_size,
+                           param_value,
+                           param_value_size_ret);
 }
 
 cl_int
@@ -582,8 +590,24 @@ clSetMemObjectDestructorCallback(cl_mem  memobj,
                                  void (CL_CALLBACK *pfn_notify) (cl_mem, void*),
                                  void * user_data)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM(memobj);
+  INVALID_VALUE_IF (pfn_notify == 0);
+
+  cl_mem_dstr_cb *cb = (cl_mem_dstr_cb*)malloc(sizeof(cl_mem_dstr_cb));
+  if (!cb) {
+    err = CL_OUT_OF_HOST_MEMORY;
+    goto error;
+  }
+
+  memset(cb, 0, sizeof(cl_mem_dstr_cb));
+  cb->pfn_notify = pfn_notify;
+  cb->user_data = user_data;
+  cb->next = memobj->dstr_cb;
+  memobj->dstr_cb = cb;
+
+error:
+  return err;
 }
 
 cl_sampler
@@ -1467,7 +1491,9 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
                    cl_int *          errcode_ret)
 {
   void *ptr = NULL;
+  void *mem_ptr = NULL;
   cl_int err = CL_SUCCESS;
+  int slot = -1;
 
   CHECK_QUEUE(command_queue);
   CHECK_MEM(buffer);
@@ -1500,10 +1526,66 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
 
   ptr = (char*)ptr + offset;
 
+  if(buffer->flags & CL_MEM_USE_HOST_PTR) {
+    assert(buffer->host_ptr);
+    memcpy(buffer->host_ptr + offset, ptr, size);
+    mem_ptr = buffer->host_ptr + offset;
+  } else {
+    mem_ptr = ptr;
+  }
+
+  /* Record the mapped address. */
+  if (!buffer->mapped_ptr_sz) {
+    buffer->mapped_ptr_sz = 16;
+    buffer->mapped_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz);
+    if (!buffer->mapped_ptr) {
+      cl_mem_unmap_auto (buffer);
+      err = CL_OUT_OF_HOST_MEMORY;
+      ptr = NULL;
+      goto error;
+    }
+
+    memset(buffer->mapped_ptr, 0, buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+    slot = 0;
+  } else {
+    int i = 0;
+    for (; i < buffer->mapped_ptr_sz; i++) {
+      if (buffer->mapped_ptr[i].ptr == NULL) {
+        slot = i;
+        break;
+      }
+    }
+
+    if (i == buffer->mapped_ptr_sz) {
+      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * buffer->mapped_ptr_sz * 2);
+      if (!new_ptr) {
+        cl_mem_unmap_auto (buffer);
+        err = CL_OUT_OF_HOST_MEMORY;
+        ptr = NULL;
+        goto error;
+      }
+      memset(new_ptr, 0, 2 * buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      memcpy(new_ptr, buffer->mapped_ptr,
+             buffer->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      slot = buffer->mapped_ptr_sz;
+      buffer->mapped_ptr_sz *= 2;
+      free(buffer->mapped_ptr);
+      buffer->mapped_ptr = new_ptr;
+    }
+  }
+
+  assert(slot != -1);
+  buffer->mapped_ptr[slot].ptr = mem_ptr;
+  buffer->mapped_ptr[slot].v_ptr = ptr;
+  buffer->mapped_ptr[slot].size = size;
+  buffer->map_ref++;
+
 error:
   if (errcode_ret)
     *errcode_ret = err;
-  return ptr;
+  return mem_ptr;
 }
 
 void *
@@ -1578,7 +1660,70 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
                         const cl_event *  event_wait_list,
                         cl_event *        event)
 {
-  return cl_mem_unmap_auto(memobj);
+  cl_int err = CL_SUCCESS;
+  int i;
+  size_t mapped_size = 0;
+  void * v_ptr = NULL;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(memobj);
+  if (command_queue->ctx != memobj->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+  INVALID_VALUE_IF(!mapped_ptr);
+  for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+    if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+      memobj->mapped_ptr[i].ptr = NULL;
+      mapped_size = memobj->mapped_ptr[i].size;
+      v_ptr = memobj->mapped_ptr[i].v_ptr;
+      memobj->mapped_ptr[i].size = 0;
+      memobj->mapped_ptr[i].v_ptr = NULL;
+      memobj->map_ref--;
+      break;
+    }
+  }
+  /* can not find a mapped address? */
+  INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
+
+  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mapped_ptr >= memobj->host_ptr &&
+      mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+    /* Sync the data. */
+    memcpy(v_ptr, mapped_ptr, mapped_size);
+  } else {
+    assert(v_ptr == mapped_ptr);
+  }
+
+  cl_mem_unmap_auto(memobj);
+
+  /* shrink the mapped slot. */
+  if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
+    int j = 0;
+    cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+	sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
+    if (!new_ptr) {
+      /* Just do nothing. */
+      goto error;
+    }
+    memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
+
+    for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+      if (memobj->mapped_ptr[i].ptr) {
+        new_ptr[j] = memobj->mapped_ptr[i];
+        j++;
+        assert(j < memobj->mapped_ptr_sz/2);
+      }
+    }
+    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
+    free(memobj->mapped_ptr);
+    memobj->mapped_ptr = new_ptr;
+  }
+
+error:
+  return err;
 }
 
 cl_int
diff --git a/src/cl_context.c b/src/cl_context.c
index 338706b..a48436c 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -36,10 +36,22 @@
 #include <assert.h>
 #include <string.h>
 
+#define CHECK(var) \
+  if (var) \
+    return CL_INVALID_PROPERTY; \
+  else \
+    var = 1;
+
 static cl_int
 cl_context_properties_process(const cl_context_properties *prop,
                               struct _cl_context_prop *cl_props, cl_uint * prop_len)
 {
+  int set_cl_context_platform = 0,
+      set_cl_gl_context_khr = 0,
+      set_cl_egl_display_khr = 0,
+      set_cl_glx_display_khr = 0,
+      set_cl_wgl_hdc_khr = 0,
+      set_cl_cgl_sharegroup_khr = 0;
   cl_int err = CL_SUCCESS;
 
   cl_props->gl_type = CL_GL_NOSHARE;
@@ -52,6 +64,7 @@ cl_context_properties_process(const cl_context_properties *prop,
   while(*prop) {
     switch (*prop) {
     case CL_CONTEXT_PLATFORM:
+      CHECK (set_cl_context_platform);
       cl_props->platform_id = *(prop + 1);
       if (UNLIKELY((cl_platform_id) cl_props->platform_id != intel_platform)) {
         err = CL_INVALID_PLATFORM;
@@ -59,21 +72,26 @@ cl_context_properties_process(const cl_context_properties *prop,
       }
       break;
     case CL_GL_CONTEXT_KHR:
+      CHECK (set_cl_gl_context_khr);
       cl_props->gl_context = *(prop + 1);
       break;
     case CL_EGL_DISPLAY_KHR:
+      CHECK (set_cl_egl_display_khr);
       cl_props->gl_type = CL_GL_EGL_DISPLAY;
       cl_props->egl_display = *(prop + 1);
       break;
     case CL_GLX_DISPLAY_KHR:
+      CHECK (set_cl_glx_display_khr);
       cl_props->gl_type = CL_GL_GLX_DISPLAY;
       cl_props->glx_display = *(prop + 1);
       break;
     case CL_WGL_HDC_KHR:
+      CHECK (set_cl_wgl_hdc_khr);
       cl_props->gl_type = CL_GL_WGL_HDC;
       cl_props->wgl_hdc = *(prop + 1);
       break;
     case CL_CGL_SHAREGROUP_KHR:
+      CHECK (set_cl_cgl_sharegroup_khr);
       cl_props->gl_type = CL_GL_CGL_SHAREGROUP;
       cl_props->cgl_sharegroup = *(prop + 1);
       break;
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 8fe863a..f58e1fd 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -68,12 +68,12 @@
     .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
 DECL_INFO_STRING(name, "Intel HD Graphics Family")
 DECL_INFO_STRING(vendor, "Intel")
-DECL_INFO_STRING(version, OCL_VERSION_STRING)
+DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
 DECL_INFO_STRING(profile, "FULL_PROFILE")
-DECL_INFO_STRING(opencl_c_version, "OpenCL C 1.1")
+DECL_INFO_STRING(opencl_c_version, LIBCL_C_VERSION_STRING)
 DECL_INFO_STRING(extensions, "")
 DECL_INFO_STRING(built_in_kernels, "")
-DECL_INFO_STRING(driver_version, LIBCL_VERSION_STRING)
+DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
 #undef DECL_INFO_STRING
 
 
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 5465aa9..f794ce7 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -58,6 +58,8 @@ cl_get_mem_object_info(cl_mem mem,
     FIELD_SIZE(MEM_MAP_COUNT, cl_uint);
     FIELD_SIZE(MEM_REFERENCE_COUNT, cl_uint);
     FIELD_SIZE(MEM_CONTEXT, cl_context);
+    FIELD_SIZE(MEM_ASSOCIATED_MEMOBJECT, cl_mem);
+    FIELD_SIZE(MEM_OFFSET, size_t);
   default:
     return CL_INVALID_VALUE;
   }
@@ -71,18 +73,25 @@ cl_get_mem_object_info(cl_mem mem,
     *((cl_mem_flags *)param_value) = mem->flags;
     break;
   case CL_MEM_SIZE:
-    *((size_t *)param_value) = cl_buffer_get_size(mem->bo);
+    *((size_t *)param_value) = mem->size;
     break;
   case CL_MEM_HOST_PTR:
-    NOT_IMPLEMENTED;
+    *((size_t *)param_value) = (size_t)mem->host_ptr;
     break;
   case CL_MEM_MAP_COUNT:
-    NOT_IMPLEMENTED;
+    *((cl_uint *)param_value) = mem->map_ref;
     break;
   case CL_MEM_REFERENCE_COUNT:
-    NOT_IMPLEMENTED;
+    *((cl_uint *)param_value) = mem->ref_n;
     break;
   case CL_MEM_CONTEXT:
+    *((cl_context *)param_value) = mem->ctx;
+    break;
+  // TODO: Need to implement sub buffer first.
+  case CL_MEM_ASSOCIATED_MEMOBJECT:
+    NOT_IMPLEMENTED;
+    break;
+  case CL_MEM_OFFSET:
     NOT_IMPLEMENTED;
     break;
   }
@@ -90,6 +99,57 @@ cl_get_mem_object_info(cl_mem mem,
   return CL_SUCCESS;
 }
 
+LOCAL cl_int
+cl_get_image_info(cl_mem mem,
+                  cl_image_info param_name,
+                  size_t param_value_size,
+                  void *param_value,
+                  size_t *param_value_size_ret)
+{
+  if(!mem || !mem->is_image)
+    return CL_INVALID_MEM_OBJECT;
+
+  switch(param_name)
+  {
+    FIELD_SIZE(IMAGE_FORMAT, cl_image_format);
+    FIELD_SIZE(IMAGE_ELEMENT_SIZE, size_t);
+    FIELD_SIZE(IMAGE_ROW_PITCH, size_t);
+    FIELD_SIZE(IMAGE_SLICE_PITCH, size_t);
+    FIELD_SIZE(IMAGE_WIDTH, size_t);
+    FIELD_SIZE(IMAGE_HEIGHT, size_t);
+    FIELD_SIZE(IMAGE_DEPTH, size_t);
+  default:
+    return CL_INVALID_VALUE;
+  }
+
+  switch(param_name)
+  {
+  case CL_IMAGE_FORMAT:
+    *(cl_image_format *)param_value = mem->fmt;
+    break;
+  case CL_IMAGE_ELEMENT_SIZE:
+    *(size_t *)param_value = mem->bpp;
+    break;
+  case CL_IMAGE_ROW_PITCH:
+    *(size_t *)param_value = mem->row_pitch;
+    break;
+  case CL_IMAGE_SLICE_PITCH:
+    *(size_t *)param_value = mem->slice_pitch;
+    break;
+  case CL_IMAGE_WIDTH:
+    *(size_t *)param_value = mem->w;
+    break;
+  case CL_IMAGE_HEIGHT:
+    *(size_t *)param_value = mem->h;
+    break;
+  case CL_IMAGE_DEPTH:
+    *(size_t *)param_value = mem->depth;
+    break;
+  }
+
+  return CL_SUCCESS;
+}
+
 #undef FIELD_SIZE
 
 static cl_mem
@@ -106,10 +166,6 @@ cl_mem_allocate(cl_context ctx,
   cl_ulong max_mem_size;
 
   assert(ctx);
-  FATAL_IF (flags & CL_MEM_ALLOC_HOST_PTR,
-            "CL_MEM_ALLOC_HOST_PTR unsupported"); /* XXX */
-  FATAL_IF (flags & CL_MEM_USE_HOST_PTR,
-            "CL_MEM_USE_HOST_PTR unsupported");   /* XXX */
 
   if ((err = cl_get_device_info(ctx->device,
                                 CL_DEVICE_MAX_MEM_ALLOC_SIZE,
@@ -172,11 +228,35 @@ cl_mem_new(cl_context ctx,
            void *data,
            cl_int *errcode_ret)
 {
+  /* Possible mem type combination:
+       CL_MEM_ALLOC_HOST_PTR
+       CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+       CL_MEM_USE_HOST_PTR
+       CL_MEM_COPY_HOST_PTR   */
+
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
 
-  /* Check flags consistency */
-  if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR && data == NULL)) {
+  /* This flag is valid only if host_ptr is not NULL */
+  if (UNLIKELY((flags & CL_MEM_COPY_HOST_PTR ||
+                flags & CL_MEM_USE_HOST_PTR) &&
+                data == NULL)) {
+    err = CL_INVALID_HOST_PTR;
+    goto error;
+  }
+
+  /* CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR
+     are mutually exclusive. */
+  if (UNLIKELY(flags & CL_MEM_ALLOC_HOST_PTR &&
+               flags & CL_MEM_USE_HOST_PTR)) {
+    err = CL_INVALID_HOST_PTR;
+    goto error;
+  }
+
+  /* CL_MEM_COPY_HOST_PTR and CL_MEM_USE_HOST_PTR
+     are mutually exclusive. */
+  if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR &&
+               flags & CL_MEM_USE_HOST_PTR)) {
     err = CL_INVALID_HOST_PTR;
     goto error;
   }
@@ -186,10 +266,15 @@ cl_mem_new(cl_context ctx,
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
+  mem->type = CL_MEM_OBJECT_BUFFER;
+
   /* Copy the data if required */
-  if (flags & CL_MEM_COPY_HOST_PTR) /* TODO check other flags too */
+  if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR)
     cl_buffer_subdata(mem->bo, 0, sz, data);
 
+  if (flags & CL_MEM_USE_HOST_PTR || flags & CL_MEM_COPY_HOST_PTR)
+    mem->host_ptr = data;
+
 exit:
   if (errcode_ret)
     *errcode_ret = err;
@@ -418,6 +503,22 @@ cl_mem_delete(cl_mem mem)
   pthread_mutex_unlock(&mem->ctx->buffer_lock);
   cl_context_delete(mem->ctx);
 
+  /* Someone still mapped? */
+  assert(!mem->map_ref);
+
+  if (mem->mapped_ptr)
+    free(mem->mapped_ptr);
+
+  if (mem->dstr_cb) {
+    cl_mem_dstr_cb *cb = mem->dstr_cb;
+    while (mem->dstr_cb) {
+      cb = mem->dstr_cb;
+      cb->pfn_notify(mem, cb->user_data);
+      mem->dstr_cb = cb->next;
+      free(cb);
+    }
+  }
+
   cl_free(mem);
 }
 
diff --git a/src/cl_mem.h b/src/cl_mem.h
index c204992..1b1709a 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -49,6 +49,18 @@ typedef enum cl_image_tiling {
   CL_TILE_Y  = 2
 } cl_image_tiling_t;
 
+typedef struct _cl_mapped_ptr {
+  void * ptr;
+  void * v_ptr;
+  size_t size;
+}cl_mapped_ptr;
+
+typedef struct _cl_mem_dstr_cb {
+  struct _cl_mem_dstr_cb * next;
+  void (CL_CALLBACK *pfn_notify)(cl_mem memobj, void *user_data);
+  void *user_data;
+}cl_mem_dstr_cb;
+
 /* Used for buffers and images */
 struct _cl_mem {
   DEFINE_ICD(dispatch)
@@ -68,11 +80,19 @@ struct _cl_mem {
   uint32_t intel_fmt;       /* format to provide in the surface state */
   uint32_t bpp;             /* number of bytes per pixel */
   cl_image_tiling_t tiling; /* only IVB+ supports TILE_[X,Y] (image only) */
+  void * host_ptr;          /* Pointer of the host mem specified by CL_MEM_ALLOC_HOST_PTR */
+  cl_mapped_ptr* mapped_ptr;/* Store the mapped addresses and size by caller. */
+  int mapped_ptr_sz;        /* The array size of mapped_ptr. */
+  int map_ref;              /* The mapped count. */
+  cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
 };
 
 /* Query information about a memory object */
 extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
 
+/* Query information about an image */
+extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
+
 /* Create a new memory object and initialize it with possible user data */
 extern cl_mem cl_mem_new(cl_context, cl_mem_flags, size_t, void*, cl_int*);
 
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index 04641a5..f247171 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -149,7 +149,7 @@ EGLImageKHR cl_create_textured_egl_image(cl_context ctx,
   egl_context = (EGLDisplay)ctx->props.gl_context;
   return egl_funcs->eglCreateImageKHR_func(egl_display, egl_context,
                                            EGL_GL_TEXTURE_2D_KHR,
-                                           (EGLClientBuffer)texture,
+                                           (EGLClientBuffer)(uintptr_t)texture,
                                            &egl_attribs[0]);
 }
 
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index 2e0a86a..33915ce 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -33,7 +33,7 @@
 static struct _cl_platform_id intel_platform_data = {
   INIT_ICD(dispatch)
   DECL_INFO_STRING(profile, "FULL_PROFILE")
-  DECL_INFO_STRING(version, OCL_VERSION_STRING)
+  DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
   DECL_INFO_STRING(name, "Experiment Intel Gen OCL Driver")
   DECL_INFO_STRING(vendor, "Intel")
   DECL_INFO_STRING(icd_suffix_khr, "Intel")
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index 5701a50..b8f7d61 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -59,14 +59,12 @@ extern cl_int cl_get_platform_info(cl_platform_id    platform,
                                    void *            param_value,
                                    size_t *          param_value_size_ret);
 
-#define OCL_VERSION_MAJOR 1
-#define OCL_VERSION_MINOR 1
-
 #define _STR(x) #x
 #define _JOINT(x, y) _STR(x) "." _STR(y)
 
-#define OCL_VERSION_STRING "OpenCL " _JOINT(OCL_VERSION_MAJOR, OCL_VERSION_MINOR)
-#define LIBCL_VERSION_STRING _JOINT(LIBCL_VERSION_MAJOR, LIBCL_VERSION_MINOR)
+#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR)
+#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR)
+#define LIBCL_DRIVER_VERSION_STRING _JOINT(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR)
 
 #endif /* __CL_PLATFORM_ID_H__ */
 
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 621acad..5cd20c3 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -67,6 +67,7 @@ set (utests_sources
   compiler_saturate_sub.cpp
   compiler_shift_right.cpp
   compiler_short_scatter.cpp
+  compiler_smoothstep.cpp
   compiler_uint2_copy.cpp
   compiler_uint3_copy.cpp
   compiler_uint8_copy.cpp
@@ -99,8 +100,20 @@ set (utests_sources
   compiler_vector_load_store.cpp
   compiler_cl_finish.cpp
   get_cl_info.cpp
+  builtin_bitselect.cpp
+  builtin_frexp.cpp
+  builtin_mad_sat.cpp
+  builtin_modf.cpp
+  builtin_nextafter.cpp
+  builtin_remquo.cpp
+  builtin_shuffle.cpp
+  builtin_sign.cpp
   buildin_work_dim.cpp
   builtin_global_size.cpp
+  builtin_local_size.cpp
+  builtin_global_id.cpp
+  builtin_num_groups.cpp
+  builtin_local_id.cpp
   runtime_createcontext.cpp
   runtime_null_kernel_arg.cpp
   compiler_double.cpp
diff --git a/utests/builtin_bitselect.cpp b/utests/builtin_bitselect.cpp
new file mode 100644
index 0000000..37fb8df
--- /dev/null
+++ b/utests/builtin_bitselect.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+
+int as_int(float f) {
+  void *p = &f;
+  return *(int *)p;
+}
+
+int cpu(int a, int b, int c) {
+  return (a & ~c) | (b & c);
+}
+
+void builtin_bitselect(void)
+{
+  const int n = 32;
+  float src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_bitselect");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((float*)buf_data[0])[i] = rand() * 0.1f;
+    src2[i] = ((float*)buf_data[1])[i] = rand() * 0.1f;
+    src3[i] = ((float*)buf_data[2])[i] = rand() * 0.1f;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[3])[i] == cpu(as_int(src1[i]), as_int(src2[i]), as_int(src3[i])));
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_bitselect);
diff --git a/utests/builtin_frexp.cpp b/utests/builtin_frexp.cpp
new file mode 100644
index 0000000..75dac3b
--- /dev/null
+++ b/utests/builtin_frexp.cpp
@@ -0,0 +1,50 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_frexp(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_frexp");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  src[0] = ((float*)buf_data[0])[0] = 0.f;
+  src[1] = ((float*)buf_data[0])[1] = -0.f;
+  src[2] = ((float*)buf_data[0])[2] = nanf("");
+  src[3] = ((float*)buf_data[0])[3] = INFINITY;
+  src[4] = ((float*)buf_data[0])[4] = -INFINITY;
+  for (int i = 5; i < n; ++i)
+    src[i] = ((float*)buf_data[0])[i] = (rand() & 255) * 0.1f - 12.8f;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  float *dst = (float*)buf_data[1];
+  int *exp = (int*)buf_data[2];
+  int w;
+  OCL_ASSERT(dst[0] == 0.f && exp[0] == 0);
+  OCL_ASSERT(dst[1] == -0.f && exp[1] == 0);
+  OCL_ASSERT(isnanf(dst[2]));
+  OCL_ASSERT(dst[3] == INFINITY);
+  OCL_ASSERT(dst[4] == -INFINITY);
+  for (int i = 5; i < n; ++i) {
+    OCL_ASSERT(fabsf(dst[i] - frexpf(src[i], &w)) < 1e-5);
+    OCL_ASSERT(exp[i] == w);
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_frexp);
diff --git a/utests/builtin_global_id.cpp b/utests/builtin_global_id.cpp
new file mode 100644
index 0000000..9601cab
--- /dev/null
+++ b/utests/builtin_global_id.cpp
@@ -0,0 +1,77 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11.
+Now define global size as following:
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+
+Kernel:
+id = get_global_id(0) + get_global_id(1)*3 + get_global_id(2)*3*4
+
+dimension:1
+ 0  1  2
+dimension:2
+ 0  1  2
+ 3  4  5
+ 6  7  8
+ 9 10 11
+dimension:3
+ 0  1  2   12 13 14   24 25 26   36 37 38   48 49 50
+ 3  4  5   15 16 17   27 28 29   39 40 41   51 52 53
+ 6  7  8   18 19 20   30 31 32   42 43 44   54 55 56
+ 9 10 11   21 22 23   33 34 35   45 46 47   57 58 59
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_global_id(void)
+{
+
+  // Setup kernel and buffers
+  int dim, global_id[80], err, i, buf_len=1;
+  OCL_CREATE_KERNEL("builtin_global_id");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*80, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    for(i=1; i <= dim; i++)
+    {
+      globals[i - 1] = 2 + i;
+      locals[i - 1] = 2 + i;
+      buf_len *= 2 + i;
+    }
+    for(i=dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &global_id, 0, NULL, NULL);
+
+    if (err != CL_SUCCESS)
+    {
+      printf("Error: Failed to read output array! %d\n", err);
+      exit(1);
+    }
+
+#if udebug
+    for(i = 0; i < buf_len; i++)
+    {
+      printf("%2d ", global_id[i]);
+      if ((i + 1) % 3 == 0) printf("\n");
+    }
+#endif
+
+    for( i = 0; i < buf_len; i++)
+      OCL_ASSERT( global_id[i] == i);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_global_id);
diff --git a/utests/builtin_local_id.cpp b/utests/builtin_local_id.cpp
new file mode 100644
index 0000000..1f07615
--- /dev/null
+++ b/utests/builtin_local_id.cpp
@@ -0,0 +1,81 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11.
+Now define local and global size as following:
+  globals[0] = 4;
+  globals[1] = 9;
+  globals[2] = 16;
+  locals[0] = 2;
+  locals[1] = 3;
+  locals[2] = 4;
+
+Kernel:
+int id = get_local_id(0) +  get_group_id(0)*2 + \
+         get_local_id(1) * 4 + get_group_id(1)*12 +\
+         get_local_id(2) *36 + get_group_id(2)*144;
+
+dimension:1
+ 0  1  2  3
+dimension:2
+ 0  1  2  3  4  5  6  7  8  9 10 11
+12 13 14 15 16 17 18 19 20 21 22 23
+24 25 26 27 28 29 30 31 32 33 34 35
+dimension:3
+ 0  1  2  3  4  5  6  7 ... 139 140 141 142 143
+...
+...
+429 430 431 432 433 434 ... 571 572 573 574 575
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_local_id(void)
+{
+
+  // Setup kernel and buffers
+  int dim, local_id[576], err, i, buf_len=1;
+  OCL_CREATE_KERNEL("builtin_local_id");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    for(i=1; i <= dim; i++)
+    {
+      locals[i - 1] = i + 1;
+      globals[i - 1] = (i + 1) * (i + 1);
+      buf_len *= ((i + 1) * (i + 1));
+    }
+    for(i = dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &local_id, 0, NULL, NULL);
+
+    if (err != CL_SUCCESS)
+    {
+      printf("Error: Failed to read output array! %d\n", err);
+      exit(1);
+    }
+
+#if udebug
+    for(i = 0; i < buf_len; i++)
+    {
+      printf("%2d ", local_id[i]);
+      if ((i + 1) % 4  == 0) printf("\n");
+    }
+#endif
+
+    for( i = 0; i < buf_len; i++)
+      OCL_ASSERT( local_id[i] == i);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_local_id);
diff --git a/utests/builtin_local_size.cpp b/utests/builtin_local_size.cpp
new file mode 100644
index 0000000..a9dac2e
--- /dev/null
+++ b/utests/builtin_local_size.cpp
@@ -0,0 +1,88 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_local_size should be as following:
+
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+  locals[0] = 3;
+  locals[1] = 4;
+  locals[2] = 5;
+
+get_local_size(-1) = 1 (dimension:1)
+get_local_size(0) = 3 (dimension:1)
+get_local_size(1) = 1 (dimension:1)
+get_local_size(2) = 1 (dimension:1)
+
+get_local_size(-1) = 1 (dimension:2)
+get_local_size(0) = 3 (dimension:2)
+get_local_size(1) = 4 (dimension:2)
+get_local_size(2) = 1 (dimension:2)
+get_local_size(3) = 1 (dimension:2)
+
+get_local_size(-1) = 1 (dimension:3)
+get_local_size(0) = 3 (dimension:3)
+get_local_size(1) = 4 (dimension:3)
+get_local_size(2) = 5 (dimension:3)
+get_local_size(3) = 1 (dimension:3)
+get_local_size(4) = 1 (dimension:3)
+
+*/
+#include "utest_helper.hpp"
+#define udebug 0
+
+static void builtin_local_size(void)
+{
+
+  // Setup kernel and buffers
+  int dim, dim_arg_global, local_size, err;
+  OCL_CREATE_KERNEL("builtin_local_size");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+  locals[0] = 3;
+  locals[1] = 4;
+  locals[2] = 5;
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+
+    for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+    {
+
+      err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to write to source array!\n");
+        exit(1);
+      }
+
+      // Run the kernel
+      OCL_NDRANGE( dim );
+
+      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &local_size, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to read output array! %d\n", err);
+        exit(1);
+      }
+
+#if udebug
+      printf("get_local_size(%d) = %d (dimension:%d)\n", dim_arg_global, local_size, dim);
+#endif
+      if ( dim_arg_global >= 0 && dim_arg_global < dim)
+        OCL_ASSERT( local_size == dim_arg_global + 3);
+      else
+      {
+        OCL_ASSERT( local_size == 1);
+      }
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_local_size);
diff --git a/utests/builtin_mad_sat.cpp b/utests/builtin_mad_sat.cpp
new file mode 100644
index 0000000..ed9a558
--- /dev/null
+++ b/utests/builtin_mad_sat.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+void builtin_mad_sat(void)
+{
+  const int n = 32;
+  short src1[n], src2[n], src3[n];
+srand(0);
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_mad_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((short*)buf_data[0])[i] = rand();
+    src2[i] = ((short*)buf_data[1])[i] = rand();
+    src3[i] = ((short*)buf_data[2])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i) {
+    int a = (int)src1[i] * (int)src2[i] + (int)src3[i];
+    a = a > 0x7FFF ? 0x7FFF : (a < -0x8000 ? -0x8000 : a);
+    OCL_ASSERT(((short*)buf_data[3])[i] == (short)a);
+  }
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_mad_sat);
diff --git a/utests/builtin_modf.cpp b/utests/builtin_modf.cpp
new file mode 100644
index 0000000..057e95e
--- /dev/null
+++ b/utests/builtin_modf.cpp
@@ -0,0 +1,56 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+void builtin_modf(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_modf");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  src[0] = INFINITY;
+  src[1] = -INFINITY;
+  src[2] = nanf("");
+  src[3] = 0;
+  src[4] = 1.5f;
+  src[5] = 2.5f;
+  src[6] = -2.5f;
+  src[7] = 20;
+  src[8] = 21;
+  src[9] = 89.5f;
+
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, n * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  float *dst = (float *)buf_data[1];
+  float *it = (float *)buf_data[2];
+  OCL_ASSERT(dst[0] == 0 && it[0] == INFINITY);
+  OCL_ASSERT(dst[1] == -0.f && it[1] == -INFINITY);
+  OCL_ASSERT(isnanf(dst[2]) && isnanf(it[2]));
+  OCL_ASSERT(dst[3] == 0 && it[3] == 0);
+  OCL_ASSERT(dst[4] == 0.5f && it[4] == 1);
+  OCL_ASSERT(dst[5] == 0.5f && it[5] == 2);
+  OCL_ASSERT(dst[6] == -0.5f && it[6] == -2);
+  OCL_ASSERT(dst[7] == 0 && it[7] == 20);
+  OCL_ASSERT(dst[8] == 0 && it[8] == 21);
+  OCL_ASSERT(dst[9] == 0.5f && it[9] == 89);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_modf);
diff --git a/utests/builtin_nextafter.cpp b/utests/builtin_nextafter.cpp
new file mode 100644
index 0000000..ae95497
--- /dev/null
+++ b/utests/builtin_nextafter.cpp
@@ -0,0 +1,60 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+static int as_int(float f) {
+  void *p = &f;
+  return *(int *)p;
+}
+
+void builtin_nextafter(void)
+{
+  const int n = 16;
+  float src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_nextafter");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  src1[0] = nanf(""), src2[0] = 1.1f;
+  src1[1] = 2.2f,     src2[1] = nanf("");
+  src1[2] = nanf(""), src2[2] = nanf("");
+  src1[3] = 123.4f,   src2[3] = 123.4f;
+  src1[4] = 0.f,      src2[4] = 1.f;
+  src1[5] = -0.f,     src2[5] = -1.f;
+  for (int i = 6; i < n; ++i) {
+    src1[i] = (rand() & 255) * 0.1f - 12.8f;
+    src2[i] = (rand() & 255) * 0.1f - 12.8f;
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, n * sizeof(float));
+  memcpy(buf_data[1], src2, n * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  float *dest = (float *)buf_data[2];
+  if (0)
+    for (int i = 0; i < n; ++i)
+      printf("%d %x %x %x %x\n", i, as_int(src1[i]), as_int(src2[i]),
+             as_int(dest[i]), as_int(nextafterf(src1[i], src2[i])));
+  OCL_ASSERT(isnanf(dest[0]));
+  OCL_ASSERT(isnanf(dest[1]));
+  OCL_ASSERT(isnanf(dest[2]));
+  for (int i = 3; i < n; ++i)
+    OCL_ASSERT(dest[i] == nextafterf(src1[i], src2[i]));
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_nextafter);
diff --git a/utests/builtin_num_groups.cpp b/utests/builtin_num_groups.cpp
new file mode 100644
index 0000000..bbff435
--- /dev/null
+++ b/utests/builtin_num_groups.cpp
@@ -0,0 +1,85 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_num_groups should be as following:
+
+  globals[0] = 1;
+  globals[1] = 4;
+  globals[2] = 9;
+  locals[0] = 1;
+  locals[1] = 2;
+  locals[2] = 3;
+
+#ifdef CL_VERSION_1_2 | CL_VERSION_1_1:
+get_num_groups(-1) = 1 (dimension:1)
+get_num_groups(0) = 1 (dimension:1)
+get_num_groups(1) = 1 (dimension:1)
+
+get_num_groups(-1) = 1 (dimension:2)
+get_num_groups(0) = 1 (dimension:2)
+get_num_groups(1) = 2 (dimension:2)
+get_num_groups(2) = 1 (dimension:2)
+
+get_num_groups(-1) = 1 (dimension:3)
+get_num_groups(0) = 1 (dimension:3)
+get_num_groups(1) = 2 (dimension:3)
+get_num_groups(2) = 3 (dimension:3)
+get_num_groups(3) = 1 (dimension:3)
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_num_groups(void)
+{
+
+  // Setup kernel and buffers
+  int dim, dim_arg_global, num_groups, err;
+  OCL_CREATE_KERNEL("builtin_num_groups");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = 1;
+  globals[1] = 4;
+  globals[2] = 9;
+  locals[0] = 1;
+  locals[1] = 2;
+  locals[2] = 3;
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+
+    for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+    {
+
+      err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to write to source array!\n");
+        exit(1);
+      }
+
+      // Run the kernel
+      OCL_NDRANGE( dim );
+
+      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &num_groups, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to read output array! %d\n", err);
+        exit(1);
+      }
+
+#if udebug
+      printf("get_num_groups(%d) = %d (dimension:%d)\n", dim_arg_global, num_groups, dim);
+#endif
+      if ( dim_arg_global >= 0 && dim_arg_global < dim)
+        OCL_ASSERT( num_groups == dim_arg_global + 1 );
+      else
+      {
+        OCL_ASSERT( num_groups == 1);
+      }
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_num_groups);
diff --git a/utests/builtin_remquo.cpp b/utests/builtin_remquo.cpp
new file mode 100644
index 0000000..f67be12
--- /dev/null
+++ b/utests/builtin_remquo.cpp
@@ -0,0 +1,65 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+void builtin_remquo(void)
+{
+  const int n = 16;
+  float src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_remquo");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  src1[0] = 1,         src2[0] = 0;
+  src1[1] = 1,         src2[1] = -0.f;
+  src1[2] = INFINITY,  src2[2] = 1;
+  src1[3] = -INFINITY, src2[3] = 1;
+  src1[4] = nanf(""),  src2[4] = nanf("");
+  src1[5] = 1.625f,    src2[5] = 1;
+  src1[6] = -1.625f,   src2[6] = 1;
+  src1[7] = 1.625f,    src2[7] = -1;
+  src1[8] = -1.625f,   src2[8] = -1;
+  src1[9] = 5,         src2[9] = 2;
+  src1[10] = 3,        src2[10] = 2;
+  src1[11] = -0.f,     src2[11] = 1;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, n * sizeof(float));
+  memcpy(buf_data[1], src2, n * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  float *dest = (float *)buf_data[2];
+  int *quo = (int *)buf_data[3];
+  OCL_ASSERT(isnanf(dest[0]));
+  OCL_ASSERT(isnanf(dest[1]));
+  OCL_ASSERT(isnanf(dest[2]));
+  OCL_ASSERT(isnanf(dest[3]));
+  OCL_ASSERT(isnanf(dest[4]));
+  OCL_ASSERT(dest[5] == -0.375f && quo[5] ==  2);
+  OCL_ASSERT(dest[6] ==  0.375f && quo[6] == -2);
+  OCL_ASSERT(dest[7] == -0.375f && quo[7] == -2);
+  OCL_ASSERT(dest[8] ==  0.375f && quo[8] ==  2);
+  OCL_ASSERT(dest[9] == 1       && quo[9] ==  2);
+  OCL_ASSERT(dest[10] == -1     && quo[10] == 2);
+  OCL_ASSERT(dest[11] == -0.f   && quo[11] == 0);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_remquo);
diff --git a/utests/builtin_shuffle.cpp b/utests/builtin_shuffle.cpp
new file mode 100644
index 0000000..c7fa86b
--- /dev/null
+++ b/utests/builtin_shuffle.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void builtin_shuffle(void)
+{
+  const int n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_shuffle");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; i ++) {
+    ((float *)(buf_data[0]))[i] = rand();
+    ((float *)(buf_data[1]))[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; i ++) {
+    OCL_ASSERT(((float *)(buf_data[0]))[i] == ((float *)(buf_data[3]))[i]);
+    OCL_ASSERT(((float *)(buf_data[1]))[i] == ((float *)(buf_data[2]))[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_shuffle);
diff --git a/utests/builtin_sign.cpp b/utests/builtin_sign.cpp
new file mode 100644
index 0000000..426de36
--- /dev/null
+++ b/utests/builtin_sign.cpp
@@ -0,0 +1,47 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_sign(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_sign");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  src[0] = ((float*)buf_data[0])[0] = nanf("");
+  src[1] = ((float*)buf_data[0])[1] = INFINITY;
+  src[2] = ((float*)buf_data[0])[2] = 0.f;
+  src[3] = ((float*)buf_data[0])[3] = -0.f;
+  for (int i = 4; i < n; ++i) {
+    src[i] = ((float*)buf_data[0])[i] = (rand() & 15) * 0.1 - 0.75;
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  float *dst = (float*)buf_data[1];
+  OCL_ASSERT(dst[0] == 0);
+  OCL_ASSERT(dst[1] == 1.f);
+  OCL_ASSERT(dst[2] == 0.f);
+  OCL_ASSERT(dst[3] == -0.f);
+  for (int i = 4; i < n; ++i) {
+    if (src[i] == 0.f)
+      OCL_ASSERT(dst[i] == 0.f);
+    else if (src[i] == -0.f)
+      OCL_ASSERT(dst[i] == -0.f);
+    else
+      OCL_ASSERT(dst[i] == (src[i] > 0 ? 1 : -1));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sign);
diff --git a/utests/compiler_smoothstep.cpp b/utests/compiler_smoothstep.cpp
new file mode 100644
index 0000000..760063b
--- /dev/null
+++ b/utests/compiler_smoothstep.cpp
@@ -0,0 +1,58 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+float cpu(float e0, float e1, float x)
+{
+  x = (x - e0) / (e1 - e0);
+  if (x >= 1)
+    x = 1.f;
+  if (x <= 0)
+    x = 0.f;
+  return x * x * (3 - 2 * x);
+}
+
+void compiler_smoothstep(void)
+{
+  const int n = 32;
+  float src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_smoothstep");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    float a = 0.1f * (rand() & 15) - 0.75f;
+    float b = a + 0.1f * (rand() & 15);
+    float c = 0.1f * (rand() & 15) - 0.75f;
+    src1[i] = ((float*)buf_data[0])[i] = a;
+    src2[i] = ((float*)buf_data[1])[i] = b;
+    src3[i] = ((float*)buf_data[2])[i] = c;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i) {
+    float a = ((float*)buf_data[3])[i];
+    float b = cpu(src1[i], src2[i], src3[i]);
+    OCL_ASSERT(fabsf(a - b) < 1e-4f);
+  }
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_smoothstep);
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index 4c6f404..ec02ce9 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -30,6 +30,7 @@ struct Info_Result {
     }
 
     bool check_result (void) {
+        //printf("The refer is %d, we get result is %d\n", refer, ret);
         if (ret != refer && refer != (T)NO_STANDARD_REF)
             return false;
 
@@ -492,3 +493,116 @@ void get_kernel_info(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(get_kernel_info);
+
+/* ***************************************************** *
+ * clGetImageInfo                                        *
+ * ***************************************************** */
+void get_image_info(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  cl_image_format format;
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  OCL_CREATE_IMAGE2D(buf[0], 0, &format, w, h, 0, NULL);
+  cl_mem image = buf[0];
+
+  cl_image_format ret_format;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_FORMAT, sizeof(ret_format), &ret_format, NULL);
+  OCL_ASSERT(format.image_channel_order == ret_format.image_channel_order);
+  OCL_ASSERT(format.image_channel_data_type == ret_format.image_channel_data_type);
+
+  size_t element_size;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_ELEMENT_SIZE, sizeof(element_size), &element_size, NULL);
+  OCL_ASSERT(element_size == 4);
+
+  size_t row_pitch;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
+  OCL_ASSERT(row_pitch == 4 * w);
+
+  size_t slice_pitch;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
+  OCL_ASSERT(slice_pitch == 0);
+
+  size_t width;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+  OCL_ASSERT(width == w);
+
+  size_t height;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+  OCL_ASSERT(height == h);
+
+  size_t depth;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
+  OCL_ASSERT(depth == 1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_image_info);
+
+/* ***************************************************** *
+ * clGetMemObjectInfo                                    *
+ * ***************************************************** */
+#define CALL_GETMEMINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetMemObjectInfo, (buf[0]))
+
+void get_mem_info(void)
+{
+    map<cl_mem_info, void *> maps;
+    int expect_ref;
+
+    OCL_CREATE_BUFFER(buf[0], 0, 64, NULL);
+    void * map_ptr = clEnqueueMapBuffer(queue, buf[0], 1, CL_MAP_READ, 0, 64, 0, NULL, NULL, NULL);
+
+    expect_ref = CL_MEM_OBJECT_BUFFER;
+    maps.insert(make_pair(CL_MEM_TYPE,
+                          (void *)(new Info_Result<cl_mem_object_type>((cl_mem_object_type)expect_ref))));
+    expect_ref = 0;
+    maps.insert(make_pair(CL_MEM_FLAGS,
+                          (void *)(new Info_Result<cl_mem_flags>(expect_ref))));
+    expect_ref = 64;
+    maps.insert(make_pair(CL_MEM_SIZE,
+                          (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+    expect_ref = 0;
+    maps.insert(make_pair(CL_MEM_HOST_PTR,
+                          (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+    expect_ref = 1;
+    maps.insert(make_pair(CL_MEM_MAP_COUNT,
+                          (void *)(new Info_Result<cl_uint>(((cl_uint)expect_ref)))));
+    expect_ref = 1;
+    maps.insert(make_pair(CL_MEM_REFERENCE_COUNT,
+                          (void *)(new Info_Result<cl_uint>(((cl_uint)expect_ref)))));
+    maps.insert(make_pair(CL_MEM_CONTEXT,
+                          (void *)(new Info_Result<cl_context>(((cl_context)ctx)))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_mem_info, void *> x) {
+        switch (x.first) {
+        case CL_MEM_TYPE:
+            CALL_GETMEMINFO_AND_RET(cl_mem_object_type);
+            break;
+        case CL_MEM_FLAGS:
+            CALL_GETMEMINFO_AND_RET(cl_mem_flags);
+            break;
+        case CL_MEM_SIZE:
+            CALL_GETMEMINFO_AND_RET(size_t);
+            break;
+        case CL_MEM_HOST_PTR:
+            CALL_GETMEMINFO_AND_RET(size_t);
+            break;
+        case CL_MEM_MAP_COUNT:
+            CALL_GETMEMINFO_AND_RET(cl_uint);
+            break;
+        case CL_MEM_REFERENCE_COUNT:
+            CALL_GETMEMINFO_AND_RET(cl_uint);
+            break;
+        case CL_MEM_CONTEXT:
+            CALL_GETMEMINFO_AND_RET(cl_context);
+            break;
+
+        default:
+            break;
+        }
+    });
+
+    clEnqueueUnmapMemObject(queue, buf[0], map_ptr, 0, NULL, NULL);
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_mem_info);
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 504f80f..9069db2 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -294,10 +294,10 @@ error:
 #include <cstring>
 #define GET_DEVICE_STR_INFO(LOWER_NAME, NAME) \
     std::string LOWER_NAME ##Str; \
-    OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, 0, 0, &param_value_size); \
+    OCL_CALL (clGetDeviceInfo, device, NAME, 0, 0, &param_value_size); \
     { \
       std::vector<char> param_value(param_value_size); \
-      OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, \
+      OCL_CALL (clGetDeviceInfo, device, NAME, \
                 param_value_size, param_value.empty() ? NULL : &param_value.front(), \
                 &param_value_size); \
       if (!param_value.empty()) \
@@ -305,7 +305,6 @@ error:
     } \
     printf("device_" #LOWER_NAME " \"%s\"\n", LOWER_NAME ##Str.c_str());
 
-
 int
 cl_ocl_init(void)
 {
@@ -332,11 +331,13 @@ cl_ocl_init(void)
   OCL_CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   {
     size_t param_value_size;
-    GET_DEVICE_STR_INFO(profile, PROFILE);
-    GET_DEVICE_STR_INFO(name, NAME);
-    GET_DEVICE_STR_INFO(vendor, VENDOR);
-    GET_DEVICE_STR_INFO(version, VERSION);
-    GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
+    GET_DEVICE_STR_INFO(profile, CL_DEVICE_PROFILE);
+    GET_DEVICE_STR_INFO(name, CL_DEVICE_NAME);
+    GET_DEVICE_STR_INFO(vendor, CL_DEVICE_VENDOR);
+    GET_DEVICE_STR_INFO(version, CL_DEVICE_VERSION);
+    GET_DEVICE_STR_INFO(opencl_c_version, CL_DEVICE_OPENCL_C_VERSION);
+    GET_DEVICE_STR_INFO(driver_version, CL_DRIVER_VERSION);
+    GET_DEVICE_STR_INFO(extensions, CL_DEVICE_EXTENSIONS);
     if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
       hasGLExt = true;
     }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git